changed ImageHandsOn from "gaussian" to "heNormal" initialization, and also most layers defaults in CNTK.core.bs

2016-08-19 23:34:17 -07:00 · 2016-08-19 23:34:17 -07:00 · db74d6b468
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -30,7 +30,7 @@

 # LinearLayer -- create a fully-connected linear projection layer
 # Note: outDim may describe a tensor as well.
-LinearLayer {outDim, bias = true, init='uniform', initValueScale=1, inputRank=0} =
+LinearLayer {outDim, bias = true, init='heNormal', initValueScale=1, inputRank=0} =
 {
    W = ParameterTensor {_ConcatArrays (outDim, Inferred), init=init, initValueScale=initValueScale}
    b = ParameterTensor {outDim, initValue=0}
@ -42,7 +42,7 @@ LinearLayer {outDim, bias = true, init='uniform', initValueScale=1, inputRank=0}
 }.apply

 # DenseLayer -- create a fully-connected layer with optional non-linearity
-DenseLayer{outDim, bias = true, activation=(x=>x), init='uniform', initValueScale=1, inputRank=0} = Sequential ( LinearLayer{outDim, bias=bias, init=init, initValueScale=initValueScale, inputRank=inputRank} : activation )
+DenseLayer{outDim, bias = true, activation=(x=>x), init='heNormal', initValueScale=1, inputRank=0} = Sequential ( LinearLayer{outDim, bias=bias, init=init, initValueScale=initValueScale, inputRank=inputRank} : activation )

 # EmbeddingLayer -- create a linear embedding layer
 EmbeddingLayer {outDim,                                   # dimension of embedding
@ -65,7 +65,7 @@ ConvolutionalLayer {numOutputChannels,   # e.g. (1) or BS.Constants.None
                    filterShape,         # e.g. (3:3)
                    bias = true,
                    activation = (x=>x),
-                    init = "uniform",
+                    init = "heNormal",
                    initValueScale = 1,          # TODO: rename to initScale
                    #reductionRank = 1,          # TODO: support this
                    stride = 1, pad = false,
@ -346,11 +346,11 @@ CNTK2 = [
    Tanh(_, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = _ /*plus the function args*/ ]

    // 6. Reductions    
-    ReduceSum   (_, axis=None, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = if BS.Constants.IsNone (axis) then 0 else axis ; reductionOp = "Sum"    /*plus the function args*/ ]
-    ReduceLogSum(_, axis=None, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = if BS.Constants.IsNone (axis) then 0 else axis ; reductionOp = "LogSum" /*plus the function args*/ ]
-    ReduceMin   (_, axis=None, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = if BS.Constants.IsNone (axis) then 0 else axis ; reductionOp = "Min"    /*plus the function args*/ ]
-    ReduceMax   (_, axis=None, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = if BS.Constants.IsNone (axis) then 0 else axis ; reductionOp = "Max"    /*plus the function args*/ ]
-    #ReduceMean (_, axis=None, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = if BS.Constants.IsNone (axis) then 0 else axis ; reductionOp = "Mean"   /*plus the function args*/ ]
+    ReduceSum   (_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "Sum"    /*plus the function args*/ ]}.r
+    ReduceLogSum(_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "LogSum" /*plus the function args*/ ]}.r
+    ReduceMin   (_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "Min"    /*plus the function args*/ ]}.r
+    ReduceMax   (_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "Max"    /*plus the function args*/ ]}.r
+    #ReduceMean (_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "Mean"   /*plus the function args*/ ]}.r

    // 7. Control flow (if, composite etc.)
    // None so far
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -866,7 +866,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
        if (!isEnabled(ConvolutionEngineKind::Legacy))
            RuntimeError("Trying to use Legacy convolution engine when it's disabled.");
        // REVIEW alexeyk: should honor m_traceLevel here.
-        fprintf(stderr, "\n%lsusing legacy convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+        fprintf(stderr, "%lsusing legacy convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return std::make_unique<LegacyConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

@ -874,19 +874,19 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
    if (isEnabled(ConvolutionEngineKind::CuDnn) &&
        CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId, geometry, poolKind))
    {
-        fprintf(stderr, "\n%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+        fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

    if (isEnabled(ConvolutionEngineKind::Gemm) && GemmConvolutionEngine<ElemType>::IsSupported(deviceId, geometry))
    {
-        fprintf(stderr, "\n%lsusing GEMM convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+        fprintf(stderr, "%lsusing GEMM convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return std::make_unique<GemmConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

    if (!isEnabled(ConvolutionEngineKind::Reference))
        RuntimeError("Reference convolution is disabled and no other engine supports such configuratin (or disabled).");
-    fprintf(stderr, "\n%lsusing reference convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+    fprintf(stderr, "%lsusing reference convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
    return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
 }

--- a/Tutorials/ImageHandsOn/ImageHandsOn.cntk
+++ b/Tutorials/ImageHandsOn/ImageHandsOn.cntk
@ -18,18 +18,16 @@ TrainConvNet = {
        labelDim = 10

        model (features) = {
-            featNorm = features - Constant (128)
-            l1 = ConvolutionalLayer {32, (5:5), pad=true, activation=ReLU,
-                                     init="gaussian", initValueScale=0.0043} (featNorm)
+            # TODO: update these new config values in all _Solution files and web page
+            featNorm = features - Constant (128)   # TODO: suspicious that we don't normalize by stddev (~74 if uniform distr.)
+            l1 = ConvolutionalLayer {32, (5:5), pad=true, activation=ReLU, initValueScale=0.1557/256} (featNorm)
            p1 = MaxPoolingLayer {(3:3), stride=(2:2)} (l1)
-            l2 = ConvolutionalLayer {32, (5:5), pad=true, activation=ReLU,
-                                     init="gaussian", initValueScale=1.414} (p1)
+            l2 = ConvolutionalLayer {32, (5:5), pad=true, activation=ReLU, initValueScale=0.2} (p1)
            p2 = MaxPoolingLayer {(3:3), stride=(2:2)} (l2)
-            l3 = ConvolutionalLayer {64, (5:5), pad=true, activation=ReLU,
-                                     init="gaussian", initValueScale=1.414} (p2)
+            l3 = ConvolutionalLayer {64, (5:5), pad=true, activation=ReLU, initValueScale=0.2} (p2)
            p3 = MaxPoolingLayer {(3:3), stride=(2:2)} (l3)
-            d1 = DenseLayer {64, activation=ReLU, init="gaussian", initValueScale=12} (p3)
-            z  = LinearLayer {10, init="gaussian", initValueScale=1.5} (d1)
+            d1 = DenseLayer {64, activation=ReLU, initValueScale=1.697} (p3)
+            z  = LinearLayer {10, initValueScale=0.212} (d1)
        }.z

        # inputs