changed apply function to apply() for clarity and familiarity;

factored reporting of layout mismatches
2016-08-11 15:09:09 -07:00 · 2016-08-11 15:09:09 -07:00 · b950b9ede4
--- a/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
@ -37,8 +37,8 @@ TrainConvNet = [
        {
            C = ConvolutionalLayer {dim, (5:5), pad = true, activation = ReLU, init = "gaussian", initValueScale = initValueScale}
            P = MaxPoolingLayer {(3:3), stride = (2:2)}
-            f(x) = P(C(x))
-        }.f
+            apply (x) = P(C(x))
+        }.apply
        model_layers (features) =
        {
            featNorm = features - Constant (128)
@ -75,14 +75,14 @@ TrainConvNet = [
            C = ConvolutionalLayer {dim, (5:5), pad = true, bias = false, init = "gaussian", initValueScale = initValueScale}
            B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}
            P = MaxPoolingLayer {(3:3), stride = (2:2)}
-            f(x) = P(ReLU(B(C(x))))
-        }.f
+            apply (x) = P(ReLU(B(C(x))))
+        }.apply
        MyDenseBNReLULayer {dim, initValueScale} =
        {
            D = DenseLayer {dim, bias = false, init = "gaussian", initValueScale = initValueScale}
            B = BatchNormalizationLayer {normalizationTimeConstant = 4096}
-            f(x) = ReLU(B(D(x)))
-        }.f
+            apply (x) = ReLU(B(D(x)))
+        }.apply
        model_withBatchNorm (features) =
        {
            featNorm = features - Constant (128)
@ -99,15 +99,16 @@ TrainConvNet = [
            # note: (3:3), while the macro above is (5:5)
            C = ConvolutionalLayer {dim, (3:3), pad = true, stride = (stride:stride), bias = false, init = "gaussian", initValueScale = initValueScale}
            B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}
-            f(x) = B(C(x))
-        }.f
+            apply (x) = B(C(x))
+        }.apply
        ResNetNode {dim, initValueScale} =
        {
            C1 = MyConvBNLayer {dim, initValueScale, 1}  # first convolution layer
            C2 = MyConvBNLayer {dim, initValueScale, 1}  # second convolution layer
-            #B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}  # TODO: Having this works better, it seems
-            f(x) = ReLU (x + C2(ReLU(C1(x))))  # ReLU between C1 and C2 and after summation
-        }.f
+            #B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}
+            # ^^ Note: Adding an exra BN to 'x' trains slightly better.
+            apply (x) = ReLU (x + C2(ReLU(C1(x))))  # ReLU between C1 and C2 and after summation
+        }.apply
        ResNetIncNode {dim, initValueScale} =
        {
            # first branch. This doubles the #channels but halves the image size
@ -123,8 +124,8 @@ TrainConvNet = [
            B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}

            # layer sums both branches and rectifies the result
-            f(x) = ReLU (B(P(x)) + C2(ReLU(C1(x))))  # ReLU between C1 and C2 and after summation
-        }.f
+            apply (x) = ReLU (B(P(x)) + C2(ReLU(C1(x))))  # ReLU between C1 and C2 and after summation
+        }.apply
        model_resNet (features) =
        {
            conv1 = MyConvBNLayer {16, 0.26, 1} (features)
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -35,11 +35,11 @@ LinearLayer {outDim, bias = true, init='uniform', initValueScale=1} =
    W = ParameterTensor {_ConcatArrays (outDim, 0), init=init, initValueScale=initValueScale}
    b = ParameterTensor {outDim, initValue=0}
    outRank = Length (_AsArray (outDim)) # support outputs with tensor layouts
-    f(x) =
+    apply (x) =
        if bias
        then Times (W, x, outputRank = outRank) + b
        else Times (W, x, outputRank = outRank)
-}.f
+}.apply

 # DenseLayer -- create a fully-connected layer with optional non-linearity
 DenseLayer{outDim, bias = true, activation=(x=>x), init='uniform', initValueScale=1} = Sequential ( LinearLayer{outDim, bias = bias, init = init, initValueScale = initValueScale} : activation )
@ -53,8 +53,8 @@ EmbeddingLayer {outDim,                                   # dimension of embeddi
        then ParameterTensor {shape, init='uniform'}  # learnable
        else ParameterTensor {shape, initFromFilePath = embeddingPath, learningRateMultiplier = 0}  # fixed from file
    TimesOp = if transpose then TransposeTimes else Times
-    f(x) = TimesOp (E, x)    # x is expected to be sparse one-hot
-}.f
+    apply (x) = TimesOp (E, x)    # x is expected to be sparse one-hot
+}.apply

 # ConvolutionalLayer -- create a convolution layer with optional non-linearity
 #             [ (shifting dims)  |  (reduction dim)  |  (output dim)  |  (sample dims) ]
@ -83,11 +83,11 @@ ConvolutionalLayer {numOutputChannels,   # e.g. (1) or BS.Constants.None
    b = ParameterTensor(_ConcatArrays (Repeat (Length (filterShape), 1), outputChannelsShape), initValue = 0)                                 # [ 1 x 1 x     K ]
    sharing = true    # TODO: support this
    transpose = false # TODO: support this
-    f(x) = {
+    apply (x) = {
        c = Convolution (W, x, filterShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = pad, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
        res = activation (if bias then c + b else c)
    }.res
-}.f
+}.apply

 # MaxPoolingLayer, AveragePoolingLayer -- create a max- or average-pooling layer
 _PoolingLayer {poolKind,            # "max" or "average"
@ -95,8 +95,8 @@ _PoolingLayer {poolKind,            # "max" or "average"
               stride = 1, pad = false,
               lowerPad = 0, upperPad = 0} = # TODO: support this
 {
-    f(x) = Pooling (x, poolKind, filterShape, stride = stride, autoPadding = pad, lowerPad = lowerPad, upperPad = upperPad)
-}.f
+    apply (x) = Pooling (x, poolKind, filterShape, stride = stride, autoPadding = pad, lowerPad = lowerPad, upperPad = upperPad)
+}.apply
 MaxPoolingLayer {filterShape, stride = 1, pad = false, lowerPad = 0, upperPad = 0} =
    _PoolingLayer {"max", filterShape, stride = stride, pad = pad, lowerPad = lowerPad, upperPad = upperPad}
 AveragePoolingLayer {filterShape, stride = 1, pad = false, lowerPad = 0, upperPad = 0} =
@ -110,30 +110,30 @@ RecurrentLSTMLayer {outputDim,
 {
    previousHook = if goBackwards then BS.RNNs.NextHC else BS.RNNs.PreviousHC
    lstm = BS.RNNs.LSTMBlock {outputDim, cellShape = cellShape, enableSelfStabilization = enableSelfStabilization}
-    f(x) = {
+    apply (x) = {
        prevState = previousHook (lstmState) # recurrent memory. E.g. Previous or Next, with or without initial state, beam reordering etc.

        #auxInput = augmentInputHook(x, prevState)   # optionally augment input. Constants.None if none.

        lstmState = lstm (x, prevState)
    }.lstmState.h // that's the value we return
-}.f
+}.apply

 # DelayLayer -- delay input
 DelayLayer {T=1, defaultHiddenActivation=0} =
 {
-    f(x) =
+    apply (x) =
        if      T > 0 then PastValue   (0, x, timeStep=T,  defaultHiddenActivation=defaultHiddenActivation)
        else if T < 0 then FutureValue (0, x, timeStep=-T, defaultHiddenActivation=defaultHiddenActivation)
        else x
-}.f
+}.apply

 # DropoutLayer -- create a drop-out layer
 # Not yet supported with this interface; just use Dropout directly.
 #DropoutLayer {prob = BS.Constants.None} = if !BS.Constants.IsNone (prob) then Fail ("DropoutLayer: Dropout probability can currently not be specified per-layer.") else
 #{
-#    f(x) = Dropout (x)
-#}.f
+#    apply (x) = Dropout (x)
+#}.apply

 # BatchNormalizationLayer -- create a batch-normalization layer
 BatchNormalizationLayer {spatialRank = 0,  # reduce over these dims. E.g. 2 to reduce over (w,h) in a [W x H x C]-shaped input
@ -147,8 +147,8 @@ BatchNormalizationLayer {spatialRank = 0,  # reduce over these dims. E.g. 2 to r
    bias         = ParameterTensor {normShape, initValue = 0}
    runMean      = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
    runInvStdDev = ParameterTensor {normShape, initValue = 0, learningRateMultiplier = 0}
-    f(x) = BatchNormalization (x, scale, bias, runMean, runInvStdDev, spatialRank > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
-}.f
+    apply (x) = BatchNormalization (x, scale, bias, runMean, runInvStdDev, spatialRank > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
+}.apply

 # LayerNormalizationLayer -- create a layer-normalization layer
 LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} = if BS.Constants.IsNone (dim) then Fail ("LayerNormalizationLayer: 'dim' parameter is currently required.") else
@ -156,7 +156,7 @@ LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} =
    gain = ParameterTensor{(1), initValue = initScale}
    bias = ParameterTensor{(1), initValue = initBias}

-    f(x) = {
+    apply (x) = {
        div = Constant (1.0 / dim)

        # normalize w.r.t. actual sample statistics
@ -168,14 +168,14 @@ LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} =
        # denormalize with learned parameters
        val = xHat .* gain + bias
    }.val
-}.f
+}.apply

 # StabilizerLayer -- create a scalar stabilizer [J. Droppo, 2014 -- TODO: get the reference]
 StabilizerLayer {} =
 {
    # BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor Stabilize() for this.
-    f(x) = Stabilize (x)
-}.f
+    apply (x) = Stabilize (x)
+}.apply

 # FeatureMVNLayer -- create a corpus-level feature-normalization layer
 # This can only be applied to features. Statistics are not shared across invocations,
@ -197,20 +197,20 @@ Sequential (arrayOfFunctions) =
 {
    fs = _AsArray (arrayOfFunctions)  # make sure it works with a single function that is not an array
    Apply (x, N) = if N == 0 then x else fs[N-1](Apply (x, N-1))  # we do that recursively
-    f(x) = Apply (x, Length (fs))
-}.f
+    apply (x) = Apply (x, Length (fs))
+}.apply
 # Parallel -- composite that applies several functions to the same input and combines the result
 Parallel (arrayOfFunctions, combineFunction) =
 {
    fs = _AsArray (arrayOfFunctions)
-    f(x) = combineFunction (array[0..Length (fs)-1] (i => fs[i](x)))
-}.f
+    apply (x) = combineFunction (array[0..Length (fs)-1] (i => fs[i](x)))
+}.apply
 # MergeBinary -- apply two functions and combine them with a binary function, e.g. Plus
 MergeBinary (arrayOfFunctions, combineFunction) =
    if Length (arrayOfFunctions) != 2 then Fail ("Merge() is currently limited to binary functions.") else
    {
-        f(x,y) = combineFunction (arrayOfFunctions[0](x), arrayOfFunctions[1](y))
-    }.f
+        apply (x, y) = combineFunction (arrayOfFunctions[0](x), arrayOfFunctions[1](y))
+    }.apply
 # LayerStack -- generate a stack of models from a lambda of the form (i => some expression of i)
 # e.g. h3 = LayerStack {3, i => MyConvLayer {(32:32:64)[i], (0.0043:1.414:1.414)[i]} } (featNorm)
 LayerStack {n, c} = Sequential (array[0..n-1] (c))
@ -857,7 +857,7 @@ RNNs =
        S(x) = Parameters.Stabilize (x, enabled=enableSelfStabilization)
        # BUGBUG: S() must not be a macro either, but also an object instance

-        f(x, prevState, aux=Constants.None) = [
+        apply (x, prevState, aux=Constants.None) = [
            _ = [     // encapsulate the inner workings

                dh = prevState.h // previous values
@ -890,8 +890,8 @@ RNNs =
                then Wmr * S(_.ht)        // project
                else _.ht                 // no projection
            dim = outputDim
-        ] // end of f(x)
-    ].f
+        ] // end of apply(x)
+    ].apply

    # LSTMP -- LSTM function with projection and self-stabilization
    # Projection is enabled by passing different values for outputDim and cellDim.
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -87,6 +87,15 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
 // subroutines for Validate() implementations
 // -----------------------------------------------------------------------

+static void InconsistentMBLayout(const ComputationNodeBase& us, const ComputationNodeBase& which, ComputationNodeBase& vsWhich)
+{
+#if 1
+    RuntimeError("%ls: Dynamic axes mismatches between %ls and %ls. If this is by design, use ReconcileDynamicAxis().",
+                 us.NodeDescription().c_str(), which.NodeDescription().c_str(), vsWhich.NodeDescription());
+#else
+#endif
+}
+
 // helper function to infer the MBLayout for this node from inputs, for the *standard case*
 // the standard case is:
 //  - all inputs must share the same layout (e.g. adding two minibatches)
@ -105,8 +114,7 @@ void ComputationNodeBase::InferMBLayoutFromInputsForStandardCase(bool isFinalVal
        else if (!pMBLayout) // first non-NULL layout: just copy it
            pMBLayout = child->m_pMBLayout;
        else if (pMBLayout != child->m_pMBLayout && isFinalValidationPass) // got a layout--compare whether it is the same
-            RuntimeError("%ls: InferMBLayoutFromInputsForStandardCase: Expected minibatch layouts to be the same between all children. Child '%ls' (%ls) uses a different layout than previously checked children and might get out of sync during runtime. If this is by design, use ReconcileDynamicAxis() to forward layouts between nodes.",
-                         NodeDescription().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
+            InconsistentMBLayout(*this, *this, *child);
    }
    // all are consistent: install it
    LinkToMBLayout(pMBLayout);
@ -133,9 +141,10 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all
    ValidateInferBinaryInputDims();

    if (isFinalValidationPass &&
-        Input(0)->GetMBLayout() != Input(1)->GetMBLayout() && Input(0)->HasMBLayout() && Input(1)->HasMBLayout())
+        Input(0)->HasMBLayout() && Input(1)->HasMBLayout() &&
+        Input(0)->GetMBLayout() != Input(1)->GetMBLayout())
    {
-        LogicError("%ls: Minibatch layouts are not the same between arguments and might get out of sync during runtime. If this is by design, use ReconcileDynamicAxis() to forward layouts between nodes.", NodeDescription().c_str());
+        InconsistentMBLayout(*this, *Input(0), *Input(1));
    }

    // result has tensor shape with dimensions being the max over both
@ -176,10 +185,10 @@ void ComputationNodeBase::ValidateNaryZip(bool isFinalValidationPass, bool allow

    // check minibatch layout consistency for all possible pairs (n choose 2)
    if (isFinalValidationPass)
-        for (size_t i = 0; i < numInputs; i++)        
-            for (size_t j = i+1; j < numInputs; j++)            
-                if (Input(i)->GetMBLayout() != Input(j)->GetMBLayout() && Input(i)->HasMBLayout() && Input(j)->HasMBLayout())
-                    LogicError("%ls: Minibatch layouts are not the same between arguments and might get out of sync during runtime. If this is by design, use ReconcileDynamicAxis() to forward layouts between nodes.", NodeDescription().c_str());
+        for (size_t i = 0; i < numInputs; i++)
+            for (size_t j = i + 1; j < numInputs; j++)
+                if (Input(i)->HasMBLayout() && Input(j)->HasMBLayout() && Input(i)->GetMBLayout() != Input(j)->GetMBLayout())
+                    InconsistentMBLayout(*this, *Input(i), *Input(j));

    // result has tensor shape with dimensions being the max over all inputs
    let shape0 = GetInputSampleLayout(0);