From f88fcdb3f2b381c8c8de50c3f865a9eba1d5fb7d Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Thu, 11 Aug 2016 17:29:21 -0700
Subject: [PATCH] now accepts mismatching MBLayouts during validation, relying
 on runtime checks only. Updated ATIS sample.

---
 Examples/Text/ATIS/TutorialAll.cntk           | 52 ++++++-------------
 .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs | 11 ++--
 Source/Common/Include/Sequences.h             |  6 +--
 .../ComputationNetworkLib/ComputationNode.cpp | 46 ++++++++--------
 .../ComputationNetworkLib/ComputationNode.h   |  1 +
 5 files changed, 52 insertions(+), 64 deletions(-)

diff --git a/Examples/Text/ATIS/TutorialAll.cntk b/Examples/Text/ATIS/TutorialAll.cntk
index 094d67b85..9b1075fb1 100644
--- a/Examples/Text/ATIS/TutorialAll.cntk
+++ b/Examples/Text/ATIS/TutorialAll.cntk
@@ -11,14 +11,14 @@ parallelTrain = true
 
 #stderr = $WorkDir$/log
 
-command = TrainSlotTagger:RunSlotTagger:EvalSlotTagger
+command = TrainATIS:RunATIS:EvalATIS
 
 vocabSize = 943    # number of words
 numLabels = 129    # number of slot labels
 numIntents = 26    # number of intent labels
 
 # The command to train the LSTM model
-TrainSlotTagger = [
+TrainATIS = [
     action = "train"
     BrainScriptNetworkBuilder = [
         inputDim = $vocabSize$
@@ -53,7 +53,7 @@ TrainSlotTagger = [
         evaluationNodes = (errs)
         outputNodes     = (z)
     ]
-    # enable this one instead for intent classification
+    # rename this to BrainScriptNetworkBuilder to switch to intent-classification task
     Intent_BrainScriptNetworkBuilder = [
         inputDim = $vocabSize$
         labelDim = $numIntents$
@@ -61,18 +61,12 @@ TrainSlotTagger = [
         #hiddenDim = 300
         hiddenDim = 150
 
-        RecSplice (a2) = [ # splice with reconciliation
-            i1 = a2[0]
-            i2 = ReconcileDynamicAxis (a2[1], i1)
-            res = Splice (i1 : i2)
-        ].res
-
         model = Sequential (
             Parallel ((DelayLayer{T=1} : Identity : DelayLayer{T=-1}), Splice) :  # 3-word window
             EmbeddingLayer {embDim} :                                             # embedding
             RecurrentLSTMLayer {hiddenDim} : BS.Sequences.Last :                  # LSTM state, final state
-            #Parallel ((Sequential (RecurrentLSTMLayer {hiddenDim}                  : BS.Sequences.Last):
-            #           Sequential (RecurrentLSTMLayer {hiddenDim, goBackwards=true}: BS.Sequences.First)), RecSplice) :  # bidirectional LSTM
+            #Parallel ((Sequential (RecurrentLSTMLayer {hiddenDim}                   : BS.Sequences.Last):
+                       Sequential (RecurrentLSTMLayer {hiddenDim, goBackwards=true} : BS.Sequences.First)), Splice) :  # bidirectional LSTM
             DenseLayer {labelDim, initValueScale=7}        # output layer
         )
 
@@ -82,7 +76,7 @@ TrainSlotTagger = [
         intentLabels = Input {labelDim}
 
         # model application
-        z = ReconcileDynamicAxis (model (query), intentLabels)
+        z = model (query)
 
         # loss and metric
         ce   = CrossEntropyWithSoftmax (intentLabels, z)
@@ -96,30 +90,18 @@ TrainSlotTagger = [
     ]
 
     SGD = [
-        # maximum number of epochs
-        maxEpochs = 20   # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy
-        #maxEpochs = 200   # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy
+        maxEpochs = 20 ; epochSize = 36000
 
-        # for each epoch, maximum number of input samples(words) is set below
-        epochSize = 36000
-
-        # minibatchSize should be larger than the maximum sentence length
         minibatchSize = 70
 
         learningRatesPerSample = 0.01*2:0.005*12:0.001
-        #learningRatesPerSample = 0.01*20:0.005*120:0.001
+
         gradUpdateType = "FSAdaGrad"
 
-        gradientClippingWithTruncation = true
-        clippingThresholdPerSample = 15.0
+        gradientClippingWithTruncation = true ; clippingThresholdPerSample = 15.0
 
         # number of minibatches to report progress
-        firstMBsToShowResult = 10
-        numMBsToShowResult = 100
-
-        # if validation shows that the model has no improvement, then do back-up to the previously
-        # estimated model and reduce learning rate
-        loadBestModel = true
+        firstMBsToShowResult = 10 ; numMBsToShowResult = 100
 
         parallelTrain = [
             parallelizationMethod = "DataParallelSGD"
@@ -144,21 +126,19 @@ TrainSlotTagger = [
 ]
 
 # Run the model to predict slot labels
-RunSlotTagger = [
+RunATIS = [
     action = "write"
     BrainScriptNetworkBuilder = [
         modelAsTrained = BS.Network.Load ("$modelPath$")
         final = Hardmax (modelAsTrained.z)  # make a decision
-        labels = Pass (modelAsTrained.slotLabels)
+        #labels = Pass (modelAsTrained.slotLabels)
         # enable this for intent classification:
-        #labels = Pass (modelAsTrained.intentLabels)
-        #t = DynamicAxis()
+        labels = Pass (modelAsTrained.intentLabels)
+        t = DynamicAxis()
     ]
 
     outputPath = $WorkDir$/model.writeaction
-    outputNodeNames = slotLabels:final
-    # enable this for intent classification:
-    #outputNodeNames = intentLabels:final
+    outputNodeNames = intentLabels:slotLabels:final
 
     reader = [
         readerType = "CNTKTextFormatReader"
@@ -173,7 +153,7 @@ RunSlotTagger = [
 ]
 
 # Evaluate the model's slot-tagging accuracy (as an error count)
-EvalSlotTagger = [
+EvalATIS = [
     action = "eval"
     modelPath = $modelPath$  # from outside
     reader = [
diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
index 74691028a..79f4a0f45 100644
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@@ -239,7 +239,7 @@ Exp                     = CNTK2.Exp
 Floor                   = CNTK2.Floor
 Log                     = CNTK2.Log
 Minus                   = CNTK2.Minus
-Pass                    = CNTK2.Identity
+Pass                    = CNTK2.Pass
 Plus                    = CNTK2.Plus
 RectifiedLinear         = CNTK2.ReLU # deprecated
 ReLU                    = CNTK2.ReLU
@@ -380,10 +380,11 @@ CNTK2 = [
     LessEqual(_, y, tag='')    = new ComputationNode [ operation = 'LessEqual'    ; inputs = (_ : y) /*plus the function args*/ ]
 
     // 13. Others    
-    Identity(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]    
+    Pass(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]    
+    Identity = Pass
 ]
 
-# Parameter{} can do several forms of initialization. It is no longer required to say 'init="kind"', so we can clean these up a bit.
+# Parameter{} can do several forms of initialization.
 #  - initValue=scalar, value=array --> initialize from this value  --array form not implemented yet
 #  - initFromFilePath="..." --> read from a data file
 #  - init="uniform|gaussian" (random init scaled by initValueScale). Warning: This has magic scaling factors. TODO: document them here
@@ -393,7 +394,9 @@ CNTK2 = [
 #  - init="fixedValue", value from 'value'
 # Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
 Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
-LearnableParameter = Parameter  // deprecated 
+
+LearnableParameter = Parameter  // deprecated
+
 # TODO: make Parameter take tensor dims?
 ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
 ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index 20b25c1d4..dc7cf2470 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -962,7 +962,7 @@ static inline std::pair<size_t, size_t> ColumnRangeWithMBLayoutFor(size_t numCol
         if (fr.m_broadcastAllowed && !pMBLayout && numCols == 1)
             return std::pair<size_t, size_t>(0, numCols);
         if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout)
-            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation?");
+            ;// LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation?");
         else
             LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix.");
     }
@@ -1055,8 +1055,8 @@ static inline std::pair<DimensionVector, DimensionVector> TensorSliceWithMBLayou
         if (fr.m_pMBLayout /*get data for a loop*/ && !pMBLayout /*'data' is not samples*/ && fr.m_broadcastAllowed /*we're OK with that*/)
             ; // the time dimension is broadcasting--leave it as is
         else if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout)
-            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation? %s vs. %s", 
-                       static_cast<string>(*(fr.m_pMBLayout)).c_str(), static_cast<string>(*(pMBLayout)).c_str());
+            ; //LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation? %s vs. %s", 
+              //           static_cast<string>(*(fr.m_pMBLayout)).c_str(), static_cast<string>(*(pMBLayout)).c_str());
         else
             LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix: %s vs. %s", 
                        static_cast<string>(*(fr.m_pMBLayout)).c_str(), static_cast<string>(*(pMBLayout)).c_str());
diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp
index ee0437f2c..f9feda69b 100644
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@@ -87,12 +87,21 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
 // subroutines for Validate() implementations
 // -----------------------------------------------------------------------
 
-static void InconsistentMBLayout(const ComputationNodeBase& us, const ComputationNodeBase& which, ComputationNodeBase& vsWhich)
+// compare two MBLayouts, and alert if they are different
+void ComputationNodeBase::ValidateMBLayout(const ComputationNodeBasePtr which, const ComputationNodeBasePtr vsWhich) const
 {
-#if 1
-    RuntimeError("%ls: Dynamic axes mismatches between %ls and %ls. If this is by design, use ReconcileDynamicAxis().",
-                 us.NodeDescription().c_str(), which.NodeDescription().c_str(), vsWhich.NodeDescription());
+    if (!which->HasMBLayout() || !vsWhich->HasMBLayout() || which->GetMBLayout() == vsWhich->GetMBLayout())
+        return;
+    // MBLayouts are inconsistent
+#if 0
+    // can't have that
+    RuntimeError("%ls: Dynamic axes mismatch between %ls and %ls. If this is by design, use ReconcileDynamicAxis().",
+                 NodeDescription().c_str(), which->NodeDescription().c_str(), vsWhich->NodeDescription());
 #else
+    // We will let this slip with a reminder, assuming that this will be caught at runtime.
+    // By allowing this, users will not need ReconcileDynamicAxis() for reductions over a sequence like BS.Sequences.Last().
+    fprintf(stderr, "WARNING: %ls: Dynamic axes mismatch between %ls and %ls. If they are incompatible, this will fail later. If this is by design, use ReconcileDynamicAxis().\n",
+            NodeDescription().c_str(), which->NodeDescription().c_str(), vsWhich->NodeDescription().c_str());
 #endif
 }
 
@@ -104,20 +113,20 @@ static void InconsistentMBLayout(const ComputationNodeBase& us, const Computatio
 //  - if there are more than one different layouts involved, this function will fail
 void ComputationNodeBase::InferMBLayoutFromInputsForStandardCase(bool isFinalValidationPass)
 {
-    MBLayoutPtr pMBLayout; // start with NULL layout
-    for (auto child : m_inputs)
+    ComputationNodeBasePtr firstInputWithMBLayout;
+    for (auto input : m_inputs)
     {
-        if (!child) // node not set yet (DelayedValueNodeBase seems to allow this)--BUGBUG: Then this function won't operate correctly.
+        if (!input) // node not set yet (DelayedValueNodeBase seems to allow this)--BUGBUG: Then this function won't operate correctly.
             ;
-        else if (!child->m_pMBLayout) // NULL layout (typical for parameter nodes)
+        else if (!input->m_pMBLayout) // NULL layout (typical for parameter nodes)
             ;
-        else if (!pMBLayout) // first non-NULL layout: just copy it
-            pMBLayout = child->m_pMBLayout;
-        else if (pMBLayout != child->m_pMBLayout && isFinalValidationPass) // got a layout--compare whether it is the same
-            InconsistentMBLayout(*this, *this, *child);
+        else if (!firstInputWithMBLayout) // first input with layout: remember this child
+            firstInputWithMBLayout = input;
+        else if (isFinalValidationPass) // got a layout--compare whether it is the same
+            ValidateMBLayout(firstInputWithMBLayout, input);
     }
     // all are consistent: install it
-    LinkToMBLayout(pMBLayout);
+    LinkToMBLayout(firstInputWithMBLayout ? firstInputWithMBLayout->m_pMBLayout : nullptr);
 }
 
 // single input that maps its input element-wise (e.g. Sigmoid)
@@ -140,12 +149,8 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all
 
     ValidateInferBinaryInputDims();
 
-    if (isFinalValidationPass &&
-        Input(0)->HasMBLayout() && Input(1)->HasMBLayout() &&
-        Input(0)->GetMBLayout() != Input(1)->GetMBLayout())
-    {
-        InconsistentMBLayout(*this, *Input(0), *Input(1));
-    }
+    if (isFinalValidationPass)
+        ValidateMBLayout(Input(0), Input(1));
 
     // result has tensor shape with dimensions being the max over both
     let shape0 = GetInputSampleLayout(0);
@@ -187,8 +192,7 @@ void ComputationNodeBase::ValidateNaryZip(bool isFinalValidationPass, bool allow
     if (isFinalValidationPass)
         for (size_t i = 0; i < numInputs; i++)
             for (size_t j = i + 1; j < numInputs; j++)
-                if (Input(i)->HasMBLayout() && Input(j)->HasMBLayout() && Input(i)->GetMBLayout() != Input(j)->GetMBLayout())
-                    InconsistentMBLayout(*this, *Input(i), *Input(j));
+                ValidateMBLayout(Input(i), Input(j));
 
     // result has tensor shape with dimensions being the max over all inputs
     let shape0 = GetInputSampleLayout(0);
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 6b5680dc6..853c9a159 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -679,6 +679,7 @@ protected:
     void ValidateBinaryZip(bool isFinalValidationPass, bool allowBroadcast);
     void ValidateBinaryReduce(bool isFinalValidationPass);    
     void ValidateNaryZip(bool isFinalValidationPass, bool allowBroadcast, size_t numInputs);
+    void ValidateMBLayout(const ComputationNodeBasePtr which, const ComputationNodeBasePtr vsWhich) const;
     void InferMBLayoutFromInputsForStandardCase(bool isFinalValidationPass);
     virtual void ValidateInferInputDimsFrom(const TensorShape&) = 0;    // (implemented by ComputationNode<ElemType>)