now accepts mismatching MBLayouts during validation, relying on runtime checks only. Updated ATIS sample.

2016-08-11 17:29:21 -07:00 · 2016-08-11 17:29:21 -07:00 · f88fcdb3f2
--- a/Examples/Text/ATIS/TutorialAll.cntk
+++ b/Examples/Text/ATIS/TutorialAll.cntk
@ -11,14 +11,14 @@ parallelTrain = true

 #stderr = $WorkDir$/log

-command = TrainSlotTagger:RunSlotTagger:EvalSlotTagger
+command = TrainATIS:RunATIS:EvalATIS

 vocabSize = 943    # number of words
 numLabels = 129    # number of slot labels
 numIntents = 26    # number of intent labels

 # The command to train the LSTM model
-TrainSlotTagger = [
+TrainATIS = [
    action = "train"
    BrainScriptNetworkBuilder = [
        inputDim = $vocabSize$
@ -53,7 +53,7 @@ TrainSlotTagger = [
        evaluationNodes = (errs)
        outputNodes     = (z)
    ]
-    # enable this one instead for intent classification
+    # rename this to BrainScriptNetworkBuilder to switch to intent-classification task
    Intent_BrainScriptNetworkBuilder = [
        inputDim = $vocabSize$
        labelDim = $numIntents$
@ -61,18 +61,12 @@ TrainSlotTagger = [
        #hiddenDim = 300
        hiddenDim = 150

-        RecSplice (a2) = [ # splice with reconciliation
-            i1 = a2[0]
-            i2 = ReconcileDynamicAxis (a2[1], i1)
-            res = Splice (i1 : i2)
-        ].res
-
        model = Sequential (
            Parallel ((DelayLayer{T=1} : Identity : DelayLayer{T=-1}), Splice) :  # 3-word window
            EmbeddingLayer {embDim} :                                             # embedding
            RecurrentLSTMLayer {hiddenDim} : BS.Sequences.Last :                  # LSTM state, final state
-            #Parallel ((Sequential (RecurrentLSTMLayer {hiddenDim}                  : BS.Sequences.Last):
-            #           Sequential (RecurrentLSTMLayer {hiddenDim, goBackwards=true}: BS.Sequences.First)), RecSplice) :  # bidirectional LSTM
+            #Parallel ((Sequential (RecurrentLSTMLayer {hiddenDim}                   : BS.Sequences.Last):
+                       Sequential (RecurrentLSTMLayer {hiddenDim, goBackwards=true} : BS.Sequences.First)), Splice) :  # bidirectional LSTM
            DenseLayer {labelDim, initValueScale=7}        # output layer
        )

@ -82,7 +76,7 @@ TrainSlotTagger = [
        intentLabels = Input {labelDim}

        # model application
-        z = ReconcileDynamicAxis (model (query), intentLabels)
+        z = model (query)

        # loss and metric
        ce   = CrossEntropyWithSoftmax (intentLabels, z)
@ -96,30 +90,18 @@ TrainSlotTagger = [
    ]

    SGD = [
-        # maximum number of epochs
-        maxEpochs = 20   # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy
-        #maxEpochs = 200   # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy
+        maxEpochs = 20 ; epochSize = 36000

-        # for each epoch, maximum number of input samples(words) is set below
-        epochSize = 36000
-
-        # minibatchSize should be larger than the maximum sentence length
        minibatchSize = 70

        learningRatesPerSample = 0.01*2:0.005*12:0.001
-        #learningRatesPerSample = 0.01*20:0.005*120:0.001
+
        gradUpdateType = "FSAdaGrad"

-        gradientClippingWithTruncation = true
-        clippingThresholdPerSample = 15.0
+        gradientClippingWithTruncation = true ; clippingThresholdPerSample = 15.0

        # number of minibatches to report progress
-        firstMBsToShowResult = 10
-        numMBsToShowResult = 100
-
-        # if validation shows that the model has no improvement, then do back-up to the previously
-        # estimated model and reduce learning rate
-        loadBestModel = true
+        firstMBsToShowResult = 10 ; numMBsToShowResult = 100

        parallelTrain = [
            parallelizationMethod = "DataParallelSGD"
@ -144,21 +126,19 @@ TrainSlotTagger = [
 ]

 # Run the model to predict slot labels
-RunSlotTagger = [
+RunATIS = [
    action = "write"
    BrainScriptNetworkBuilder = [
        modelAsTrained = BS.Network.Load ("$modelPath$")
        final = Hardmax (modelAsTrained.z)  # make a decision
-        labels = Pass (modelAsTrained.slotLabels)
+        #labels = Pass (modelAsTrained.slotLabels)
        # enable this for intent classification:
-        #labels = Pass (modelAsTrained.intentLabels)
-        #t = DynamicAxis()
+        labels = Pass (modelAsTrained.intentLabels)
+        t = DynamicAxis()
    ]

    outputPath = $WorkDir$/model.writeaction
-    outputNodeNames = slotLabels:final
-    # enable this for intent classification:
-    #outputNodeNames = intentLabels:final
+    outputNodeNames = intentLabels:slotLabels:final

    reader = [
        readerType = "CNTKTextFormatReader"
@ -173,7 +153,7 @@ RunSlotTagger = [
 ]

 # Evaluate the model's slot-tagging accuracy (as an error count)
-EvalSlotTagger = [
+EvalATIS = [
    action = "eval"
    modelPath = $modelPath$  # from outside
    reader = [
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -239,7 +239,7 @@ Exp                     = CNTK2.Exp
 Floor                   = CNTK2.Floor
 Log                     = CNTK2.Log
 Minus                   = CNTK2.Minus
-Pass                    = CNTK2.Identity
+Pass                    = CNTK2.Pass
 Plus                    = CNTK2.Plus
 RectifiedLinear         = CNTK2.ReLU # deprecated
 ReLU                    = CNTK2.ReLU
@ -380,10 +380,11 @@ CNTK2 = [
    LessEqual(_, y, tag='')    = new ComputationNode [ operation = 'LessEqual'    ; inputs = (_ : y) /*plus the function args*/ ]

    // 13. Others    
-    Identity(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]    
+    Pass(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]    
+    Identity = Pass
 ]

-# Parameter{} can do several forms of initialization. It is no longer required to say 'init="kind"', so we can clean these up a bit.
+# Parameter{} can do several forms of initialization.
 #  - initValue=scalar, value=array --> initialize from this value  --array form not implemented yet
 #  - initFromFilePath="..." --> read from a data file
 #  - init="uniform|gaussian" (random init scaled by initValueScale). Warning: This has magic scaling factors. TODO: document them here
@ -393,7 +394,9 @@ CNTK2 = [
 #  - init="fixedValue", value from 'value'
 # Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
 Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
-LearnableParameter = Parameter  // deprecated 
+
+LearnableParameter = Parameter  // deprecated
+
 # TODO: make Parameter take tensor dims?
 ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
 ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -962,7 +962,7 @@ static inline std::pair<size_t, size_t> ColumnRangeWithMBLayoutFor(size_t numCol
        if (fr.m_broadcastAllowed && !pMBLayout && numCols == 1)
            return std::pair<size_t, size_t>(0, numCols);
        if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout)
-            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation?");
+            ;// LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation?");
        else
            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix.");
    }
@ -1055,8 +1055,8 @@ static inline std::pair<DimensionVector, DimensionVector> TensorSliceWithMBLayou
        if (fr.m_pMBLayout /*get data for a loop*/ && !pMBLayout /*'data' is not samples*/ && fr.m_broadcastAllowed /*we're OK with that*/)
            ; // the time dimension is broadcasting--leave it as is
        else if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout)
-            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation? %s vs. %s", 
-                       static_cast<string>(*(fr.m_pMBLayout)).c_str(), static_cast<string>(*(pMBLayout)).c_str());
+            ; //LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation? %s vs. %s", 
+              //           static_cast<string>(*(fr.m_pMBLayout)).c_str(), static_cast<string>(*(pMBLayout)).c_str());
        else
            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix: %s vs. %s", 
                       static_cast<string>(*(fr.m_pMBLayout)).c_str(), static_cast<string>(*(pMBLayout)).c_str());
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -87,12 +87,21 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
 // subroutines for Validate() implementations
 // -----------------------------------------------------------------------

-static void InconsistentMBLayout(const ComputationNodeBase& us, const ComputationNodeBase& which, ComputationNodeBase& vsWhich)
+// compare two MBLayouts, and alert if they are different
+void ComputationNodeBase::ValidateMBLayout(const ComputationNodeBasePtr which, const ComputationNodeBasePtr vsWhich) const
 {
-#if 1
-    RuntimeError("%ls: Dynamic axes mismatches between %ls and %ls. If this is by design, use ReconcileDynamicAxis().",
-                 us.NodeDescription().c_str(), which.NodeDescription().c_str(), vsWhich.NodeDescription());
+    if (!which->HasMBLayout() || !vsWhich->HasMBLayout() || which->GetMBLayout() == vsWhich->GetMBLayout())
+        return;
+    // MBLayouts are inconsistent
+#if 0
+    // can't have that
+    RuntimeError("%ls: Dynamic axes mismatch between %ls and %ls. If this is by design, use ReconcileDynamicAxis().",
+                 NodeDescription().c_str(), which->NodeDescription().c_str(), vsWhich->NodeDescription());
 #else
+    // We will let this slip with a reminder, assuming that this will be caught at runtime.
+    // By allowing this, users will not need ReconcileDynamicAxis() for reductions over a sequence like BS.Sequences.Last().
+    fprintf(stderr, "WARNING: %ls: Dynamic axes mismatch between %ls and %ls. If they are incompatible, this will fail later. If this is by design, use ReconcileDynamicAxis().\n",
+            NodeDescription().c_str(), which->NodeDescription().c_str(), vsWhich->NodeDescription().c_str());
 #endif
 }

@ -104,20 +113,20 @@ static void InconsistentMBLayout(const ComputationNodeBase& us, const Computatio
 //  - if there are more than one different layouts involved, this function will fail
 void ComputationNodeBase::InferMBLayoutFromInputsForStandardCase(bool isFinalValidationPass)
 {
-    MBLayoutPtr pMBLayout; // start with NULL layout
-    for (auto child : m_inputs)
+    ComputationNodeBasePtr firstInputWithMBLayout;
+    for (auto input : m_inputs)
    {
-        if (!child) // node not set yet (DelayedValueNodeBase seems to allow this)--BUGBUG: Then this function won't operate correctly.
+        if (!input) // node not set yet (DelayedValueNodeBase seems to allow this)--BUGBUG: Then this function won't operate correctly.
            ;
-        else if (!child->m_pMBLayout) // NULL layout (typical for parameter nodes)
+        else if (!input->m_pMBLayout) // NULL layout (typical for parameter nodes)
            ;
-        else if (!pMBLayout) // first non-NULL layout: just copy it
-            pMBLayout = child->m_pMBLayout;
-        else if (pMBLayout != child->m_pMBLayout && isFinalValidationPass) // got a layout--compare whether it is the same
-            InconsistentMBLayout(*this, *this, *child);
+        else if (!firstInputWithMBLayout) // first input with layout: remember this child
+            firstInputWithMBLayout = input;
+        else if (isFinalValidationPass) // got a layout--compare whether it is the same
+            ValidateMBLayout(firstInputWithMBLayout, input);
    }
    // all are consistent: install it
-    LinkToMBLayout(pMBLayout);
+    LinkToMBLayout(firstInputWithMBLayout ? firstInputWithMBLayout->m_pMBLayout : nullptr);
 }

 // single input that maps its input element-wise (e.g. Sigmoid)
@ -140,12 +149,8 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all

    ValidateInferBinaryInputDims();

-    if (isFinalValidationPass &&
-        Input(0)->HasMBLayout() && Input(1)->HasMBLayout() &&
-        Input(0)->GetMBLayout() != Input(1)->GetMBLayout())
-    {
-        InconsistentMBLayout(*this, *Input(0), *Input(1));
-    }
+    if (isFinalValidationPass)
+        ValidateMBLayout(Input(0), Input(1));

    // result has tensor shape with dimensions being the max over both
    let shape0 = GetInputSampleLayout(0);
@ -187,8 +192,7 @@ void ComputationNodeBase::ValidateNaryZip(bool isFinalValidationPass, bool allow
    if (isFinalValidationPass)
        for (size_t i = 0; i < numInputs; i++)
            for (size_t j = i + 1; j < numInputs; j++)
-                if (Input(i)->HasMBLayout() && Input(j)->HasMBLayout() && Input(i)->GetMBLayout() != Input(j)->GetMBLayout())
-                    InconsistentMBLayout(*this, *Input(i), *Input(j));
+                ValidateMBLayout(Input(i), Input(j));

    // result has tensor shape with dimensions being the max over all inputs
    let shape0 = GetInputSampleLayout(0);
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -679,6 +679,7 @@ protected:
    void ValidateBinaryZip(bool isFinalValidationPass, bool allowBroadcast);
    void ValidateBinaryReduce(bool isFinalValidationPass);    
    void ValidateNaryZip(bool isFinalValidationPass, bool allowBroadcast, size_t numInputs);
+    void ValidateMBLayout(const ComputationNodeBasePtr which, const ComputationNodeBasePtr vsWhich) const;
    void InferMBLayoutFromInputsForStandardCase(bool isFinalValidationPass);
    virtual void ValidateInferInputDimsFrom(const TensorShape&) = 0;    // (implemented by ComputationNode<ElemType>)