From f88fcdb3f2b381c8c8de50c3f865a9eba1d5fb7d Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Thu, 11 Aug 2016 17:29:21 -0700 Subject: [PATCH] now accepts mismatching MBLayouts during validation, relying on runtime checks only. Updated ATIS sample. --- Examples/Text/ATIS/TutorialAll.cntk | 52 ++++++------------- .../CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs | 11 ++-- Source/Common/Include/Sequences.h | 6 +-- .../ComputationNetworkLib/ComputationNode.cpp | 46 ++++++++-------- .../ComputationNetworkLib/ComputationNode.h | 1 + 5 files changed, 52 insertions(+), 64 deletions(-) diff --git a/Examples/Text/ATIS/TutorialAll.cntk b/Examples/Text/ATIS/TutorialAll.cntk index 094d67b85..9b1075fb1 100644 --- a/Examples/Text/ATIS/TutorialAll.cntk +++ b/Examples/Text/ATIS/TutorialAll.cntk @@ -11,14 +11,14 @@ parallelTrain = true #stderr = $WorkDir$/log -command = TrainSlotTagger:RunSlotTagger:EvalSlotTagger +command = TrainATIS:RunATIS:EvalATIS vocabSize = 943 # number of words numLabels = 129 # number of slot labels numIntents = 26 # number of intent labels # The command to train the LSTM model -TrainSlotTagger = [ +TrainATIS = [ action = "train" BrainScriptNetworkBuilder = [ inputDim = $vocabSize$ @@ -53,7 +53,7 @@ TrainSlotTagger = [ evaluationNodes = (errs) outputNodes = (z) ] - # enable this one instead for intent classification + # rename this to BrainScriptNetworkBuilder to switch to intent-classification task Intent_BrainScriptNetworkBuilder = [ inputDim = $vocabSize$ labelDim = $numIntents$ @@ -61,18 +61,12 @@ TrainSlotTagger = [ #hiddenDim = 300 hiddenDim = 150 - RecSplice (a2) = [ # splice with reconciliation - i1 = a2[0] - i2 = ReconcileDynamicAxis (a2[1], i1) - res = Splice (i1 : i2) - ].res - model = Sequential ( Parallel ((DelayLayer{T=1} : Identity : DelayLayer{T=-1}), Splice) : # 3-word window EmbeddingLayer {embDim} : # embedding RecurrentLSTMLayer {hiddenDim} : BS.Sequences.Last : # LSTM state, final state - #Parallel ((Sequential (RecurrentLSTMLayer {hiddenDim} : BS.Sequences.Last): - # Sequential (RecurrentLSTMLayer {hiddenDim, goBackwards=true}: BS.Sequences.First)), RecSplice) : # bidirectional LSTM + #Parallel ((Sequential (RecurrentLSTMLayer {hiddenDim} : BS.Sequences.Last): + Sequential (RecurrentLSTMLayer {hiddenDim, goBackwards=true} : BS.Sequences.First)), Splice) : # bidirectional LSTM DenseLayer {labelDim, initValueScale=7} # output layer ) @@ -82,7 +76,7 @@ TrainSlotTagger = [ intentLabels = Input {labelDim} # model application - z = ReconcileDynamicAxis (model (query), intentLabels) + z = model (query) # loss and metric ce = CrossEntropyWithSoftmax (intentLabels, z) @@ -96,30 +90,18 @@ TrainSlotTagger = [ ] SGD = [ - # maximum number of epochs - maxEpochs = 20 # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy - #maxEpochs = 200 # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy + maxEpochs = 20 ; epochSize = 36000 - # for each epoch, maximum number of input samples(words) is set below - epochSize = 36000 - - # minibatchSize should be larger than the maximum sentence length minibatchSize = 70 learningRatesPerSample = 0.01*2:0.005*12:0.001 - #learningRatesPerSample = 0.01*20:0.005*120:0.001 + gradUpdateType = "FSAdaGrad" - gradientClippingWithTruncation = true - clippingThresholdPerSample = 15.0 + gradientClippingWithTruncation = true ; clippingThresholdPerSample = 15.0 # number of minibatches to report progress - firstMBsToShowResult = 10 - numMBsToShowResult = 100 - - # if validation shows that the model has no improvement, then do back-up to the previously - # estimated model and reduce learning rate - loadBestModel = true + firstMBsToShowResult = 10 ; numMBsToShowResult = 100 parallelTrain = [ parallelizationMethod = "DataParallelSGD" @@ -144,21 +126,19 @@ TrainSlotTagger = [ ] # Run the model to predict slot labels -RunSlotTagger = [ +RunATIS = [ action = "write" BrainScriptNetworkBuilder = [ modelAsTrained = BS.Network.Load ("$modelPath$") final = Hardmax (modelAsTrained.z) # make a decision - labels = Pass (modelAsTrained.slotLabels) + #labels = Pass (modelAsTrained.slotLabels) # enable this for intent classification: - #labels = Pass (modelAsTrained.intentLabels) - #t = DynamicAxis() + labels = Pass (modelAsTrained.intentLabels) + t = DynamicAxis() ] outputPath = $WorkDir$/model.writeaction - outputNodeNames = slotLabels:final - # enable this for intent classification: - #outputNodeNames = intentLabels:final + outputNodeNames = intentLabels:slotLabels:final reader = [ readerType = "CNTKTextFormatReader" @@ -173,7 +153,7 @@ RunSlotTagger = [ ] # Evaluate the model's slot-tagging accuracy (as an error count) -EvalSlotTagger = [ +EvalATIS = [ action = "eval" modelPath = $modelPath$ # from outside reader = [ diff --git a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs index 74691028a..79f4a0f45 100644 --- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs +++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs @@ -239,7 +239,7 @@ Exp = CNTK2.Exp Floor = CNTK2.Floor Log = CNTK2.Log Minus = CNTK2.Minus -Pass = CNTK2.Identity +Pass = CNTK2.Pass Plus = CNTK2.Plus RectifiedLinear = CNTK2.ReLU # deprecated ReLU = CNTK2.ReLU @@ -380,10 +380,11 @@ CNTK2 = [ LessEqual(_, y, tag='') = new ComputationNode [ operation = 'LessEqual' ; inputs = (_ : y) /*plus the function args*/ ] // 13. Others - Identity(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ] + Pass(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ] + Identity = Pass ] -# Parameter{} can do several forms of initialization. It is no longer required to say 'init="kind"', so we can clean these up a bit. +# Parameter{} can do several forms of initialization. # - initValue=scalar, value=array --> initialize from this value --array form not implemented yet # - initFromFilePath="..." --> read from a data file # - init="uniform|gaussian" (random init scaled by initValueScale). Warning: This has magic scaling factors. TODO: document them here @@ -393,7 +394,9 @@ CNTK2 = [ # - init="fixedValue", value from 'value' # Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init) Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ] -LearnableParameter = Parameter // deprecated + +LearnableParameter = Parameter // deprecated + # TODO: make Parameter take tensor dims? ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ] ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0) diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h index 20b25c1d4..dc7cf2470 100644 --- a/Source/Common/Include/Sequences.h +++ b/Source/Common/Include/Sequences.h @@ -962,7 +962,7 @@ static inline std::pair ColumnRangeWithMBLayoutFor(size_t numCol if (fr.m_broadcastAllowed && !pMBLayout && numCols == 1) return std::pair(0, numCols); if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout) - LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation?"); + ;// LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation?"); else LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix."); } @@ -1055,8 +1055,8 @@ static inline std::pair TensorSliceWithMBLayou if (fr.m_pMBLayout /*get data for a loop*/ && !pMBLayout /*'data' is not samples*/ && fr.m_broadcastAllowed /*we're OK with that*/) ; // the time dimension is broadcasting--leave it as is else if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout) - LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation? %s vs. %s", - static_cast(*(fr.m_pMBLayout)).c_str(), static_cast(*(pMBLayout)).c_str()); + ; //LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation? %s vs. %s", + // static_cast(*(fr.m_pMBLayout)).c_str(), static_cast(*(pMBLayout)).c_str()); else LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix: %s vs. %s", static_cast(*(fr.m_pMBLayout)).c_str(), static_cast(*(pMBLayout)).c_str()); diff --git a/Source/ComputationNetworkLib/ComputationNode.cpp b/Source/ComputationNetworkLib/ComputationNode.cpp index ee0437f2c..f9feda69b 100644 --- a/Source/ComputationNetworkLib/ComputationNode.cpp +++ b/Source/ComputationNetworkLib/ComputationNode.cpp @@ -87,12 +87,21 @@ void ComputationNode::Backprop(const FrameRange& fr, bool childrenInTh // subroutines for Validate() implementations // ----------------------------------------------------------------------- -static void InconsistentMBLayout(const ComputationNodeBase& us, const ComputationNodeBase& which, ComputationNodeBase& vsWhich) +// compare two MBLayouts, and alert if they are different +void ComputationNodeBase::ValidateMBLayout(const ComputationNodeBasePtr which, const ComputationNodeBasePtr vsWhich) const { -#if 1 - RuntimeError("%ls: Dynamic axes mismatches between %ls and %ls. If this is by design, use ReconcileDynamicAxis().", - us.NodeDescription().c_str(), which.NodeDescription().c_str(), vsWhich.NodeDescription()); + if (!which->HasMBLayout() || !vsWhich->HasMBLayout() || which->GetMBLayout() == vsWhich->GetMBLayout()) + return; + // MBLayouts are inconsistent +#if 0 + // can't have that + RuntimeError("%ls: Dynamic axes mismatch between %ls and %ls. If this is by design, use ReconcileDynamicAxis().", + NodeDescription().c_str(), which->NodeDescription().c_str(), vsWhich->NodeDescription()); #else + // We will let this slip with a reminder, assuming that this will be caught at runtime. + // By allowing this, users will not need ReconcileDynamicAxis() for reductions over a sequence like BS.Sequences.Last(). + fprintf(stderr, "WARNING: %ls: Dynamic axes mismatch between %ls and %ls. If they are incompatible, this will fail later. If this is by design, use ReconcileDynamicAxis().\n", + NodeDescription().c_str(), which->NodeDescription().c_str(), vsWhich->NodeDescription().c_str()); #endif } @@ -104,20 +113,20 @@ static void InconsistentMBLayout(const ComputationNodeBase& us, const Computatio // - if there are more than one different layouts involved, this function will fail void ComputationNodeBase::InferMBLayoutFromInputsForStandardCase(bool isFinalValidationPass) { - MBLayoutPtr pMBLayout; // start with NULL layout - for (auto child : m_inputs) + ComputationNodeBasePtr firstInputWithMBLayout; + for (auto input : m_inputs) { - if (!child) // node not set yet (DelayedValueNodeBase seems to allow this)--BUGBUG: Then this function won't operate correctly. + if (!input) // node not set yet (DelayedValueNodeBase seems to allow this)--BUGBUG: Then this function won't operate correctly. ; - else if (!child->m_pMBLayout) // NULL layout (typical for parameter nodes) + else if (!input->m_pMBLayout) // NULL layout (typical for parameter nodes) ; - else if (!pMBLayout) // first non-NULL layout: just copy it - pMBLayout = child->m_pMBLayout; - else if (pMBLayout != child->m_pMBLayout && isFinalValidationPass) // got a layout--compare whether it is the same - InconsistentMBLayout(*this, *this, *child); + else if (!firstInputWithMBLayout) // first input with layout: remember this child + firstInputWithMBLayout = input; + else if (isFinalValidationPass) // got a layout--compare whether it is the same + ValidateMBLayout(firstInputWithMBLayout, input); } // all are consistent: install it - LinkToMBLayout(pMBLayout); + LinkToMBLayout(firstInputWithMBLayout ? firstInputWithMBLayout->m_pMBLayout : nullptr); } // single input that maps its input element-wise (e.g. Sigmoid) @@ -140,12 +149,8 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all ValidateInferBinaryInputDims(); - if (isFinalValidationPass && - Input(0)->HasMBLayout() && Input(1)->HasMBLayout() && - Input(0)->GetMBLayout() != Input(1)->GetMBLayout()) - { - InconsistentMBLayout(*this, *Input(0), *Input(1)); - } + if (isFinalValidationPass) + ValidateMBLayout(Input(0), Input(1)); // result has tensor shape with dimensions being the max over both let shape0 = GetInputSampleLayout(0); @@ -187,8 +192,7 @@ void ComputationNodeBase::ValidateNaryZip(bool isFinalValidationPass, bool allow if (isFinalValidationPass) for (size_t i = 0; i < numInputs; i++) for (size_t j = i + 1; j < numInputs; j++) - if (Input(i)->HasMBLayout() && Input(j)->HasMBLayout() && Input(i)->GetMBLayout() != Input(j)->GetMBLayout()) - InconsistentMBLayout(*this, *Input(i), *Input(j)); + ValidateMBLayout(Input(i), Input(j)); // result has tensor shape with dimensions being the max over all inputs let shape0 = GetInputSampleLayout(0); diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 6b5680dc6..853c9a159 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -679,6 +679,7 @@ protected: void ValidateBinaryZip(bool isFinalValidationPass, bool allowBroadcast); void ValidateBinaryReduce(bool isFinalValidationPass); void ValidateNaryZip(bool isFinalValidationPass, bool allowBroadcast, size_t numInputs); + void ValidateMBLayout(const ComputationNodeBasePtr which, const ComputationNodeBasePtr vsWhich) const; void InferMBLayoutFromInputsForStandardCase(bool isFinalValidationPass); virtual void ValidateInferInputDimsFrom(const TensorShape&) = 0; // (implemented by ComputationNode)