diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50_ndl_deprecated.cntk b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50_ndl_deprecated.cntk index c6478384d..6482e2afb 100644 --- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50_ndl_deprecated.cntk +++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50_ndl_deprecated.cntk @@ -88,11 +88,11 @@ Train=[ ] PBN=[ - action="pbn" + action="bnstat" modelPath="$ModelDir$/ResNet_50" # Set minibatch size for testing. minibatchSize=256 - iters=30 + itersPerNode=30 reader=[ readerType="ImageReader" diff --git a/Makefile b/Makefile index 895d6be6c..6ea6062e4 100644 --- a/Makefile +++ b/Makefile @@ -439,7 +439,8 @@ EVAL:=eval SGDLIB_SRC=\ $(SOURCEDIR)/SGDLib/Profiler.cpp \ - $(SOURCEDIR)/SGDLib/SGD.cpp + $(SOURCEDIR)/SGDLib/SGD.cpp \ + $(SOURCEDIR)/SGDLib/PostComputingActions.cpp \ EVAL_SRC=\ $(SOURCEDIR)/EvalDll/CNTKEval.cpp \ diff --git a/Source/ActionsLib/Actions.h b/Source/ActionsLib/Actions.h index 2a1c83e9a..b991e1b95 100644 --- a/Source/ActionsLib/Actions.h +++ b/Source/ActionsLib/Actions.h @@ -42,11 +42,11 @@ template void DoDumpNodes(const ConfigParameters& config); template void DoEdit(const ConfigParameters& config); +template +void DoBatchNormalizationStat(const ConfigParameters& config); // evaluation (EvalActions.cpp) template -void DoEvalBN(const ConfigParameters& config); -template void DoEval(const ConfigParameters& config); template void DoCrossValidate(const ConfigParameters& config); diff --git a/Source/ActionsLib/EvalActions.cpp b/Source/ActionsLib/EvalActions.cpp index cbba55207..b2c8c1f1b 100644 --- a/Source/ActionsLib/EvalActions.cpp +++ b/Source/ActionsLib/EvalActions.cpp @@ -78,62 +78,6 @@ static void DoEvalBase(const ConfigParameters& config, IDataReader& reader) eval.Evaluate(&reader, evalNodeNamesVector, mbSize[0], epochSize); } -// =========================================================================== -// DoEvalBNBase() - implements CNTK "pbn" command -// =========================================================================== - -template -static void DoEvalBNBase(const ConfigParameters& config, IDataReader& reader) -{ - // DEVICEID_TYPE deviceId = DeviceFromConfig(config); - ConfigArray minibatchSize = config(L"minibatchSize", "40960"); - size_t epochSize = config(L"epochSize", "0"); - if (epochSize == 0) - { - epochSize = requestDataSize; - } - wstring modelPath = config(L"modelPath"); - wstring exportPath = modelPath + L".PBN"; - intargvector mbSize = minibatchSize; - - int iters = config(L"iters", 240); - - int traceLevel = config(L"traceLevel", "0"); - size_t numMBsToShowResult = config(L"numMBsToShowResult", "100"); - size_t firstMBsToShowResult = config(L"firstMBsToShowResult", "0"); - size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX); - size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1); - - bool enableDistributedMBReading = config(L"distributedMBReading", false); - - vector evalNodeNamesVector; - - let net = GetModelFromConfig(config, L"evalNodeNames", evalNodeNamesVector); - - // set tracing flags - net->EnableNodeTracing(config(L"traceNodeNamesReal", ConfigParameters::Array(stringargvector())), - config(L"traceNodeNamesCategory", ConfigParameters::Array(stringargvector())), - config(L"traceNodeNamesSparse", ConfigParameters::Array(stringargvector()))); - - SimpleEvaluator eval(net, MPIWrapper::GetInstance(), enableDistributedMBReading, numMBsToShowResult, - firstMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches); - eval.EvaluateBN(&reader, evalNodeNamesVector, exportPath, mbSize[0], iters, epochSize); -} - -template -void DoEvalBN(const ConfigParameters& config) -{ - // evaluate batch normalization mean and various - ConfigParameters readerConfig(config(L"reader")); - - // Should trace level to zero in Post BN? - //readerConfig.Insert("traceLevel", config(L"traceLevel", "0")); - - DataReader evaBNDataReader(readerConfig); - - DoEvalBNBase(config, evaBNDataReader); -} - template void DoEval(const ConfigParameters& config) { @@ -146,8 +90,6 @@ void DoEval(const ConfigParameters& config) DoEvalBase(config, testDataReader); } -template void DoEvalBN(const ConfigParameters& config); -template void DoEvalBN(const ConfigParameters& config); template void DoEval(const ConfigParameters& config); template void DoEval(const ConfigParameters& config); diff --git a/Source/ActionsLib/TrainActions.cpp b/Source/ActionsLib/TrainActions.cpp index eaaec303e..97b818c67 100644 --- a/Source/ActionsLib/TrainActions.cpp +++ b/Source/ActionsLib/TrainActions.cpp @@ -26,6 +26,7 @@ #include "ScriptableObjects.h" #include "BrainScriptEvaluator.h" #include "BrainScriptParser.h" +#include "PostComputingActions.h" #include #include @@ -235,3 +236,46 @@ void DoEdit(const ConfigParameters& config) template void DoEdit(const ConfigParameters& config); template void DoEdit(const ConfigParameters& config); + +// =========================================================================== +// DoBatchNormalizationStat() - implements CNTK "bnstat" command +// =========================================================================== + +template +void DoBatchNormalizationStat(const ConfigParameters& config) +{ + ConfigParameters readerConfig(config(L"reader")); + readerConfig.Insert("traceLevel", config(L"traceLevel", "0")); + + auto dataReader = make_shared(readerConfig); + + int traceLevel = config(L"traceLevel", "0"); + int itersPerNode = config(L"itersPerNode", 30); + + ConfigArray minibatchSize = config(L"minibatchSize", "40960"); + intargvector mbSize = minibatchSize; + + bool enableDistributedMBReading = config(L"enableDistributedMBReading", false); + + wstring curModelPath = config(L"modelPath", L""); + wstring newModelPath = config(L"newModelPath", L""); + if (newModelPath == L"") + { + newModelPath = curModelPath + L".PBN"; + } + + std::vector evalNodeNames; + let net = GetModelFromConfig(config, L"evalNodeNames", evalNodeNames); + // set tracing flags + net->EnableNodeTracing(config(L"traceNodeNamesReal", ConfigParameters::Array(stringargvector())), + config(L"traceNodeNamesCategory", ConfigParameters::Array(stringargvector())), + config(L"traceNodeNamesSparse", ConfigParameters::Array(stringargvector()))); + + PostComputingActions postComputingActions(net, MPIWrapper::GetInstance(), enableDistributedMBReading, traceLevel); + + postComputingActions.BatchNormalizationStatistics(dataReader.get(), evalNodeNames, newModelPath, mbSize[0], itersPerNode); +} + +template void DoBatchNormalizationStat(const ConfigParameters& config); +template void DoBatchNormalizationStat(const ConfigParameters& config); + diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index 9d07bb5f7..175713156 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -154,7 +154,7 @@ static void DisableLegacyUsage(const ConfigParameters& TopLevelConfig, const Con // When running in parallel with MPI, only commands in 'commandstoRunOnAllRanks' should // be run in parallel across multiple ranks. Others should only run on rank 0 -const std::set commandstoRunOnAllRanks = { "train", "trainRNN", "adapt", "test", "eval", "cv", "devtest", "pbn" }; +const std::set commandstoRunOnAllRanks = { "train", "trainRNN", "adapt", "test", "eval", "cv", "devtest", "bnstat" }; // process the command template @@ -243,9 +243,9 @@ void DoCommands(const ConfigParameters& config, const shared_ptr& mp LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str()); fullEpochsOffset += GetMaxEpochs(commandParams); } - else if (thisAction == "pbn") + else if (thisAction == "bnstat") { - DoEvalBN(commandParams); + DoBatchNormalizationStat(commandParams); } else if (thisAction == "adapt") { diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 9b072e6a6..7c934aef2 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -136,10 +136,6 @@ public: // main entry point for backprop void Backprop(const ComputationNodeBasePtr rootNode); - // partial forward entry - void ForwardProp(const ComputationNodeBasePtr rootNode, const ComputationNodeBasePtr startNode, - const ComputationNodeBasePtr endNode); - template // version that takes multiple nodes void ForwardProp(const NODESET& nodes) { @@ -678,6 +674,44 @@ public: return nodesWithType; } + // Get the eval nodes with names + // if evalNodeNames are not specified, return all the default evalnodes and training criterion nodes. + std::vector GetEvalNodesWithName(const std::vector evalNodeNames) + { + // determine nodes to evaluate + std::vector evalNodes; + + set criteriaLogged; // (keeps track ot duplicates to avoid we don't double-log critera) + if (evalNodeNames.size() == 0) + { + fprintf(stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n"); + if (EvaluationNodes().empty() && FinalCriterionNodes().empty()) + InvalidArgument("There is no default evaluation node or training criterion specified in the network."); + + for (const auto& node : EvaluationNodes()) + if (criteriaLogged.insert(node).second) + evalNodes.push_back(node); + + for (const auto& node : FinalCriterionNodes()) + if (criteriaLogged.insert(node).second) + evalNodes.push_back(node); + } + else + { + for (int i = 0; i < evalNodeNames.size(); i++) + { + const auto& node = GetNodeFromName(evalNodeNames[i]); + if (!criteriaLogged.insert(node).second) + continue; + if (node->GetSampleLayout().GetNumElements() != 1) + InvalidArgument("Criterion nodes to evaluate must have dimension 1x1."); + evalNodes.push_back(node); + } + } + + return evalNodes; + } + public: // return list of nodes that require precomputation and not precomputed yet std::list GetNodesRequiringPreComputation(const ComputationNodeBasePtr& rootNode = nullptr, bool checkComputed = true); @@ -1014,7 +1048,7 @@ protected: virtual const std::wstring OperationName() const override { return L"PARTraversalFlowControlNode"; - } + } virtual void BeginForwardProp() override { } @@ -1038,8 +1072,6 @@ protected: virtual void AllocateGradientMatricesForInputs(MatrixPool& matrixPool); virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool); virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool); - - virtual void ForwardProp(const FrameRange&, const ComputationNodeBasePtr, const ComputationNodeBasePtr) override; public: // this special constructor constructs the top-level network node diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp index 6501f948b..e830a35f5 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -79,17 +79,6 @@ void ComputationNetwork::Backprop(const ComputationNodeBasePtr rootNode) // trai GetNestedNetwork(rootNode)->Backprop(FrameRange(nullptr), true, true); } -void ComputationNetwork::ForwardProp(const ComputationNodeBasePtr rootNode, const ComputationNodeBasePtr startNode, const ComputationNodeBasePtr endNode) -{ - VerifyIsCompiled("ForwardProp"); - - // traverse partial nodes as inputs - shared_ptr network = dynamic_pointer_cast(GetNestedNetwork(rootNode)); - assert(network); - - network->ForwardProp(FrameRange(nullptr), startNode, endNode); -} - void ComputationNetwork::FormNestedNetwork(const ComputationNodeBasePtr& rootNode) { if (m_nestedNetworks.find(rootNode) != m_nestedNetworks.end()) @@ -158,7 +147,6 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con } } } - /*virtual*/ void ComputationNetwork::PARTraversalFlowControlNode::Backprop(const FrameRange& fr, bool childrenInThisLoop, bool childrenInOuterLoop) /*override*/ { childrenInThisLoop, childrenInOuterLoop; // TODO: think through what these mean when coming from PAR mode @@ -187,36 +175,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con /*virtual*/ void ComputationNetwork::PARTraversalFlowControlNode::ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) /*override*/ { } -/*virtual*/ void ComputationNetwork::PARTraversalFlowControlNode::ForwardProp(const FrameRange & fr, ComputationNodeBasePtr startNode, ComputationNodeBasePtr endNode) -{ - // if start node is nullptr, forward will be enable - bool enableForward = startNode ? false : true; - for (auto& node : m_nestedNodes) - { -#if 0 - if (dynamic_pointer_cast>(node)) - dynamic_pointer_cast>(node)->DebugLogMinibatch(); -#endif - if (node->IsOutOfDateWrtInputs() && enableForward) - { - node->BeginForwardProp(); - node->ForwardProp(fr.WithLayout(node->GetMBLayout())); - node->EndForwardProp(); - - node->BumpEvalTimeStamp(); - } - - if (node == startNode) - { - enableForward = true; - } - else if (node == endNode) - { - break; - } - } -} // ----------------------------------------------------------------------- // SEQTraversalFlowControlNode methods -- implements SEQ traversal (loop unrolling) diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index e441adfb2..b0444dece 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -1878,10 +1878,6 @@ public: virtual void DumpNodeInfo(const bool /*printValues*/, const bool /*printMetadata*/, File& fstream) const override {} virtual std::set> GetMatrixInfo() const override { NOT_IMPLEMENTED; } - virtual void ForwardProp(const FrameRange&, const ComputationNodeBasePtr, const ComputationNodeBasePtr) { NOT_IMPLEMENTED; } - - std::vector GetNestedNodes() { return m_nestedNodes; } - protected: public: // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of SEQTraversalFlowControlNode std::vector m_nestedNodes; // nodes tucked away in this node, in evaluation order }; diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h index 76496ccd6..95119d60e 100644 --- a/Source/ComputationNetworkLib/InputAndParamNodes.h +++ b/Source/ComputationNetworkLib/InputAndParamNodes.h @@ -37,6 +37,7 @@ public: MarkValueNonSharable(); m_initString = L"fromValue"; // default init is with 0; typically overwritten m_initValue = 0; + m_regMultiplier = 1.0f; // enable reg in update by default } LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& shape) : LearnableParameter(deviceId, name) @@ -101,6 +102,14 @@ public: // called from CloneFunction(..., parameters="constant") virtual void FreezeParameters() override; // from IFreezable + // Setting the reg multiplier for a learnable node, effecting L1Reg and L2Reg both. + void SetRegMultiplier(float regMultiplier) + { + m_regMultiplier = regMultiplier; + } + // called from SGD UpdateWeights, to adjust the reg for each node + float GetRegMultiplier() const { return m_regMultiplier; } + private: // init parameters for deferred initialization (which happens in Validate()) std::wstring m_initString; // if non-empty then deferred initialization is needed. Gets cleared upon completion of deferred init. @@ -109,6 +118,9 @@ private: int m_initOutputRank; bool m_initOnCPUOnly; ElemType m_initValue; + + // flags related to gradient update + float m_regMultiplier; // The multiplier to adjust the L1Reg and L2Reg for Learnable node }; // ----------------------------------------------------------------------- diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h index b687d88f8..31bc26913 100644 --- a/Source/ComputationNetworkLib/TrainingNodes.h +++ b/Source/ComputationNetworkLib/TrainingNodes.h @@ -8,6 +8,7 @@ #include "ComputationNode.h" #include "BatchNormalizationEngine.h" #include "RNGHandle.h" +#include "InputAndParamNodes.h" #define __STDC_FORMAT_MACROS #include @@ -1587,15 +1588,15 @@ class BatchNormalizationNode : public ComputationNodeNonLooping, publi public: BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true), - m_samplesSeen(0), m_imageLayoutKind(ImageLayoutKind::CHW), m_postBatchNormalization(false), m_swapNormTimeConst(0), - m_swapBlendTimeConst(0), m_convertRunningVariancePending(false) + m_samplesSeen(0), m_imageLayoutKind(ImageLayoutKind::CHW), + m_convertRunningVariancePending(false) { } BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant, double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) : Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant), - m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_samplesSeen(0), m_postBatchNormalization(false), - m_swapNormTimeConst(0), m_swapBlendTimeConst(0), m_convertRunningVariancePending(false) + m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_samplesSeen(0), + m_convertRunningVariancePending(false) { } BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) : @@ -1605,9 +1606,6 @@ public: ImageLayoutKindFrom(configp->Get(L"imageLayout"))) { AttachInputsFromConfig(configp, this->GetExpectedNumInputs()); - m_postBatchNormalization = false; - m_swapNormTimeConst = 0; - m_swapBlendTimeConst = 0; } void Save(File& fstream) const override @@ -1724,7 +1722,7 @@ private: // time-constant conversions double ComputeExpAvgFactor() const { // in inference mode, only use long-term mean and do not update running estimates - if (!Environment().IsTraining() && !m_postBatchNormalization) + if (!Environment().IsTraining()) { if (m_samplesSeen == 0) RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str()); @@ -1756,7 +1754,7 @@ private: // time-constant conversions double ComputeBlendFactor() const { // in inference mode, only use long-term mean and do not update running estimates - if (!Environment().IsTraining() && !m_postBatchNormalization) + if (!Environment().IsTraining()) { if (m_samplesSeen == 0) RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str()); @@ -1805,7 +1803,7 @@ public: // In inference-only mode, m_savedMean and m_saveInvStdDev will not be // produced and BackpropToNonLooping() may not be called. In // non-inference (training) mode, saved statistics must be produced. - bool inferenceOnly = !Environment().IsTraining() && !m_postBatchNormalization; + bool inferenceOnly = !Environment().IsTraining(); m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in) inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, // (in/out) running estimates, updated from the current MB mean/variance @@ -1870,14 +1868,6 @@ public: } virtual void EndForwardProp() override - { - if(m_postBatchNormalization) - m_samplesSeen += GetMBLayout()->GetActualNumSamples(); - - Base::EndForwardProp(); - } - - virtual void EndBackprop() override { // Update samples if not locked. double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this) @@ -2019,28 +2009,29 @@ public: m_blendTimeConst = std::numeric_limits::infinity(); } + // ResetStatisticsState will set the batch normal statistics into initial state + // used for re-statistics the mean and variance of BN + // any others use may lead undependable results, please be careful + void ResetStatisticsState() + { + m_samplesSeen = 0; + m_normTimeConst = 0; + m_blendTimeConst = 0; + } + // Turn off the L1 and L2 regularization + void DisableRegInBatchNormalization() + { + let scaleNode = dynamic_pointer_cast>(Input(1)); + let biasNode = dynamic_pointer_cast>(Input(2)); + scaleNode->SetRegMultiplier(0.f); + biasNode->SetRegMultiplier(0.f); + } double NormalizationTimeConstant() const { return m_normTimeConst; } double BlendTimeConstant() const { return m_blendTimeConst; } bool Spatial() const { return m_spatial; } double Epsilon() const { return m_epsilon; } bool UseCNTKEngine() const { return m_useCntkEngine; } - void SetPostBatchNormalizationBegin() - { - m_postBatchNormalization = true; - m_samplesSeen = 0; - m_swapNormTimeConst = m_normTimeConst; - m_swapBlendTimeConst = m_blendTimeConst; - m_normTimeConst = -1; - m_blendTimeConst = 0; - } - void SetPostBatchNormalizationEnd() - { - m_postBatchNormalization = false; - m_normTimeConst = m_swapNormTimeConst; - m_blendTimeConst = m_swapBlendTimeConst; - } - private: // Old versioning - do not use. Do not remove until we're sure there are no old models around. struct VersionInfo @@ -2104,11 +2095,6 @@ private: std::unique_ptr> m_bnEng; - // post batch normalization process mark - bool m_postBatchNormalization; - - double m_swapNormTimeConst; - double m_swapBlendTimeConst; bool m_convertRunningVariancePending; }; diff --git a/Source/SGDLib/PostComputingActions.cpp b/Source/SGDLib/PostComputingActions.cpp new file mode 100644 index 000000000..03b2513a7 --- /dev/null +++ b/Source/SGDLib/PostComputingActions.cpp @@ -0,0 +1,161 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// +// PostStat.cpp -- CNTK post statistics related actions +// + +#include "PostComputingActions.h" + +#include "TrainingNodes.h" +#include "ProgressTracing.h" +#include "DataReaderHelpers.h" +#include "SimpleDistGradAggregator.h" + +#include + +namespace Microsoft { namespace MSR{ namespace CNTK { + +template +void PostComputingActions::BatchNormalizationStatistics(IDataReader * dataReader, const vector& evalNodeNames, + const wstring newModelPath, const size_t mbSize, const int iters) +{ + // since the mean and variance of bn will be modified in statistics, + // training mode will make it work. And there is no back prop, other parameters + // are fixed during computing. + ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::training); + + // bn nodes need to be computed from bottom to top with evaluating order + let evalNodes = m_net->GetEvalNodesWithName(evalNodeNames); + + // find all the BN nodes by evalOrder + std::vector bnNodes; + std::set bnNodesLogged; // (avoid double record of batch normalization nodes) + for (auto& evalNode : evalNodes) + { + for (auto& node : m_net->GetEvalOrder(evalNode)) + { + let bnNode = dynamic_pointer_cast>(node); + if (bnNode) + { + if (bnNodesLogged.insert(node).second) + { + // reset the statistics states of bn nodes + bnNode->ResetStatisticsState(); + bnNode->SetNormalizationTimeConstants(-1, bnNode->NormalizationTimeConstant(), + 0, bnNode->BlendTimeConstant()); + bnNodes.push_back(node); + // add BN nodes into the evaluation group, then they will be added into root nodes when + // the network re-compile + m_net->AddToNodeGroup(L"evaluation", bnNode); + } + } + } + } + + // re-compile the network to add bn nodes as rootNodes. + m_net->CompileNetwork(); + + // allocate memory for all bnNodes evalOrder + m_net->AllocateAllMatrices(bnNodes, std::vector(), nullptr); + + // prepare features + auto& featureNodes = m_net->FeatureNodes(); + + StreamMinibatchInputs inputMatrices; + for (auto& node : featureNodes) + inputMatrices.AddInput(node->NodeName(), node->ValuePtr(), node->GetMBLayout(), node->GetSampleLayout()); + + bool useParallelTrain = (m_mpi != nullptr); + bool useDistributedMBReading = useParallelTrain && m_enableDistributedMBReading && dataReader->SupportsDistributedMBRead(); + size_t totalEpochSize = bnNodes.size() * mbSize * iters; + + m_net->StartEvaluateMinibatchLoop(bnNodes); + + if (useDistributedMBReading) + dataReader->StartDistributedMinibatchLoop(mbSize, 0, m_mpi->CurrentNodeRank(), m_mpi->NumNodesInUse(), totalEpochSize); + else + dataReader->StartMinibatchLoop(mbSize, 0, totalEpochSize); + + for (auto& node : bnNodes) + { + let bnNode = static_pointer_cast>(node); + size_t actualMBSize = 0; + + LOGPRINTF(stderr, "Estimating Statistics --> %ls\n", bnNode->GetName().c_str()); + + + // for every single bn node, the statistics is the average of mean and variance for several times in forward prop + // the forward prop is from the feature to the current bn node + for (int iter = 0; iter < iters; iter++) + { + // during the bn stat, dataRead must be ensured + bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, + nullptr, useDistributedMBReading, useParallelTrain, inputMatrices, actualMBSize, m_mpi); + + if (!wasDataRead) LogicError("DataRead Failure in batch normalization statistics"); + + ComputationNetwork::BumpEvalTimeStamp(featureNodes); + + // forward prop till reaching the current bn node + m_net->ForwardProp(node); + } + + // after finished statistics, the mean and variance of the bn node should be freezd. + bnNode->FreezeParameters(); + + // Sync during or after all iters of a BN node are equivalent + if (useParallelTrain) + { + if (m_gradHeader == nullptr) + { + m_gradHeader.reset(DistGradHeader::Create(evalNodes.size()), [](DistGradHeader* ptr) + { + DistGradHeader::Destroy(ptr); + }); + } + + // push the statistics results of mean and variance of bn nodes into mpi updating vector + std::vector*> learnParamsValues(2, nullptr); + + SimpleDistGradAggregator distGradAgg(m_mpi, false /*useAsyncAggregation*/, 0 /*syncStatsTrace*/); + + auto runMeanParameterPtr = node->Input(3); + auto runStdParameterPtr = node->Input(4); + + shared_ptr> runMeanNode = static_pointer_cast>(runMeanParameterPtr); + shared_ptr> runStdNode = static_pointer_cast>(runStdParameterPtr); + + learnParamsValues[0] = &(runMeanNode->Value()); + learnParamsValues[1] = &(runStdNode->Value()); + + m_gradHeader->numSamples = actualMBSize ? 1 : actualMBSize; + distGradAgg.AggregateGradients(learnParamsValues, m_gradHeader.get(), 0); + + // get the average mean and variance across all the workers + for (auto& parameter : learnParamsValues) + { + (*parameter) /= (ElemType)m_mpi->NumNodesInUse(); + } + } + } + + dataReader->DataEnd(); + + // remove all the added BN nodes from evaluation group + for (auto& bnNode : bnNodes) + { + m_net->RemoveFromNodeGroup(L"evaluation", bnNode); + } + + // save model + if (!useParallelTrain || m_mpi->CurrentNodeRank() == m_mpi->MainNodeRank()) + m_net->Save(newModelPath); + + return; +} + +template class PostComputingActions; +template class PostComputingActions; + +}}} diff --git a/Source/SGDLib/PostComputingActions.h b/Source/SGDLib/PostComputingActions.h new file mode 100644 index 000000000..9dfc95bb6 --- /dev/null +++ b/Source/SGDLib/PostComputingActions.h @@ -0,0 +1,65 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// +// PostStat.h -- CNTK post statistics related actions +// + +#pragma once +#include "ComputationNode.h" +#include "ComputationNetwork.h" +#include "MPIWrapper.h" +#include "DataReader.h" +#include "IDistGradAggregator.h" +#include "DistGradHeader.h" + +using namespace std; + +namespace Microsoft { namespace MSR { namespace CNTK { + +template +class IDistGradAggregator; + +// Post statistics normally called between training and evaluating, to generate the statistics results used by evaluating +// For now, the application is only with statistics mean and variance of Batch Normalization nodes after training +template +class PostComputingActions +{ +public: + PostComputingActions(ComputationNetworkPtr net, const MPIWrapperPtr& mpi, bool enableDistributedMBReading = false, const int traceLevel = 0) : + m_net(net), + m_traceLevel(traceLevel), + m_mpi(mpi), + m_distGradAgg(nullptr), + m_gradHeader(nullptr), + m_enableDistributedMBReading(enableDistributedMBReading) + { + } + + // This function is used for evaluating the mean and variance of all batch normalization nodes after training. + // Details will link to the wiki https://github.com/Microsoft/CNTK/wiki/Post-Batch-Normalization-Statistics + // The reason why put it into evalute is the action take place after trainning and non-backprop processing, which makes me believe + // this function is like a kind of evaluate function. + // In this function, + // 1. since all other weights are fix except the un-pbn nodes, I set the networkoperationMode into inferring. + // 2. The next thing is to load the network model and data source, I follow the Evaluate function to do so, however, I delete something + // seem useless, like error statistics etc. + // 3. Finding the BN nodes in the network and put them into a vector with evaluate order (This links the nestedNode vector I got in + // ControlFlowNetwork) + // 4. From node to node in the BN vector to generate the mean and various (This links to the changes of BatchNormalizationNode + // in TrainingNodes.h, since I need to make the nodes "learn" mean and variance in inferring mode) + // 5. Consider the multi-GPU, we need to sync up the BN results between all the worker and average the value. + void BatchNormalizationStatistics(IDataReader* dataReader, const vector& evalNodeNames, const wstring newModelPath, + const size_t mbSize, const int iters = 30); + +private: + ComputationNetworkPtr m_net; + MPIWrapperPtr m_mpi; + bool m_enableDistributedMBReading; + + int m_traceLevel; + + std::shared_ptr> m_distGradAgg; + std::shared_ptr m_gradHeader; +}; +}}} diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index b5a590d8a..61cb07091 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -8,6 +8,7 @@ #include "SpecialPurposeNodes.h" // for SequenceWithSoftmaxNode #include "DataReaderHelpers.h" #include "MatrixQuantizerImpl.h" +#include "InputAndParamNodes.h" #ifdef CNTK_PARALLEL_TRAINING_SUPPORT //static inline bool operator==(const std::pair& a, double b) { assert(b==0); return a.first == b; } @@ -875,23 +876,13 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, EpochCriterion epochCriterionLastLogged = epochCriterion; vector epochEvalErrorsLastLogged = epochEvalErrors; - // Now, we need to use a switch to enable/disable wk in BatchNormalization. - // If we can determine whether wk added or not for each node, then, discard this - std::unordered_set batchNormalizationWeights; - if (m_disableWkInBatchNormal) { - for (auto& evalNode : evaluationNodes) + // NOTE: For ResNet, the regularization in BatchNormalization should be disable. + if (m_disableRegInBatchNormalization) { + let bnNodes = net->GetNodesWithType(L"BatchNormalization"); + for (auto &node : bnNodes) { - shared_ptr nestedNetwork = static_pointer_cast(net->GetNestedNetwork(evalNode)); - for (auto& node : nestedNetwork->GetNestedNodes()) - { - shared_ptr> castNode = - dynamic_pointer_cast>(node); - if (castNode) - { - batchNormalizationWeights.insert(castNode->GetInputs()[1]); - batchNormalizationWeights.insert(castNode->GetInputs()[2]); - } - } + let bnNode = dynamic_pointer_cast>(node); + bnNode->DisableRegInBatchNormalization(); } } @@ -1110,11 +1101,10 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, if (smoothedGradient.HasNan("TrainOneEpoch/UpdateWeights(): ")) LogicError("%ls %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str()); #endif - double l2Factor = batchNormalizationWeights.find(node) == batchNormalizationWeights.end() ? 1.0 : 0.0; // BUGBUG (Issue #95): Access to net MBLayout can no longer be done if we have multiple input layouts UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber /*BUGBUG workaround:*/, net->GetMBLayoutPtrOfNetwork()->GetNumParallelSequences()), numSamplesInMinibatch, - m_L2RegWeight * l2Factor, m_L1RegWeight, + m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier, m_useNesterovMomentum); #ifdef _DEBUG if (dynamic_pointer_cast>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): ")) @@ -2017,9 +2007,10 @@ void SGD::UpdateWeights(const ComputationNodeBasePtr& node, LogicError("UpdateWeights() called for a learnable ComputationNode which has m_learningRateMultiplier == 0!"); double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier(); + double nodeDependentRegMultiplier = dynamic_pointer_cast>(node)->GetRegMultiplier(); UpdateWeightsS(this, dynamic_pointer_cast>(node)->Value(), dynamic_pointer_cast>(node)->Gradient(), smoothedGradient, nodeDependentLearningRatePerSample, momentumPerSample, - actualMBSize, L2RegWeight, L1RegWeight, + actualMBSize, L2RegWeight * nodeDependentRegMultiplier, L1RegWeight * nodeDependentRegMultiplier, needAveMultiplier, m_useNesterovMomentum); node->BumpEvalTimeStamp(); } @@ -2475,7 +2466,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType) m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0); m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0); - m_disableWkInBatchNormal = configSGD(L"disableWkInBatchNormal", false); + m_disableRegInBatchNormalization = configSGD(L"disableRegInBatchNormalization", false); m_dropoutRates = configSGD(L"dropoutRate", ConfigRecordType::Array(doubleargvector(vector{0.0}))); m_batchNormalizationTimeConstant = configSGD(L"batchNormalizationTimeConstant", ConfigRecordType::Array(doubleargvector(vector{0}))); diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 9e6bc4474..064338ff8 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -291,7 +291,10 @@ protected: double m_seqGammarCalcbMMIFactor; bool m_seqGammarCalcUsesMBR; - bool m_disableWkInBatchNormal; + // decide whether should apply L2 regularization into BatchNormalizationNode + // true: disable L2 Regularization + // false: enable L2 Regularization (default) + bool m_disableRegInBatchNormalization; }; template diff --git a/Source/SGDLib/SGDLib.vcxproj b/Source/SGDLib/SGDLib.vcxproj index 03d11b4db..4b6e0dada 100644 --- a/Source/SGDLib/SGDLib.vcxproj +++ b/Source/SGDLib/SGDLib.vcxproj @@ -124,6 +124,7 @@ + @@ -132,10 +133,11 @@ + - + \ No newline at end of file diff --git a/Source/SGDLib/SGDLib.vcxproj.filters b/Source/SGDLib/SGDLib.vcxproj.filters index 5d0cb116e..fc4f156a2 100644 --- a/Source/SGDLib/SGDLib.vcxproj.filters +++ b/Source/SGDLib/SGDLib.vcxproj.filters @@ -1,32 +1,17 @@  - - Common - - - Common - - - Common - - - Common - Misc - - Common - GPU Interfacing SGD - - Common + + Stat @@ -144,6 +129,9 @@ SGD + + Stat + @@ -182,5 +170,8 @@ {b866d513-7bd0-497c-98c2-f62dbcd4cde4} + + {f406217f-5a11-44ca-bb34-52254dbee8af} + - + \ No newline at end of file diff --git a/Source/SGDLib/SimpleEvaluator.h b/Source/SGDLib/SimpleEvaluator.h index 813fb382a..a2dda2927 100644 --- a/Source/SGDLib/SimpleEvaluator.h +++ b/Source/SGDLib/SimpleEvaluator.h @@ -52,36 +52,7 @@ public: { ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring); - // determine nodes to evaluate - std::vector evalNodes; - - set criteriaLogged; // (keeps track ot duplicates to avoid we don't double-log critera) - if (evalNodeNames.size() == 0) - { - fprintf(stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n"); - if (m_net->EvaluationNodes().empty() && m_net->FinalCriterionNodes().empty()) - InvalidArgument("There is no default evaluation node or training criterion specified in the network."); - - for (const auto& node : m_net->EvaluationNodes()) - if (criteriaLogged.insert(node).second) - evalNodes.push_back(node); - - for (const auto& node : m_net->FinalCriterionNodes()) - if (criteriaLogged.insert(node).second) - evalNodes.push_back(node); - } - else - { - for (int i = 0; i < evalNodeNames.size(); i++) - { - const auto& node = m_net->GetNodeFromName(evalNodeNames[i]); - if (!criteriaLogged.insert(node).second) - continue; - if (node->GetSampleLayout().GetNumElements() != 1) - InvalidArgument("Criterion nodes to evaluate must have dimension 1x1."); - evalNodes.push_back(node); - } - } + let evalNodes = m_net->GetEvalNodesWithName(evalNodeNames); // initialize eval results std::vector evalResults(evalNodes.size(), EpochCriterion(0)); @@ -112,7 +83,7 @@ public: if (useDistributedMBReading) dataReader->StartDistributedMinibatchLoop(mbSize, 0, m_mpi->CurrentNodeRank(), m_mpi->NumNodesInUse(), testSize); else - dataReader->StartMinibatchLoop(mbSize, 0, testSize); + dataReader->StartMinibatchLoop(mbSize, 0, testSize); m_net->StartEvaluateMinibatchLoop(evalNodes); @@ -257,153 +228,6 @@ public: return evalResults; } - void EvaluateBN(IDataReader* dataReader, const vector& evalNodeNames, const wstring exportPath, const size_t mbSize, const int iters = 240, const size_t testSize = requestDataSize) - { - ScopedNetworkOperationMode modeGuard(m_net, NetworkOperationMode::inferring); - - // determine nodes to evaluate - std::vector evalNodes; - - set criteriaLogged; // (keeps track ot duplicates to avoid we don't double-log critera) - if (evalNodeNames.size() == 0) - { - fprintf(stderr, "evalNodeNames are not specified, using all the default evalnodes and training criterion nodes.\n"); - if (m_net->EvaluationNodes().empty() && m_net->FinalCriterionNodes().empty()) - InvalidArgument("There is no default evaluation node or training criterion specified in the network."); - - for (const auto& node : m_net->EvaluationNodes()) - if (criteriaLogged.insert(node).second) - evalNodes.push_back(node); - - for (const auto& node : m_net->FinalCriterionNodes()) - if (criteriaLogged.insert(node).second) - evalNodes.push_back(node); - } - else - { - for (int i = 0; i < evalNodeNames.size(); i++) - { - const auto& node = m_net->GetNodeFromName(evalNodeNames[i]); - if (!criteriaLogged.insert(node).second) - continue; - if (node->GetSampleLayout().GetNumElements() != 1) - InvalidArgument("Criterion nodes to evaluate must have dimension 1x1."); - evalNodes.push_back(node); - } - } - - // allocate memory for forward computation - m_net->AllocateAllMatrices(evalNodes, {}, nullptr); - - // prepare features and labels - auto& featureNodes = m_net->FeatureNodes(); - auto& labelNodes = m_net->LabelNodes(); - - StreamMinibatchInputs inputMatrices; - for (auto& node : featureNodes) - inputMatrices.AddInput(node->NodeName(), node->ValuePtr(), node->GetMBLayout(), node->GetSampleLayout()); - for (auto& node : labelNodes) - inputMatrices.AddInput(node->NodeName(), node->ValuePtr(), node->GetMBLayout(), node->GetSampleLayout()); - - bool useParallelTrain = (m_mpi != nullptr); - bool useDistributedMBReading = useParallelTrain && m_enableDistributedMBReading && dataReader->SupportsDistributedMBRead(); - if (useDistributedMBReading) - dataReader->StartDistributedMinibatchLoop(mbSize, 0, m_mpi->CurrentNodeRank(), m_mpi->NumNodesInUse(), testSize); - else - dataReader->StartMinibatchLoop(mbSize, 0, testSize); - - m_net->StartEvaluateMinibatchLoop(evalNodes); - - // Passing in two empty node lists so the dispatcher can work for the evalNodes. - std::list learnableNodes; - std::vector criterionNodes; - - // First, all batch normalization nodes should be marked. - std::vector batchNormalNodes; - shared_ptr nestedNetwork = static_pointer_cast(m_net->GetNestedNetwork(evalNodes[0])); - for (auto& node : nestedNetwork->GetNestedNodes()) - { - shared_ptr> castNode = - dynamic_pointer_cast>(node); - if (castNode) - { - batchNormalNodes.push_back(node); - } - } - - // Push all batch normalization mean and std into learn params values for mpi update - std::vector*> learnParamsValues(2, nullptr); - - bool noMoreSamplesToProcess = false; - for (auto& node : batchNormalNodes) - { - shared_ptr> batchNode = - static_pointer_cast>(node); - batchNode->SetPostBatchNormalizationBegin(); - size_t actualMBSize = 0; - - LOGPRINTF(stderr, "Start evaluating: %ls\n", batchNode->GetName().c_str()); - - // Post batch normal iters - for (int iter = 0; iter < iters; iter++) - { - bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, - nullptr, useDistributedMBReading, useParallelTrain, inputMatrices, actualMBSize, m_mpi); - - if (!wasDataRead && (!useDistributedMBReading || noMoreSamplesToProcess)) - break; - - // TODO should handle it, since post BN exist no samples in iters - if (!wasDataRead) - actualMBSize = 0; - - // Batch Normalization Evaluate don't need to support subMinibatches - ComputationNetwork::BumpEvalTimeStamp(featureNodes); - ComputationNetwork::BumpEvalTimeStamp(labelNodes); - - m_net->ForwardProp(evalNodes[0], nullptr, node); - dataReader->DataEnd(); - } - batchNode->SetPostBatchNormalizationEnd(); - - // Sync during or after all iters of a BN node are equivalent - if (useParallelTrain) - { - if (m_gradHeader == nullptr) - { - m_gradHeader.reset(DistGradHeader::Create(evalNodes.size()), [](DistGradHeader* ptr) - { - DistGradHeader::Destroy(ptr); - }); - } - SimpleDistGradAggregator distGradAgg(m_mpi, false /*useAsyncAggregation*/, 0 /*syncStatsTrace*/); - - auto runMeanParameterPtr = node->GetInputs()[3]; - auto runStdParameterPtr = node->GetInputs()[4]; - - shared_ptr> runMeanNode = static_pointer_cast>(runMeanParameterPtr); - shared_ptr> runStdNode = static_pointer_cast>(runStdParameterPtr); - - learnParamsValues[0] = &(runMeanNode->Value()); - learnParamsValues[1] = &(runStdNode->Value()); - - m_gradHeader->numSamples = actualMBSize ? 1 : actualMBSize; - distGradAgg.AggregateGradients(learnParamsValues, m_gradHeader.get(), 0); - - for (auto& parameter : learnParamsValues) - { - (*parameter) /= (ElemType)m_mpi->NumNodesInUse(); - } - } - } - - // Save Model - if (!useParallelTrain || m_mpi->CurrentNodeRank() == m_mpi->MainNodeRank()) - m_net->Save(exportPath); - - return; - } - protected: void DisplayEvalStatistics(const size_t startMBNum, const size_t endMBNum, const size_t numSamplesLastLogged, const vector& evalNodes,