From 4003c087a19669d1c75806aaf958991222fb1a71 Mon Sep 17 00:00:00 2001 From: rpengms <40006668+rpengms@users.noreply.github.com> Date: Wed, 20 Mar 2019 11:36:16 -0700 Subject: [PATCH] Merge fp16 brainscript work (#3606) * FP16 BrainScript - address code review comments * Remove Tab and fix debug build breaks * Fix Linux Build breaks * fp16 brain script - add _CRT_SECURE_NO_WARNINGS * fp16 brain script - fix NetworkTests * Update tests for model version change * Remove changes for InputAndParamNodes * Fix typo * Remove redundant code * Fix optional parameters --- Makefile | 1 + Source/1BitSGD/BlockMomentumSGD.h | 12 +- Source/1BitSGD/V2BlockMomentumSGD.h | 50 ++-- Source/ActionsLib/NDLNetworkBuilder.cpp | 117 ++++++++- Source/ActionsLib/NDLNetworkBuilder.h | 6 +- .../ActionsLib/NetworkDescriptionLanguage.cpp | 7 + .../ActionsLib/NetworkDescriptionLanguage.h | 1 + Source/ActionsLib/NetworkFactory.cpp | 3 + Source/ActionsLib/SimpleNetworkBuilder.cpp | 1 + Source/ActionsLib/SimpleNetworkBuilder.h | 10 +- Source/ActionsLib/TrainActions.cpp | 8 +- Source/CNTK/CNTK.cpp | 161 ++++++++----- Source/Common/Include/Config.h | 7 + .../ComputationNetwork.cpp | 28 +++ .../ComputationNetwork.h | 8 + .../ComputationNetworkBuilder.cpp | 63 ++++- .../ComputationNetworkBuilder.h | 19 +- .../ComputationNetworkLib/ComputationNode.h | 4 +- Source/EvalDll/CNTKEval.cpp | 3 + Source/SGDLib/ASGDHelper.cpp | 14 ++ Source/SGDLib/AccumulatorAggregation.h | 22 +- Source/SGDLib/DataReaderHelpers.h | 4 +- Source/SGDLib/MASGD.h | 12 +- Source/SGDLib/SGD.cpp | 228 +++++++++++++++--- Source/SGDLib/SGD.h | 36 ++- Source/SGDLib/SGDLib.vcxproj | 2 + Source/SGDLib/SGDLib.vcxproj.filters | 6 + .../SGDLib/SimpleDistGradAggregatorHelper.cpp | 82 +++++++ .../SGDLib/SimpleDistGradAggregatorHelper.h | 24 ++ Source/SGDLib/SimpleEvaluator.h | 9 +- Source/SGDLib/V2SimpleDistGradAggregator.h | 4 +- .../UnitTests/EvalTests/EvalExtendedTests.cpp | 2 +- .../NetworkTests/NetworkTests.vcxproj | 2 +- 33 files changed, 769 insertions(+), 187 deletions(-) create mode 100644 Source/SGDLib/SimpleDistGradAggregatorHelper.cpp create mode 100644 Source/SGDLib/SimpleDistGradAggregatorHelper.h diff --git a/Makefile b/Makefile index bbdee61b2..cb651d080 100644 --- a/Makefile +++ b/Makefile @@ -707,6 +707,7 @@ SGDLIB_SRC=\ $(SOURCEDIR)/SGDLib/Profiler.cpp \ $(SOURCEDIR)/SGDLib/SGD.cpp \ $(SOURCEDIR)/SGDLib/PostComputingActions.cpp \ + $(SOURCEDIR)/SGDLib/SimpleDistGradAggregatorHelper.cpp \ SGDLIB_SRC+=$(CNTKLIBRARY_COMMON_SRC) diff --git a/Source/1BitSGD/BlockMomentumSGD.h b/Source/1BitSGD/BlockMomentumSGD.h index b96476973..ca2d1c388 100644 --- a/Source/1BitSGD/BlockMomentumSGD.h +++ b/Source/1BitSGD/BlockMomentumSGD.h @@ -94,15 +94,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { ); } /*virtual*/ void OnEpochEnd(const std::list& LearnableNodes, - std::list>& smoothedGradient, + std::list& smoothedGradients, size_t samplesSinceLastSync) override { - Base::OnEpochEnd(LearnableNodes, smoothedGradient, samplesSinceLastSync); + Base::OnEpochEnd(LearnableNodes, smoothedGradients, samplesSinceLastSync); } /*virtual*/ void ModelAggregationProcessing( size_t samplesSinceLastSync, const std::list& learnableNodes, - std::list>& smoothedGradient, + std::list& smoothedGradients, size_t& totalSamplesProcessed, float& secondsOnCommunication ) override @@ -181,9 +181,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { //---------------------------------------- if (m_resetSGDMomentumAfterAggregation) { - for (Matrix& x : smoothedGradient) + for (auto sg : smoothedGradients) { - x.SetValue((ElemType)0); + auto x = dynamic_pointer_cast>(sg); + if (x != nullptr) + x->SetValue((ElemType)0); } } } diff --git a/Source/1BitSGD/V2BlockMomentumSGD.h b/Source/1BitSGD/V2BlockMomentumSGD.h index e9ededc92..805f238d3 100644 --- a/Source/1BitSGD/V2BlockMomentumSGD.h +++ b/Source/1BitSGD/V2BlockMomentumSGD.h @@ -108,7 +108,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool OnArrivingAtSyncPoint( const std::list& learnableNodes, /* input/output: */ - std::list>& smoothedGradient, /* input/output: under some setup, it will reset to zero*/ + std::list& smoothedGradients, /* input/output: under some setup, it will reset to zero*/ size_t samplesSinceLastSync /* input: samples processed since last sync on this worker only */ ) override { @@ -130,12 +130,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Otherwise let update the weights. float secondsOnCommunication = 0.0f; size_t totalSamples = 0; - ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication); + ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication); return true; } /*virtual*/ void OnEpochEnd(const std::list& learnableNodes, - std::list>& smoothedGradient, + std::list& smoothedGradients, size_t samplesSinceLastSync) override { if (!m_someWorkerHasFinished) @@ -152,13 +152,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { // Let's update our weights no matter what. float secondsOnCommunication = 0.0f; size_t totalSamples = 0; - ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication); + ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication); } /*virtual*/ void ModelAggregationProcessing( size_t /*samplesSinceLastSync*/, const std::list& learnableNodes, - std::list>& smoothedGradient, + std::list& smoothedGradients, size_t& /*totalSamplesProcessed*/, /* out */ float& secondsOnCommunication /* out */ ) override @@ -196,8 +196,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_communicator->AggregateInPlace(aggregatedWeightsPrepared, m_communicator->Workers()); // 2. Let's update the model - for (auto& pBaseNode : learnableNodes) + auto smoothedGradientIter = smoothedGradients.begin(); + for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++) { + ComputationNodeBasePtr pBaseNode = *nodeIter; if (!pBaseNode->IsParameterUpdateRequired()) continue; @@ -235,15 +237,35 @@ namespace Microsoft { namespace MSR { namespace CNTK { // 2.2.4 update bookkeeping prevWeight.SetValue(currentWeight); } - } - //---------------------------------------- - // 3. reset SGD momentum if necessary - //---------------------------------------- - if (m_resetSGDMomentumAfterAggregation) - { - for (Matrix& x : smoothedGradient) + + //---------------------------------------- + // 3. reset SGD momentum if necessary + //---------------------------------------- { - x.SetValue((ElemType)0); + // For half, we keep a copy of float weights, update that too + if (std::is_same()) + { + auto compoundMatrixPtr = dynamic_pointer_cast> (*smoothedGradientIter); + size_t numCols = currentWeight.GetNumCols(); + + auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols); + parameterMatrix.CastAssignValuesOf(currentWeight); + + if (m_resetSGDMomentumAfterAggregation) + { + // Only reset smoothed gradients + auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols); + smoothedGradientMatrix.SetValue(0.0f); + } + } + else + { + if (m_resetSGDMomentumAfterAggregation) + { + auto x = dynamic_pointer_cast> (*smoothedGradientIter); + x->SetValue((ElemType)0); + } + } } } } diff --git a/Source/ActionsLib/NDLNetworkBuilder.cpp b/Source/ActionsLib/NDLNetworkBuilder.cpp index dbcf036a2..734ece489 100644 --- a/Source/ActionsLib/NDLNetworkBuilder.cpp +++ b/Source/ActionsLib/NDLNetworkBuilder.cpp @@ -22,6 +22,20 @@ namespace Microsoft { namespace MSR { namespace CNTK { using namespace std; +template +static inline bool isprecision(std::wstring& str) +{ + if ((str == L"") && std::is_same()) + return true; + if (std::is_same()) + return EqualCI(str, L"float16"); + else if (std::is_same()) + return EqualCI(str, L"float"); + else if (std::is_same()) + return EqualCI(str, L"double"); + return false; +} + template void NDLNodeEvaluatorImpl::Evaluate(NDLNode* node, const wstring& baseName, const NDLPass pass) { @@ -48,7 +62,7 @@ void NDLNodeEvaluatorImpl::Evaluate(NDLNode* node, const wst std::wstring cnNodeType = Microsoft::MSR::CNTK::ToFixedWStringFromMultiByte(node->GetValue()); - ComputationNodePtr nodePtr; + ComputationNodeBasePtr nodePtr; // get the node pointer for the node, should be stored in the EvalValue; if (pass > ndlPassInitial) @@ -56,7 +70,7 @@ void NDLNodeEvaluatorImpl::Evaluate(NDLNode* node, const wst nodePtr = ComputationNode::FromVoidPtr(node->GetEvalValue()); if (!nodePtr) { - nodePtr = dynamic_pointer_cast>(m_net->GetNodeFromName(name)); + nodePtr = m_net->GetNodeFromName(name); node->SetEvalValue(nodePtr.get()); } } @@ -75,15 +89,48 @@ void NDLNodeEvaluatorImpl::Evaluate(NDLNode* node, const wst auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType); wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", ""); + wstring precision = node->GetOptionalParameter("precision", ""); + // TODO: Map dynamicAxis from name to node at this point, where that node is memoized inside NDL. // first look for this node already existing in the network // BUGBUG: How does this set the dimensions then? if (m_net->NodeNameExists(name)) - nodePtr = dynamic_pointer_cast>(m_net->GetNodeFromName(name)); - else if (isSparse) - nodePtr = builder.CreateSparseInputNode(name, tensorShape, dynamicAxis); + nodePtr = m_net->GetNodeFromName(name); else - nodePtr = builder.CreateInputNode(name, tensorShape, dynamicAxis); + { + if (precision == L"") + { + if (isSparse) + nodePtr = builder.CreateSparseInputNode(name, tensorShape, dynamicAxis); + else + nodePtr = builder.CreateInputNode(name, tensorShape, dynamicAxis); + } + else if (EqualCI(precision, L"float")) + { + if (isSparse) + nodePtr = builder.template TypedCreateSparseInputNode(name, tensorShape, dynamicAxis); + else + nodePtr = builder.template TypedCreateInputNode(name, tensorShape, dynamicAxis); + } + else if (EqualCI(precision, L"double")) + { + if (isSparse) + nodePtr = builder.template TypedCreateSparseInputNode(name, tensorShape, dynamicAxis); + else + nodePtr = builder.template TypedCreateInputNode(name, tensorShape, dynamicAxis); + } + else if (EqualCI(precision, L"float16")) + { + if (isSparse) + nodePtr = builder.template TypedCreateSparseInputNode(name, tensorShape, dynamicAxis); + else + nodePtr = builder.template TypedCreateInputNode(name, tensorShape, dynamicAxis); + } + else + { + RuntimeError("NDLNetworkBuilder: Input: the 'precision' parameter if specified, must be 'float', 'double' or 'float16'."); + } + } } } else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput") @@ -193,7 +240,7 @@ void NDLNodeEvaluatorImpl::Evaluate(NDLNode* node, const wst nodePtr = builder.CreateLearnableParameter(name, rows, cols); nodePtr->SetLearningRateMultiplier(0); } - else if (pass == ndlPassFinal || nodePtr->Value().GetNumElements() != 0) + else if (pass == ndlPassFinal || (dynamic_pointer_cast> (nodePtr))->Value().GetNumElements() != 0) { ElemType val = parameter[0]->GetScalar(); m_net->InitLearnableParameters(nodePtr, L"fixedValue", val); @@ -607,6 +654,56 @@ void NDLNodeEvaluatorImpl::Evaluate(NDLNode* node, const wst nodeParamCount = nodePtr->GetNumInputs(); } } + else if (cnNodeType == OperationName2Of(CastNode)) + { + if (parameter.size() < 1) + RuntimeError("%ls should have 1 or more parameters (node and cast precision).", cnNodeType.c_str()); + + // setup the parameter position of children so we can hook them up later + nodeParamCount = 1; + nodeParamStart = 0; + + if (pass == ndlPassInitial) + { + // evaluate only scalar parameters + vector params = EvaluateParameters(node, baseName, 0, parameter.size(), pass); + auto sourceNode = (NDLNode*) params[0]; + wstring sourcePrecision = sourceNode->GetOptionalParameter("precision", ""); + wstring targetPrecision = node->GetOptionalParameter("precision", ""); + if (EqualCI(targetPrecision, L"float16")) + { + ComputationNetworkBuilder builder2(*m_net); + if (isprecision(sourcePrecision)) + nodePtr = builder2.CreateCastNode(name); + else if (isprecision(sourcePrecision)) + nodePtr = builder2.CreateCastNode(name); + else + RuntimeError("NDLNetworkBuilder: for CastNode to cast to half, input must be 'float' or 'double'"); + } + else if (EqualCI(targetPrecision, L"float")) + { + ComputationNetworkBuilder builder2(*m_net); + if (isprecision(sourcePrecision)) + nodePtr = builder2.CreateCastNode(name); + else if (isprecision(sourcePrecision)) + nodePtr = builder2.CreateCastNode(name); + else + RuntimeError("NDLNetworkBuilder: for CastNode to cast to float, input must be 'float16' or 'double'"); + } + else if (EqualCI(targetPrecision, L"double")) + { + ComputationNetworkBuilder builder2(*m_net); + if (isprecision(sourcePrecision)) + nodePtr = builder2.CreateCastNode(name); + else if (isprecision(sourcePrecision)) + nodePtr = builder2.CreateCastNode(name); + else + RuntimeError("NDLNetworkBuilder: for CastNode to cast to double, input must be 'float' or 'float16'"); + } + else + RuntimeError("NDLNetworkBuilder: CastNode - need to specify 'precision' parameter: 'float', 'double' or 'float16'."); + } + } else { @@ -645,7 +742,10 @@ void NDLNodeEvaluatorImpl::Evaluate(NDLNode* node, const wst #if 1 vector inputNodes; for (let& in : inputs) - inputNodes.push_back(ComputationNode::FromVoidPtr(in)); + { + ComputationNodeBase *p = (ComputationNodeBase *)in; + inputNodes.push_back(p ? p->shared_from_this() : nullptr); + } nodePtr->AttachInputs(inputNodes); #else // TODO: delete this @@ -714,6 +814,7 @@ TensorShape NDLNodeEvaluatorImpl::ProcessTensorShapeParameters(const N return TensorShape(dims); } +template class NDLBuilderImpl; template class NDLBuilderImpl; template class NDLBuilderImpl; diff --git a/Source/ActionsLib/NDLNetworkBuilder.h b/Source/ActionsLib/NDLNetworkBuilder.h index 1f40ef1fd..53b2727af 100644 --- a/Source/ActionsLib/NDLNetworkBuilder.h +++ b/Source/ActionsLib/NDLNetworkBuilder.h @@ -269,10 +269,11 @@ public: } // ProcessOptionalParameters - Process the optional parameters of a node - virtual void ProcessOptionalParameters(NDLNode* node) + virtual void ProcessOptionalParameters(NDLNode* node) override { vector*> params = node->GetParameters(true); // get all the optional parameters only - auto compNode = ComputationNode::FromVoidPtr(node->GetEvalValue()); + ComputationNodeBase* compNodePtr = (ComputationNodeBase *) (node->GetEvalValue()); + ComputationNodeBasePtr compNode = compNodePtr ? compNodePtr->shared_from_this() : nullptr; std::string empty; // loop through all the optional parameters processing them as necessary @@ -582,6 +583,7 @@ private: DEVICEID_TYPE m_deviceId; }; +template class NDLBuilder; template class NDLBuilder; template class NDLBuilder; diff --git a/Source/ActionsLib/NetworkDescriptionLanguage.cpp b/Source/ActionsLib/NetworkDescriptionLanguage.cpp index 9f9976a72..401a76985 100644 --- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp +++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp @@ -160,6 +160,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable) else if (EqualInsensitive(nodeType, OperationNameOf(AtanhNode))) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(AveragePoolingNode))) ret = true; else if (EqualInsensitive(nodeType, OperationNameOf(BatchNormalizationNode))) ret = true; + else if (EqualInsensitive(nodeType, OperationName2Of(CastNode))) ret = true; #ifdef COMING_SOON else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true; #endif @@ -267,18 +268,24 @@ NDLScript NDLScript::s_global("global"); // declare the static variables from the classes template <> +NDLScript NDLScript::s_global{}; +template <> NDLScript NDLScript::s_global{}; template <> NDLScript NDLScript::s_global{}; +template <> +int NDLNode::s_nameCounter = 0; template <> int NDLNode::s_nameCounter = 0; template <> int NDLNode::s_nameCounter = 0; +template class NDLNode; template class NDLNode; template class NDLNode; +template class NDLScript; template class NDLScript; template class NDLScript; diff --git a/Source/ActionsLib/NetworkDescriptionLanguage.h b/Source/ActionsLib/NetworkDescriptionLanguage.h index ce44c96db..322f45082 100644 --- a/Source/ActionsLib/NetworkDescriptionLanguage.h +++ b/Source/ActionsLib/NetworkDescriptionLanguage.h @@ -98,6 +98,7 @@ public: } }; +template class NDLNodeEvaluator; template class NDLNodeEvaluator; template class NDLNodeEvaluator; diff --git a/Source/ActionsLib/NetworkFactory.cpp b/Source/ActionsLib/NetworkFactory.cpp index 21c9b9346..281e311bd 100644 --- a/Source/ActionsLib/NetworkFactory.cpp +++ b/Source/ActionsLib/NetworkFactory.cpp @@ -188,9 +188,12 @@ ComputationNetworkPtr GetModelFromConfig(const ConfigRecordType& config, const w return net; } +template function GetNetworkFactory(const ScriptableObjects::IConfigRecord& config); template function GetNetworkFactory(const ScriptableObjects::IConfigRecord& config); template function GetNetworkFactory(const ScriptableObjects::IConfigRecord& config); +template function GetNetworkFactory(const ConfigParameters& config); template function GetNetworkFactory(const ConfigParameters& config); template function GetNetworkFactory(const ConfigParameters& config); +template ComputationNetworkPtr GetModelFromConfig (const ConfigParameters& config, const wstring&, vector& outputNodeNamesVector); template ComputationNetworkPtr GetModelFromConfig (const ConfigParameters& config, const wstring&, vector& outputNodeNamesVector); template ComputationNetworkPtr GetModelFromConfig(const ConfigParameters& config, const wstring&, vector& outputNodeNamesVector); diff --git a/Source/ActionsLib/SimpleNetworkBuilder.cpp b/Source/ActionsLib/SimpleNetworkBuilder.cpp index ecc793744..dc19b0056 100644 --- a/Source/ActionsLib/SimpleNetworkBuilder.cpp +++ b/Source/ActionsLib/SimpleNetworkBuilder.cpp @@ -1775,6 +1775,7 @@ shared_ptr> SimpleNetworkBuilder::AddTrainAn return output; } +template class SimpleNetworkBuilder; template class SimpleNetworkBuilder; template class SimpleNetworkBuilder; diff --git a/Source/ActionsLib/SimpleNetworkBuilder.h b/Source/ActionsLib/SimpleNetworkBuilder.h index ff823597f..b3fc39f72 100644 --- a/Source/ActionsLib/SimpleNetworkBuilder.h +++ b/Source/ActionsLib/SimpleNetworkBuilder.h @@ -159,9 +159,13 @@ public: m_constInputGateValue = config("constInputGateValue", "false"); m_constOutputGateValue = config("constOutputGateValue", "false"); - m_forgetGateInitVal = config("forgetGateInitVal", "-1"); - m_inputGateInitVal = config("inputGateInitVal", "-1"); - m_outputGateInitVal = config("outputGateInitVal", "-1"); + ElemType forgetGateInitVal = config("forgetGateInitVal", "-1"); + ElemType inputGateInitVal = config("inputGateInitVal", "-1"); + ElemType outputGateInitVal = config("outputGateInitVal", "-1"); + + m_forgetGateInitVal = forgetGateInitVal; + m_inputGateInitVal = inputGateInitVal; + m_outputGateInitVal = outputGateInitVal; m_sparse_input = config("sparseinput", "false"); diff --git a/Source/ActionsLib/TrainActions.cpp b/Source/ActionsLib/TrainActions.cpp index e3b75654e..a3b178d3f 100644 --- a/Source/ActionsLib/TrainActions.cpp +++ b/Source/ActionsLib/TrainActions.cpp @@ -142,12 +142,14 @@ shared_ptr MakeRuntimeObject(const IConfigRecordPtr configp { const IConfigRecord& config = *configp; wstring precision = config[L"precision"]; // dispatch on ElemType - if (precision == L"float") + if (precision == L"float16") + DoTrain(config); + else if (precision == L"float") DoTrain(config); else if (precision == L"double") DoTrain(config); else - RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str()); + RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str()); return make_shared(); // return a dummy object } @@ -156,8 +158,10 @@ shared_ptr MakeRuntimeObject(const IConfigRecordPtr configp ScriptableObjects::ConfigurableRuntimeTypeRegister::Add registerTrainAction(L"TrainAction"); }}} +template void DoTrain(const ScriptableObjects::IConfigRecord& config); template void DoTrain(const ScriptableObjects::IConfigRecord& config); template void DoTrain(const ScriptableObjects::IConfigRecord& config); +template void DoTrain(const ConfigParameters& config); template void DoTrain(const ConfigParameters& config); template void DoTrain(const ConfigParameters& config); diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp index 892f667a5..d58f36be2 100644 --- a/Source/CNTK/CNTK.cpp +++ b/Source/CNTK/CNTK.cpp @@ -171,6 +171,91 @@ static void DisableLegacyUsage(const ConfigParameters& TopLevelConfig, const Con // be run in parallel across multiple ranks. Others should only run on rank 0 const std::set commandstoRunOnAllRanks = { "train", "trainRNN", "adapt", "test", "eval", "cv", "devtest", "bnstat" }; + +template +bool DispatchThisAction(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& config) +{ + if (thisAction == "train" || thisAction == "trainRNN") + { + DoTrain(commandParams); + } + else if (thisAction == "bnstat") + { + DoBatchNormalizationStat(commandParams); + } + else if (thisAction == "adapt") + { + DoAdapt(commandParams); + } + else if (thisAction == "test" || thisAction == "eval") + { + DoEval(commandParams); + } + else if (thisAction == "edit") + { + DoEdit(commandParams); + } + else if (thisAction == "cv") + { + DoCrossValidate(commandParams); + } + else if (thisAction == "write") + { + DoWriteOutput(commandParams); + } + else if (thisAction == "devtest") + { + TestCn(config); // for "devtest" action pass the root config instead + } + else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode") + { + DoDumpNodes(commandParams); + } + else if (thisAction == "convertdbn") + { + DoConvertFromDbn(commandParams); + } + else if (thisAction == "exportdbn") + { + DoExportToDbn(commandParams); + } + else if (thisAction == "createLabelMap") + { + DoCreateLabelMap(commandParams); + } + else if (thisAction == "writeWordAndClass") + { + DoWriteWordAndClassInfo(commandParams); + } + else if (thisAction == "plot") + { + DoTopologyPlot(commandParams); + } + else if (thisAction == "SVD") + { + DoParameterSVD(commandParams); + } + else + { + return false; + } + return true; +} + +template <> +bool DispatchThisAction(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& ) +{ + if (thisAction == "train" || thisAction == "trainRNN") + { + DoTrain(commandParams); + } + else + { + RuntimeError("half only supported for action train or trainRNN!"); + } + return true; +} + // process the command template void DoCommands(const ConfigParameters& config, const shared_ptr& mpi) @@ -270,73 +355,21 @@ void DoCommands(const ConfigParameters& config, const shared_ptr& mp { LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str()); } - DoTrain(commandParams); + } + + if (!DispatchThisAction(thisAction, commandParams, config)) + { + RuntimeError("unknown action: %s in command set: %s", thisAction.c_str(), command[i].c_str()); + } + + if (thisAction == "train" || thisAction == "trainRNN") + { if (progressTracing) { LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str()); } fullEpochsOffset += GetMaxEpochs(commandParams); } - else if (thisAction == "bnstat") - { - DoBatchNormalizationStat(commandParams); - } - else if (thisAction == "adapt") - { - DoAdapt(commandParams); - } - else if (thisAction == "test" || thisAction == "eval") - { - DoEval(commandParams); - } - else if (thisAction == "edit") - { - DoEdit(commandParams); - } - else if (thisAction == "cv") - { - DoCrossValidate(commandParams); - } - else if (thisAction == "write") - { - DoWriteOutput(commandParams); - } - else if (thisAction == "devtest") - { - TestCn(config); // for "devtest" action pass the root config instead - } - else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode") - { - DoDumpNodes(commandParams); - } - else if (thisAction == "convertdbn") - { - DoConvertFromDbn(commandParams); - } - else if (thisAction == "exportdbn") - { - DoExportToDbn(commandParams); - } - else if (thisAction == "createLabelMap") - { - DoCreateLabelMap(commandParams); - } - else if (thisAction == "writeWordAndClass") - { - DoWriteWordAndClassInfo(commandParams); - } - else if (thisAction == "plot") - { - DoTopologyPlot(commandParams); - } - else if (thisAction == "SVD") - { - DoParameterSVD(commandParams); - } - else - { - RuntimeError("unknown action: %s in command set: %s", thisAction.c_str(), command[i].c_str()); - } } fprintf(stderr, "\n"); @@ -740,12 +773,14 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) LOGPRINTF(stderr, "precision = \"%s\"\n", type.c_str()); } - if (type == "float") + if (type == "float16") + DoCommands(config, mpi); + else if (type == "float") DoCommands(config, mpi); else if (type == "double") DoCommands(config, mpi); else - RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float\" or \"double\"", type.c_str()); + RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float16\" or \"float\" or \"double\"", type.c_str()); // if completed then write a doneFile if requested if (!doneFile.empty()) diff --git a/Source/Common/Include/Config.h b/Source/Common/Include/Config.h index 0adf2e357..e9f1c09c1 100644 --- a/Source/Common/Include/Config.h +++ b/Source/Common/Include/Config.h @@ -8,6 +8,8 @@ #include #include #include +#include "File.h" +#include "half.hpp" using namespace std; @@ -150,6 +152,11 @@ public: return (float) (double) *this; } + operator half() const + { + return (half)(double)*this; + } + private: long tolong() const { diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp index 6801f5b45..ecdaeb243 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.cpp +++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp @@ -150,6 +150,20 @@ void ComputationNetwork::SaveToFileImpl(const wstring& fileName, const FileOptio else if (nodePtr->Is>()) precision = ElemTypeName(); else LogicError("Unexpected node type."); +#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31 + if (nodePtr->Is>()) + precision = ElemTypeName2(); + else if (nodePtr->Is>()) + precision = ElemTypeName2(); + else if (nodePtr->Is>()) + precision = ElemTypeName2(); + else if (nodePtr->Is>()) + precision = ElemTypeName2(); + else if (nodePtr->Is>()) + precision = ElemTypeName2(); + else if (nodePtr->Is>()) + precision = ElemTypeName2(); +#endif fstream << precision; #endif fstream << nodePtr->OperationName(); @@ -265,6 +279,20 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs node = ComputationNetworkBuilder::NewNode(opName, m_deviceId, nodeName); else if (precision == L"") // old file format: default to node = ComputationNetworkBuilder::NewNode(opName, m_deviceId, nodeName); +#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31 + else if (precision == L"half,float") + node = ComputationNetworkBuilder::NewNode2(opName, m_deviceId, nodeName); + else if (precision == L"half,double") + node = ComputationNetworkBuilder::NewNode2(opName, m_deviceId, nodeName); + else if (precision == L"float,half") + node = ComputationNetworkBuilder::NewNode2(opName, m_deviceId, nodeName); + else if (precision == L"float,double") + node = ComputationNetworkBuilder::NewNode2(opName, m_deviceId, nodeName); + else if (precision == L"double,half") + node = ComputationNetworkBuilder::NewNode2(opName, m_deviceId, nodeName); + else if (precision == L"double,float") + node = ComputationNetworkBuilder::NewNode2(opName, m_deviceId, nodeName); +#endif else RuntimeError("Read: Unexpected precision tag '%ls'", precision.c_str()); diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h index 85d6922da..64975e585 100644 --- a/Source/ComputationNetworkLib/ComputationNetwork.h +++ b/Source/ComputationNetworkLib/ComputationNetwork.h @@ -1313,6 +1313,14 @@ template <> /*static*/ inline const wchar_t* ElemTypeName() { return L"f template <> /*static*/ inline const wchar_t* ElemTypeName() { return L"double"; } template <> /*static*/ inline const wchar_t* ElemTypeName() { return L"half"; } +template static inline const wchar_t* ElemTypeName2(); +template <> /*static*/ inline const wchar_t* ElemTypeName2() { return L"float,half"; } +template <> /*static*/ inline const wchar_t* ElemTypeName2() { return L"float,double"; } +template <> /*static*/ inline const wchar_t* ElemTypeName2() { return L"double,half"; } +template <> /*static*/ inline const wchar_t* ElemTypeName2() { return L"double,float"; } +template <> /*static*/ inline const wchar_t* ElemTypeName2() { return L"half,float"; } +template <> /*static*/ inline const wchar_t* ElemTypeName2() { return L"half,double"; } + // The following emits the class and enables the BaseMatrix to be available (used by EvalDll) // The corresponding Matrix is emitted in the SetDeviceId function above. template class Matrix; diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp index 113cdb22b..2d79ddfea 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp @@ -175,6 +175,13 @@ static shared_ptr> CreateNode(const std::wstring& node else return CreateStandardNode(nodeType, forward<_Types>(_Args)...); } +template +static shared_ptr> CreateNode2(const std::wstring& nodeType, _Types&&... _Args) +{ + // check more types + if (nodeType == OperationName2Of(CastNode)) return New>(forward<_Types>(_Args)...); + else RuntimeError("CreateNode2: unsupported nodeType - %S", nodeType.c_str()); +} // this function is called from SimpleNetworkBuilder and old NDL template /*static*/ shared_ptr> ComputationNetworkBuilder::NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name) @@ -189,6 +196,13 @@ template return CreateNode(nodeType, deviceId, name); } +template +template +/*static*/ shared_ptr> ComputationNetworkBuilder::NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name) +{ + return CreateNode2(nodeType, deviceId, name); +} + shared_ptr NewComputationNodeFromConfig(const Microsoft::MSR::ScriptableObjects::IConfigRecordPtr configp) { wstring precision = configp->Get(L"precision"); // dispatch on ElemType @@ -247,15 +261,17 @@ shared_ptr> ComputationNetworkBuilder::Creat } template -shared_ptr> ComputationNetworkBuilder::CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName) +template +shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName) { - return net.AddNodeToNetWithElemType(New>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName)); + return net.AddNodeToNetWithElemType(New>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName)); } template -shared_ptr> ComputationNetworkBuilder::CreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName) +template +shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName) { - return net.AddNodeToNetWithElemType(New>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName)); + return net.AddNodeToNetWithElemType(New>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName)); } template @@ -318,6 +334,12 @@ shared_ptr> ComputationNetworkBuilder::Creat { return net.AddNodeToNetWithElemType(New>(net.GetDeviceId(), nodeName)); } +template +template +shared_ptr> ComputationNetworkBuilder::CreateCastNode(const std::wstring& nodeName) +{ + return net.AddNodeToNetWithElemType(New>(net.GetDeviceId(), nodeName)); +} // this is the catch-all for all cases not covered as special cases above // Unlike the specialized ones above, this one creates nodes by type given as a string. @@ -997,4 +1019,37 @@ template shared_ptr> ComputationNetworkBuilder::Typ template shared_ptr> ComputationNetworkBuilder::TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); template shared_ptr> ComputationNetworkBuilder::TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); + +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); +template shared_ptr> ComputationNetworkBuilder::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); + +template shared_ptr> ComputationNetworkBuilder::CreateCastNode(const std::wstring& nodeName); +template shared_ptr> ComputationNetworkBuilder::CreateCastNode(const std::wstring& nodeName); +template shared_ptr> ComputationNetworkBuilder::CreateCastNode(const std::wstring& nodeName); +template shared_ptr> ComputationNetworkBuilder::CreateCastNode(const std::wstring& nodeName); +template shared_ptr> ComputationNetworkBuilder::CreateCastNode(const std::wstring& nodeName); +template shared_ptr> ComputationNetworkBuilder::CreateCastNode(const std::wstring& nodeName); + +template shared_ptr> ComputationNetworkBuilder::NewNode2(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name); +template shared_ptr> ComputationNetworkBuilder::NewNode2(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name); +template shared_ptr> ComputationNetworkBuilder::NewNode2(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name); +template shared_ptr> ComputationNetworkBuilder::NewNode2(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name); +template shared_ptr> ComputationNetworkBuilder::NewNode2(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name); +template shared_ptr> ComputationNetworkBuilder::NewNode2(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name); }}} diff --git a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h index 826c582c3..604dcab58 100644 --- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h +++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h @@ -38,6 +38,8 @@ public: // TODO: move into a separate header/class, to decouple from this class which would then be only used by old NDL and SimpleNetworkBuilder. static ComputationNodePtr NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name); static ComputationNodePtr NewNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name); + template + static ComputationNodePtr NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name); // The following functions create nodes and add them to the net, but don't attach inputs (some don't have inputs). // There are special versions for nodes with custom constructors, and a catch-all, CreateComputationNode(), for all others. @@ -53,12 +55,25 @@ public: template shared_ptr> TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2 + template + shared_ptr> CreateCastNode(const std::wstring& nodeName); + // sparse matrix size is optionally specified // ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0); ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L""); ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L""); - ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L""); - ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L""); + shared_ptr> CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"") + { + return this->template TypedCreateInputNode(inputName, sampleLayout, dynamicAxisName); + } + template + shared_ptr> TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); + shared_ptr> CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"") + { + return this->template TypedCreateSparseInputNode(inputName, sampleLayout, dynamicAxisName); + } + template + shared_ptr> TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName); ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape, const std::vector& sharing, const std::vector& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad, bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples); diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h index 6c2e9e9ae..8d9e25d68 100644 --- a/Source/ComputationNetworkLib/ComputationNode.h +++ b/Source/ComputationNetworkLib/ComputationNode.h @@ -61,7 +61,8 @@ #define CNTK_MODEL_VERSION_28 28 // Padding op #define CNTK_MODEL_VERSION_29 29 // Expose StopGradient in BS #define CNTK_MODEL_VERSION_30 30 // LatticeWithSequenceSoftmax node -#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_30 +#define CNTK_MODEL_VERSION_31 31 // Cast node +#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_31 // helper mode for debugging // If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations. @@ -95,6 +96,7 @@ struct /*interface*/ IComputationNode // TODO: OperationName calls static TypeName which does not match the actual type names in that the 'Node' is missing. virtual const std::wstring OperationName() const = 0; #define OperationNameOf(T) (T::TypeName()) // convenience macro +#define OperationName2Of(T) (T::TypeName()) // convenience macro virtual void UpdateFunctionMBSize() = 0; // recalculate our column dimensions from MBLayout. Override to update temps. diff --git a/Source/EvalDll/CNTKEval.cpp b/Source/EvalDll/CNTKEval.cpp index 736b0e75f..f4058c161 100644 --- a/Source/EvalDll/CNTKEval.cpp +++ b/Source/EvalDll/CNTKEval.cpp @@ -4,6 +4,9 @@ // // CNTKEval.cpp : Defines the exported functions for the CNTK DLL. // +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings +#endif #define __STDC_FORMAT_MACROS #include diff --git a/Source/SGDLib/ASGDHelper.cpp b/Source/SGDLib/ASGDHelper.cpp index aeca17930..4ae5238d7 100644 --- a/Source/SGDLib/ASGDHelper.cpp +++ b/Source/SGDLib/ASGDHelper.cpp @@ -646,6 +646,20 @@ ASGDHelper* NewASGDHelper( #endif } +template<> ASGDHelper* NewASGDHelper( + const std::list & learnableNodes, + size_t nodeNumRanks, + bool useAsyncBuffer, + bool isSimulatedModelAveragingSGD, + AdjustLearningRateAtBeginning adjusttype, + double adjustCoef, + size_t adjustPerMinibatches, + int traceLevel, + int syncPerfStats) +{ + RuntimeError("NewASGDHelper - half not supported!"); +} + template ASGDHelper* NewASGDHelper( const std::list & learnableNodes, size_t nodeNumRanks, diff --git a/Source/SGDLib/AccumulatorAggregation.h b/Source/SGDLib/AccumulatorAggregation.h index 4c3ff6095..a24c1d3b9 100644 --- a/Source/SGDLib/AccumulatorAggregation.h +++ b/Source/SGDLib/AccumulatorAggregation.h @@ -18,6 +18,7 @@ #include "Matrix.h" #include "SimpleDistGradAggregator.h" #include "V2SimpleDistGradAggregator.h" +#include "SimpleDistGradAggregatorHelper.h" namespace Microsoft { namespace MSR { namespace CNTK { @@ -46,21 +47,12 @@ void AggregateAccumulatorValuesAndUpdateEvaluation( } // Prepare aggregator. - std::shared_ptr> distGradAgg; - if (Globals::UseV2Aggregator()) - distGradAgg = make_shared>( - mpi, - false /*useAsyncAggregation*/, - net->GetDeviceId(), - 0 /*syncStatsTrace*/, - ::CNTK::MPICommunicator(packThresholdSizeInBytes)); - else - distGradAgg = make_shared>( - mpi, - false /*useAsyncAggregation*/, - net->GetDeviceId(), - 0 /*syncStatsTrace*/, - packThresholdSizeInBytes); + std::shared_ptr> distGradAgg = GetSimpleDistGradAggregator( + mpi, + false /*useAsyncAggregation*/, + net->GetDeviceId(), + 0 /*syncStatsTrace*/, + packThresholdSizeInBytes); // Prepare header. const size_t c_evalNodes = 1; diff --git a/Source/SGDLib/DataReaderHelpers.h b/Source/SGDLib/DataReaderHelpers.h index 3fb8c750a..d4ceeb360 100644 --- a/Source/SGDLib/DataReaderHelpers.h +++ b/Source/SGDLib/DataReaderHelpers.h @@ -24,10 +24,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { for (const auto& iter : inputMatrices) matrices.insert(iter.second.matrix); for (auto& node : net->FeatureNodes()) - if (matrices.find(node->As>()->ValuePtr()) != matrices.end()) + if (matrices.find(node->ValuePtr()) != matrices.end()) node->NotifyFunctionValuesMBSizeModified(); for (auto& node : net->LabelNodes()) - if (matrices.find(node->As>()->ValuePtr()) != matrices.end()) + if (matrices.find(node->ValuePtr()) != matrices.end()) node->NotifyFunctionValuesMBSizeModified(); } diff --git a/Source/SGDLib/MASGD.h b/Source/SGDLib/MASGD.h index 1a828e990..a9f1708ee 100644 --- a/Source/SGDLib/MASGD.h +++ b/Source/SGDLib/MASGD.h @@ -150,7 +150,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } virtual void OnEpochEnd(const std::list& LearnableNodes, - std::list>& smoothedGradient, + std::list& smoothedGradients, size_t samplesSinceLastSync ) { @@ -165,7 +165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (read2sync) { m_numSyncPerformed++; - ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication); + ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication); m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication); } @@ -175,7 +175,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual bool OnArrivingAtSyncPoint( const std::list& LearnableNodes, /* input/output: */ - std::list>& smoothedGradient, /* input/output: under some setup, it will reset to zero*/ + std::list& smoothedGradients, /* input/output: under some setup, it will reset to zero*/ size_t samplesSinceLastSync /* input: samples processed since last sync on this worker only */ ) { @@ -190,7 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (read2Sync) { m_numSyncPerformed++; - ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication); + ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication); m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication); } return read2Sync; @@ -199,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void ModelAggregationProcessing( size_t samplesSinceLastSync, /* in: */ const std::list& learnableNodes, /* in/out */ - std::list>& smoothedGradient, /* in/out */ + std::list& smoothedGradients, /* in/out */ size_t& totalSamplesProcessed, /* out */ float& secondsOnCommunication /* out */) = 0; @@ -346,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { void ModelAggregationProcessing( size_t samplesSinceLastSync, /* in */ const std::list& learnableNodes, /* in/out */ - std::list>& smoothedGradient, /* in/out */ + std::list& smoothedGradients, /* in/out */ size_t& totalSamplesProcessed, /* out */ float& secondsOnCommunication /* out */) override // NOTE: the variable type is determined by the interface in SGD::TrainOneEpoch diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index 9a09a9698..dcee286fb 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -31,6 +31,7 @@ #include "ASGDHelper.h" #include "CNTKLibraryInternals.h" +#include "SimpleDistGradAggregatorHelper.h" #include "SimpleDistGradAggregator.h" #include "V2SimpleDistGradAggregator.h" #include "ProgressTracing.h" @@ -47,8 +48,10 @@ using namespace std; // class SGD // ======================================================================= +template SGD::SGD(const ConfigParameters&); template SGD::SGD(const ConfigParameters&); template SGD::SGD(const ConfigParameters&); +template SGD::SGD(const ScriptableObjects::IConfigRecord&); template SGD::SGD(const ScriptableObjects::IConfigRecord&); template SGD::SGD(const ScriptableObjects::IConfigRecord&); @@ -223,6 +226,11 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, } } + if (criterionNodes.front()->template Is>()) + { + InvalidArgument("TrainOrAdaptModel: using Float16 for loss function may cause overflow, please cast to float."); + } + // This code is only relevant for the new (V2) readers. It exists because of // a shortcoming in DecimateMinibatchInPlace, which does not yet work when inputs // in the same minibatch have different layouts, which is something only V2 readers can @@ -333,7 +341,7 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, // initializing weights and gradient holder // only one criterion so far TODO: support multiple ones? auto& learnableNodes = net->LearnableParameterNodes(criterionNodes[0]); - list> smoothedGradients; + list smoothedGradients; vector smoothedCounts; // currently used by FSAdaGradUpdate() size_t numParameters = 0; @@ -344,9 +352,30 @@ void SGD::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net, // Note: We don't actually need the smoothedGradients if !IsParameterUpdateRequired(). // However, this is hard to fix since lots of code assumes smoothedGradients to be in the same order as learnableNodes. // V2 API fixes this. - smoothedGradients.push_back(Matrix(node->Value().GetNumRows(), - node->Value().GetNumCols(), - net->GetDeviceId())); + MatrixBasePtr smoothedGradientPtr; + size_t numRows = node->Value().GetNumRows(); + size_t numCols = node->Value().GetNumCols(); + if (std::is_same()) + { + // For half parameters, we use float smoothed gradients + // Allocate 3 times the size for casting parameter and gradients to float + const size_t c_smoothed_gradients_factor = 3; + shared_ptr> compoundMatrixPtr = std::make_shared>(numRows, + numCols * c_smoothed_gradients_factor, + net->GetDeviceId()); + // Initialize float parameters + auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols); + parameterMatrix.CastAssignValuesOf(node->Value()); + + smoothedGradientPtr = compoundMatrixPtr; + } + else + { + smoothedGradientPtr = std::make_shared>(numRows, + numCols, + net->GetDeviceId()); + } + smoothedGradients.push_back(smoothedGradientPtr); smoothedCounts.push_back(0); if (node->IsParameterUpdateRequired()) { @@ -987,7 +1016,7 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, // TODO: why is this a pointer? const std::list& learnableNodes, - std::list>& smoothedGradients, vector& smoothedCounts, + std::list& smoothedGradients, vector& smoothedCounts, /*out*/ EpochCriterion& epochCriterion, /*out*/ std::vector& epochEvalErrors, const std::string& prefixMsg, @@ -1389,7 +1418,25 @@ size_t SGD::TrainOneEpoch(ComputationNetworkPtr net, if (node->IsParameterUpdateRequired()) { #ifdef _DEBUG - if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): ")) + bool hasNan = false; + if (std::is_same()) + { + // Get metrix from compound metrix + auto compoundMatrixPtr = dynamic_pointer_cast> (*smoothedGradientIter); + if (compoundMatrixPtr) + { + size_t numCols = dynamic_pointer_cast>(node)->Value().GetNumCols(); + + auto smoothedGradient = compoundMatrixPtr->ColumnSlice(0, numCols); + hasNan = smoothedGradient.HasNan("TrainOneEpoch/UpdateWeights(): "); + } + } + else + { + auto smoothedGradient = dynamic_pointer_cast> (*smoothedGradientIter); + hasNan = smoothedGradient && smoothedGradient->HasNan("TrainOneEpoch/UpdateWeights(): "); + } + if (hasNan) LogicError("%ls %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str()); #endif double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier(); @@ -1811,7 +1858,7 @@ double SGD::SearchForBestLearnRate(ComputationNetworkPtr net, const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, vector smoothedCounts, + std::list& smoothedGradients, vector smoothedCounts, const bool learnRateInitialized, const double largestPrevLearnRatePerSample) { @@ -1985,7 +2032,7 @@ size_t SGD::AdaptiveMinibatchSizing(ComputationNetworkPtr net, const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, vector smoothedCounts, + std::list& smoothedGradients, vector smoothedCounts, const double learningRateAdjustmentFactor) { size_t minMinibatchSize = initialMinibatchSize; @@ -2086,7 +2133,7 @@ size_t SGD::SearchForBestMinibatchSize(ComputationNetworkPtr net, const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, std::vector smoothedCounts, + std::list& smoothedGradients, std::vector smoothedCounts, const size_t minMinibatchSize, const size_t maxMinibatchSize) { // may happen for automatically reduced learning rates @@ -2190,7 +2237,7 @@ void SGD::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net, const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, vector smoothedCounts, + std::list& smoothedGradients, vector smoothedCounts, /*out*/ EpochCriterion& epochCriterion, /*out*/ std::vector& epochEvalErrors, std::string prefixMsg, @@ -2264,6 +2311,24 @@ void SGD::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net } } +template +std::shared_ptr> _GetAllReduceDistGradAggregator(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace) +{ + if (Globals::UseV2Aggregator()) + { + auto communicator = ::CNTK::QuantizedMPICommunicator(zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, nBits); + return std::make_shared>(communicator, useAsyncAggregation, traceLevel, syncStatsTrace); + } + else + return std::make_shared>(mpi, nBits, zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, useAsyncAggregation, traceLevel, syncStatsTrace); +} + +template <> +std::shared_ptr> _GetAllReduceDistGradAggregator(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace) +{ + RuntimeError("SGD - half not supported for quantization!"); +} + template void SGD::InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel) { @@ -2274,13 +2339,7 @@ void SGD::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d if (traceLevel > 0) fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits); #ifdef CNTK_PARALLEL_TRAINING_SUPPORT - if (Globals::UseV2Aggregator()) - { - auto communicator = ::CNTK::QuantizedMPICommunicator(m_zeroThresholdFor1Bit, true, numGradientBits); - m_distGradAgg = std::make_shared>(communicator, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace); - } - else - m_distGradAgg = std::make_shared>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace); + m_distGradAgg = _GetAllReduceDistGradAggregator(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace); #else RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!"); #endif // !CNTK_PARALLEL_TRAINING_SUPPORT @@ -2289,15 +2348,38 @@ void SGD::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d { if (traceLevel > 0) fprintf(stderr, "Initializing dataParallelSGD with FP%d aggregation.\n", numGradientBits); - if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines. - m_distGradAgg = std::make_shared>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, ::CNTK::MPICommunicator(m_packThresholdSizeInBytes, m_useFP16AllReduce)); - else - m_distGradAgg = std::make_shared>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes); + m_distGradAgg = GetSimpleDistGradAggregator(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes, m_useFP16AllReduce); } m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); }); } +template +shared_ptr> _GetBlockMomentumSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize) +{ + assert(!Globals::UseV2Aggregator()); + return make_shared>(mpi, traceLevel, devID, useNesterovBlockMomentum, resetSGDMomentum, blockLearningRate, blockMomentumAsTimeConstant, modelAggregationBlockSize); +} + +template <> +shared_ptr> _GetBlockMomentumSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize) +{ + assert(!Globals::UseV2Aggregator()); + RuntimeError("SGD - half not supported when useV2Aggregator is false!"); +} + +template +shared_ptr> _GetBasicModelAveragingSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID) +{ + return make_shared>(mpi, traceLevel, devID); +} + +template <> +shared_ptr> _GetBasicModelAveragingSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID) +{ + RuntimeError("SGD - half not supported for modelAveragingSGD"); +} + template void SGD::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID) { @@ -2307,7 +2389,7 @@ void SGD::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de } if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD) { - m_pMASGDHelper = make_shared>(m_mpi, traceLevel, devID); + m_pMASGDHelper = _GetBasicModelAveragingSGD(m_mpi, traceLevel, devID); } else if (GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD) { @@ -2329,7 +2411,7 @@ void SGD::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de m_modelAggregationBlockSize); } else - m_pMASGDHelper = make_shared>(m_mpi, traceLevel, devID, + m_pMASGDHelper = _GetBlockMomentumSGD(m_mpi, traceLevel, devID, m_useNesterovBlockMomentum, m_resetSGDMomentum, m_blockLearningRate, m_blockMomentumAsTimeConstant, m_modelAggregationBlockSize); @@ -2341,6 +2423,47 @@ void SGD::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de // UpdateWeights() - actual weight update, implementing various update rules template void SGD::UpdateWeights(Matrix& functionValues, Matrix& gradientValues, + MatrixBasePtr& smoothedGradientValues, double& smoothedCount, + const double learnRatePerSample, const double momentumPerSample, + size_t actualMBSize, + const double L2RegWeight, const double L1RegWeight, + const bool needAveMultiplier, + const bool useNesterovMomentum) const +{ + if (std::is_same()) + { + // Get metrix from compound metrix + auto compoundMatrixPtr = dynamic_pointer_cast> (smoothedGradientValues); + size_t numCols = functionValues.GetNumCols(); + + auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols); + auto tempGradientMatrix = compoundMatrixPtr->ColumnSlice(numCols, numCols); + auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols); + + // Cast gradients to float + tempGradientMatrix.CastAssignValuesOf(gradientValues); + + // Update + TypedUpdateWeights(parameterMatrix, tempGradientMatrix, smoothedGradientMatrix, smoothedCount, + learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight, + needAveMultiplier, useNesterovMomentum); + + // Cast parameter back to half + functionValues.CastAssignValuesOf(parameterMatrix); + + } + else + { + auto sgv = dynamic_pointer_cast> (smoothedGradientValues); + TypedUpdateWeights<>(functionValues, gradientValues, *sgv, smoothedCount, + learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight, + needAveMultiplier, useNesterovMomentum); + } +} + +template +template +void SGD::TypedUpdateWeights(Matrix& functionValues, Matrix& gradientValues, Matrix& smoothedGradientValues, double& smoothedCount, const double learnRatePerSample, const double momentumPerSample, size_t actualMBSize, @@ -2363,7 +2486,7 @@ void SGD::UpdateWeights(Matrix& functionValues, Matrix 0); // clipping gradients to prevent outliers - ClipGradient(gradientValues, actualMBSize); + ClipGradient(gradientValues, actualMBSize); GradientsUpdateType adpType = GradUpdateType(); double noiseStd = GradientUpdateNoiseStd(); @@ -2453,8 +2576,9 @@ void SGD::UpdateWeights(Matrix& functionValues, Matrix template -void SGD::ClipGradient(Matrix& gradient, const size_t actualMBSize) const +void SGD::ClipGradient(Matrix& gradient, const size_t actualMBSize) const { if (m_clippingThresholdPerSample != std::numeric_limits::infinity()) { @@ -2474,10 +2598,30 @@ void SGD::ClipGradient(Matrix& gradient, const size_t actual } } +template +static void SaveSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient) +{ + auto smoothedGradientPtr = dynamic_pointer_cast> (smoothedGradient); + if (!smoothedGradientPtr) + RuntimeError("Failed to cast, type mismatch"); + const Matrix& smoothedGradientValues = *smoothedGradientPtr; + fstream << smoothedGradientValues; +} + +template +static void LoadSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient) +{ + auto smoothedGradientPtr = dynamic_pointer_cast> (smoothedGradient); + if (!smoothedGradientPtr) + RuntimeError("Failed to cast, type mismatch"); + Matrix& smoothedGradientValues = *smoothedGradientPtr; + fstream >> smoothedGradientValues; +} + template void SGD::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, const double learnRatePerSample, - const std::list>& smoothedGradients, + const std::list& smoothedGradients, const std::vector& smoothedCounts, const double prevCriterion, const size_t minibatchSize) @@ -2510,10 +2654,12 @@ void SGD::SaveCheckPointInfo(const size_t epoch, const size_t totalSam fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); - for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) + for (auto smoothedGradient : smoothedGradients) { - const Matrix& smoothedGradientValues = *smoothedGradientIter; - fstream << smoothedGradientValues; + if (std::is_same()) + SaveSmoothedGradient(fstream, smoothedGradient); + else + SaveSmoothedGradient(fstream, smoothedGradient); } fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient"); @@ -2553,7 +2699,7 @@ template bool SGD::TryLoadCheckPointInfo(const size_t epochNumber, /*out*/ size_t& totalSamplesSeen, /*out*/ double& learnRatePerSample, - std::list>& smoothedGradients, + std::list& smoothedGradients, std::vector& smoothedCounts, /*out*/ double& prevCriterion, /*out*/ size_t& minibatchSize) @@ -2582,7 +2728,7 @@ template void SGD::LoadCheckPointInfo(const size_t epochNumber, /*out*/ size_t& totalSamplesSeen, /*out*/ double& learnRatePerSample, - std::list>& smoothedGradients, + std::list& smoothedGradients, std::vector& smoothedCounts, /*out*/ double& prevCriterion, /*out*/ size_t& minibatchSize) @@ -2600,6 +2746,9 @@ void SGD::LoadCheckPointInfo(const size_t epochNumber, fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion"); } + if (ckpVersion > CURRENT_CNTK_CHECKPOINT_VERSION) + RuntimeError("The checkpoint file has a newer format version (%d) than this CNTK version can handle (%d).", (int)ckpVersion, (int)CURRENT_CNTK_CHECKPOINT_VERSION); + fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP"); fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate"); @@ -2618,10 +2767,12 @@ void SGD::LoadCheckPointInfo(const size_t epochNumber, fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient"); - for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++) + for (auto smoothedGradient : smoothedGradients) { - Matrix& smoothedGradientValues = *smoothedGradientIter; - fstream >> smoothedGradientValues; + if (std::is_same()) + LoadSmoothedGradient(fstream, smoothedGradient); + else + LoadSmoothedGradient(fstream, smoothedGradient); } fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient"); @@ -2824,6 +2975,7 @@ void SGD::MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNet nodeIter->SetEvalTimeStampOutdatedWrtAll(); } +template class SGD; template class SGD; template class SGD; @@ -2881,7 +3033,7 @@ static AdjustLearningRateAtBeginning AdjustLearningRateAtBeginningType(const wst else InvalidArgument("AdjustLearningRateatBeginningType: Invalid Type. Valid values are (None | Linearly | Staircase)"); } #endif - + template SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType) { @@ -3306,12 +3458,14 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType) static size_t GetSizeOfPrecision(const ScriptableObjects::IConfigRecordPtr configp) { wstring precision = configp->Get(L"precision"); - if (precision == L"float") + if (precision == L"float16") + return sizeof(half); + else if (precision == L"float") return sizeof(float); else if (precision == L"double") return sizeof(double); else - RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str()); + RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str()); } SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp) diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h index 623f1f114..b95419400 100644 --- a/Source/SGDLib/SGD.h +++ b/Source/SGDLib/SGD.h @@ -25,7 +25,8 @@ using namespace std; // ugh! TODO: get rid of this from .h files!!! #define CNTK_CHECKPOINT_VERSION_1 1 // 1 -> no version number #define CNTK_CHECKPOINT_VERSION_2 2 -#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_2 +#define CNTK_CHECKPOINT_VERSION_3 3 // float smoothed gradients for float16/half parameters +#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_3 namespace CNTK { namespace Internal { // Forward declarations. @@ -442,7 +443,7 @@ protected: const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, std::vector smoothedCounts, + std::list& smoothedGradients, std::vector smoothedCounts, const bool learnRateInitialized, const double largestPrevLearnRatePerSample); @@ -458,7 +459,7 @@ protected: const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, std::vector smoothedCounts, + std::list& smoothedGradients, std::vector smoothedCounts, /*out*/ EpochCriterion& epochCriterion, /*out*/ std::vector& epochEvalErrors, std::string prefixMsg, @@ -478,7 +479,7 @@ protected: const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, std::vector smoothedCounts, + std::list& smoothedGradients, std::vector smoothedCounts, const double learningRateAdjustmentFactor); // uses a small percentage of training data of minibatch to @@ -496,7 +497,7 @@ protected: const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, std::vector smoothedCounts, + std::list& smoothedGradients, std::vector smoothedCounts, const size_t minMinibatchSize, const size_t maxMinibatchSize); // Attempts to compute the error signal for the whole utterance, which will @@ -523,7 +524,7 @@ protected: const std::vector& evaluationNodes, StreamMinibatchInputs* inputMatrices, const std::list& learnableNodes, - std::list>& smoothedGradients, std::vector& smoothedCounts, + std::list& smoothedGradients, std::vector& smoothedCounts, /*out*/ EpochCriterion& epochCriterion, /*out*/ std::vector& epochEvalErrors, const std::string& prefixMsg = "", @@ -534,26 +535,37 @@ protected: void InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel); void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID); -public: +private: // UpdateWeights() - actual weight update, implementing various update rules void UpdateWeights(Matrix& functionValues, Matrix& gradientValues, - Matrix& smoothedGradient, double& smoothedCount, + MatrixBasePtr& smoothedGradient, double& smoothedCount, + const double learnRatePerSample, const double momentumPerSample, + size_t actualMBSize, + const double L2RegWeight, const double L1RegWeight, + const bool needAveMultiplier, + const bool useNesterovMomentum) const; + + template + void TypedUpdateWeights(Matrix& functionValues, Matrix& gradientValues, + Matrix& smoothedGradient, double& smoothedCount, const double learnRatePerSample, const double momentumPerSample, size_t actualMBSize, const double L2RegWeight, const double L1RegWeight, const bool needAveMultiplier, const bool useNesterovMomentum) const; +public: // return -1 if nothing exists int DetermineStartEpoch(const bool makeMode); wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false) const; protected: - void ClipGradient(Matrix& gradient, const size_t actualMBSize) const; + template + void ClipGradient(Matrix& gradient, const size_t actualMBSize) const; void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, // TODO: combine totalSamplesSeen and prevCriterion into a EpochCriterion type const double learnRatePerSample, - const std::list>& smoothedGradients, + const std::list& smoothedGradients, const std::vector& smoothedCounts, const double prevCriterion, const size_t minibatchSize); @@ -561,14 +573,14 @@ protected: bool TryLoadCheckPointInfo(const size_t epochNumber, /*out*/ size_t& totalSamplesSeen, /*out*/ double& learnRatePerSample, - std::list>& smoothedGradients, + std::list& smoothedGradients, std::vector& smoothedCounts, /*out*/ double& prevCriterion, /*out*/ size_t& minibatchSize); void LoadCheckPointInfo(const size_t epochNumber, /*out*/ size_t& totalSamplesSeen, /*out*/ double& learnRatePerSample, - std::list>& smoothedGradients, + std::list& smoothedGradients, std::vector& smoothedCounts, /*out*/ double& prevCriterion, /*out*/ size_t& minibatchSize); diff --git a/Source/SGDLib/SGDLib.vcxproj b/Source/SGDLib/SGDLib.vcxproj index e55d6a427..cacc70a99 100644 --- a/Source/SGDLib/SGDLib.vcxproj +++ b/Source/SGDLib/SGDLib.vcxproj @@ -137,6 +137,7 @@ + @@ -149,6 +150,7 @@ + diff --git a/Source/SGDLib/SGDLib.vcxproj.filters b/Source/SGDLib/SGDLib.vcxproj.filters index 16d52d17c..0133ff25d 100644 --- a/Source/SGDLib/SGDLib.vcxproj.filters +++ b/Source/SGDLib/SGDLib.vcxproj.filters @@ -16,6 +16,9 @@ Parallelization + + Parallelization + @@ -144,6 +147,9 @@ Parallelization + + Parallelization + diff --git a/Source/SGDLib/SimpleDistGradAggregatorHelper.cpp b/Source/SGDLib/SimpleDistGradAggregatorHelper.cpp new file mode 100644 index 000000000..4e9b84f7b --- /dev/null +++ b/Source/SGDLib/SimpleDistGradAggregatorHelper.cpp @@ -0,0 +1,82 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +#pragma warning(disable : 4267) // conversion from size_t to int or other types + +#include "Basics.h" +#include "MPIWrapper.h" +#include "Matrix.h" +#include "SimpleDistGradAggregatorHelper.h" +#include "DistGradHeader.h" +#include "IDistGradAggregator.h" +#include "SimpleDistGradAggregator.h" +#include "V2SimpleDistGradAggregator.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + + +template +std::shared_ptr> GetSimpleDistGradAggregator( + const MPIWrapperPtr& mpi, + bool useAsyncAggregation, + int deviceId, + int syncStatsTrace, + size_t packThresholdSizeInBytes, + bool useFP16AllReduce) +{ + if (Globals::UseV2Aggregator()) + return std::make_shared>( + mpi, + useAsyncAggregation, + deviceId, + syncStatsTrace, + ::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce)); + else + return std::make_shared>( + mpi, + useAsyncAggregation, + deviceId, + syncStatsTrace, + packThresholdSizeInBytes); +} + +template <> +std::shared_ptr> GetSimpleDistGradAggregator( + const MPIWrapperPtr& mpi, + bool useAsyncAggregation, + int deviceId, + int syncStatsTrace, + size_t packThresholdSizeInBytes, + bool useFP16AllReduce) +{ + if (Globals::UseV2Aggregator()) + return std::make_shared>( + mpi, + useAsyncAggregation, + deviceId, + syncStatsTrace, + ::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce)); + else + RuntimeError("SGD - half not supported when useV2Aggregator is false!"); +} + +template std::shared_ptr> GetSimpleDistGradAggregator( + const MPIWrapperPtr& mpi, + bool useAsyncAggregation, + int deviceId, + int syncStatsTrace, + size_t packThresholdSizeInBytes, + bool useFP16AllReduce); + +template std::shared_ptr> GetSimpleDistGradAggregator( + const MPIWrapperPtr& mpi, + bool useAsyncAggregation, + int deviceId, + int syncStatsTrace, + size_t packThresholdSizeInBytes, + bool useFP16AllReduce); + +}}} diff --git a/Source/SGDLib/SimpleDistGradAggregatorHelper.h b/Source/SGDLib/SimpleDistGradAggregatorHelper.h new file mode 100644 index 000000000..21302980a --- /dev/null +++ b/Source/SGDLib/SimpleDistGradAggregatorHelper.h @@ -0,0 +1,24 @@ +// +// Copyright (c) Microsoft. All rights reserved. +// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved. +// Licensed under the MIT license. See LICENSE.md file in the project root for full license information. +// + +#pragma once + +#include "Constants.h" +#include "IDistGradAggregator.h" + +namespace Microsoft { namespace MSR { namespace CNTK { + + +template +std::shared_ptr> GetSimpleDistGradAggregator( + const MPIWrapperPtr& mpi, + bool useAsyncAggregation, + int deviceId, + int syncStatsTrace, + size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES, + bool useFP16AllReduce = false); + +}}} diff --git a/Source/SGDLib/SimpleEvaluator.h b/Source/SGDLib/SimpleEvaluator.h index 2941c26d4..eadd2f2ee 100644 --- a/Source/SGDLib/SimpleEvaluator.h +++ b/Source/SGDLib/SimpleEvaluator.h @@ -5,8 +5,6 @@ #pragma once -#include "V2SimpleDistGradAggregator.h" - #include "AccumulatorAggregation.h" #include "Basics.h" #include "DataReader.h" @@ -18,7 +16,7 @@ #include "ProgressTracing.h" #include "DistGradHeader.h" #include "IDistGradAggregator.h" -#include "SimpleDistGradAggregator.h" +#include "SimpleDistGradAggregatorHelper.h" #include "Criterion.h" #include "Globals.h" @@ -167,10 +165,7 @@ public: DistGradHeader::Destroy(ptr); }); - if (Globals::UseV2Aggregator()) - m_distGradAgg = make_shared>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/, ::CNTK::MPICommunicator()); - else - m_distGradAgg = make_shared>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/); + m_distGradAgg = GetSimpleDistGradAggregator(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/); } m_gradHeader->numEvalNode = evalNodes.size(); diff --git a/Source/SGDLib/V2SimpleDistGradAggregator.h b/Source/SGDLib/V2SimpleDistGradAggregator.h index 1ca3569db..586626e46 100644 --- a/Source/SGDLib/V2SimpleDistGradAggregator.h +++ b/Source/SGDLib/V2SimpleDistGradAggregator.h @@ -109,7 +109,7 @@ public: // Synchronize the Quantization compute stream with the completion of // compute of the gradient matrices on the main compute stream - mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent(); + mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent(); delete mainStreamSyncEvent; AggregateGradientsImpl(newGradients, newGradHeader, showSyncPerfStats); @@ -185,7 +185,7 @@ private: if (m_useAsyncAggregation) { std::unique_ptr mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId)); - mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent(); + mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent(); } } diff --git a/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp b/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp index ca6b76f58..9e07b7495 100644 --- a/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp +++ b/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp @@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(CheckModelVersion) // This is a watch guard to make sure that any change in the model version will be detected. // If you change the CNTK model version, please do not silently adapt this test. // Instead, please do notify the CNTK release team (AlexeyO, Wolfgang, Zhou, Mark) to prepare required steps for the next release. - BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 30, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n"); + BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 31, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n"); } BOOST_AUTO_TEST_CASE(EvalConstantPlusTest) diff --git a/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj b/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj index acdb12c04..26397e43a 100644 --- a/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj +++ b/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj @@ -61,7 +61,7 @@ Console true - Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;%(AdditionalDependencies) + Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;Cntk.SGD-$(CntkComponentVersion).lib;%(AdditionalDependencies) true $(MSMPI_LIB64);$(OutDir);$(BOOST_LIB_PATH);$(NvmlLibPath) Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll