Merge fp16 brainscript work (#3606)
* FP16 BrainScript - address code review comments * Remove Tab and fix debug build breaks * Fix Linux Build breaks * fp16 brain script - add _CRT_SECURE_NO_WARNINGS * fp16 brain script - fix NetworkTests * Update tests for model version change * Remove changes for InputAndParamNodes * Fix typo * Remove redundant code * Fix optional parameters
This commit is contained in:
Родитель
45ae386bc8
Коммит
4003c087a1
1
Makefile
1
Makefile
|
@ -707,6 +707,7 @@ SGDLIB_SRC=\
|
|||
$(SOURCEDIR)/SGDLib/Profiler.cpp \
|
||||
$(SOURCEDIR)/SGDLib/SGD.cpp \
|
||||
$(SOURCEDIR)/SGDLib/PostComputingActions.cpp \
|
||||
$(SOURCEDIR)/SGDLib/SimpleDistGradAggregatorHelper.cpp \
|
||||
|
||||
SGDLIB_SRC+=$(CNTKLIBRARY_COMMON_SRC)
|
||||
|
||||
|
|
|
@ -94,15 +94,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
);
|
||||
}
|
||||
/*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& LearnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradient,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
size_t samplesSinceLastSync) override
|
||||
{
|
||||
Base::OnEpochEnd(LearnableNodes, smoothedGradient, samplesSinceLastSync);
|
||||
Base::OnEpochEnd(LearnableNodes, smoothedGradients, samplesSinceLastSync);
|
||||
}
|
||||
/*virtual*/ void ModelAggregationProcessing(
|
||||
size_t samplesSinceLastSync,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradient,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
size_t& totalSamplesProcessed,
|
||||
float& secondsOnCommunication
|
||||
) override
|
||||
|
@ -181,9 +181,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
//----------------------------------------
|
||||
if (m_resetSGDMomentumAfterAggregation)
|
||||
{
|
||||
for (Matrix<ElemType>& x : smoothedGradient)
|
||||
for (auto sg : smoothedGradients)
|
||||
{
|
||||
x.SetValue((ElemType)0);
|
||||
auto x = dynamic_pointer_cast<Matrix<ElemType>>(sg);
|
||||
if (x != nullptr)
|
||||
x->SetValue((ElemType)0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -108,7 +108,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
bool OnArrivingAtSyncPoint(
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes, /* input/output: */
|
||||
std::list<Matrix<ElemType>>& smoothedGradient, /* input/output: under some setup, it will reset to zero*/
|
||||
std::list<MatrixBasePtr>& smoothedGradients, /* input/output: under some setup, it will reset to zero*/
|
||||
size_t samplesSinceLastSync /* input: samples processed since last sync on this worker only */
|
||||
) override
|
||||
{
|
||||
|
@ -130,12 +130,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// Otherwise let update the weights.
|
||||
float secondsOnCommunication = 0.0f;
|
||||
size_t totalSamples = 0;
|
||||
ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
|
||||
ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradient,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
size_t samplesSinceLastSync) override
|
||||
{
|
||||
if (!m_someWorkerHasFinished)
|
||||
|
@ -152,13 +152,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// Let's update our weights no matter what.
|
||||
float secondsOnCommunication = 0.0f;
|
||||
size_t totalSamples = 0;
|
||||
ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
|
||||
ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication);
|
||||
}
|
||||
|
||||
/*virtual*/ void ModelAggregationProcessing(
|
||||
size_t /*samplesSinceLastSync*/,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradient,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
size_t& /*totalSamplesProcessed*/, /* out */
|
||||
float& secondsOnCommunication /* out */
|
||||
) override
|
||||
|
@ -196,8 +196,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
m_communicator->AggregateInPlace(aggregatedWeightsPrepared, m_communicator->Workers());
|
||||
|
||||
// 2. Let's update the model
|
||||
for (auto& pBaseNode : learnableNodes)
|
||||
auto smoothedGradientIter = smoothedGradients.begin();
|
||||
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
|
||||
{
|
||||
ComputationNodeBasePtr pBaseNode = *nodeIter;
|
||||
if (!pBaseNode->IsParameterUpdateRequired())
|
||||
continue;
|
||||
|
||||
|
@ -235,15 +237,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// 2.2.4 update bookkeeping
|
||||
prevWeight.SetValue(currentWeight);
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// 3. reset SGD momentum if necessary
|
||||
//----------------------------------------
|
||||
{
|
||||
// For half, we keep a copy of float weights, update that too
|
||||
if (std::is_same<ElemType, half>())
|
||||
{
|
||||
auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (*smoothedGradientIter);
|
||||
size_t numCols = currentWeight.GetNumCols();
|
||||
|
||||
auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
|
||||
parameterMatrix.CastAssignValuesOf(currentWeight);
|
||||
|
||||
if (m_resetSGDMomentumAfterAggregation)
|
||||
{
|
||||
for (Matrix<ElemType>& x : smoothedGradient)
|
||||
// Only reset smoothed gradients
|
||||
auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols);
|
||||
smoothedGradientMatrix.SetValue(0.0f);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
x.SetValue((ElemType)0);
|
||||
if (m_resetSGDMomentumAfterAggregation)
|
||||
{
|
||||
auto x = dynamic_pointer_cast<Matrix<ElemType>> (*smoothedGradientIter);
|
||||
x->SetValue((ElemType)0);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -22,6 +22,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
using namespace std;
|
||||
|
||||
template <class ElemType, class TargetType>
|
||||
static inline bool isprecision(std::wstring& str)
|
||||
{
|
||||
if ((str == L"") && std::is_same<ElemType, TargetType>())
|
||||
return true;
|
||||
if (std::is_same<TargetType, half>())
|
||||
return EqualCI(str, L"float16");
|
||||
else if (std::is_same<TargetType, float>())
|
||||
return EqualCI(str, L"float");
|
||||
else if (std::is_same<TargetType, double>())
|
||||
return EqualCI(str, L"double");
|
||||
return false;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
|
||||
{
|
||||
|
@ -48,7 +62,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
|
||||
std::wstring cnNodeType = Microsoft::MSR::CNTK::ToFixedWStringFromMultiByte(node->GetValue());
|
||||
|
||||
ComputationNodePtr nodePtr;
|
||||
ComputationNodeBasePtr nodePtr;
|
||||
|
||||
// get the node pointer for the node, should be stored in the EvalValue;
|
||||
if (pass > ndlPassInitial)
|
||||
|
@ -56,7 +70,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
nodePtr = ComputationNode<ElemType>::FromVoidPtr(node->GetEvalValue());
|
||||
if (!nodePtr)
|
||||
{
|
||||
nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
|
||||
nodePtr = m_net->GetNodeFromName(name);
|
||||
node->SetEvalValue(nodePtr.get());
|
||||
}
|
||||
}
|
||||
|
@ -75,16 +89,49 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);
|
||||
|
||||
wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", "");
|
||||
wstring precision = node->GetOptionalParameter("precision", "");
|
||||
|
||||
// TODO: Map dynamicAxis from name to node at this point, where that node is memoized inside NDL.
|
||||
// first look for this node already existing in the network
|
||||
// BUGBUG: How does this set the dimensions then?
|
||||
if (m_net->NodeNameExists(name))
|
||||
nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
|
||||
else if (isSparse)
|
||||
nodePtr = m_net->GetNodeFromName(name);
|
||||
else
|
||||
{
|
||||
if (precision == L"")
|
||||
{
|
||||
if (isSparse)
|
||||
nodePtr = builder.CreateSparseInputNode(name, tensorShape, dynamicAxis);
|
||||
else
|
||||
nodePtr = builder.CreateInputNode(name, tensorShape, dynamicAxis);
|
||||
}
|
||||
else if (EqualCI(precision, L"float"))
|
||||
{
|
||||
if (isSparse)
|
||||
nodePtr = builder.template TypedCreateSparseInputNode<float>(name, tensorShape, dynamicAxis);
|
||||
else
|
||||
nodePtr = builder.template TypedCreateInputNode<float>(name, tensorShape, dynamicAxis);
|
||||
}
|
||||
else if (EqualCI(precision, L"double"))
|
||||
{
|
||||
if (isSparse)
|
||||
nodePtr = builder.template TypedCreateSparseInputNode<double>(name, tensorShape, dynamicAxis);
|
||||
else
|
||||
nodePtr = builder.template TypedCreateInputNode<double>(name, tensorShape, dynamicAxis);
|
||||
}
|
||||
else if (EqualCI(precision, L"float16"))
|
||||
{
|
||||
if (isSparse)
|
||||
nodePtr = builder.template TypedCreateSparseInputNode<half>(name, tensorShape, dynamicAxis);
|
||||
else
|
||||
nodePtr = builder.template TypedCreateInputNode<half>(name, tensorShape, dynamicAxis);
|
||||
}
|
||||
else
|
||||
{
|
||||
RuntimeError("NDLNetworkBuilder: Input: the 'precision' parameter if specified, must be 'float', 'double' or 'float16'.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput")
|
||||
{
|
||||
|
@ -193,7 +240,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
nodePtr = builder.CreateLearnableParameter(name, rows, cols);
|
||||
nodePtr->SetLearningRateMultiplier(0);
|
||||
}
|
||||
else if (pass == ndlPassFinal || nodePtr->Value().GetNumElements() != 0)
|
||||
else if (pass == ndlPassFinal || (dynamic_pointer_cast<ComputationNode<ElemType>> (nodePtr))->Value().GetNumElements() != 0)
|
||||
{
|
||||
ElemType val = parameter[0]->GetScalar();
|
||||
m_net->InitLearnableParameters(nodePtr, L"fixedValue", val);
|
||||
|
@ -607,6 +654,56 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
nodeParamCount = nodePtr->GetNumInputs();
|
||||
}
|
||||
}
|
||||
else if (cnNodeType == OperationName2Of(CastNode))
|
||||
{
|
||||
if (parameter.size() < 1)
|
||||
RuntimeError("%ls should have 1 or more parameters (node and cast precision).", cnNodeType.c_str());
|
||||
|
||||
// setup the parameter position of children so we can hook them up later
|
||||
nodeParamCount = 1;
|
||||
nodeParamStart = 0;
|
||||
|
||||
if (pass == ndlPassInitial)
|
||||
{
|
||||
// evaluate only scalar parameters
|
||||
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
|
||||
auto sourceNode = (NDLNode<ElemType>*) params[0];
|
||||
wstring sourcePrecision = sourceNode->GetOptionalParameter("precision", "");
|
||||
wstring targetPrecision = node->GetOptionalParameter("precision", "");
|
||||
if (EqualCI(targetPrecision, L"float16"))
|
||||
{
|
||||
ComputationNetworkBuilder<half> builder2(*m_net);
|
||||
if (isprecision<ElemType, float>(sourcePrecision))
|
||||
nodePtr = builder2.CreateCastNode<float>(name);
|
||||
else if (isprecision<ElemType, double>(sourcePrecision))
|
||||
nodePtr = builder2.CreateCastNode<double>(name);
|
||||
else
|
||||
RuntimeError("NDLNetworkBuilder: for CastNode to cast to half, input must be 'float' or 'double'");
|
||||
}
|
||||
else if (EqualCI(targetPrecision, L"float"))
|
||||
{
|
||||
ComputationNetworkBuilder<float> builder2(*m_net);
|
||||
if (isprecision<ElemType, half>(sourcePrecision))
|
||||
nodePtr = builder2.CreateCastNode<half>(name);
|
||||
else if (isprecision<ElemType, double>(sourcePrecision))
|
||||
nodePtr = builder2.CreateCastNode<double>(name);
|
||||
else
|
||||
RuntimeError("NDLNetworkBuilder: for CastNode to cast to float, input must be 'float16' or 'double'");
|
||||
}
|
||||
else if (EqualCI(targetPrecision, L"double"))
|
||||
{
|
||||
ComputationNetworkBuilder<double> builder2(*m_net);
|
||||
if (isprecision<ElemType, float>(sourcePrecision))
|
||||
nodePtr = builder2.CreateCastNode<float>(name);
|
||||
else if (isprecision<ElemType, half>(sourcePrecision))
|
||||
nodePtr = builder2.CreateCastNode<half>(name);
|
||||
else
|
||||
RuntimeError("NDLNetworkBuilder: for CastNode to cast to double, input must be 'float' or 'float16'");
|
||||
}
|
||||
else
|
||||
RuntimeError("NDLNetworkBuilder: CastNode - need to specify 'precision' parameter: 'float', 'double' or 'float16'.");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
|
||||
|
@ -645,7 +742,10 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
|
|||
#if 1
|
||||
vector<ComputationNodeBasePtr> inputNodes;
|
||||
for (let& in : inputs)
|
||||
inputNodes.push_back(ComputationNode<ElemType>::FromVoidPtr(in));
|
||||
{
|
||||
ComputationNodeBase *p = (ComputationNodeBase *)in;
|
||||
inputNodes.push_back(p ? p->shared_from_this() : nullptr);
|
||||
}
|
||||
|
||||
nodePtr->AttachInputs(inputNodes);
|
||||
#else // TODO: delete this
|
||||
|
@ -714,6 +814,7 @@ TensorShape NDLNodeEvaluatorImpl<ElemType>::ProcessTensorShapeParameters(const N
|
|||
return TensorShape(dims);
|
||||
}
|
||||
|
||||
template class NDLBuilderImpl<half>;
|
||||
template class NDLBuilderImpl<float>;
|
||||
template class NDLBuilderImpl<double>;
|
||||
|
||||
|
|
|
@ -269,10 +269,11 @@ public:
|
|||
}
|
||||
|
||||
// ProcessOptionalParameters - Process the optional parameters of a node
|
||||
virtual void ProcessOptionalParameters(NDLNode<ElemType>* node)
|
||||
virtual void ProcessOptionalParameters(NDLNode<ElemType>* node) override
|
||||
{
|
||||
vector<NDLNode<ElemType>*> params = node->GetParameters(true); // get all the optional parameters only
|
||||
auto compNode = ComputationNode<ElemType>::FromVoidPtr(node->GetEvalValue());
|
||||
ComputationNodeBase* compNodePtr = (ComputationNodeBase *) (node->GetEvalValue());
|
||||
ComputationNodeBasePtr compNode = compNodePtr ? compNodePtr->shared_from_this() : nullptr;
|
||||
std::string empty;
|
||||
|
||||
// loop through all the optional parameters processing them as necessary
|
||||
|
@ -582,6 +583,7 @@ private:
|
|||
DEVICEID_TYPE m_deviceId;
|
||||
};
|
||||
|
||||
template class NDLBuilder<half>;
|
||||
template class NDLBuilder<float>;
|
||||
template class NDLBuilder<double>;
|
||||
|
||||
|
|
|
@ -160,6 +160,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
|
|||
else if (EqualInsensitive(nodeType, OperationNameOf(AtanhNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(AveragePoolingNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(BatchNormalizationNode))) ret = true;
|
||||
else if (EqualInsensitive(nodeType, OperationName2Of(CastNode))) ret = true;
|
||||
#ifdef COMING_SOON
|
||||
else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
|
||||
#endif
|
||||
|
@ -267,18 +268,24 @@ NDLScript<ElemType> NDLScript<ElemType>::s_global("global");
|
|||
|
||||
// declare the static variables from the classes
|
||||
template <>
|
||||
NDLScript<half> NDLScript<half>::s_global{};
|
||||
template <>
|
||||
NDLScript<float> NDLScript<float>::s_global{};
|
||||
template <>
|
||||
NDLScript<double> NDLScript<double>::s_global{};
|
||||
|
||||
template <>
|
||||
int NDLNode<half>::s_nameCounter = 0;
|
||||
template <>
|
||||
int NDLNode<float>::s_nameCounter = 0;
|
||||
template <>
|
||||
int NDLNode<double>::s_nameCounter = 0;
|
||||
|
||||
template class NDLNode<half>;
|
||||
template class NDLNode<float>;
|
||||
template class NDLNode<double>;
|
||||
|
||||
template class NDLScript<half>;
|
||||
template class NDLScript<float>;
|
||||
template class NDLScript<double>;
|
||||
|
||||
|
|
|
@ -98,6 +98,7 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
template class NDLNodeEvaluator<half>;
|
||||
template class NDLNodeEvaluator<float>;
|
||||
template class NDLNodeEvaluator<double>;
|
||||
|
||||
|
|
|
@ -188,9 +188,12 @@ ComputationNetworkPtr GetModelFromConfig(const ConfigRecordType& config, const w
|
|||
return net;
|
||||
}
|
||||
|
||||
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, half>(const ScriptableObjects::IConfigRecord& config);
|
||||
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, float>(const ScriptableObjects::IConfigRecord& config);
|
||||
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, double>(const ScriptableObjects::IConfigRecord& config);
|
||||
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, half>(const ConfigParameters& config);
|
||||
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, float>(const ConfigParameters& config);
|
||||
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, double>(const ConfigParameters& config);
|
||||
template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, half> (const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
|
||||
template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, float> (const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
|
||||
template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, double>(const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
|
||||
|
|
|
@ -1775,6 +1775,7 @@ shared_ptr<ComputationNode<ElemType>> SimpleNetworkBuilder<ElemType>::AddTrainAn
|
|||
return output;
|
||||
}
|
||||
|
||||
template class SimpleNetworkBuilder<half>;
|
||||
template class SimpleNetworkBuilder<float>;
|
||||
template class SimpleNetworkBuilder<double>;
|
||||
|
||||
|
|
|
@ -159,9 +159,13 @@ public:
|
|||
m_constInputGateValue = config("constInputGateValue", "false");
|
||||
m_constOutputGateValue = config("constOutputGateValue", "false");
|
||||
|
||||
m_forgetGateInitVal = config("forgetGateInitVal", "-1");
|
||||
m_inputGateInitVal = config("inputGateInitVal", "-1");
|
||||
m_outputGateInitVal = config("outputGateInitVal", "-1");
|
||||
ElemType forgetGateInitVal = config("forgetGateInitVal", "-1");
|
||||
ElemType inputGateInitVal = config("inputGateInitVal", "-1");
|
||||
ElemType outputGateInitVal = config("outputGateInitVal", "-1");
|
||||
|
||||
m_forgetGateInitVal = forgetGateInitVal;
|
||||
m_inputGateInitVal = inputGateInitVal;
|
||||
m_outputGateInitVal = outputGateInitVal;
|
||||
|
||||
m_sparse_input = config("sparseinput", "false");
|
||||
|
||||
|
|
|
@ -142,12 +142,14 @@ shared_ptr<Object> MakeRuntimeObject<TrainAction>(const IConfigRecordPtr configp
|
|||
{
|
||||
const IConfigRecord& config = *configp;
|
||||
wstring precision = config[L"precision"]; // dispatch on ElemType
|
||||
if (precision == L"float")
|
||||
if (precision == L"float16")
|
||||
DoTrain<IConfigRecord, half>(config);
|
||||
else if (precision == L"float")
|
||||
DoTrain<IConfigRecord, float>(config);
|
||||
else if (precision == L"double")
|
||||
DoTrain<IConfigRecord, double>(config);
|
||||
else
|
||||
RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
|
||||
RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str());
|
||||
|
||||
return make_shared<Object>(); // return a dummy object
|
||||
}
|
||||
|
@ -156,8 +158,10 @@ shared_ptr<Object> MakeRuntimeObject<TrainAction>(const IConfigRecordPtr configp
|
|||
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<TrainAction> registerTrainAction(L"TrainAction");
|
||||
}}}
|
||||
|
||||
template void DoTrain<ScriptableObjects::IConfigRecord, half>(const ScriptableObjects::IConfigRecord& config);
|
||||
template void DoTrain<ScriptableObjects::IConfigRecord, float>(const ScriptableObjects::IConfigRecord& config);
|
||||
template void DoTrain<ScriptableObjects::IConfigRecord, double>(const ScriptableObjects::IConfigRecord& config);
|
||||
template void DoTrain<ConfigParameters, half>(const ConfigParameters& config);
|
||||
template void DoTrain<ConfigParameters, float>(const ConfigParameters& config);
|
||||
template void DoTrain<ConfigParameters, double>(const ConfigParameters& config);
|
||||
|
||||
|
|
|
@ -171,6 +171,91 @@ static void DisableLegacyUsage(const ConfigParameters& TopLevelConfig, const Con
|
|||
// be run in parallel across multiple ranks. Others should only run on rank 0
|
||||
const std::set<std::string> commandstoRunOnAllRanks = { "train", "trainRNN", "adapt", "test", "eval", "cv", "devtest", "bnstat" };
|
||||
|
||||
|
||||
template <typename ElemType>
|
||||
bool DispatchThisAction(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& config)
|
||||
{
|
||||
if (thisAction == "train" || thisAction == "trainRNN")
|
||||
{
|
||||
DoTrain<ConfigParameters, ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "bnstat")
|
||||
{
|
||||
DoBatchNormalizationStat<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "adapt")
|
||||
{
|
||||
DoAdapt<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "test" || thisAction == "eval")
|
||||
{
|
||||
DoEval<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "edit")
|
||||
{
|
||||
DoEdit<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "cv")
|
||||
{
|
||||
DoCrossValidate<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "write")
|
||||
{
|
||||
DoWriteOutput<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "devtest")
|
||||
{
|
||||
TestCn<ElemType>(config); // for "devtest" action pass the root config instead
|
||||
}
|
||||
else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
|
||||
{
|
||||
DoDumpNodes<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "convertdbn")
|
||||
{
|
||||
DoConvertFromDbn<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "exportdbn")
|
||||
{
|
||||
DoExportToDbn<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "createLabelMap")
|
||||
{
|
||||
DoCreateLabelMap<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "writeWordAndClass")
|
||||
{
|
||||
DoWriteWordAndClassInfo<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "plot")
|
||||
{
|
||||
DoTopologyPlot<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "SVD")
|
||||
{
|
||||
DoParameterSVD<ElemType>(commandParams);
|
||||
}
|
||||
else
|
||||
{
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool DispatchThisAction<half>(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& )
|
||||
{
|
||||
if (thisAction == "train" || thisAction == "trainRNN")
|
||||
{
|
||||
DoTrain<ConfigParameters, half>(commandParams);
|
||||
}
|
||||
else
|
||||
{
|
||||
RuntimeError("half only supported for action train or trainRNN!");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
// process the command
|
||||
template <typename ElemType>
|
||||
void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mpi)
|
||||
|
@ -270,73 +355,21 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
|
|||
{
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
|
||||
}
|
||||
DoTrain<ConfigParameters, ElemType>(commandParams);
|
||||
}
|
||||
|
||||
if (!DispatchThisAction<ElemType>(thisAction, commandParams, config))
|
||||
{
|
||||
RuntimeError("unknown action: %s in command set: %s", thisAction.c_str(), command[i].c_str());
|
||||
}
|
||||
|
||||
if (thisAction == "train" || thisAction == "trainRNN")
|
||||
{
|
||||
if (progressTracing)
|
||||
{
|
||||
LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
|
||||
}
|
||||
fullEpochsOffset += GetMaxEpochs(commandParams);
|
||||
}
|
||||
else if (thisAction == "bnstat")
|
||||
{
|
||||
DoBatchNormalizationStat<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "adapt")
|
||||
{
|
||||
DoAdapt<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "test" || thisAction == "eval")
|
||||
{
|
||||
DoEval<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "edit")
|
||||
{
|
||||
DoEdit<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "cv")
|
||||
{
|
||||
DoCrossValidate<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "write")
|
||||
{
|
||||
DoWriteOutput<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "devtest")
|
||||
{
|
||||
TestCn<ElemType>(config); // for "devtest" action pass the root config instead
|
||||
}
|
||||
else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
|
||||
{
|
||||
DoDumpNodes<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "convertdbn")
|
||||
{
|
||||
DoConvertFromDbn<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "exportdbn")
|
||||
{
|
||||
DoExportToDbn<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "createLabelMap")
|
||||
{
|
||||
DoCreateLabelMap<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "writeWordAndClass")
|
||||
{
|
||||
DoWriteWordAndClassInfo<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "plot")
|
||||
{
|
||||
DoTopologyPlot<ElemType>(commandParams);
|
||||
}
|
||||
else if (thisAction == "SVD")
|
||||
{
|
||||
DoParameterSVD<ElemType>(commandParams);
|
||||
}
|
||||
else
|
||||
{
|
||||
RuntimeError("unknown action: %s in command set: %s", thisAction.c_str(), command[i].c_str());
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n");
|
||||
|
@ -740,12 +773,14 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
|
|||
LOGPRINTF(stderr, "precision = \"%s\"\n", type.c_str());
|
||||
}
|
||||
|
||||
if (type == "float")
|
||||
if (type == "float16")
|
||||
DoCommands<half>(config, mpi);
|
||||
else if (type == "float")
|
||||
DoCommands<float>(config, mpi);
|
||||
else if (type == "double")
|
||||
DoCommands<double>(config, mpi);
|
||||
else
|
||||
RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float\" or \"double\"", type.c_str());
|
||||
RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float16\" or \"float\" or \"double\"", type.c_str());
|
||||
|
||||
// if completed then write a doneFile if requested
|
||||
if (!doneFile.empty())
|
||||
|
|
|
@ -8,6 +8,8 @@
|
|||
#include <map>
|
||||
#include <stdexcept>
|
||||
#include <stdint.h>
|
||||
#include "File.h"
|
||||
#include "half.hpp"
|
||||
|
||||
using namespace std;
|
||||
|
||||
|
@ -150,6 +152,11 @@ public:
|
|||
return (float) (double) *this;
|
||||
}
|
||||
|
||||
operator half() const
|
||||
{
|
||||
return (half)(double)*this;
|
||||
}
|
||||
|
||||
private:
|
||||
long tolong() const
|
||||
{
|
||||
|
|
|
@ -150,6 +150,20 @@ void ComputationNetwork::SaveToFileImpl(const wstring& fileName, const FileOptio
|
|||
else if (nodePtr->Is<ComputationNode<half>>())
|
||||
precision = ElemTypeName<half>();
|
||||
else LogicError("Unexpected node type.");
|
||||
#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31
|
||||
if (nodePtr->Is<CastNode<half,float>>())
|
||||
precision = ElemTypeName2<half,float>();
|
||||
else if (nodePtr->Is<CastNode<half, double>>())
|
||||
precision = ElemTypeName2<half, double>();
|
||||
else if (nodePtr->Is<CastNode<float, half>>())
|
||||
precision = ElemTypeName2<float, half>();
|
||||
else if (nodePtr->Is<CastNode<float, double>>())
|
||||
precision = ElemTypeName2<float, double>();
|
||||
else if (nodePtr->Is<CastNode<double, half>>())
|
||||
precision = ElemTypeName2<double, half>();
|
||||
else if (nodePtr->Is<CastNode<double, float>>())
|
||||
precision = ElemTypeName2<double, float>();
|
||||
#endif
|
||||
fstream << precision;
|
||||
#endif
|
||||
fstream << nodePtr->OperationName();
|
||||
|
@ -265,6 +279,20 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
|
|||
node = ComputationNetworkBuilder<half>::NewNode(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"") // old file format: default to <ElemType>
|
||||
node = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
|
||||
#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31
|
||||
else if (precision == L"half,float")
|
||||
node = ComputationNetworkBuilder<half>::NewNode2<float>(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"half,double")
|
||||
node = ComputationNetworkBuilder<half>::NewNode2<double>(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"float,half")
|
||||
node = ComputationNetworkBuilder<float>::NewNode2<half>(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"float,double")
|
||||
node = ComputationNetworkBuilder<float>::NewNode2<double>(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"double,half")
|
||||
node = ComputationNetworkBuilder<double>::NewNode2<half>(opName, m_deviceId, nodeName);
|
||||
else if (precision == L"double,float")
|
||||
node = ComputationNetworkBuilder<double>::NewNode2<float>(opName, m_deviceId, nodeName);
|
||||
#endif
|
||||
else
|
||||
RuntimeError("Read: Unexpected precision tag '%ls'", precision.c_str());
|
||||
|
||||
|
|
|
@ -1313,6 +1313,14 @@ template <> /*static*/ inline const wchar_t* ElemTypeName<float>() { return L"f
|
|||
template <> /*static*/ inline const wchar_t* ElemTypeName<double>() { return L"double"; }
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName<half>() { return L"half"; }
|
||||
|
||||
template <typename ElemType, typename ElemType2> static inline const wchar_t* ElemTypeName2();
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName2<float,half>() { return L"float,half"; }
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName2<float,double>() { return L"float,double"; }
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName2<double,half>() { return L"double,half"; }
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName2<double,float>() { return L"double,float"; }
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName2<half,float>() { return L"half,float"; }
|
||||
template <> /*static*/ inline const wchar_t* ElemTypeName2<half,double>() { return L"half,double"; }
|
||||
|
||||
// The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
|
||||
// The corresponding Matrix<float> is emitted in the SetDeviceId function above.
|
||||
template class Matrix<double>;
|
||||
|
|
|
@ -175,6 +175,13 @@ static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring& node
|
|||
else return CreateStandardNode<ElemType>(nodeType, forward<_Types>(_Args)...);
|
||||
}
|
||||
|
||||
template <class ElemType, class ElemType2, class... _Types>
|
||||
static shared_ptr<ComputationNode<ElemType>> CreateNode2(const std::wstring& nodeType, _Types&&... _Args)
|
||||
{
|
||||
// check more types
|
||||
if (nodeType == OperationName2Of(CastNode)) return New<CastNode<ElemType, ElemType2>>(forward<_Types>(_Args)...);
|
||||
else RuntimeError("CreateNode2: unsupported nodeType - %S", nodeType.c_str());
|
||||
}
|
||||
// this function is called from SimpleNetworkBuilder and old NDL
|
||||
template <class ElemType>
|
||||
/*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name)
|
||||
|
@ -189,6 +196,13 @@ template <class ElemType>
|
|||
return CreateNode<ElemType>(nodeType, deviceId, name);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
template <class ElemType2>
|
||||
/*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name)
|
||||
{
|
||||
return CreateNode2<ElemType, ElemType2>(nodeType, deviceId, name);
|
||||
}
|
||||
|
||||
shared_ptr<ComputationNodeBase> NewComputationNodeFromConfig(const Microsoft::MSR::ScriptableObjects::IConfigRecordPtr configp)
|
||||
{
|
||||
wstring precision = configp->Get(L"precision"); // dispatch on ElemType
|
||||
|
@ -247,15 +261,17 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
|
||||
template <class ValueType>
|
||||
shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
|
||||
return net.AddNodeToNetWithElemType(New<InputValue<ValueType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
|
||||
template <class ValueType>
|
||||
shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
|
||||
return net.AddNodeToNetWithElemType(New<SparseInputValue<ValueType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -318,6 +334,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
|
|||
{
|
||||
return net.AddNodeToNetWithElemType(New<ReconcileDynamicAxisNode<ElemType>>(net.GetDeviceId(), nodeName));
|
||||
}
|
||||
template <class ElemType>
|
||||
template <class InputNodeType>
|
||||
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateCastNode(const std::wstring& nodeName)
|
||||
{
|
||||
return net.AddNodeToNetWithElemType(New<CastNode<ElemType, InputNodeType>>(net.GetDeviceId(), nodeName));
|
||||
}
|
||||
|
||||
// this is the catch-all for all cases not covered as special cases above
|
||||
// Unlike the specialized ones above, this one creates nodes by type given as a string.
|
||||
|
@ -997,4 +1019,37 @@ template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::Typ
|
|||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
|
||||
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::CreateCastNode<half>(const std::wstring& nodeName);
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::CreateCastNode<double>(const std::wstring& nodeName);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::CreateCastNode<half>(const std::wstring& nodeName);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::CreateCastNode<float>(const std::wstring& nodeName);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::CreateCastNode<float>(const std::wstring& nodeName);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::CreateCastNode<double>(const std::wstring& nodeName);
|
||||
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::NewNode2<half>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::NewNode2<double>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::NewNode2<half>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::NewNode2<float>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::NewNode2<float>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::NewNode2<double>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
}}}
|
||||
|
|
|
@ -38,6 +38,8 @@ public:
|
|||
// TODO: move into a separate header/class, to decouple from this class which would then be only used by old NDL and SimpleNetworkBuilder.
|
||||
static ComputationNodePtr NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
static ComputationNodePtr NewNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
template <class ElemType2>
|
||||
static ComputationNodePtr NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
|
||||
|
||||
// The following functions create nodes and add them to the net, but don't attach inputs (some don't have inputs).
|
||||
// There are special versions for nodes with custom constructors, and a catch-all, CreateComputationNode(), for all others.
|
||||
|
@ -53,12 +55,25 @@ public:
|
|||
template<class ValueType>
|
||||
shared_ptr<ComputationNode<ValueType>> TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
|
||||
|
||||
template <class InputNodeType>
|
||||
shared_ptr<ComputationNode<ElemType>> CreateCastNode(const std::wstring& nodeName);
|
||||
|
||||
// sparse matrix size is optionally specified
|
||||
// ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
|
||||
ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
|
||||
ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
|
||||
ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
|
||||
ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
|
||||
shared_ptr<ComputationNode<ElemType>> CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"")
|
||||
{
|
||||
return this->template TypedCreateInputNode<ElemType>(inputName, sampleLayout, dynamicAxisName);
|
||||
}
|
||||
template<class ValueType>
|
||||
shared_ptr<ComputationNode<ValueType>> TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
shared_ptr<ComputationNode<ElemType>> CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"")
|
||||
{
|
||||
return this->template TypedCreateSparseInputNode<ElemType>(inputName, sampleLayout, dynamicAxisName);
|
||||
}
|
||||
template<class ValueType>
|
||||
shared_ptr<ComputationNode<ValueType>> TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
|
||||
ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
|
||||
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
|
||||
bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
|
||||
|
|
|
@ -61,7 +61,8 @@
|
|||
#define CNTK_MODEL_VERSION_28 28 // Padding op
|
||||
#define CNTK_MODEL_VERSION_29 29 // Expose StopGradient in BS
|
||||
#define CNTK_MODEL_VERSION_30 30 // LatticeWithSequenceSoftmax node
|
||||
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_30
|
||||
#define CNTK_MODEL_VERSION_31 31 // Cast node
|
||||
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_31
|
||||
|
||||
// helper mode for debugging
|
||||
// If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
|
||||
|
@ -95,6 +96,7 @@ struct /*interface*/ IComputationNode
|
|||
// TODO: OperationName calls static TypeName which does not match the actual type names in that the 'Node' is missing.
|
||||
virtual const std::wstring OperationName() const = 0;
|
||||
#define OperationNameOf(T) (T<float>::TypeName()) // convenience macro
|
||||
#define OperationName2Of(T) (T<double,float>::TypeName()) // convenience macro
|
||||
|
||||
virtual void UpdateFunctionMBSize() = 0; // recalculate our column dimensions from MBLayout. Override to update temps.
|
||||
|
||||
|
|
|
@ -4,6 +4,9 @@
|
|||
//
|
||||
// CNTKEval.cpp : Defines the exported functions for the CNTK DLL.
|
||||
//
|
||||
#ifndef _CRT_SECURE_NO_WARNINGS
|
||||
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
|
||||
#endif
|
||||
|
||||
#define __STDC_FORMAT_MACROS
|
||||
#include <inttypes.h>
|
||||
|
|
|
@ -646,6 +646,20 @@ ASGDHelper<ElemType>* NewASGDHelper(
|
|||
#endif
|
||||
}
|
||||
|
||||
template<> ASGDHelper<half>* NewASGDHelper<half>(
|
||||
const std::list<ComputationNodeBasePtr> & learnableNodes,
|
||||
size_t nodeNumRanks,
|
||||
bool useAsyncBuffer,
|
||||
bool isSimulatedModelAveragingSGD,
|
||||
AdjustLearningRateAtBeginning adjusttype,
|
||||
double adjustCoef,
|
||||
size_t adjustPerMinibatches,
|
||||
int traceLevel,
|
||||
int syncPerfStats)
|
||||
{
|
||||
RuntimeError("NewASGDHelper - half not supported!");
|
||||
}
|
||||
|
||||
template ASGDHelper<float>* NewASGDHelper<float>(
|
||||
const std::list<ComputationNodeBasePtr> & learnableNodes,
|
||||
size_t nodeNumRanks,
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#include "Matrix.h"
|
||||
#include "SimpleDistGradAggregator.h"
|
||||
#include "V2SimpleDistGradAggregator.h"
|
||||
#include "SimpleDistGradAggregatorHelper.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
@ -46,16 +47,7 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
|
|||
}
|
||||
|
||||
// Prepare aggregator.
|
||||
std::shared_ptr<IDistGradAggregator<ElemType>> distGradAgg;
|
||||
if (Globals::UseV2Aggregator())
|
||||
distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(
|
||||
mpi,
|
||||
false /*useAsyncAggregation*/,
|
||||
net->GetDeviceId(),
|
||||
0 /*syncStatsTrace*/,
|
||||
::CNTK::MPICommunicator(packThresholdSizeInBytes));
|
||||
else
|
||||
distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(
|
||||
std::shared_ptr<IDistGradAggregator<ElemType>> distGradAgg = GetSimpleDistGradAggregator<ElemType>(
|
||||
mpi,
|
||||
false /*useAsyncAggregation*/,
|
||||
net->GetDeviceId(),
|
||||
|
|
|
@ -24,10 +24,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
for (const auto& iter : inputMatrices)
|
||||
matrices.insert(iter.second.matrix);
|
||||
for (auto& node : net->FeatureNodes())
|
||||
if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
|
||||
if (matrices.find(node->ValuePtr()) != matrices.end())
|
||||
node->NotifyFunctionValuesMBSizeModified();
|
||||
for (auto& node : net->LabelNodes())
|
||||
if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
|
||||
if (matrices.find(node->ValuePtr()) != matrices.end())
|
||||
node->NotifyFunctionValuesMBSizeModified();
|
||||
}
|
||||
|
||||
|
|
|
@ -150,7 +150,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
virtual void OnEpochEnd(const std::list<ComputationNodeBasePtr>& LearnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradient,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
size_t samplesSinceLastSync
|
||||
)
|
||||
{
|
||||
|
@ -165,7 +165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (read2sync)
|
||||
{
|
||||
m_numSyncPerformed++;
|
||||
ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication);
|
||||
ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication);
|
||||
m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication);
|
||||
}
|
||||
|
||||
|
@ -175,7 +175,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
virtual bool OnArrivingAtSyncPoint(
|
||||
const std::list<ComputationNodeBasePtr>& LearnableNodes, /* input/output: */
|
||||
std::list<Matrix<ElemType>>& smoothedGradient, /* input/output: under some setup, it will reset to zero*/
|
||||
std::list<MatrixBasePtr>& smoothedGradients, /* input/output: under some setup, it will reset to zero*/
|
||||
size_t samplesSinceLastSync /* input: samples processed since last sync on this worker only */
|
||||
)
|
||||
{
|
||||
|
@ -190,7 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (read2Sync)
|
||||
{
|
||||
m_numSyncPerformed++;
|
||||
ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication);
|
||||
ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication);
|
||||
m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication);
|
||||
}
|
||||
return read2Sync;
|
||||
|
@ -199,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
virtual void ModelAggregationProcessing(
|
||||
size_t samplesSinceLastSync, /* in: */
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes, /* in/out */
|
||||
std::list<Matrix<ElemType>>& smoothedGradient, /* in/out */
|
||||
std::list<MatrixBasePtr>& smoothedGradients, /* in/out */
|
||||
size_t& totalSamplesProcessed, /* out */
|
||||
float& secondsOnCommunication /* out */) = 0;
|
||||
|
||||
|
@ -346,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
void ModelAggregationProcessing(
|
||||
size_t samplesSinceLastSync, /* in */
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes, /* in/out */
|
||||
std::list<Matrix<ElemType>>& smoothedGradient, /* in/out */
|
||||
std::list<MatrixBasePtr>& smoothedGradients, /* in/out */
|
||||
size_t& totalSamplesProcessed, /* out */
|
||||
float& secondsOnCommunication /* out */) override
|
||||
// NOTE: the variable type is determined by the interface in SGD::TrainOneEpoch
|
||||
|
|
|
@ -31,6 +31,7 @@
|
|||
#include "ASGDHelper.h"
|
||||
|
||||
#include "CNTKLibraryInternals.h"
|
||||
#include "SimpleDistGradAggregatorHelper.h"
|
||||
#include "SimpleDistGradAggregator.h"
|
||||
#include "V2SimpleDistGradAggregator.h"
|
||||
#include "ProgressTracing.h"
|
||||
|
@ -47,8 +48,10 @@ using namespace std;
|
|||
// class SGD
|
||||
// =======================================================================
|
||||
|
||||
template SGD<half>::SGD(const ConfigParameters&);
|
||||
template SGD<float>::SGD(const ConfigParameters&);
|
||||
template SGD<double>::SGD(const ConfigParameters&);
|
||||
template SGD<half>::SGD(const ScriptableObjects::IConfigRecord&);
|
||||
template SGD<float>::SGD(const ScriptableObjects::IConfigRecord&);
|
||||
template SGD<double>::SGD(const ScriptableObjects::IConfigRecord&);
|
||||
|
||||
|
@ -223,6 +226,11 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
}
|
||||
}
|
||||
|
||||
if (criterionNodes.front()->template Is<ComputationNode<half>>())
|
||||
{
|
||||
InvalidArgument("TrainOrAdaptModel: using Float16 for loss function may cause overflow, please cast to float.");
|
||||
}
|
||||
|
||||
// This code is only relevant for the new (V2) readers. It exists because of
|
||||
// a shortcoming in DecimateMinibatchInPlace, which does not yet work when inputs
|
||||
// in the same minibatch have different layouts, which is something only V2 readers can
|
||||
|
@ -333,7 +341,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
// initializing weights and gradient holder
|
||||
// only one criterion so far TODO: support multiple ones?
|
||||
auto& learnableNodes = net->LearnableParameterNodes(criterionNodes[0]);
|
||||
list<Matrix<ElemType>> smoothedGradients;
|
||||
list<MatrixBasePtr> smoothedGradients;
|
||||
vector<double> smoothedCounts; // currently used by FSAdaGradUpdate()
|
||||
size_t numParameters = 0;
|
||||
|
||||
|
@ -344,9 +352,30 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
|
|||
// Note: We don't actually need the smoothedGradients if !IsParameterUpdateRequired().
|
||||
// However, this is hard to fix since lots of code assumes smoothedGradients to be in the same order as learnableNodes.
|
||||
// V2 API fixes this.
|
||||
smoothedGradients.push_back(Matrix<ElemType>(node->Value().GetNumRows(),
|
||||
node->Value().GetNumCols(),
|
||||
net->GetDeviceId()));
|
||||
MatrixBasePtr smoothedGradientPtr;
|
||||
size_t numRows = node->Value().GetNumRows();
|
||||
size_t numCols = node->Value().GetNumCols();
|
||||
if (std::is_same<ElemType, half>())
|
||||
{
|
||||
// For half parameters, we use float smoothed gradients
|
||||
// Allocate 3 times the size for casting parameter and gradients to float
|
||||
const size_t c_smoothed_gradients_factor = 3;
|
||||
shared_ptr<Matrix<float>> compoundMatrixPtr = std::make_shared<Matrix<float>>(numRows,
|
||||
numCols * c_smoothed_gradients_factor,
|
||||
net->GetDeviceId());
|
||||
// Initialize float parameters
|
||||
auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
|
||||
parameterMatrix.CastAssignValuesOf(node->Value());
|
||||
|
||||
smoothedGradientPtr = compoundMatrixPtr;
|
||||
}
|
||||
else
|
||||
{
|
||||
smoothedGradientPtr = std::make_shared<Matrix<ElemType>>(numRows,
|
||||
numCols,
|
||||
net->GetDeviceId());
|
||||
}
|
||||
smoothedGradients.push_back(smoothedGradientPtr);
|
||||
smoothedCounts.push_back(0);
|
||||
if (node->IsParameterUpdateRequired())
|
||||
{
|
||||
|
@ -987,7 +1016,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices, // TODO: why is this a pointer?
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, vector<double>& smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, vector<double>& smoothedCounts,
|
||||
/*out*/ EpochCriterion& epochCriterion,
|
||||
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
|
||||
const std::string& prefixMsg,
|
||||
|
@ -1389,7 +1418,25 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
|
|||
if (node->IsParameterUpdateRequired())
|
||||
{
|
||||
#ifdef _DEBUG
|
||||
if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): "))
|
||||
bool hasNan = false;
|
||||
if (std::is_same<ElemType, half>())
|
||||
{
|
||||
// Get metrix from compound metrix
|
||||
auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (*smoothedGradientIter);
|
||||
if (compoundMatrixPtr)
|
||||
{
|
||||
size_t numCols = dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().GetNumCols();
|
||||
|
||||
auto smoothedGradient = compoundMatrixPtr->ColumnSlice(0, numCols);
|
||||
hasNan = smoothedGradient.HasNan("TrainOneEpoch/UpdateWeights(): ");
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
auto smoothedGradient = dynamic_pointer_cast<Matrix<ElemType>> (*smoothedGradientIter);
|
||||
hasNan = smoothedGradient && smoothedGradient->HasNan("TrainOneEpoch/UpdateWeights(): ");
|
||||
}
|
||||
if (hasNan)
|
||||
LogicError("%ls %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
|
||||
#endif
|
||||
double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier();
|
||||
|
@ -1811,7 +1858,7 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
|
||||
const bool learnRateInitialized,
|
||||
const double largestPrevLearnRatePerSample)
|
||||
{
|
||||
|
@ -1985,7 +2032,7 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
|
||||
const double learningRateAdjustmentFactor)
|
||||
{
|
||||
size_t minMinibatchSize = initialMinibatchSize;
|
||||
|
@ -2086,7 +2133,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
const size_t minMinibatchSize, const size_t maxMinibatchSize)
|
||||
{
|
||||
// may happen for automatically reduced learning rates
|
||||
|
@ -2190,7 +2237,7 @@ void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
|
||||
/*out*/ EpochCriterion& epochCriterion,
|
||||
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
|
||||
std::string prefixMsg,
|
||||
|
@ -2264,6 +2311,24 @@ void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net
|
|||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
std::shared_ptr<IDistGradAggregator<ElemType>> _GetAllReduceDistGradAggregator(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
|
||||
{
|
||||
if (Globals::UseV2Aggregator())
|
||||
{
|
||||
auto communicator = ::CNTK::QuantizedMPICommunicator(zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, nBits);
|
||||
return std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, useAsyncAggregation, traceLevel, syncStatsTrace);
|
||||
}
|
||||
else
|
||||
return std::make_shared<AllReduceDistGradAggregator<ElemType>>(mpi, nBits, zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, useAsyncAggregation, traceLevel, syncStatsTrace);
|
||||
}
|
||||
|
||||
template <>
|
||||
std::shared_ptr<IDistGradAggregator<half>> _GetAllReduceDistGradAggregator<half>(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
|
||||
{
|
||||
RuntimeError("SGD - half not supported for quantization!");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel)
|
||||
{
|
||||
|
@ -2274,13 +2339,7 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
|
|||
if (traceLevel > 0)
|
||||
fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);
|
||||
#ifdef CNTK_PARALLEL_TRAINING_SUPPORT
|
||||
if (Globals::UseV2Aggregator())
|
||||
{
|
||||
auto communicator = ::CNTK::QuantizedMPICommunicator(m_zeroThresholdFor1Bit, true, numGradientBits);
|
||||
m_distGradAgg = std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
|
||||
}
|
||||
else
|
||||
m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
|
||||
m_distGradAgg = _GetAllReduceDistGradAggregator<ElemType>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
|
||||
#else
|
||||
RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
|
||||
#endif // !CNTK_PARALLEL_TRAINING_SUPPORT
|
||||
|
@ -2289,15 +2348,38 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
|
|||
{
|
||||
if (traceLevel > 0)
|
||||
fprintf(stderr, "Initializing dataParallelSGD with FP%d aggregation.\n", numGradientBits);
|
||||
if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines.
|
||||
m_distGradAgg = std::make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, ::CNTK::MPICommunicator(m_packThresholdSizeInBytes, m_useFP16AllReduce));
|
||||
else
|
||||
m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes);
|
||||
m_distGradAgg = GetSimpleDistGradAggregator<ElemType>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes, m_useFP16AllReduce);
|
||||
}
|
||||
|
||||
m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<IMASGD<ElemType>> _GetBlockMomentumSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize)
|
||||
{
|
||||
assert(!Globals::UseV2Aggregator());
|
||||
return make_shared<BlockMomentumSGD<ElemType>>(mpi, traceLevel, devID, useNesterovBlockMomentum, resetSGDMomentum, blockLearningRate, blockMomentumAsTimeConstant, modelAggregationBlockSize);
|
||||
}
|
||||
|
||||
template <>
|
||||
shared_ptr<IMASGD<half>> _GetBlockMomentumSGD<half>(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize)
|
||||
{
|
||||
assert(!Globals::UseV2Aggregator());
|
||||
RuntimeError("SGD - half not supported when useV2Aggregator is false!");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
shared_ptr<IMASGD<ElemType>> _GetBasicModelAveragingSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID)
|
||||
{
|
||||
return make_shared<BasicModelAveragingSGD<ElemType>>(mpi, traceLevel, devID);
|
||||
}
|
||||
|
||||
template <>
|
||||
shared_ptr<IMASGD<half>> _GetBasicModelAveragingSGD<half>(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID)
|
||||
{
|
||||
RuntimeError("SGD - half not supported for modelAveragingSGD");
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID)
|
||||
{
|
||||
|
@ -2307,7 +2389,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
|
|||
}
|
||||
if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD)
|
||||
{
|
||||
m_pMASGDHelper = make_shared<BasicModelAveragingSGD<ElemType>>(m_mpi, traceLevel, devID);
|
||||
m_pMASGDHelper = _GetBasicModelAveragingSGD<ElemType>(m_mpi, traceLevel, devID);
|
||||
}
|
||||
else if (GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD)
|
||||
{
|
||||
|
@ -2329,7 +2411,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
|
|||
m_modelAggregationBlockSize);
|
||||
}
|
||||
else
|
||||
m_pMASGDHelper = make_shared<BlockMomentumSGD<ElemType>>(m_mpi, traceLevel, devID,
|
||||
m_pMASGDHelper = _GetBlockMomentumSGD<ElemType>(m_mpi, traceLevel, devID,
|
||||
m_useNesterovBlockMomentum, m_resetSGDMomentum,
|
||||
m_blockLearningRate, m_blockMomentumAsTimeConstant,
|
||||
m_modelAggregationBlockSize);
|
||||
|
@ -2341,6 +2423,47 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
|
|||
// UpdateWeights() - actual weight update, implementing various update rules
|
||||
template <class ElemType>
|
||||
void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
|
||||
MatrixBasePtr& smoothedGradientValues, double& smoothedCount,
|
||||
const double learnRatePerSample, const double momentumPerSample,
|
||||
size_t actualMBSize,
|
||||
const double L2RegWeight, const double L1RegWeight,
|
||||
const bool needAveMultiplier,
|
||||
const bool useNesterovMomentum) const
|
||||
{
|
||||
if (std::is_same<ElemType, half>())
|
||||
{
|
||||
// Get metrix from compound metrix
|
||||
auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (smoothedGradientValues);
|
||||
size_t numCols = functionValues.GetNumCols();
|
||||
|
||||
auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols);
|
||||
auto tempGradientMatrix = compoundMatrixPtr->ColumnSlice(numCols, numCols);
|
||||
auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
|
||||
|
||||
// Cast gradients to float
|
||||
tempGradientMatrix.CastAssignValuesOf(gradientValues);
|
||||
|
||||
// Update
|
||||
TypedUpdateWeights<float>(parameterMatrix, tempGradientMatrix, smoothedGradientMatrix, smoothedCount,
|
||||
learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight,
|
||||
needAveMultiplier, useNesterovMomentum);
|
||||
|
||||
// Cast parameter back to half
|
||||
functionValues.CastAssignValuesOf(parameterMatrix);
|
||||
|
||||
}
|
||||
else
|
||||
{
|
||||
auto sgv = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradientValues);
|
||||
TypedUpdateWeights<>(functionValues, gradientValues, *sgv, smoothedCount,
|
||||
learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight,
|
||||
needAveMultiplier, useNesterovMomentum);
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType1>
|
||||
template <class ElemType>
|
||||
void SGD<ElemType1>::TypedUpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& smoothedGradientValues, double& smoothedCount,
|
||||
const double learnRatePerSample, const double momentumPerSample,
|
||||
size_t actualMBSize,
|
||||
|
@ -2363,7 +2486,7 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
|
|||
assert(actualMBSize > 0);
|
||||
|
||||
// clipping gradients to prevent outliers
|
||||
ClipGradient(gradientValues, actualMBSize);
|
||||
ClipGradient<ElemType>(gradientValues, actualMBSize);
|
||||
|
||||
GradientsUpdateType adpType = GradUpdateType();
|
||||
double noiseStd = GradientUpdateNoiseStd();
|
||||
|
@ -2453,8 +2576,9 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
|
|||
}
|
||||
|
||||
// protected:
|
||||
template <class ElemType1>
|
||||
template <class ElemType>
|
||||
void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
|
||||
void SGD<ElemType1>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
|
||||
{
|
||||
if (m_clippingThresholdPerSample != std::numeric_limits<double>::infinity())
|
||||
{
|
||||
|
@ -2474,10 +2598,30 @@ void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actual
|
|||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
static void SaveSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient)
|
||||
{
|
||||
auto smoothedGradientPtr = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradient);
|
||||
if (!smoothedGradientPtr)
|
||||
RuntimeError("Failed to cast, type mismatch");
|
||||
const Matrix<ElemType>& smoothedGradientValues = *smoothedGradientPtr;
|
||||
fstream << smoothedGradientValues;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
static void LoadSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient)
|
||||
{
|
||||
auto smoothedGradientPtr = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradient);
|
||||
if (!smoothedGradientPtr)
|
||||
RuntimeError("Failed to cast, type mismatch");
|
||||
Matrix<ElemType>& smoothedGradientValues = *smoothedGradientPtr;
|
||||
fstream >> smoothedGradientValues;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
|
||||
const double learnRatePerSample,
|
||||
const std::list<Matrix<ElemType>>& smoothedGradients,
|
||||
const std::list<MatrixBasePtr>& smoothedGradients,
|
||||
const std::vector<double>& smoothedCounts,
|
||||
const double prevCriterion,
|
||||
const size_t minibatchSize)
|
||||
|
@ -2510,10 +2654,12 @@ void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSam
|
|||
|
||||
fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
|
||||
|
||||
for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
|
||||
for (auto smoothedGradient : smoothedGradients)
|
||||
{
|
||||
const Matrix<ElemType>& smoothedGradientValues = *smoothedGradientIter;
|
||||
fstream << smoothedGradientValues;
|
||||
if (std::is_same<ElemType, half>())
|
||||
SaveSmoothedGradient<float>(fstream, smoothedGradient);
|
||||
else
|
||||
SaveSmoothedGradient<ElemType>(fstream, smoothedGradient);
|
||||
}
|
||||
|
||||
fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
|
||||
|
@ -2553,7 +2699,7 @@ template <class ElemType>
|
|||
bool SGD<ElemType>::TryLoadCheckPointInfo(const size_t epochNumber,
|
||||
/*out*/ size_t& totalSamplesSeen,
|
||||
/*out*/ double& learnRatePerSample,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
std::vector<double>& smoothedCounts,
|
||||
/*out*/ double& prevCriterion,
|
||||
/*out*/ size_t& minibatchSize)
|
||||
|
@ -2582,7 +2728,7 @@ template <class ElemType>
|
|||
void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
|
||||
/*out*/ size_t& totalSamplesSeen,
|
||||
/*out*/ double& learnRatePerSample,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
std::vector<double>& smoothedCounts,
|
||||
/*out*/ double& prevCriterion,
|
||||
/*out*/ size_t& minibatchSize)
|
||||
|
@ -2600,6 +2746,9 @@ void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
|
|||
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion");
|
||||
}
|
||||
|
||||
if (ckpVersion > CURRENT_CNTK_CHECKPOINT_VERSION)
|
||||
RuntimeError("The checkpoint file has a newer format version (%d) than this CNTK version can handle (%d).", (int)ckpVersion, (int)CURRENT_CNTK_CHECKPOINT_VERSION);
|
||||
|
||||
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
|
||||
|
||||
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
|
||||
|
@ -2618,10 +2767,12 @@ void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
|
|||
|
||||
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
|
||||
|
||||
for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
|
||||
for (auto smoothedGradient : smoothedGradients)
|
||||
{
|
||||
Matrix<ElemType>& smoothedGradientValues = *smoothedGradientIter;
|
||||
fstream >> smoothedGradientValues;
|
||||
if (std::is_same<ElemType, half>())
|
||||
LoadSmoothedGradient<float>(fstream, smoothedGradient);
|
||||
else
|
||||
LoadSmoothedGradient<ElemType>(fstream, smoothedGradient);
|
||||
}
|
||||
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
|
||||
|
||||
|
@ -2824,6 +2975,7 @@ void SGD<ElemType>::MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNet
|
|||
nodeIter->SetEvalTimeStampOutdatedWrtAll();
|
||||
}
|
||||
|
||||
template class SGD<half>;
|
||||
template class SGD<float>;
|
||||
template class SGD<double>;
|
||||
|
||||
|
@ -3306,12 +3458,14 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
|
|||
static size_t GetSizeOfPrecision(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
{
|
||||
wstring precision = configp->Get(L"precision");
|
||||
if (precision == L"float")
|
||||
if (precision == L"float16")
|
||||
return sizeof(half);
|
||||
else if (precision == L"float")
|
||||
return sizeof(float);
|
||||
else if (precision == L"double")
|
||||
return sizeof(double);
|
||||
else
|
||||
RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
|
||||
RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str());
|
||||
}
|
||||
|
||||
SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp)
|
||||
|
|
|
@ -25,7 +25,8 @@ using namespace std; // ugh! TODO: get rid of this from .h files!!!
|
|||
|
||||
#define CNTK_CHECKPOINT_VERSION_1 1 // 1 -> no version number
|
||||
#define CNTK_CHECKPOINT_VERSION_2 2
|
||||
#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_2
|
||||
#define CNTK_CHECKPOINT_VERSION_3 3 // float smoothed gradients for float16/half parameters
|
||||
#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_3
|
||||
|
||||
namespace CNTK { namespace Internal {
|
||||
// Forward declarations.
|
||||
|
@ -442,7 +443,7 @@ protected:
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
const bool learnRateInitialized,
|
||||
const double largestPrevLearnRatePerSample);
|
||||
|
||||
|
@ -458,7 +459,7 @@ protected:
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
/*out*/ EpochCriterion& epochCriterion,
|
||||
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
|
||||
std::string prefixMsg,
|
||||
|
@ -478,7 +479,7 @@ protected:
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
const double learningRateAdjustmentFactor);
|
||||
|
||||
// uses a small percentage of training data of minibatch to
|
||||
|
@ -496,7 +497,7 @@ protected:
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
|
||||
const size_t minMinibatchSize, const size_t maxMinibatchSize);
|
||||
|
||||
// Attempts to compute the error signal for the whole utterance, which will
|
||||
|
@ -523,7 +524,7 @@ protected:
|
|||
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
|
||||
StreamMinibatchInputs* inputMatrices,
|
||||
const std::list<ComputationNodeBasePtr>& learnableNodes,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double>& smoothedCounts,
|
||||
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double>& smoothedCounts,
|
||||
/*out*/ EpochCriterion& epochCriterion,
|
||||
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
|
||||
const std::string& prefixMsg = "",
|
||||
|
@ -534,26 +535,37 @@ protected:
|
|||
|
||||
void InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel);
|
||||
void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID);
|
||||
public:
|
||||
private:
|
||||
// UpdateWeights() - actual weight update, implementing various update rules
|
||||
void UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
|
||||
Matrix<ElemType>& smoothedGradient, double& smoothedCount,
|
||||
MatrixBasePtr& smoothedGradient, double& smoothedCount,
|
||||
const double learnRatePerSample, const double momentumPerSample,
|
||||
size_t actualMBSize,
|
||||
const double L2RegWeight, const double L1RegWeight,
|
||||
const bool needAveMultiplier,
|
||||
const bool useNesterovMomentum) const;
|
||||
|
||||
template<class ElemType2 = ElemType>
|
||||
void TypedUpdateWeights(Matrix<ElemType2>& functionValues, Matrix<ElemType2>& gradientValues,
|
||||
Matrix<ElemType2>& smoothedGradient, double& smoothedCount,
|
||||
const double learnRatePerSample, const double momentumPerSample,
|
||||
size_t actualMBSize,
|
||||
const double L2RegWeight, const double L1RegWeight,
|
||||
const bool needAveMultiplier,
|
||||
const bool useNesterovMomentum) const;
|
||||
public:
|
||||
// return -1 if nothing exists
|
||||
int DetermineStartEpoch(const bool makeMode);
|
||||
|
||||
wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false) const;
|
||||
|
||||
protected:
|
||||
void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
|
||||
template<class ElemType2 = ElemType>
|
||||
void ClipGradient(Matrix<ElemType2>& gradient, const size_t actualMBSize) const;
|
||||
|
||||
void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, // TODO: combine totalSamplesSeen and prevCriterion into a EpochCriterion type
|
||||
const double learnRatePerSample,
|
||||
const std::list<Matrix<ElemType>>& smoothedGradients,
|
||||
const std::list<MatrixBasePtr>& smoothedGradients,
|
||||
const std::vector<double>& smoothedCounts,
|
||||
const double prevCriterion,
|
||||
const size_t minibatchSize);
|
||||
|
@ -561,14 +573,14 @@ protected:
|
|||
bool TryLoadCheckPointInfo(const size_t epochNumber,
|
||||
/*out*/ size_t& totalSamplesSeen,
|
||||
/*out*/ double& learnRatePerSample,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
std::vector<double>& smoothedCounts,
|
||||
/*out*/ double& prevCriterion,
|
||||
/*out*/ size_t& minibatchSize);
|
||||
void LoadCheckPointInfo(const size_t epochNumber,
|
||||
/*out*/ size_t& totalSamplesSeen,
|
||||
/*out*/ double& learnRatePerSample,
|
||||
std::list<Matrix<ElemType>>& smoothedGradients,
|
||||
std::list<MatrixBasePtr>& smoothedGradients,
|
||||
std::vector<double>& smoothedCounts,
|
||||
/*out*/ double& prevCriterion,
|
||||
/*out*/ size_t& minibatchSize);
|
||||
|
|
|
@ -137,6 +137,7 @@
|
|||
<ClInclude Include="MASGD.h" />
|
||||
<ClInclude Include="PostComputingActions.h" />
|
||||
<ClInclude Include="SimpleDistGradAggregator.h" />
|
||||
<ClInclude Include="SimpleDistGradAggregatorHelper.h" />
|
||||
<ClInclude Include="SimpleEvaluator.h" />
|
||||
<ClInclude Include="SimpleOutputWriter.h" />
|
||||
<ClInclude Include="SGD.h" />
|
||||
|
@ -149,6 +150,7 @@
|
|||
<ClCompile Include="PostComputingActions.cpp" />
|
||||
<ClCompile Include="Profiler.cpp" />
|
||||
<ClCompile Include="SGD.cpp" />
|
||||
<ClCompile Include="SimpleDistGradAggregatorHelper.cpp" />
|
||||
<ClCompile Include="stdafx.cpp" />
|
||||
</ItemGroup>
|
||||
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
|
||||
|
|
|
@ -16,6 +16,9 @@
|
|||
<ClCompile Include="ASGDHelper.cpp">
|
||||
<Filter>Parallelization</Filter>
|
||||
</ClCompile>
|
||||
<ClCompile Include="SimpleDistGradAggregatorHelper.cpp">
|
||||
<Filter>Parallelization</Filter>
|
||||
</ClCompile>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<ClInclude Include="..\Common\Include\fileutil.h">
|
||||
|
@ -144,6 +147,9 @@
|
|||
<ClInclude Include="AccumulatorAggregation.h">
|
||||
<Filter>Parallelization</Filter>
|
||||
</ClInclude>
|
||||
<ClInclude Include="SimpleDistGradAggregatorHelper.h">
|
||||
<Filter>Parallelization</Filter>
|
||||
</ClInclude>
|
||||
</ItemGroup>
|
||||
<ItemGroup>
|
||||
<Filter Include="Common">
|
||||
|
|
|
@ -0,0 +1,82 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma warning(disable : 4267) // conversion from size_t to int or other types
|
||||
|
||||
#include "Basics.h"
|
||||
#include "MPIWrapper.h"
|
||||
#include "Matrix.h"
|
||||
#include "SimpleDistGradAggregatorHelper.h"
|
||||
#include "DistGradHeader.h"
|
||||
#include "IDistGradAggregator.h"
|
||||
#include "SimpleDistGradAggregator.h"
|
||||
#include "V2SimpleDistGradAggregator.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
||||
template <class ElemType>
|
||||
std::shared_ptr<IDistGradAggregator<ElemType>> GetSimpleDistGradAggregator(
|
||||
const MPIWrapperPtr& mpi,
|
||||
bool useAsyncAggregation,
|
||||
int deviceId,
|
||||
int syncStatsTrace,
|
||||
size_t packThresholdSizeInBytes,
|
||||
bool useFP16AllReduce)
|
||||
{
|
||||
if (Globals::UseV2Aggregator())
|
||||
return std::make_shared<V2SimpleDistGradAggregator<ElemType>>(
|
||||
mpi,
|
||||
useAsyncAggregation,
|
||||
deviceId,
|
||||
syncStatsTrace,
|
||||
::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce));
|
||||
else
|
||||
return std::make_shared<SimpleDistGradAggregator<ElemType>>(
|
||||
mpi,
|
||||
useAsyncAggregation,
|
||||
deviceId,
|
||||
syncStatsTrace,
|
||||
packThresholdSizeInBytes);
|
||||
}
|
||||
|
||||
template <>
|
||||
std::shared_ptr<IDistGradAggregator<half>> GetSimpleDistGradAggregator<half>(
|
||||
const MPIWrapperPtr& mpi,
|
||||
bool useAsyncAggregation,
|
||||
int deviceId,
|
||||
int syncStatsTrace,
|
||||
size_t packThresholdSizeInBytes,
|
||||
bool useFP16AllReduce)
|
||||
{
|
||||
if (Globals::UseV2Aggregator())
|
||||
return std::make_shared<V2SimpleDistGradAggregator<half>>(
|
||||
mpi,
|
||||
useAsyncAggregation,
|
||||
deviceId,
|
||||
syncStatsTrace,
|
||||
::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce));
|
||||
else
|
||||
RuntimeError("SGD - half not supported when useV2Aggregator is false!");
|
||||
}
|
||||
|
||||
template std::shared_ptr<IDistGradAggregator<float>> GetSimpleDistGradAggregator<float>(
|
||||
const MPIWrapperPtr& mpi,
|
||||
bool useAsyncAggregation,
|
||||
int deviceId,
|
||||
int syncStatsTrace,
|
||||
size_t packThresholdSizeInBytes,
|
||||
bool useFP16AllReduce);
|
||||
|
||||
template std::shared_ptr<IDistGradAggregator<double>> GetSimpleDistGradAggregator<double>(
|
||||
const MPIWrapperPtr& mpi,
|
||||
bool useAsyncAggregation,
|
||||
int deviceId,
|
||||
int syncStatsTrace,
|
||||
size_t packThresholdSizeInBytes,
|
||||
bool useFP16AllReduce);
|
||||
|
||||
}}}
|
|
@ -0,0 +1,24 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "Constants.h"
|
||||
#include "IDistGradAggregator.h"
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
|
||||
template <class ElemType>
|
||||
std::shared_ptr<IDistGradAggregator<ElemType>> GetSimpleDistGradAggregator(
|
||||
const MPIWrapperPtr& mpi,
|
||||
bool useAsyncAggregation,
|
||||
int deviceId,
|
||||
int syncStatsTrace,
|
||||
size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES,
|
||||
bool useFP16AllReduce = false);
|
||||
|
||||
}}}
|
|
@ -5,8 +5,6 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include "V2SimpleDistGradAggregator.h"
|
||||
|
||||
#include "AccumulatorAggregation.h"
|
||||
#include "Basics.h"
|
||||
#include "DataReader.h"
|
||||
|
@ -18,7 +16,7 @@
|
|||
#include "ProgressTracing.h"
|
||||
#include "DistGradHeader.h"
|
||||
#include "IDistGradAggregator.h"
|
||||
#include "SimpleDistGradAggregator.h"
|
||||
#include "SimpleDistGradAggregatorHelper.h"
|
||||
#include "Criterion.h"
|
||||
#include "Globals.h"
|
||||
|
||||
|
@ -167,10 +165,7 @@ public:
|
|||
DistGradHeader::Destroy(ptr);
|
||||
});
|
||||
|
||||
if (Globals::UseV2Aggregator())
|
||||
m_distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/, ::CNTK::MPICommunicator());
|
||||
else
|
||||
m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
|
||||
m_distGradAgg = GetSimpleDistGradAggregator<ElemType>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
|
||||
}
|
||||
|
||||
m_gradHeader->numEvalNode = evalNodes.size();
|
||||
|
|
|
@ -109,7 +109,7 @@ public:
|
|||
|
||||
// Synchronize the Quantization compute stream with the completion of
|
||||
// compute of the gradient matrices on the main compute stream
|
||||
mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<ElemType>();
|
||||
mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<float>();
|
||||
delete mainStreamSyncEvent;
|
||||
|
||||
AggregateGradientsImpl(newGradients, newGradHeader, showSyncPerfStats);
|
||||
|
@ -185,7 +185,7 @@ private:
|
|||
if (m_useAsyncAggregation)
|
||||
{
|
||||
std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId));
|
||||
mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<ElemType>();
|
||||
mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<float>();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(CheckModelVersion)
|
|||
// This is a watch guard to make sure that any change in the model version will be detected.
|
||||
// If you change the CNTK model version, please do not silently adapt this test.
|
||||
// Instead, please do notify the CNTK release team (AlexeyO, Wolfgang, Zhou, Mark) to prepare required steps for the next release.
|
||||
BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 30, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
|
||||
BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 31, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_CASE(EvalConstantPlusTest)
|
||||
|
|
|
@ -61,7 +61,7 @@
|
|||
<Link>
|
||||
<SubSystem>Console</SubSystem>
|
||||
<GenerateDebugInformation>true</GenerateDebugInformation>
|
||||
<AdditionalDependencies>Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<AdditionalDependencies>Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;Cntk.SGD-$(CntkComponentVersion).lib;%(AdditionalDependencies)</AdditionalDependencies>
|
||||
<OptimizeReferences>true</OptimizeReferences>
|
||||
<AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(BOOST_LIB_PATH);$(NvmlLibPath)</AdditionalLibraryDirectories>
|
||||
<DelayLoadDLLs>Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll</DelayLoadDLLs>
|
||||
|
|
Загрузка…
Ссылка в новой задаче