* FP16 BrainScript - address code review comments

* Remove Tab and fix debug build breaks

* Fix Linux Build breaks

* fp16 brain script - add _CRT_SECURE_NO_WARNINGS

* fp16 brain script - fix NetworkTests

* Update tests for model version change

* Remove changes for InputAndParamNodes

* Fix typo

* Remove redundant code

* Fix optional parameters
This commit is contained in:
rpengms 2019-03-20 11:36:16 -07:00 коммит произвёл GitHub
Родитель 45ae386bc8
Коммит 4003c087a1
Не найден ключ, соответствующий данной подписи
Идентификатор ключа GPG: 4AEE18F83AFDEB23
33 изменённых файлов: 769 добавлений и 187 удалений

Просмотреть файл

@ -707,6 +707,7 @@ SGDLIB_SRC=\
$(SOURCEDIR)/SGDLib/Profiler.cpp \
$(SOURCEDIR)/SGDLib/SGD.cpp \
$(SOURCEDIR)/SGDLib/PostComputingActions.cpp \
$(SOURCEDIR)/SGDLib/SimpleDistGradAggregatorHelper.cpp \
SGDLIB_SRC+=$(CNTKLIBRARY_COMMON_SRC)

Просмотреть файл

@ -94,15 +94,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
);
}
/*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& LearnableNodes,
std::list<Matrix<ElemType>>& smoothedGradient,
std::list<MatrixBasePtr>& smoothedGradients,
size_t samplesSinceLastSync) override
{
Base::OnEpochEnd(LearnableNodes, smoothedGradient, samplesSinceLastSync);
Base::OnEpochEnd(LearnableNodes, smoothedGradients, samplesSinceLastSync);
}
/*virtual*/ void ModelAggregationProcessing(
size_t samplesSinceLastSync,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradient,
std::list<MatrixBasePtr>& smoothedGradients,
size_t& totalSamplesProcessed,
float& secondsOnCommunication
) override
@ -181,9 +181,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//----------------------------------------
if (m_resetSGDMomentumAfterAggregation)
{
for (Matrix<ElemType>& x : smoothedGradient)
for (auto sg : smoothedGradients)
{
x.SetValue((ElemType)0);
auto x = dynamic_pointer_cast<Matrix<ElemType>>(sg);
if (x != nullptr)
x->SetValue((ElemType)0);
}
}
}

Просмотреть файл

@ -108,7 +108,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool OnArrivingAtSyncPoint(
const std::list<ComputationNodeBasePtr>& learnableNodes, /* input/output: */
std::list<Matrix<ElemType>>& smoothedGradient, /* input/output: under some setup, it will reset to zero*/
std::list<MatrixBasePtr>& smoothedGradients, /* input/output: under some setup, it will reset to zero*/
size_t samplesSinceLastSync /* input: samples processed since last sync on this worker only */
) override
{
@ -130,12 +130,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Otherwise let update the weights.
float secondsOnCommunication = 0.0f;
size_t totalSamples = 0;
ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication);
return true;
}
/*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradient,
std::list<MatrixBasePtr>& smoothedGradients,
size_t samplesSinceLastSync) override
{
if (!m_someWorkerHasFinished)
@ -152,13 +152,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Let's update our weights no matter what.
float secondsOnCommunication = 0.0f;
size_t totalSamples = 0;
ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradients, totalSamples, secondsOnCommunication);
}
/*virtual*/ void ModelAggregationProcessing(
size_t /*samplesSinceLastSync*/,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradient,
std::list<MatrixBasePtr>& smoothedGradients,
size_t& /*totalSamplesProcessed*/, /* out */
float& secondsOnCommunication /* out */
) override
@ -196,8 +196,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_communicator->AggregateInPlace(aggregatedWeightsPrepared, m_communicator->Workers());
// 2. Let's update the model
for (auto& pBaseNode : learnableNodes)
auto smoothedGradientIter = smoothedGradients.begin();
for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++, smoothedGradientIter++)
{
ComputationNodeBasePtr pBaseNode = *nodeIter;
if (!pBaseNode->IsParameterUpdateRequired())
continue;
@ -235,15 +237,35 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// 2.2.4 update bookkeeping
prevWeight.SetValue(currentWeight);
}
}
//----------------------------------------
// 3. reset SGD momentum if necessary
//----------------------------------------
{
// For half, we keep a copy of float weights, update that too
if (std::is_same<ElemType, half>())
{
auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (*smoothedGradientIter);
size_t numCols = currentWeight.GetNumCols();
auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
parameterMatrix.CastAssignValuesOf(currentWeight);
if (m_resetSGDMomentumAfterAggregation)
{
for (Matrix<ElemType>& x : smoothedGradient)
// Only reset smoothed gradients
auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols);
smoothedGradientMatrix.SetValue(0.0f);
}
}
else
{
x.SetValue((ElemType)0);
if (m_resetSGDMomentumAfterAggregation)
{
auto x = dynamic_pointer_cast<Matrix<ElemType>> (*smoothedGradientIter);
x->SetValue((ElemType)0);
}
}
}
}
}

Просмотреть файл

@ -22,6 +22,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
using namespace std;
template <class ElemType, class TargetType>
static inline bool isprecision(std::wstring& str)
{
if ((str == L"") && std::is_same<ElemType, TargetType>())
return true;
if (std::is_same<TargetType, half>())
return EqualCI(str, L"float16");
else if (std::is_same<TargetType, float>())
return EqualCI(str, L"float");
else if (std::is_same<TargetType, double>())
return EqualCI(str, L"double");
return false;
}
template <class ElemType>
void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wstring& baseName, const NDLPass pass)
{
@ -48,7 +62,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
std::wstring cnNodeType = Microsoft::MSR::CNTK::ToFixedWStringFromMultiByte(node->GetValue());
ComputationNodePtr nodePtr;
ComputationNodeBasePtr nodePtr;
// get the node pointer for the node, should be stored in the EvalValue;
if (pass > ndlPassInitial)
@ -56,7 +70,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
nodePtr = ComputationNode<ElemType>::FromVoidPtr(node->GetEvalValue());
if (!nodePtr)
{
nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
nodePtr = m_net->GetNodeFromName(name);
node->SetEvalValue(nodePtr.get());
}
}
@ -75,16 +89,49 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);
wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", "");
wstring precision = node->GetOptionalParameter("precision", "");
// TODO: Map dynamicAxis from name to node at this point, where that node is memoized inside NDL.
// first look for this node already existing in the network
// BUGBUG: How does this set the dimensions then?
if (m_net->NodeNameExists(name))
nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
else if (isSparse)
nodePtr = m_net->GetNodeFromName(name);
else
{
if (precision == L"")
{
if (isSparse)
nodePtr = builder.CreateSparseInputNode(name, tensorShape, dynamicAxis);
else
nodePtr = builder.CreateInputNode(name, tensorShape, dynamicAxis);
}
else if (EqualCI(precision, L"float"))
{
if (isSparse)
nodePtr = builder.template TypedCreateSparseInputNode<float>(name, tensorShape, dynamicAxis);
else
nodePtr = builder.template TypedCreateInputNode<float>(name, tensorShape, dynamicAxis);
}
else if (EqualCI(precision, L"double"))
{
if (isSparse)
nodePtr = builder.template TypedCreateSparseInputNode<double>(name, tensorShape, dynamicAxis);
else
nodePtr = builder.template TypedCreateInputNode<double>(name, tensorShape, dynamicAxis);
}
else if (EqualCI(precision, L"float16"))
{
if (isSparse)
nodePtr = builder.template TypedCreateSparseInputNode<half>(name, tensorShape, dynamicAxis);
else
nodePtr = builder.template TypedCreateInputNode<half>(name, tensorShape, dynamicAxis);
}
else
{
RuntimeError("NDLNetworkBuilder: Input: the 'precision' parameter if specified, must be 'float', 'double' or 'float16'.");
}
}
}
}
else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput")
{
@ -193,7 +240,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
nodePtr = builder.CreateLearnableParameter(name, rows, cols);
nodePtr->SetLearningRateMultiplier(0);
}
else if (pass == ndlPassFinal || nodePtr->Value().GetNumElements() != 0)
else if (pass == ndlPassFinal || (dynamic_pointer_cast<ComputationNode<ElemType>> (nodePtr))->Value().GetNumElements() != 0)
{
ElemType val = parameter[0]->GetScalar();
m_net->InitLearnableParameters(nodePtr, L"fixedValue", val);
@ -607,6 +654,56 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
nodeParamCount = nodePtr->GetNumInputs();
}
}
else if (cnNodeType == OperationName2Of(CastNode))
{
if (parameter.size() < 1)
RuntimeError("%ls should have 1 or more parameters (node and cast precision).", cnNodeType.c_str());
// setup the parameter position of children so we can hook them up later
nodeParamCount = 1;
nodeParamStart = 0;
if (pass == ndlPassInitial)
{
// evaluate only scalar parameters
vector<void*> params = EvaluateParameters(node, baseName, 0, parameter.size(), pass);
auto sourceNode = (NDLNode<ElemType>*) params[0];
wstring sourcePrecision = sourceNode->GetOptionalParameter("precision", "");
wstring targetPrecision = node->GetOptionalParameter("precision", "");
if (EqualCI(targetPrecision, L"float16"))
{
ComputationNetworkBuilder<half> builder2(*m_net);
if (isprecision<ElemType, float>(sourcePrecision))
nodePtr = builder2.CreateCastNode<float>(name);
else if (isprecision<ElemType, double>(sourcePrecision))
nodePtr = builder2.CreateCastNode<double>(name);
else
RuntimeError("NDLNetworkBuilder: for CastNode to cast to half, input must be 'float' or 'double'");
}
else if (EqualCI(targetPrecision, L"float"))
{
ComputationNetworkBuilder<float> builder2(*m_net);
if (isprecision<ElemType, half>(sourcePrecision))
nodePtr = builder2.CreateCastNode<half>(name);
else if (isprecision<ElemType, double>(sourcePrecision))
nodePtr = builder2.CreateCastNode<double>(name);
else
RuntimeError("NDLNetworkBuilder: for CastNode to cast to float, input must be 'float16' or 'double'");
}
else if (EqualCI(targetPrecision, L"double"))
{
ComputationNetworkBuilder<double> builder2(*m_net);
if (isprecision<ElemType, float>(sourcePrecision))
nodePtr = builder2.CreateCastNode<float>(name);
else if (isprecision<ElemType, half>(sourcePrecision))
nodePtr = builder2.CreateCastNode<half>(name);
else
RuntimeError("NDLNetworkBuilder: for CastNode to cast to double, input must be 'float' or 'float16'");
}
else
RuntimeError("NDLNetworkBuilder: CastNode - need to specify 'precision' parameter: 'float', 'double' or 'float16'.");
}
}
else
{
@ -645,7 +742,10 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
#if 1
vector<ComputationNodeBasePtr> inputNodes;
for (let& in : inputs)
inputNodes.push_back(ComputationNode<ElemType>::FromVoidPtr(in));
{
ComputationNodeBase *p = (ComputationNodeBase *)in;
inputNodes.push_back(p ? p->shared_from_this() : nullptr);
}
nodePtr->AttachInputs(inputNodes);
#else // TODO: delete this
@ -714,6 +814,7 @@ TensorShape NDLNodeEvaluatorImpl<ElemType>::ProcessTensorShapeParameters(const N
return TensorShape(dims);
}
template class NDLBuilderImpl<half>;
template class NDLBuilderImpl<float>;
template class NDLBuilderImpl<double>;

Просмотреть файл

@ -269,10 +269,11 @@ public:
}
// ProcessOptionalParameters - Process the optional parameters of a node
virtual void ProcessOptionalParameters(NDLNode<ElemType>* node)
virtual void ProcessOptionalParameters(NDLNode<ElemType>* node) override
{
vector<NDLNode<ElemType>*> params = node->GetParameters(true); // get all the optional parameters only
auto compNode = ComputationNode<ElemType>::FromVoidPtr(node->GetEvalValue());
ComputationNodeBase* compNodePtr = (ComputationNodeBase *) (node->GetEvalValue());
ComputationNodeBasePtr compNode = compNodePtr ? compNodePtr->shared_from_this() : nullptr;
std::string empty;
// loop through all the optional parameters processing them as necessary
@ -582,6 +583,7 @@ private:
DEVICEID_TYPE m_deviceId;
};
template class NDLBuilder<half>;
template class NDLBuilder<float>;
template class NDLBuilder<double>;

Просмотреть файл

@ -160,6 +160,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
else if (EqualInsensitive(nodeType, OperationNameOf(AtanhNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(AveragePoolingNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationNameOf(BatchNormalizationNode))) ret = true;
else if (EqualInsensitive(nodeType, OperationName2Of(CastNode))) ret = true;
#ifdef COMING_SOON
else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
#endif
@ -267,18 +268,24 @@ NDLScript<ElemType> NDLScript<ElemType>::s_global("global");
// declare the static variables from the classes
template <>
NDLScript<half> NDLScript<half>::s_global{};
template <>
NDLScript<float> NDLScript<float>::s_global{};
template <>
NDLScript<double> NDLScript<double>::s_global{};
template <>
int NDLNode<half>::s_nameCounter = 0;
template <>
int NDLNode<float>::s_nameCounter = 0;
template <>
int NDLNode<double>::s_nameCounter = 0;
template class NDLNode<half>;
template class NDLNode<float>;
template class NDLNode<double>;
template class NDLScript<half>;
template class NDLScript<float>;
template class NDLScript<double>;

Просмотреть файл

@ -98,6 +98,7 @@ public:
}
};
template class NDLNodeEvaluator<half>;
template class NDLNodeEvaluator<float>;
template class NDLNodeEvaluator<double>;

Просмотреть файл

@ -188,9 +188,12 @@ ComputationNetworkPtr GetModelFromConfig(const ConfigRecordType& config, const w
return net;
}
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, half>(const ScriptableObjects::IConfigRecord& config);
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, float>(const ScriptableObjects::IConfigRecord& config);
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ScriptableObjects::IConfigRecord, double>(const ScriptableObjects::IConfigRecord& config);
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, half>(const ConfigParameters& config);
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, float>(const ConfigParameters& config);
template function<ComputationNetworkPtr(DEVICEID_TYPE)> GetNetworkFactory<ConfigParameters, double>(const ConfigParameters& config);
template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, half> (const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, float> (const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);
template ComputationNetworkPtr GetModelFromConfig<ConfigParameters, double>(const ConfigParameters& config, const wstring&, vector<wstring>& outputNodeNamesVector);

Просмотреть файл

@ -1775,6 +1775,7 @@ shared_ptr<ComputationNode<ElemType>> SimpleNetworkBuilder<ElemType>::AddTrainAn
return output;
}
template class SimpleNetworkBuilder<half>;
template class SimpleNetworkBuilder<float>;
template class SimpleNetworkBuilder<double>;

Просмотреть файл

@ -159,9 +159,13 @@ public:
m_constInputGateValue = config("constInputGateValue", "false");
m_constOutputGateValue = config("constOutputGateValue", "false");
m_forgetGateInitVal = config("forgetGateInitVal", "-1");
m_inputGateInitVal = config("inputGateInitVal", "-1");
m_outputGateInitVal = config("outputGateInitVal", "-1");
ElemType forgetGateInitVal = config("forgetGateInitVal", "-1");
ElemType inputGateInitVal = config("inputGateInitVal", "-1");
ElemType outputGateInitVal = config("outputGateInitVal", "-1");
m_forgetGateInitVal = forgetGateInitVal;
m_inputGateInitVal = inputGateInitVal;
m_outputGateInitVal = outputGateInitVal;
m_sparse_input = config("sparseinput", "false");

Просмотреть файл

@ -142,12 +142,14 @@ shared_ptr<Object> MakeRuntimeObject<TrainAction>(const IConfigRecordPtr configp
{
const IConfigRecord& config = *configp;
wstring precision = config[L"precision"]; // dispatch on ElemType
if (precision == L"float")
if (precision == L"float16")
DoTrain<IConfigRecord, half>(config);
else if (precision == L"float")
DoTrain<IConfigRecord, float>(config);
else if (precision == L"double")
DoTrain<IConfigRecord, double>(config);
else
RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str());
return make_shared<Object>(); // return a dummy object
}
@ -156,8 +158,10 @@ shared_ptr<Object> MakeRuntimeObject<TrainAction>(const IConfigRecordPtr configp
ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<TrainAction> registerTrainAction(L"TrainAction");
}}}
template void DoTrain<ScriptableObjects::IConfigRecord, half>(const ScriptableObjects::IConfigRecord& config);
template void DoTrain<ScriptableObjects::IConfigRecord, float>(const ScriptableObjects::IConfigRecord& config);
template void DoTrain<ScriptableObjects::IConfigRecord, double>(const ScriptableObjects::IConfigRecord& config);
template void DoTrain<ConfigParameters, half>(const ConfigParameters& config);
template void DoTrain<ConfigParameters, float>(const ConfigParameters& config);
template void DoTrain<ConfigParameters, double>(const ConfigParameters& config);

Просмотреть файл

@ -171,6 +171,91 @@ static void DisableLegacyUsage(const ConfigParameters& TopLevelConfig, const Con
// be run in parallel across multiple ranks. Others should only run on rank 0
const std::set<std::string> commandstoRunOnAllRanks = { "train", "trainRNN", "adapt", "test", "eval", "cv", "devtest", "bnstat" };
template <typename ElemType>
bool DispatchThisAction(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& config)
{
if (thisAction == "train" || thisAction == "trainRNN")
{
DoTrain<ConfigParameters, ElemType>(commandParams);
}
else if (thisAction == "bnstat")
{
DoBatchNormalizationStat<ElemType>(commandParams);
}
else if (thisAction == "adapt")
{
DoAdapt<ElemType>(commandParams);
}
else if (thisAction == "test" || thisAction == "eval")
{
DoEval<ElemType>(commandParams);
}
else if (thisAction == "edit")
{
DoEdit<ElemType>(commandParams);
}
else if (thisAction == "cv")
{
DoCrossValidate<ElemType>(commandParams);
}
else if (thisAction == "write")
{
DoWriteOutput<ElemType>(commandParams);
}
else if (thisAction == "devtest")
{
TestCn<ElemType>(config); // for "devtest" action pass the root config instead
}
else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
{
DoDumpNodes<ElemType>(commandParams);
}
else if (thisAction == "convertdbn")
{
DoConvertFromDbn<ElemType>(commandParams);
}
else if (thisAction == "exportdbn")
{
DoExportToDbn<ElemType>(commandParams);
}
else if (thisAction == "createLabelMap")
{
DoCreateLabelMap<ElemType>(commandParams);
}
else if (thisAction == "writeWordAndClass")
{
DoWriteWordAndClassInfo<ElemType>(commandParams);
}
else if (thisAction == "plot")
{
DoTopologyPlot<ElemType>(commandParams);
}
else if (thisAction == "SVD")
{
DoParameterSVD<ElemType>(commandParams);
}
else
{
return false;
}
return true;
}
template <>
bool DispatchThisAction<half>(const string &thisAction, const ConfigParameters &commandParams, const ConfigParameters& )
{
if (thisAction == "train" || thisAction == "trainRNN")
{
DoTrain<ConfigParameters, half>(commandParams);
}
else
{
RuntimeError("half only supported for action train or trainRNN!");
}
return true;
}
// process the command
template <typename ElemType>
void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mpi)
@ -270,73 +355,21 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
{
LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
}
DoTrain<ConfigParameters, ElemType>(commandParams);
}
if (!DispatchThisAction<ElemType>(thisAction, commandParams, config))
{
RuntimeError("unknown action: %s in command set: %s", thisAction.c_str(), command[i].c_str());
}
if (thisAction == "train" || thisAction == "trainRNN")
{
if (progressTracing)
{
LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
}
fullEpochsOffset += GetMaxEpochs(commandParams);
}
else if (thisAction == "bnstat")
{
DoBatchNormalizationStat<ElemType>(commandParams);
}
else if (thisAction == "adapt")
{
DoAdapt<ElemType>(commandParams);
}
else if (thisAction == "test" || thisAction == "eval")
{
DoEval<ElemType>(commandParams);
}
else if (thisAction == "edit")
{
DoEdit<ElemType>(commandParams);
}
else if (thisAction == "cv")
{
DoCrossValidate<ElemType>(commandParams);
}
else if (thisAction == "write")
{
DoWriteOutput<ElemType>(commandParams);
}
else if (thisAction == "devtest")
{
TestCn<ElemType>(config); // for "devtest" action pass the root config instead
}
else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
{
DoDumpNodes<ElemType>(commandParams);
}
else if (thisAction == "convertdbn")
{
DoConvertFromDbn<ElemType>(commandParams);
}
else if (thisAction == "exportdbn")
{
DoExportToDbn<ElemType>(commandParams);
}
else if (thisAction == "createLabelMap")
{
DoCreateLabelMap<ElemType>(commandParams);
}
else if (thisAction == "writeWordAndClass")
{
DoWriteWordAndClassInfo<ElemType>(commandParams);
}
else if (thisAction == "plot")
{
DoTopologyPlot<ElemType>(commandParams);
}
else if (thisAction == "SVD")
{
DoParameterSVD<ElemType>(commandParams);
}
else
{
RuntimeError("unknown action: %s in command set: %s", thisAction.c_str(), command[i].c_str());
}
}
fprintf(stderr, "\n");
@ -740,12 +773,14 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
LOGPRINTF(stderr, "precision = \"%s\"\n", type.c_str());
}
if (type == "float")
if (type == "float16")
DoCommands<half>(config, mpi);
else if (type == "float")
DoCommands<float>(config, mpi);
else if (type == "double")
DoCommands<double>(config, mpi);
else
RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float\" or \"double\"", type.c_str());
RuntimeError("CNTK: Invalid precision string: \"%s\", must be \"float16\" or \"float\" or \"double\"", type.c_str());
// if completed then write a doneFile if requested
if (!doneFile.empty())

Просмотреть файл

@ -8,6 +8,8 @@
#include <map>
#include <stdexcept>
#include <stdint.h>
#include "File.h"
#include "half.hpp"
using namespace std;
@ -150,6 +152,11 @@ public:
return (float) (double) *this;
}
operator half() const
{
return (half)(double)*this;
}
private:
long tolong() const
{

Просмотреть файл

@ -150,6 +150,20 @@ void ComputationNetwork::SaveToFileImpl(const wstring& fileName, const FileOptio
else if (nodePtr->Is<ComputationNode<half>>())
precision = ElemTypeName<half>();
else LogicError("Unexpected node type.");
#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31
if (nodePtr->Is<CastNode<half,float>>())
precision = ElemTypeName2<half,float>();
else if (nodePtr->Is<CastNode<half, double>>())
precision = ElemTypeName2<half, double>();
else if (nodePtr->Is<CastNode<float, half>>())
precision = ElemTypeName2<float, half>();
else if (nodePtr->Is<CastNode<float, double>>())
precision = ElemTypeName2<float, double>();
else if (nodePtr->Is<CastNode<double, half>>())
precision = ElemTypeName2<double, half>();
else if (nodePtr->Is<CastNode<double, float>>())
precision = ElemTypeName2<double, float>();
#endif
fstream << precision;
#endif
fstream << nodePtr->OperationName();
@ -265,6 +279,20 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
node = ComputationNetworkBuilder<half>::NewNode(opName, m_deviceId, nodeName);
else if (precision == L"") // old file format: default to <ElemType>
node = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
#if CURRENT_CNTK_MODEL_VERSION >= CNTK_MODEL_VERSION_31
else if (precision == L"half,float")
node = ComputationNetworkBuilder<half>::NewNode2<float>(opName, m_deviceId, nodeName);
else if (precision == L"half,double")
node = ComputationNetworkBuilder<half>::NewNode2<double>(opName, m_deviceId, nodeName);
else if (precision == L"float,half")
node = ComputationNetworkBuilder<float>::NewNode2<half>(opName, m_deviceId, nodeName);
else if (precision == L"float,double")
node = ComputationNetworkBuilder<float>::NewNode2<double>(opName, m_deviceId, nodeName);
else if (precision == L"double,half")
node = ComputationNetworkBuilder<double>::NewNode2<half>(opName, m_deviceId, nodeName);
else if (precision == L"double,float")
node = ComputationNetworkBuilder<double>::NewNode2<float>(opName, m_deviceId, nodeName);
#endif
else
RuntimeError("Read: Unexpected precision tag '%ls'", precision.c_str());

Просмотреть файл

@ -1313,6 +1313,14 @@ template <> /*static*/ inline const wchar_t* ElemTypeName<float>() { return L"f
template <> /*static*/ inline const wchar_t* ElemTypeName<double>() { return L"double"; }
template <> /*static*/ inline const wchar_t* ElemTypeName<half>() { return L"half"; }
template <typename ElemType, typename ElemType2> static inline const wchar_t* ElemTypeName2();
template <> /*static*/ inline const wchar_t* ElemTypeName2<float,half>() { return L"float,half"; }
template <> /*static*/ inline const wchar_t* ElemTypeName2<float,double>() { return L"float,double"; }
template <> /*static*/ inline const wchar_t* ElemTypeName2<double,half>() { return L"double,half"; }
template <> /*static*/ inline const wchar_t* ElemTypeName2<double,float>() { return L"double,float"; }
template <> /*static*/ inline const wchar_t* ElemTypeName2<half,float>() { return L"half,float"; }
template <> /*static*/ inline const wchar_t* ElemTypeName2<half,double>() { return L"half,double"; }
// The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
// The corresponding Matrix<float> is emitted in the SetDeviceId function above.
template class Matrix<double>;

Просмотреть файл

@ -175,6 +175,13 @@ static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring& node
else return CreateStandardNode<ElemType>(nodeType, forward<_Types>(_Args)...);
}
template <class ElemType, class ElemType2, class... _Types>
static shared_ptr<ComputationNode<ElemType>> CreateNode2(const std::wstring& nodeType, _Types&&... _Args)
{
// check more types
if (nodeType == OperationName2Of(CastNode)) return New<CastNode<ElemType, ElemType2>>(forward<_Types>(_Args)...);
else RuntimeError("CreateNode2: unsupported nodeType - %S", nodeType.c_str());
}
// this function is called from SimpleNetworkBuilder and old NDL
template <class ElemType>
/*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name)
@ -189,6 +196,13 @@ template <class ElemType>
return CreateNode<ElemType>(nodeType, deviceId, name);
}
template <class ElemType>
template <class ElemType2>
/*static*/ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name)
{
return CreateNode2<ElemType, ElemType2>(nodeType, deviceId, name);
}
shared_ptr<ComputationNodeBase> NewComputationNodeFromConfig(const Microsoft::MSR::ScriptableObjects::IConfigRecordPtr configp)
{
wstring precision = configp->Get(L"precision"); // dispatch on ElemType
@ -247,15 +261,17 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
template <class ValueType>
shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
{
return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
return net.AddNodeToNetWithElemType(New<InputValue<ValueType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
}
template <class ElemType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
template <class ValueType>
shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
{
return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
return net.AddNodeToNetWithElemType(New<SparseInputValue<ValueType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
}
template <class ElemType>
@ -318,6 +334,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
{
return net.AddNodeToNetWithElemType(New<ReconcileDynamicAxisNode<ElemType>>(net.GetDeviceId(), nodeName));
}
template <class ElemType>
template <class InputNodeType>
shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateCastNode(const std::wstring& nodeName)
{
return net.AddNodeToNetWithElemType(New<CastNode<ElemType, InputNodeType>>(net.GetDeviceId(), nodeName));
}
// this is the catch-all for all cases not covered as special cases above
// Unlike the specialized ones above, this one creates nodes by type given as a string.
@ -997,4 +1019,37 @@ template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::Typ
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<float>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<double>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateSparseInputNode<half>(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::CreateCastNode<half>(const std::wstring& nodeName);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::CreateCastNode<double>(const std::wstring& nodeName);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::CreateCastNode<half>(const std::wstring& nodeName);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::CreateCastNode<float>(const std::wstring& nodeName);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::CreateCastNode<float>(const std::wstring& nodeName);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::CreateCastNode<double>(const std::wstring& nodeName);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::NewNode2<half>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::NewNode2<double>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::NewNode2<half>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::NewNode2<float>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::NewNode2<float>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::NewNode2<double>(const std::wstring& nodeName, DEVICEID_TYPE deviceId, const wstring& name);
}}}

Просмотреть файл

@ -38,6 +38,8 @@ public:
// TODO: move into a separate header/class, to decouple from this class which would then be only used by old NDL and SimpleNetworkBuilder.
static ComputationNodePtr NewStandardNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
static ComputationNodePtr NewNode(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
template <class ElemType2>
static ComputationNodePtr NewNode2(const std::wstring& nodeType, DEVICEID_TYPE deviceId, const wstring& name);
// The following functions create nodes and add them to the net, but don't attach inputs (some don't have inputs).
// There are special versions for nodes with custom constructors, and a catch-all, CreateComputationNode(), for all others.
@ -53,12 +55,25 @@ public:
template<class ValueType>
shared_ptr<ComputationNode<ValueType>> TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
template <class InputNodeType>
shared_ptr<ComputationNode<ElemType>> CreateCastNode(const std::wstring& nodeName);
// sparse matrix size is optionally specified
// ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
shared_ptr<ComputationNode<ElemType>> CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"")
{
return this->template TypedCreateInputNode<ElemType>(inputName, sampleLayout, dynamicAxisName);
}
template<class ValueType>
shared_ptr<ComputationNode<ValueType>> TypedCreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
shared_ptr<ComputationNode<ElemType>> CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"")
{
return this->template TypedCreateSparseInputNode<ElemType>(inputName, sampleLayout, dynamicAxisName);
}
template<class ValueType>
shared_ptr<ComputationNode<ValueType>> TypedCreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName);
ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);

Просмотреть файл

@ -61,7 +61,8 @@
#define CNTK_MODEL_VERSION_28 28 // Padding op
#define CNTK_MODEL_VERSION_29 29 // Expose StopGradient in BS
#define CNTK_MODEL_VERSION_30 30 // LatticeWithSequenceSoftmax node
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_30
#define CNTK_MODEL_VERSION_31 31 // Cast node
#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_31
// helper mode for debugging
// If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
@ -95,6 +96,7 @@ struct /*interface*/ IComputationNode
// TODO: OperationName calls static TypeName which does not match the actual type names in that the 'Node' is missing.
virtual const std::wstring OperationName() const = 0;
#define OperationNameOf(T) (T<float>::TypeName()) // convenience macro
#define OperationName2Of(T) (T<double,float>::TypeName()) // convenience macro
virtual void UpdateFunctionMBSize() = 0; // recalculate our column dimensions from MBLayout. Override to update temps.

Просмотреть файл

@ -4,6 +4,9 @@
//
// CNTKEval.cpp : Defines the exported functions for the CNTK DLL.
//
#ifndef _CRT_SECURE_NO_WARNINGS
#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
#endif
#define __STDC_FORMAT_MACROS
#include <inttypes.h>

Просмотреть файл

@ -646,6 +646,20 @@ ASGDHelper<ElemType>* NewASGDHelper(
#endif
}
template<> ASGDHelper<half>* NewASGDHelper<half>(
const std::list<ComputationNodeBasePtr> & learnableNodes,
size_t nodeNumRanks,
bool useAsyncBuffer,
bool isSimulatedModelAveragingSGD,
AdjustLearningRateAtBeginning adjusttype,
double adjustCoef,
size_t adjustPerMinibatches,
int traceLevel,
int syncPerfStats)
{
RuntimeError("NewASGDHelper - half not supported!");
}
template ASGDHelper<float>* NewASGDHelper<float>(
const std::list<ComputationNodeBasePtr> & learnableNodes,
size_t nodeNumRanks,

Просмотреть файл

@ -18,6 +18,7 @@
#include "Matrix.h"
#include "SimpleDistGradAggregator.h"
#include "V2SimpleDistGradAggregator.h"
#include "SimpleDistGradAggregatorHelper.h"
namespace Microsoft { namespace MSR { namespace CNTK {
@ -46,16 +47,7 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
}
// Prepare aggregator.
std::shared_ptr<IDistGradAggregator<ElemType>> distGradAgg;
if (Globals::UseV2Aggregator())
distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(
mpi,
false /*useAsyncAggregation*/,
net->GetDeviceId(),
0 /*syncStatsTrace*/,
::CNTK::MPICommunicator(packThresholdSizeInBytes));
else
distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(
std::shared_ptr<IDistGradAggregator<ElemType>> distGradAgg = GetSimpleDistGradAggregator<ElemType>(
mpi,
false /*useAsyncAggregation*/,
net->GetDeviceId(),

Просмотреть файл

@ -24,10 +24,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
for (const auto& iter : inputMatrices)
matrices.insert(iter.second.matrix);
for (auto& node : net->FeatureNodes())
if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
if (matrices.find(node->ValuePtr()) != matrices.end())
node->NotifyFunctionValuesMBSizeModified();
for (auto& node : net->LabelNodes())
if (matrices.find(node->As<ComputationNode<ElemType>>()->ValuePtr()) != matrices.end())
if (matrices.find(node->ValuePtr()) != matrices.end())
node->NotifyFunctionValuesMBSizeModified();
}

Просмотреть файл

@ -150,7 +150,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
virtual void OnEpochEnd(const std::list<ComputationNodeBasePtr>& LearnableNodes,
std::list<Matrix<ElemType>>& smoothedGradient,
std::list<MatrixBasePtr>& smoothedGradients,
size_t samplesSinceLastSync
)
{
@ -165,7 +165,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (read2sync)
{
m_numSyncPerformed++;
ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication);
ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication);
m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication);
}
@ -175,7 +175,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual bool OnArrivingAtSyncPoint(
const std::list<ComputationNodeBasePtr>& LearnableNodes, /* input/output: */
std::list<Matrix<ElemType>>& smoothedGradient, /* input/output: under some setup, it will reset to zero*/
std::list<MatrixBasePtr>& smoothedGradients, /* input/output: under some setup, it will reset to zero*/
size_t samplesSinceLastSync /* input: samples processed since last sync on this worker only */
)
{
@ -190,7 +190,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (read2Sync)
{
m_numSyncPerformed++;
ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradient, totalSamplesProcessed, secondsOnCommunication);
ModelAggregationProcessing(samplesSinceLastSync, LearnableNodes, smoothedGradients, totalSamplesProcessed, secondsOnCommunication);
m_perfReporter.OnMAPerformed(samplesSinceLastSync, totalSamplesProcessed, secondsOnCommunication);
}
return read2Sync;
@ -199,7 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void ModelAggregationProcessing(
size_t samplesSinceLastSync, /* in: */
const std::list<ComputationNodeBasePtr>& learnableNodes, /* in/out */
std::list<Matrix<ElemType>>& smoothedGradient, /* in/out */
std::list<MatrixBasePtr>& smoothedGradients, /* in/out */
size_t& totalSamplesProcessed, /* out */
float& secondsOnCommunication /* out */) = 0;
@ -346,7 +346,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
void ModelAggregationProcessing(
size_t samplesSinceLastSync, /* in */
const std::list<ComputationNodeBasePtr>& learnableNodes, /* in/out */
std::list<Matrix<ElemType>>& smoothedGradient, /* in/out */
std::list<MatrixBasePtr>& smoothedGradients, /* in/out */
size_t& totalSamplesProcessed, /* out */
float& secondsOnCommunication /* out */) override
// NOTE: the variable type is determined by the interface in SGD::TrainOneEpoch

Просмотреть файл

@ -31,6 +31,7 @@
#include "ASGDHelper.h"
#include "CNTKLibraryInternals.h"
#include "SimpleDistGradAggregatorHelper.h"
#include "SimpleDistGradAggregator.h"
#include "V2SimpleDistGradAggregator.h"
#include "ProgressTracing.h"
@ -47,8 +48,10 @@ using namespace std;
// class SGD
// =======================================================================
template SGD<half>::SGD(const ConfigParameters&);
template SGD<float>::SGD(const ConfigParameters&);
template SGD<double>::SGD(const ConfigParameters&);
template SGD<half>::SGD(const ScriptableObjects::IConfigRecord&);
template SGD<float>::SGD(const ScriptableObjects::IConfigRecord&);
template SGD<double>::SGD(const ScriptableObjects::IConfigRecord&);
@ -223,6 +226,11 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
}
}
if (criterionNodes.front()->template Is<ComputationNode<half>>())
{
InvalidArgument("TrainOrAdaptModel: using Float16 for loss function may cause overflow, please cast to float.");
}
// This code is only relevant for the new (V2) readers. It exists because of
// a shortcoming in DecimateMinibatchInPlace, which does not yet work when inputs
// in the same minibatch have different layouts, which is something only V2 readers can
@ -333,7 +341,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// initializing weights and gradient holder
// only one criterion so far TODO: support multiple ones?
auto& learnableNodes = net->LearnableParameterNodes(criterionNodes[0]);
list<Matrix<ElemType>> smoothedGradients;
list<MatrixBasePtr> smoothedGradients;
vector<double> smoothedCounts; // currently used by FSAdaGradUpdate()
size_t numParameters = 0;
@ -344,9 +352,30 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
// Note: We don't actually need the smoothedGradients if !IsParameterUpdateRequired().
// However, this is hard to fix since lots of code assumes smoothedGradients to be in the same order as learnableNodes.
// V2 API fixes this.
smoothedGradients.push_back(Matrix<ElemType>(node->Value().GetNumRows(),
node->Value().GetNumCols(),
net->GetDeviceId()));
MatrixBasePtr smoothedGradientPtr;
size_t numRows = node->Value().GetNumRows();
size_t numCols = node->Value().GetNumCols();
if (std::is_same<ElemType, half>())
{
// For half parameters, we use float smoothed gradients
// Allocate 3 times the size for casting parameter and gradients to float
const size_t c_smoothed_gradients_factor = 3;
shared_ptr<Matrix<float>> compoundMatrixPtr = std::make_shared<Matrix<float>>(numRows,
numCols * c_smoothed_gradients_factor,
net->GetDeviceId());
// Initialize float parameters
auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
parameterMatrix.CastAssignValuesOf(node->Value());
smoothedGradientPtr = compoundMatrixPtr;
}
else
{
smoothedGradientPtr = std::make_shared<Matrix<ElemType>>(numRows,
numCols,
net->GetDeviceId());
}
smoothedGradients.push_back(smoothedGradientPtr);
smoothedCounts.push_back(0);
if (node->IsParameterUpdateRequired())
{
@ -987,7 +1016,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices, // TODO: why is this a pointer?
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, vector<double>& smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, vector<double>& smoothedCounts,
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
const std::string& prefixMsg,
@ -1389,7 +1418,25 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
if (node->IsParameterUpdateRequired())
{
#ifdef _DEBUG
if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): "))
bool hasNan = false;
if (std::is_same<ElemType, half>())
{
// Get metrix from compound metrix
auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (*smoothedGradientIter);
if (compoundMatrixPtr)
{
size_t numCols = dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().GetNumCols();
auto smoothedGradient = compoundMatrixPtr->ColumnSlice(0, numCols);
hasNan = smoothedGradient.HasNan("TrainOneEpoch/UpdateWeights(): ");
}
}
else
{
auto smoothedGradient = dynamic_pointer_cast<Matrix<ElemType>> (*smoothedGradientIter);
hasNan = smoothedGradient && smoothedGradient->HasNan("TrainOneEpoch/UpdateWeights(): ");
}
if (hasNan)
LogicError("%ls %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
#endif
double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier();
@ -1811,7 +1858,7 @@ double SGD<ElemType>::SearchForBestLearnRate(ComputationNetworkPtr net,
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
const bool learnRateInitialized,
const double largestPrevLearnRatePerSample)
{
@ -1985,7 +2032,7 @@ size_t SGD<ElemType>::AdaptiveMinibatchSizing(ComputationNetworkPtr net,
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
const double learningRateAdjustmentFactor)
{
size_t minMinibatchSize = initialMinibatchSize;
@ -2086,7 +2133,7 @@ size_t SGD<ElemType>::SearchForBestMinibatchSize(ComputationNetworkPtr net,
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
const size_t minMinibatchSize, const size_t maxMinibatchSize)
{
// may happen for automatically reduced learning rates
@ -2190,7 +2237,7 @@ void SGD<ElemType>::TrainOneMiniEpochAndReloadModel(ComputationNetworkPtr net,
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, vector<double> smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, vector<double> smoothedCounts,
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
std::string prefixMsg,
@ -2264,6 +2311,24 @@ void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net
}
}
template <class ElemType>
std::shared_ptr<IDistGradAggregator<ElemType>> _GetAllReduceDistGradAggregator(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
{
if (Globals::UseV2Aggregator())
{
auto communicator = ::CNTK::QuantizedMPICommunicator(zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, nBits);
return std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, useAsyncAggregation, traceLevel, syncStatsTrace);
}
else
return std::make_shared<AllReduceDistGradAggregator<ElemType>>(mpi, nBits, zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, useAsyncAggregation, traceLevel, syncStatsTrace);
}
template <>
std::shared_ptr<IDistGradAggregator<half>> _GetAllReduceDistGradAggregator<half>(const MPIWrapperPtr& mpi, int nBits, bool zeroThresholdFor1Bit, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
{
RuntimeError("SGD - half not supported for quantization!");
}
template <class ElemType>
void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel)
{
@ -2274,13 +2339,7 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
if (traceLevel > 0)
fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);
#ifdef CNTK_PARALLEL_TRAINING_SUPPORT
if (Globals::UseV2Aggregator())
{
auto communicator = ::CNTK::QuantizedMPICommunicator(m_zeroThresholdFor1Bit, true, numGradientBits);
m_distGradAgg = std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
}
else
m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
m_distGradAgg = _GetAllReduceDistGradAggregator<ElemType>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
#else
RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
#endif // !CNTK_PARALLEL_TRAINING_SUPPORT
@ -2289,15 +2348,38 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
{
if (traceLevel > 0)
fprintf(stderr, "Initializing dataParallelSGD with FP%d aggregation.\n", numGradientBits);
if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines.
m_distGradAgg = std::make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, ::CNTK::MPICommunicator(m_packThresholdSizeInBytes, m_useFP16AllReduce));
else
m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes);
m_distGradAgg = GetSimpleDistGradAggregator<ElemType>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes, m_useFP16AllReduce);
}
m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
}
template <class ElemType>
shared_ptr<IMASGD<ElemType>> _GetBlockMomentumSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize)
{
assert(!Globals::UseV2Aggregator());
return make_shared<BlockMomentumSGD<ElemType>>(mpi, traceLevel, devID, useNesterovBlockMomentum, resetSGDMomentum, blockLearningRate, blockMomentumAsTimeConstant, modelAggregationBlockSize);
}
template <>
shared_ptr<IMASGD<half>> _GetBlockMomentumSGD<half>(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID, bool useNesterovBlockMomentum, bool resetSGDMomentum, double blockLearningRate, double blockMomentumAsTimeConstant, size_t modelAggregationBlockSize)
{
assert(!Globals::UseV2Aggregator());
RuntimeError("SGD - half not supported when useV2Aggregator is false!");
}
template <class ElemType>
shared_ptr<IMASGD<ElemType>> _GetBasicModelAveragingSGD(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID)
{
return make_shared<BasicModelAveragingSGD<ElemType>>(mpi, traceLevel, devID);
}
template <>
shared_ptr<IMASGD<half>> _GetBasicModelAveragingSGD<half>(const MPIWrapperPtr& mpi, size_t traceLevel, DEVICEID_TYPE devID)
{
RuntimeError("SGD - half not supported for modelAveragingSGD");
}
template <class ElemType>
void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID)
{
@ -2307,7 +2389,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
}
if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD)
{
m_pMASGDHelper = make_shared<BasicModelAveragingSGD<ElemType>>(m_mpi, traceLevel, devID);
m_pMASGDHelper = _GetBasicModelAveragingSGD<ElemType>(m_mpi, traceLevel, devID);
}
else if (GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD)
{
@ -2329,7 +2411,7 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
m_modelAggregationBlockSize);
}
else
m_pMASGDHelper = make_shared<BlockMomentumSGD<ElemType>>(m_mpi, traceLevel, devID,
m_pMASGDHelper = _GetBlockMomentumSGD<ElemType>(m_mpi, traceLevel, devID,
m_useNesterovBlockMomentum, m_resetSGDMomentum,
m_blockLearningRate, m_blockMomentumAsTimeConstant,
m_modelAggregationBlockSize);
@ -2341,6 +2423,47 @@ void SGD<ElemType>::InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE de
// UpdateWeights() - actual weight update, implementing various update rules
template <class ElemType>
void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
MatrixBasePtr& smoothedGradientValues, double& smoothedCount,
const double learnRatePerSample, const double momentumPerSample,
size_t actualMBSize,
const double L2RegWeight, const double L1RegWeight,
const bool needAveMultiplier,
const bool useNesterovMomentum) const
{
if (std::is_same<ElemType, half>())
{
// Get metrix from compound metrix
auto compoundMatrixPtr = dynamic_pointer_cast<Matrix<float>> (smoothedGradientValues);
size_t numCols = functionValues.GetNumCols();
auto smoothedGradientMatrix = compoundMatrixPtr->ColumnSlice(0, numCols);
auto tempGradientMatrix = compoundMatrixPtr->ColumnSlice(numCols, numCols);
auto parameterMatrix = compoundMatrixPtr->ColumnSlice(2 * numCols, numCols);
// Cast gradients to float
tempGradientMatrix.CastAssignValuesOf(gradientValues);
// Update
TypedUpdateWeights<float>(parameterMatrix, tempGradientMatrix, smoothedGradientMatrix, smoothedCount,
learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight,
needAveMultiplier, useNesterovMomentum);
// Cast parameter back to half
functionValues.CastAssignValuesOf(parameterMatrix);
}
else
{
auto sgv = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradientValues);
TypedUpdateWeights<>(functionValues, gradientValues, *sgv, smoothedCount,
learnRatePerSample, momentumPerSample, actualMBSize, L2RegWeight, L1RegWeight,
needAveMultiplier, useNesterovMomentum);
}
}
template <class ElemType1>
template <class ElemType>
void SGD<ElemType1>::TypedUpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
Matrix<ElemType>& smoothedGradientValues, double& smoothedCount,
const double learnRatePerSample, const double momentumPerSample,
size_t actualMBSize,
@ -2363,7 +2486,7 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
assert(actualMBSize > 0);
// clipping gradients to prevent outliers
ClipGradient(gradientValues, actualMBSize);
ClipGradient<ElemType>(gradientValues, actualMBSize);
GradientsUpdateType adpType = GradUpdateType();
double noiseStd = GradientUpdateNoiseStd();
@ -2453,8 +2576,9 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
}
// protected:
template <class ElemType1>
template <class ElemType>
void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
void SGD<ElemType1>::ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const
{
if (m_clippingThresholdPerSample != std::numeric_limits<double>::infinity())
{
@ -2474,10 +2598,30 @@ void SGD<ElemType>::ClipGradient(Matrix<ElemType>& gradient, const size_t actual
}
}
template <class ElemType>
static void SaveSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient)
{
auto smoothedGradientPtr = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradient);
if (!smoothedGradientPtr)
RuntimeError("Failed to cast, type mismatch");
const Matrix<ElemType>& smoothedGradientValues = *smoothedGradientPtr;
fstream << smoothedGradientValues;
}
template <class ElemType>
static void LoadSmoothedGradient(File& fstream, MatrixBasePtr& smoothedGradient)
{
auto smoothedGradientPtr = dynamic_pointer_cast<Matrix<ElemType>> (smoothedGradient);
if (!smoothedGradientPtr)
RuntimeError("Failed to cast, type mismatch");
Matrix<ElemType>& smoothedGradientValues = *smoothedGradientPtr;
fstream >> smoothedGradientValues;
}
template <class ElemType>
void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen,
const double learnRatePerSample,
const std::list<Matrix<ElemType>>& smoothedGradients,
const std::list<MatrixBasePtr>& smoothedGradients,
const std::vector<double>& smoothedCounts,
const double prevCriterion,
const size_t minibatchSize)
@ -2510,10 +2654,12 @@ void SGD<ElemType>::SaveCheckPointInfo(const size_t epoch, const size_t totalSam
fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
for (auto smoothedGradient : smoothedGradients)
{
const Matrix<ElemType>& smoothedGradientValues = *smoothedGradientIter;
fstream << smoothedGradientValues;
if (std::is_same<ElemType, half>())
SaveSmoothedGradient<float>(fstream, smoothedGradient);
else
SaveSmoothedGradient<ElemType>(fstream, smoothedGradient);
}
fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EGradient");
@ -2553,7 +2699,7 @@ template <class ElemType>
bool SGD<ElemType>::TryLoadCheckPointInfo(const size_t epochNumber,
/*out*/ size_t& totalSamplesSeen,
/*out*/ double& learnRatePerSample,
std::list<Matrix<ElemType>>& smoothedGradients,
std::list<MatrixBasePtr>& smoothedGradients,
std::vector<double>& smoothedCounts,
/*out*/ double& prevCriterion,
/*out*/ size_t& minibatchSize)
@ -2582,7 +2728,7 @@ template <class ElemType>
void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
/*out*/ size_t& totalSamplesSeen,
/*out*/ double& learnRatePerSample,
std::list<Matrix<ElemType>>& smoothedGradients,
std::list<MatrixBasePtr>& smoothedGradients,
std::vector<double>& smoothedCounts,
/*out*/ double& prevCriterion,
/*out*/ size_t& minibatchSize)
@ -2600,6 +2746,9 @@ void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion");
}
if (ckpVersion > CURRENT_CNTK_CHECKPOINT_VERSION)
RuntimeError("The checkpoint file has a newer format version (%d) than this CNTK version can handle (%d).", (int)ckpVersion, (int)CURRENT_CNTK_CHECKPOINT_VERSION);
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCKP");
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BLearnRate");
@ -2618,10 +2767,12 @@ void SGD<ElemType>::LoadCheckPointInfo(const size_t epochNumber,
fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BGradient");
for (auto smoothedGradientIter = smoothedGradients.begin(); smoothedGradientIter != smoothedGradients.end(); smoothedGradientIter++)
for (auto smoothedGradient : smoothedGradients)
{
Matrix<ElemType>& smoothedGradientValues = *smoothedGradientIter;
fstream >> smoothedGradientValues;
if (std::is_same<ElemType, half>())
LoadSmoothedGradient<float>(fstream, smoothedGradient);
else
LoadSmoothedGradient<ElemType>(fstream, smoothedGradient);
}
fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EGradient");
@ -2824,6 +2975,7 @@ void SGD<ElemType>::MarkDropoutNodesEvalTimeStampAsOutdated(const ComputationNet
nodeIter->SetEvalTimeStampOutdatedWrtAll();
}
template class SGD<half>;
template class SGD<float>;
template class SGD<double>;
@ -3306,12 +3458,14 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
static size_t GetSizeOfPrecision(const ScriptableObjects::IConfigRecordPtr configp)
{
wstring precision = configp->Get(L"precision");
if (precision == L"float")
if (precision == L"float16")
return sizeof(half);
else if (precision == L"float")
return sizeof(float);
else if (precision == L"double")
return sizeof(double);
else
RuntimeError("invalid value '%ls' for 'precision', must be 'float' or 'double'", precision.c_str());
RuntimeError("invalid value '%ls' for 'precision', must be 'float16' or 'float' or 'double'", precision.c_str());
}
SGDParams::SGDParams(const ScriptableObjects::IConfigRecordPtr configp)

Просмотреть файл

@ -25,7 +25,8 @@ using namespace std; // ugh! TODO: get rid of this from .h files!!!
#define CNTK_CHECKPOINT_VERSION_1 1 // 1 -> no version number
#define CNTK_CHECKPOINT_VERSION_2 2
#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_2
#define CNTK_CHECKPOINT_VERSION_3 3 // float smoothed gradients for float16/half parameters
#define CURRENT_CNTK_CHECKPOINT_VERSION CNTK_CHECKPOINT_VERSION_3
namespace CNTK { namespace Internal {
// Forward declarations.
@ -442,7 +443,7 @@ protected:
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
const bool learnRateInitialized,
const double largestPrevLearnRatePerSample);
@ -458,7 +459,7 @@ protected:
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
std::string prefixMsg,
@ -478,7 +479,7 @@ protected:
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
const double learningRateAdjustmentFactor);
// uses a small percentage of training data of minibatch to
@ -496,7 +497,7 @@ protected:
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double> smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double> smoothedCounts,
const size_t minMinibatchSize, const size_t maxMinibatchSize);
// Attempts to compute the error signal for the whole utterance, which will
@ -523,7 +524,7 @@ protected:
const std::vector<ComputationNodeBasePtr>& evaluationNodes,
StreamMinibatchInputs* inputMatrices,
const std::list<ComputationNodeBasePtr>& learnableNodes,
std::list<Matrix<ElemType>>& smoothedGradients, std::vector<double>& smoothedCounts,
std::list<MatrixBasePtr>& smoothedGradients, std::vector<double>& smoothedCounts,
/*out*/ EpochCriterion& epochCriterion,
/*out*/ std::vector<EpochCriterion>& epochEvalErrors,
const std::string& prefixMsg = "",
@ -534,26 +535,37 @@ protected:
void InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel);
void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID);
public:
private:
// UpdateWeights() - actual weight update, implementing various update rules
void UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemType>& gradientValues,
Matrix<ElemType>& smoothedGradient, double& smoothedCount,
MatrixBasePtr& smoothedGradient, double& smoothedCount,
const double learnRatePerSample, const double momentumPerSample,
size_t actualMBSize,
const double L2RegWeight, const double L1RegWeight,
const bool needAveMultiplier,
const bool useNesterovMomentum) const;
template<class ElemType2 = ElemType>
void TypedUpdateWeights(Matrix<ElemType2>& functionValues, Matrix<ElemType2>& gradientValues,
Matrix<ElemType2>& smoothedGradient, double& smoothedCount,
const double learnRatePerSample, const double momentumPerSample,
size_t actualMBSize,
const double L2RegWeight, const double L1RegWeight,
const bool needAveMultiplier,
const bool useNesterovMomentum) const;
public:
// return -1 if nothing exists
int DetermineStartEpoch(const bool makeMode);
wstring GetModelNameForEpoch(const int epoch, bool bLastModel = false) const;
protected:
void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
template<class ElemType2 = ElemType>
void ClipGradient(Matrix<ElemType2>& gradient, const size_t actualMBSize) const;
void SaveCheckPointInfo(const size_t epoch, const size_t totalSamplesSeen, // TODO: combine totalSamplesSeen and prevCriterion into a EpochCriterion type
const double learnRatePerSample,
const std::list<Matrix<ElemType>>& smoothedGradients,
const std::list<MatrixBasePtr>& smoothedGradients,
const std::vector<double>& smoothedCounts,
const double prevCriterion,
const size_t minibatchSize);
@ -561,14 +573,14 @@ protected:
bool TryLoadCheckPointInfo(const size_t epochNumber,
/*out*/ size_t& totalSamplesSeen,
/*out*/ double& learnRatePerSample,
std::list<Matrix<ElemType>>& smoothedGradients,
std::list<MatrixBasePtr>& smoothedGradients,
std::vector<double>& smoothedCounts,
/*out*/ double& prevCriterion,
/*out*/ size_t& minibatchSize);
void LoadCheckPointInfo(const size_t epochNumber,
/*out*/ size_t& totalSamplesSeen,
/*out*/ double& learnRatePerSample,
std::list<Matrix<ElemType>>& smoothedGradients,
std::list<MatrixBasePtr>& smoothedGradients,
std::vector<double>& smoothedCounts,
/*out*/ double& prevCriterion,
/*out*/ size_t& minibatchSize);

Просмотреть файл

@ -137,6 +137,7 @@
<ClInclude Include="MASGD.h" />
<ClInclude Include="PostComputingActions.h" />
<ClInclude Include="SimpleDistGradAggregator.h" />
<ClInclude Include="SimpleDistGradAggregatorHelper.h" />
<ClInclude Include="SimpleEvaluator.h" />
<ClInclude Include="SimpleOutputWriter.h" />
<ClInclude Include="SGD.h" />
@ -149,6 +150,7 @@
<ClCompile Include="PostComputingActions.cpp" />
<ClCompile Include="Profiler.cpp" />
<ClCompile Include="SGD.cpp" />
<ClCompile Include="SimpleDistGradAggregatorHelper.cpp" />
<ClCompile Include="stdafx.cpp" />
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />

Просмотреть файл

@ -16,6 +16,9 @@
<ClCompile Include="ASGDHelper.cpp">
<Filter>Parallelization</Filter>
</ClCompile>
<ClCompile Include="SimpleDistGradAggregatorHelper.cpp">
<Filter>Parallelization</Filter>
</ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\Common\Include\fileutil.h">
@ -144,6 +147,9 @@
<ClInclude Include="AccumulatorAggregation.h">
<Filter>Parallelization</Filter>
</ClInclude>
<ClInclude Include="SimpleDistGradAggregatorHelper.h">
<Filter>Parallelization</Filter>
</ClInclude>
</ItemGroup>
<ItemGroup>
<Filter Include="Common">

Просмотреть файл

@ -0,0 +1,82 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma warning(disable : 4267) // conversion from size_t to int or other types
#include "Basics.h"
#include "MPIWrapper.h"
#include "Matrix.h"
#include "SimpleDistGradAggregatorHelper.h"
#include "DistGradHeader.h"
#include "IDistGradAggregator.h"
#include "SimpleDistGradAggregator.h"
#include "V2SimpleDistGradAggregator.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
std::shared_ptr<IDistGradAggregator<ElemType>> GetSimpleDistGradAggregator(
const MPIWrapperPtr& mpi,
bool useAsyncAggregation,
int deviceId,
int syncStatsTrace,
size_t packThresholdSizeInBytes,
bool useFP16AllReduce)
{
if (Globals::UseV2Aggregator())
return std::make_shared<V2SimpleDistGradAggregator<ElemType>>(
mpi,
useAsyncAggregation,
deviceId,
syncStatsTrace,
::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce));
else
return std::make_shared<SimpleDistGradAggregator<ElemType>>(
mpi,
useAsyncAggregation,
deviceId,
syncStatsTrace,
packThresholdSizeInBytes);
}
template <>
std::shared_ptr<IDistGradAggregator<half>> GetSimpleDistGradAggregator<half>(
const MPIWrapperPtr& mpi,
bool useAsyncAggregation,
int deviceId,
int syncStatsTrace,
size_t packThresholdSizeInBytes,
bool useFP16AllReduce)
{
if (Globals::UseV2Aggregator())
return std::make_shared<V2SimpleDistGradAggregator<half>>(
mpi,
useAsyncAggregation,
deviceId,
syncStatsTrace,
::CNTK::MPICommunicator(packThresholdSizeInBytes, useFP16AllReduce));
else
RuntimeError("SGD - half not supported when useV2Aggregator is false!");
}
template std::shared_ptr<IDistGradAggregator<float>> GetSimpleDistGradAggregator<float>(
const MPIWrapperPtr& mpi,
bool useAsyncAggregation,
int deviceId,
int syncStatsTrace,
size_t packThresholdSizeInBytes,
bool useFP16AllReduce);
template std::shared_ptr<IDistGradAggregator<double>> GetSimpleDistGradAggregator<double>(
const MPIWrapperPtr& mpi,
bool useAsyncAggregation,
int deviceId,
int syncStatsTrace,
size_t packThresholdSizeInBytes,
bool useFP16AllReduce);
}}}

Просмотреть файл

@ -0,0 +1,24 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#pragma once
#include "Constants.h"
#include "IDistGradAggregator.h"
namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
std::shared_ptr<IDistGradAggregator<ElemType>> GetSimpleDistGradAggregator(
const MPIWrapperPtr& mpi,
bool useAsyncAggregation,
int deviceId,
int syncStatsTrace,
size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES,
bool useFP16AllReduce = false);
}}}

Просмотреть файл

@ -5,8 +5,6 @@
#pragma once
#include "V2SimpleDistGradAggregator.h"
#include "AccumulatorAggregation.h"
#include "Basics.h"
#include "DataReader.h"
@ -18,7 +16,7 @@
#include "ProgressTracing.h"
#include "DistGradHeader.h"
#include "IDistGradAggregator.h"
#include "SimpleDistGradAggregator.h"
#include "SimpleDistGradAggregatorHelper.h"
#include "Criterion.h"
#include "Globals.h"
@ -167,10 +165,7 @@ public:
DistGradHeader::Destroy(ptr);
});
if (Globals::UseV2Aggregator())
m_distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/, ::CNTK::MPICommunicator());
else
m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
m_distGradAgg = GetSimpleDistGradAggregator<ElemType>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
}
m_gradHeader->numEvalNode = evalNodes.size();

Просмотреть файл

@ -109,7 +109,7 @@ public:
// Synchronize the Quantization compute stream with the completion of
// compute of the gradient matrices on the main compute stream
mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<ElemType>();
mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<float>();
delete mainStreamSyncEvent;
AggregateGradientsImpl(newGradients, newGradHeader, showSyncPerfStats);
@ -185,7 +185,7 @@ private:
if (m_useAsyncAggregation)
{
std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId));
mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<ElemType>();
mainStreamSyncEvent->SynchronizeDataTransferFetchStreamWithEvent<float>();
}
}

Просмотреть файл

@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(CheckModelVersion)
// This is a watch guard to make sure that any change in the model version will be detected.
// If you change the CNTK model version, please do not silently adapt this test.
// Instead, please do notify the CNTK release team (AlexeyO, Wolfgang, Zhou, Mark) to prepare required steps for the next release.
BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 30, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 31, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
}
BOOST_AUTO_TEST_CASE(EvalConstantPlusTest)

Просмотреть файл

@ -61,7 +61,7 @@
<Link>
<SubSystem>Console</SubSystem>
<GenerateDebugInformation>true</GenerateDebugInformation>
<AdditionalDependencies>Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;%(AdditionalDependencies)</AdditionalDependencies>
<AdditionalDependencies>Cntk.Core-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;Cntk.Actions-$(CntkComponentVersion).lib;Cntk.ComputationNetwork-$(CntkComponentVersion).lib;Cntk.SequenceTrainingLib-$(CntkComponentVersion).lib;Cntk.SGD-$(CntkComponentVersion).lib;%(AdditionalDependencies)</AdditionalDependencies>
<OptimizeReferences>true</OptimizeReferences>
<AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(BOOST_LIB_PATH);$(NvmlLibPath)</AdditionalLibraryDirectories>
<DelayLoadDLLs>Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll</DelayLoadDLLs>