diff --git a/Common/Eval.cpp b/Common/Eval.cpp index 3704f27c2..d29d04584 100644 --- a/Common/Eval.cpp +++ b/Common/Eval.cpp @@ -122,4 +122,4 @@ void Eval::ResetState() template class Eval; template class Eval; -}}} \ No newline at end of file +}}} diff --git a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h index 27d9cab86..4d3aa2191 100644 --- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h +++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h @@ -204,15 +204,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { typedef ComputationNodeNonLooping Base; UsingComputationNodeMembers; public: //virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0; - PreComputedNode(DEVICEID_TYPE deviceId, const wstring & name) : Base(deviceId, name) - { - // further initializations - m_hasComputed = false; - } + PreComputedNode(DEVICEID_TYPE deviceId, const wstring & name) : + Base(deviceId, name), + m_hasComputed(false) + { } // interface through which this node is operated on are these two functions + + // check whether node has already undergone precomputation virtual bool HasComputed() const { return m_hasComputed; } - virtual void MarkComputed(const bool hasComputed) // override this for further finalizing operation + + // call this with 'false' at start and with 'true' at end + // This is used for resetting and updating from accumulators. + virtual void MarkComputed(const bool hasComputed) { m_hasComputed = hasComputed; } @@ -223,7 +227,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { Base::SaveToFile(fstream); fstream << m_hasComputed; - fstream << m_functionValues; + fstream << m_functionValues; // TODO: why serialize if not yet computed? } virtual void LoadFromFile(File& fstream, size_t modelVersion) override @@ -271,34 +275,46 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool m_hasComputed; }; -#define UsingPreComputedNodeMembers UsingComputationNodeMembersBoilerplate; using Base::m_hasComputed +#define UsingPreComputedNodeMembers UsingComputationNodeMembers; using Base::m_hasComputed // ----------------------------------------------------------------------- - // MeanNode (features) + // MeanInvStdDevNodeBase (features) -- common base class for Mean and InvStdDev // ----------------------------------------------------------------------- template - class MeanNode : public PreComputedNode, public NumInputs<1> + class MeanInvStdDevNodeBase : public PreComputedNode, public NumInputs<1> { typedef PreComputedNode Base; UsingPreComputedNodeMembers; - static const std::wstring TypeName() { return L"Mean"; } + //static const std::wstring TypeName() { return L"MeanInvStdDev (base)"; } public: - MeanNode(DEVICEID_TYPE deviceId, const wstring & name) : + MeanInvStdDevNodeBase(DEVICEID_TYPE deviceId, const wstring & name) : PreComputedNode(deviceId, name), - m_numSamples(0) + m_numSamples(SIZE_MAX) { } virtual void LoadFromFile(File& fstream, size_t modelVersion) override { Base::LoadFromFile(fstream, modelVersion); - m_numSamples = 0; // TODO: intended? Not loaded from file? + m_numSamples = SIZE_MAX; } virtual void /*PreComputedNode::*/MarkComputed(const bool hasComputed) { Base::MarkComputed(hasComputed); - if (m_hasComputed) + if (!m_hasComputed) // initialize + { + if (IsAccumulating()) + LogicError("%ls %ls operation: MarkComputed(false) has been called while accumulating.", NodeName().c_str(), OperationName().c_str()); m_numSamples = 0; + } + else // finalize + { + if (!IsAccumulating()) + LogicError("%ls %ls operation: MarkComputed(true) has been called without MarkComputed(false) first.", NodeName().c_str(), OperationName().c_str()); + if (m_numSamples == 0) + LogicError("%ls %ls operation: No data accumulated during precomputation.", NodeName().c_str(), OperationName().c_str()); + m_numSamples = SIZE_MAX; + } } virtual void ComputeInputPartial(const size_t /*inputIndex*/) @@ -306,51 +322,68 @@ namespace Microsoft { namespace MSR { namespace CNTK { LogicError("Mean operation should not be involved in the gradient calculation."); } - virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override - { - if (!m_hasComputed) - { - Matrix &samples = Inputs(0)->FunctionValues(); - Matrix &avg = FunctionValues(); -#if 1//NANCHECK - samples.HasNan("Mean-Samples"); -#endif - - size_t numNewSamples = samples.GetNumCols(); - Matrix::MultiplyAndWeightedAdd(1.0f / (m_numSamples + samples.GetNumCols()), samples, false, - ConstOnes(numNewSamples, 1, samples.GetDeviceId()), - false, (ElemType)m_numSamples / (m_numSamples + numNewSamples), avg); - -#if 1//NANCHECK - avg.HasNan("Mean-avg"); - //ones.HasNan("Mean-ones"); -#endif - - m_numSamples += numNewSamples; - } - } - - virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override - { - Base::Validate(isFinalValidationPass); - if (!m_hasComputed) - { - FunctionValues().SetValue(0); // reset accumulator - fprintf(stderr, "Mean: SetValue(0)\n"); - } - } - virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const { Base::CopyTo(nodeP, newName, flags); if (flags & CopyNodeFlags::copyNodeValue) { - auto node = dynamic_pointer_cast>(nodeP); - node->m_numSamples = m_numSamples; + if (m_numSamples != SIZE_MAX) + LogicError("%ls %ls operation: CopyTo() called while accumulating.", NodeName().c_str(), OperationName().c_str()); + auto node = dynamic_pointer_cast>(nodeP); + node->m_numSamples = SIZE_MAX; } } - private: - size_t m_numSamples; // TODO: move to base class? + protected: + size_t m_numSamples; // (SIZE_MAX while outside accumulation state) + bool IsAccumulating() const { return m_numSamples != SIZE_MAX; } + }; + + // ----------------------------------------------------------------------- + // MeanNode (features) + // ----------------------------------------------------------------------- + + template + class MeanNode : public MeanInvStdDevNodeBase + { + typedef MeanInvStdDevNodeBase Base; ComputationNodeBoilerplate; UsingPreComputedNodeMembers; + static const std::wstring TypeName() { return L"Mean"; } + public: + MeanNode(DEVICEID_TYPE deviceId, const wstring & name) : + Base(deviceId, name) + { } + + virtual void /*PreComputedNode::*/MarkComputed(const bool hasComputed) + { + Base::MarkComputed(hasComputed); + if (!m_hasComputed) // initialize accumulation + FunctionValues().SetValue(0); + // no else branch because EvaluateThisNodeNonLooping() already leaves a valid mean in m_functionValues + } + + virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override + { + if (m_hasComputed) + return; // not accumulating + + if (!IsAccumulating()) + LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str()); + + Matrix &samples = Inputs(0)->FunctionValues(); + Matrix &avg = FunctionValues(); + +#if 1//NANCHECK + samples.HasNan("Mean-Samples"); +#endif + size_t numNewSamples = samples.GetNumCols(); + Matrix::MultiplyAndWeightedAdd(1.0f / (m_numSamples + samples.GetNumCols()), samples, false, + ConstOnes(numNewSamples, 1, samples.GetDeviceId()), + false, (ElemType)m_numSamples / (m_numSamples + numNewSamples), avg); +#if 1//NANCHECK + avg.HasNan("Mean-avg"); +#endif + + m_numSamples += numNewSamples; + } }; template class MeanNode; @@ -362,32 +395,34 @@ namespace Microsoft { namespace MSR { namespace CNTK { // ----------------------------------------------------------------------- template - class InvStdDevNode : public PreComputedNode, public NumInputs<1> + class InvStdDevNode : public MeanInvStdDevNodeBase { - typedef PreComputedNode Base; UsingPreComputedNodeMembers; + typedef MeanInvStdDevNodeBase Base; ComputationNodeBoilerplate; UsingPreComputedNodeMembers; static const std::wstring TypeName() { return L"InvStdDev"; } public: InvStdDevNode(DEVICEID_TYPE deviceId, const wstring & name) : - PreComputedNode(deviceId, name), - m_mean(deviceId), m_var(deviceId), m_temp(deviceId), - m_numSamples(0) + Base(deviceId, name), + m_mean(deviceId), m_var(deviceId), m_temp(deviceId) { } - virtual void LoadFromFile(File& fstream, size_t modelVersion) override - { - Base::LoadFromFile(fstream, modelVersion); - m_numSamples = 0; // TODO: intended? not loading from file? - } - virtual void /*PreComputedNode::*/MarkComputed(const bool hasComputed) override { Base::MarkComputed(hasComputed); - if (m_hasComputed && m_numSamples > 0) //m_numSamples>0 means it's not called from model loading + if (!m_hasComputed) // initialize + { + // reset accumulators + size_t inputDim = Inputs(0)->GetNumRows(); + m_mean.Resize(inputDim, 1); + m_var.Resize(inputDim, 1); + m_mean.SetValue(0); + m_var.SetValue(0); + FunctionValues().SetValue(0); // also set this because not doing it may flag during debugging; avoids special-casing this + } + else // finalize { ElemType sqrtFloor = 1e-10f; - - m_var.InplaceTruncateBottom(sqrtFloor); //prevent too small variance (and negative square roots) + m_var.InplaceTruncateBottom(sqrtFloor); // prevent too small variance (and negative square roots due to numeric inaccuracy) #if 1//NANCHECK m_var.HasNan("MarkComputed-InplaceTruncateBottom"); #endif @@ -402,63 +437,43 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_var.HasNan("MarkComputed-ElementInverse()"); #endif FunctionValues().SetValue(m_var); - - m_numSamples = 0; } } - virtual void ComputeInputPartial(const size_t /*inputIndex*/) override - { - LogicError("InvStdDev operation should not be involved in the gradient calculation."); - } - virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override { - if (!m_hasComputed) - { - Matrix &samples = Inputs(0)->FunctionValues(); + if (m_hasComputed) + return; // not accumulating + + if (!IsAccumulating()) + LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str()); + + Matrix &samples = Inputs(0)->FunctionValues(); #if 1//NANCHECK - samples.HasNan("InvStdDev-Samples"); + samples.HasNan("InvStdDev-Samples"); #endif - m_temp.SetValue(m_mean); - size_t numNewSample = samples.GetNumCols(); - Matrix::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), samples, false, - ConstOnes(numNewSample, 1, samples.GetDeviceId()), - false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_mean); + m_temp.SetValue(m_mean); + size_t numNewSample = samples.GetNumCols(); + Matrix::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), samples, false, + ConstOnes(numNewSample, 1, samples.GetDeviceId()), + false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_mean); - m_temp -= m_mean; - m_temp.AssignElementPowerOf(m_temp, 2); - m_var += m_temp; + m_temp -= m_mean; + m_temp.AssignElementPowerOf(m_temp, 2); + m_var += m_temp; - m_temp.AssignDifferenceOf(samples, m_mean); - m_temp.AssignElementPowerOf(m_temp, 2); + m_temp.AssignDifferenceOf(samples, m_mean); + m_temp.AssignElementPowerOf(m_temp, 2); - Matrix::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), m_temp, false, - ConstOnes(numNewSample, 1, samples.GetDeviceId()), - false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_var); + Matrix::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), m_temp, false, + ConstOnes(numNewSample, 1, samples.GetDeviceId()), + false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_var); #if 1//NANCHECK - m_var.HasNan("InvStdDev-m_var"); + m_var.HasNan("InvStdDev-m_var"); #endif - m_numSamples += samples.GetNumCols(); - } - } - - virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override - { - Base::Validate(isFinalValidationPass); - - if (!m_hasComputed) - { - size_t inputDim = Inputs(0)->GetNumRows(); - m_mean.Resize(inputDim, 1); - m_var.Resize(inputDim, 1); - // reset accumulators - m_mean.SetValue(0); - m_var.SetValue(0); - fprintf(stderr, "InvStdDev: SetValue(0)\n"); - } + m_numSamples += samples.GetNumCols(); } virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) override @@ -475,15 +490,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (flags & CopyNodeFlags::copyNodeValue) { auto node = dynamic_pointer_cast>(nodeP); - node->m_numSamples = m_numSamples; - node->m_mean = m_mean; node->m_var = m_var; - node-> m_temp = m_temp; + node->m_temp = m_temp; } } private: - size_t m_numSamples; Matrix m_mean; Matrix m_var; Matrix m_temp; diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp index 92985017b..efb6eea51 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp @@ -430,7 +430,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - // prepares the network for computation + bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode) + { + return m_built.find(rootNode) != m_built.end(); + } + + // prepare to compute with the subnetwork that this rootNode depends on, including + // - auto-detecting recurrent loops + // - collect input and learnable nodes + // - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size) // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already. // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice. void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode) @@ -558,53 +566,53 @@ namespace Microsoft { namespace MSR { namespace CNTK { } #endif - template void ComputationNetwork::GetNodesRequiringX(std::list & nodesRequirePreComputation, const ComputationNodeBasePtr rootNode, bool checkComputed) + template void ComputationNetwork::GetNodesRequiringX(std::list & nodesRequiringX, const ComputationNodeBasePtr rootNode, bool checkComputed) { - if (rootNode == nullptr) // find nodes from all available nodes + if (!rootNode) // find nodes from all available nodes { - for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++) + for (const auto & nodep : m_nameToNodeMap) { - ComputationNodeBasePtr node = nodeIter->second; - if (node->RequiresPreCompute()) // TODO: why not check directly for the type with a dynamic_cast? + auto node = dynamic_pointer_cast(nodep.second); + if (node) { - auto preComputedNode = static_pointer_cast(node); - if (!checkComputed || !preComputedNode->HasComputed()) - nodesRequirePreComputation.push_back(node); + assert(node->RequiresPreCompute()); + if (!checkComputed || !node->HasComputed()) + nodesRequiringX.push_back(node); } } } else // or for calculating a specific node { - const auto & nodes = GetEvalOrder(rootNode, false); - for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++) + for (const auto & nodei : GetEvalOrder(rootNode, false)) { - ComputationNodeBasePtr node = *nodeIter; - if (node->RequiresPreCompute()) // TODO: why not check directly for the type with a dynamic_cast? + auto node = dynamic_pointer_cast(nodei); + if (node) { - auto preComputedNode = static_pointer_cast(node); - if (!checkComputed || !preComputedNode->HasComputed()) - nodesRequirePreComputation.push_back(node); + assert(node->RequiresPreCompute()); + if (!checkComputed || !node->HasComputed()) + nodesRequiringX.push_back(node); } } } + nodesRequiringX.unique(); } //return list of nodes that require precomputation and not precomputed yet. std::list ComputationNetwork::GetNodesRequiringPreComputation(const ComputationNodeBasePtr rootNode, bool checkComputed) { - std::list nodesRequirePreComputation; - GetNodesRequiringX>(nodesRequirePreComputation, rootNode, checkComputed); - GetNodesRequiringX>(nodesRequirePreComputation, rootNode, checkComputed); - return nodesRequirePreComputation; + std::list nodesRequiringX; + GetNodesRequiringX>(nodesRequiringX, rootNode, checkComputed); + GetNodesRequiringX>(nodesRequiringX, rootNode, checkComputed); + return nodesRequiringX; } //return list of nodes that require batch mode and not precomputed yet. std::list ComputationNetwork::GetNodesRequiringBatchMode(const ComputationNodeBasePtr rootNode, bool checkComputed) { - std::list nodesRequirePreComputation; - GetNodesRequiringX>(nodesRequirePreComputation, rootNode, checkComputed); - GetNodesRequiringX>(nodesRequirePreComputation, rootNode, checkComputed); - return nodesRequirePreComputation; + std::list nodesRequiringX; + GetNodesRequiringX>(nodesRequiringX, rootNode, checkComputed); + GetNodesRequiringX>(nodesRequiringX, rootNode, checkComputed); + return nodesRequiringX; } // The methods below determine evaluation order, which is tricky in presence of recurrent loops. diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h index 5bb60da78..0cca9c821 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h @@ -565,13 +565,12 @@ public: // - these must be executed frame by frame rather than as a map // - such a loop is treated as if they were a little nested network; this is done inside here // - these little nested networks are defined in m_recurrentInfo[] - void Evaluate(const ComputationNodeBasePtr rootNode) + void Evaluate(const ComputationNodeBasePtr & rootNode) { - // prepare to compute with the subnetwork that this rootNode depends on, including - // - auto-detecting recurrent loops - // - collect input and learnable nodes - // - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size) - BuildAndValidateSubNetwork(rootNode); + // caller must call BuildAndValidateSubNetwork() before + // TODO: Some places are hard to fix, e.g. encoder-decoder best-path functions. Those may be broken; this message will tell you. + if (!BuiltAndValidatedSubNetwork(rootNode)) + LogicError("Evaluate for node %ls %ls: BuildAndValidateSubNetwork() has not been called on this node."); // determines order of evaluation, such that children get evaluated before their parent nodes std::list& allNodes = GetEvalOrder(rootNode, false); @@ -692,6 +691,12 @@ public: } } } + template + void Evaluate(const NODESET & nodes) + { + for (auto & node : nodes) + Evaluate(node); + } // propagate the features' MB size to all nodes of the network // TODO: This function should go. Resizing is now part of Validate() and EvaluateThisNode(). @@ -1233,6 +1238,24 @@ private: public: // prepares the network for computation void BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode); + // and for a set of nodes + void StartEvaluateMinibatchLoop(const ComputationNodeBasePtr & rootNode) // (ugly name; meant to be unique so we can rename if needed) + { + BuildAndValidateSubNetwork(rootNode); + } + template + void StartEvaluateMinibatchLoop(const NODESET & nodes) // (ugly name; meant to be unique so we can rename if needed) + { + for (auto & node : nodes) + StartEvaluateMinibatchLoop(node); + } + template + void StartEvaluateMinibatchLoop(const NODESET & nodes1, const NODESET & nodes2) // often needed for two sets (training & evaluation criteria) + { + StartEvaluateMinibatchLoop(nodes1); + StartEvaluateMinibatchLoop(nodes2); + } + bool BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode); //this function will need to be called before actual validation and execution to //predetermine how to share matrices to reduce memory usage. diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h index 265baea67..885cb5b7b 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h @@ -503,9 +503,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { virtual void /*IComputationNode::*/OnEvaluateBeginIteration() // called before first iteration step of EvaluateThisNode() { - fprintf(stderr, "Trace: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str()); + fprintf(stderr, "OnEvaluateBeginIteration: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str()); + } + virtual void /*IComputationNode::*/OnEvaluateEndIteration() // called after last iteration step of EvaluateThisNode() + { + fprintf(stderr, "OnEvaluateEndIteration: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str()); } - virtual void /*IComputationNode::*/OnEvaluateEndIteration() { } // called after last iteration step of EvaluateThisNode() protected: @@ -823,6 +826,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { template class ComputationNode : public ComputationNodeBase // abstract class that cannot be instantiated { + typedef ComputationNodeBase Base; protected: //std containers such as list and map does not support class reference so we need to use pointer typedef shared_ptr> ComputationNodePtr; @@ -1249,6 +1253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { #ifdef _DEBUG virtual void /*IComputationNode::*/OnEvaluateEndIteration() // called after last iteration step of EvaluateThisNode() { + Base::OnEvaluateEndIteration(); MaskMissingValuesColumnsToZero(); if (m_functionValues.HasNan("OnEvaluateEndIteration")) LogicError("%ls %ls operation unexpectedly produced NaN values.", NodeName().c_str(), OperationName().c_str()); @@ -1487,11 +1492,13 @@ public: \ using Base::SaveToFile; using Base::UpdateFunctionAndGradientMBSize; using Base::SetInput; \ using Base::Validate; using Base::ValidateUnaryMap; using Base::ValidateBinaryZip; using Base::ValidateUnaryReduce; using Base::ValidateBinaryReduce; using Base::ValidateInferBinaryChildren; using Base::ValidateInferInputSize -#define UsingComputationNodeMembersBoilerplate \ +#define ComputationNodeBoilerplate \ protected: /* some boilerplate goes here */ \ virtual const std::wstring OperationName() const override { return TypeName(); } \ - virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) override { return new typename std::remove_reference::type(deviceId, name); } \ - UsingComputationNodeMembers + virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) override { return new typename std::remove_reference::type(deviceId, name); } + +#define UsingComputationNodeMembersBoilerplate \ + ComputationNodeBoilerplate; UsingComputationNodeMembers #pragma endregion base computation class diff --git a/MachineLearning/CNTKEval/CNTKEvalTest/CNTKEvalTest.cpp b/MachineLearning/CNTKEval/CNTKEvalTest/CNTKEvalTest.cpp index f7983cf30..9f236e11c 100644 --- a/MachineLearning/CNTKEval/CNTKEvalTest/CNTKEvalTest.cpp +++ b/MachineLearning/CNTKEval/CNTKEvalTest/CNTKEvalTest.cpp @@ -49,6 +49,7 @@ void DoCommand(const ConfigParameters& configRoot) DataReader* dataReader = new DataReader(readerConfig); eval.LoadModel(modelPath); dataReader->StartMinibatchLoop(mbSize, 0, epochSize); + eval.StartEvaluateMinibatchLoop(outputName); while (dataReader->GetMinibatch(inputMatrices)) { void* data = (void*)arr->data(); diff --git a/MachineLearning/CNTKSGDLib/SGD.cpp b/MachineLearning/CNTKSGDLib/SGD.cpp index b9aa6b3d5..9964f2a89 100644 --- a/MachineLearning/CNTKSGDLib/SGD.cpp +++ b/MachineLearning/CNTKSGDLib/SGD.cpp @@ -1145,7 +1145,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { std::vector & labelNodes, std::map*>* inputMatrices) { - std::list nodes = net.GetNodesRequiringPreComputation(); + std::list nodes = net.GetNodesRequiringPreComputation(); // this tests all HasComputed() flags if (nodes.size() == 0) { @@ -1169,39 +1169,28 @@ namespace Microsoft { namespace MSR { namespace CNTK { trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0); else // using only one epoch trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize); -#if 1 - size_t actualMBSize; - while (DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, nullptr, false, false, *inputMatrices, actualMBSize)) + net.StartEvaluateMinibatchLoop(nodes); + + // initialize + for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++) + { + auto node = static_pointer_cast>(*nodeIter); + node->MarkComputed(false/*begin accumulating*/); + } + size_t actualMBSizeDummy; + while (DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, nullptr, false, false, *inputMatrices, actualMBSizeDummy)) { // TODO: move these into GetMinibatchIntoNetwork() --but those are passed around; necessary? Can't we get them from 'net'? ComputationNetwork::UpdateEvalTimeStamps(featureNodes); ComputationNetwork::UpdateEvalTimeStamps(labelNodes); - for (auto & node : nodes) // this loops over all pertinent PreComputeNodes - net.Evaluate(node); + net.Evaluate(nodes); } -#else - while (trainSetDataReader->GetMinibatch(*inputMatrices)) - { - // TODO: use GetMinibatchIntoNetwork(), should be easy - ComputationNetwork::UpdateEvalTimeStamps(featureNodes); - ComputationNetwork::UpdateEvalTimeStamps(labelNodes); - - net.SetActualMiniBatchSizeFromFeatures(); - trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr()); - net.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences()); - - // TODO: Exactly this loop should be INSIDE ComputationNetwork--pass the nodes array instead! - for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++) - net.Evaluate(*nodeIter); - } -#endif - - // mark done + // finalize for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++) { auto node = static_pointer_cast>(*nodeIter); - node->MarkComputed(true); + node->MarkComputed(true/*done accumulating*/); } return true; @@ -1714,13 +1703,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { int numMBsRun = 0; - size_t numEvalNodes = epochEvalErrors.size(); - // NOTE: the following two local matrices are not used in distGradAgg path // assume only one training criterion node for each epoch. // The criterion values are accumulated here over the minibatches (without having to pull them off the GPU). Matrix localEpochCriterion(1, 1, net.GetDeviceId()); - Matrix localEpochEvalErrors(1, numEvalNodes, net.GetDeviceId()); + Matrix localEpochEvalErrors(1, epochEvalErrors.size(), net.GetDeviceId()); localEpochCriterion.SetValue(0); localEpochEvalErrors.SetValue(0); @@ -1740,7 +1727,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (useGradientAggregation) { epochCriterion = double(0.0); - epochEvalErrors.assign(numEvalNodes, double(0.0)); + epochEvalErrors.assign(epochEvalErrors.size(), double(0.0)); } Profiler profiler(m_numMBsToCUDAProfile); @@ -1752,13 +1739,13 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_enableDistributedMBReading && trainSetDataReader->SupportsDistributedMBRead(); if (useDistributedMBReading) - { trainSetDataReader->StartDistributedMinibatchLoop(tunedMBSize, epochNumber, g_mpi->CurrentNodeRank(), g_mpi->NumNodesInUse(), m_epochSize); - } else - { trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize); - } + net.StartEvaluateMinibatchLoop(evaluationNodes); + net.StartEvaluateMinibatchLoop(criterionNodes); + if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode) + refNet.StartEvaluateMinibatchLoop(refNode); // TODO: what is this?? AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices); @@ -1829,10 +1816,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { //compute eval node first since when gradient is computed the forward function values //may be changed and need to be recomputed when gradient and function value share the same matrix - for (size_t i = 0; i < numEvalNodes; i++) - { - net.Evaluate(evaluationNodes[i]); - } + net.Evaluate(evaluationNodes); // only compute gradient when learning rate is large enough if (learnRatePerSample > m_minLearnRate * 0.01) @@ -1872,7 +1856,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // criteria are in FunctionValues()(0,0), we accumulate into another 1x1 Matrix (to avoid having to pull the values off the GPU) Matrix::AddElementToElement(dynamic_pointer_cast>(criterionNodes[0])->FunctionValues(), 0, 0, localEpochCriterion, 0, 0); - for (size_t i = 0; i < numEvalNodes; i++) + for (size_t i = 0; i < evaluationNodes.size(); i++) { Matrix::AddElementToElement(dynamic_pointer_cast>(evaluationNodes[i])->FunctionValues(), 0, 0, localEpochEvalErrors, 0, i); @@ -1882,14 +1866,14 @@ namespace Microsoft { namespace MSR { namespace CNTK { else { //distributed gradient aggregation - LazyInitDistGradAgg(learnableNodes, numEvalNodes, m_traceLevel); + LazyInitDistGradAgg(learnableNodes, evaluationNodes.size(), m_traceLevel); //prepare the header - m_gradHeader->numEvalNode = numEvalNodes; + m_gradHeader->numEvalNode = evaluationNodes.size(); m_gradHeader->numSamples = actualMBSize; m_gradHeader->numSamplesWithLabel = numSamplesWithLabel; m_gradHeader->criterion = actualMBSize > 0 ? criterionNodes[0]->Get00Element() : 0.0; - for (size_t i = 0; i < numEvalNodes; i++) + for (size_t i = 0; i < evaluationNodes.size(); i++) m_gradHeader->evalErrors[i] = actualMBSize > 0 ? evaluationNodes[i]->Get00Element() : 0.0; m_distGradAgg->AggregateGradients(m_gradHeader, epochNumber); @@ -1897,7 +1881,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { aggregateNumSamples = m_gradHeader->numSamples; aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel; epochCriterion += m_gradHeader->criterion; - for (size_t i = 0; ievalErrors[i]; } @@ -1963,7 +1947,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { timer.Restart(); epochCriterion = localEpochCriterion.Get00Element(); - for (size_t i = 0; i < numEvalNodes; i++) + for (size_t i = 0; i < epochEvalErrors.size(); i++) epochEvalErrors[i] = localEpochEvalErrors(0, i); timer.Stop(); @@ -1991,7 +1975,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_maxComputedEpochSize = numMBsRun * numSamplesLastMBs / m_numMBsToShowResult; } - for (size_t i = 0; i < numEvalNodes; i++) + for (size_t i = 0; i < epochEvalErrors.size(); i++) { double evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs; string formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; "; @@ -2012,7 +1996,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { numSamplesLastMBs = 0; epochCriterionLastMBs = epochCriterion; - for (size_t i = 0; i < numEvalNodes; i++) + for (size_t i = 0; i < epochEvalErrorsLastMBs.size(); i++) epochEvalErrorsLastMBs[i] = epochEvalErrors[i]; if (std::isnan(epochCriterion)) @@ -2057,7 +2041,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { { // with parallelization, we have them in regular variables epochCriterion /= float(totalEpochSamples); - for (size_t i = 0; i< numEvalNodes; i++) + for (size_t i = 0; i< epochEvalErrors.size(); i++) epochEvalErrors[i] /= totalEpochSamples; } else @@ -2067,7 +2051,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { localEpochEvalErrors /= float(totalEpochSamples); epochCriterion = localEpochCriterion.Get00Element(); - for (size_t i = 0; i < numEvalNodes; i++) + for (size_t i = 0; i < epochEvalErrors.size(); i++) epochEvalErrors[i] = localEpochEvalErrors(0, i); } @@ -2495,6 +2479,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { #define EPSILON 1e-5 + // this probes the automatic gradient computation with random inputs template bool SGD::GradientCheck(ComputationNetwork& net, const std::vector & criterionNodes, @@ -2503,6 +2488,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { { vector errMsgs; + net.StartEvaluateMinibatchLoop(criterionNodes[npos]); + // gradient checking for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++) { @@ -2524,7 +2511,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { node->UpdateEvalTimeStamp(); - // use only the first criterion. Is net.ComputeGradient(criterionNodes[npos]); if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE) diff --git a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h index 666c7dbe7..142e0a327 100644 --- a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h +++ b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h @@ -122,6 +122,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { evalResultsLastMBs.push_back((ElemType)0); dataReader->StartMinibatchLoop(mbSize, 0, testSize); + m_net.StartEvaluateMinibatchLoop(evalNodes); while (DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, nullptr, false, false, inputMatrices, actualMBSize)) { @@ -191,6 +192,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } //returns error rate + // TODO: What does this function do? double EvaluateUnroll(IDataReader* dataReader, const size_t mbSize, double &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize) { std::vector & featureNodes = m_net.FeatureNodes(); @@ -211,6 +213,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { inputMatrices[L"numberobs"] = new Matrix(1, 1, m_net.GetDeviceId()); dataReader->StartMinibatchLoop(mbSize, 0, testSize); + m_net.StartEvaluateMinibatchLoop(criterionNodes, evaluationNodes); double epochEvalError = 0; double epochCrossEntropy = 0; @@ -415,9 +418,10 @@ namespace Microsoft { namespace MSR { namespace CNTK { double evalResultsLastMBs = (double)0; for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++) - { (*ptr)->StartMinibatchLoop(mbSize, 0, testSize); - } + // BUGBUG: Code below will fail because we now must call StartMinibatchLoop(), but I can't tell from below which nodes to call it for. + //for (auto & ptr : nets) + // ptr->StartMinibatchLoop(xxx); bool bContinueDecoding = true; while (bContinueDecoding) @@ -743,7 +747,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - //return true if precomputation is executed. + // (only called by FindBestPath...()) void ResetPreCompute() { //mark false @@ -767,6 +771,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { ComputationNetwork::UpdateEvalTimeStamps(featureNodes); + net.StartEvaluateMinibatchLoop(batchComputeNodes); // TODO: Is this correct? There is no StartMinibatchLoop() for a reader. + net.SetActualMiniBatchSizeFromFeatures(); for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++) net.Evaluate(*nodeIter); diff --git a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h index 8bdb18041..c60e1ef2b 100644 --- a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h +++ b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h @@ -61,10 +61,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { //Matrix endOfFile = Matrix((size_t)1,(size_t)1); //endOfFile(0,0)=0; - //evaluate with minibatches + // evaluate with minibatches dataReader.StartMinibatchLoop(mbSize, 0, numOutputSamples); dataReader.SetNumParallelSequences(1); + m_net.StartEvaluateMinibatchLoop(outputNodes); + size_t totalEpochSamples = 0; std::map outputMatrices; @@ -107,7 +109,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { //clean up } - void WriteOutput(IDataReader& dataReader, size_t mbSize, std::wstring outputPath, const std::vector& outputNodeNames, size_t numOutputSamples=requestDataSize) { @@ -142,10 +143,12 @@ namespace Microsoft { namespace MSR { namespace CNTK { std::map*> inputMatrices; for (size_t i=0; iNodeName()] = &dynamic_pointer_cast>(featureNodes[i])->FunctionValues(); - - //evaluate with minibatches + + // evaluate with minibatches dataReader.StartMinibatchLoop(mbSize, 0, numOutputSamples); + m_net.StartEvaluateMinibatchLoop(outputNodes); + size_t totalEpochSamples = 0; size_t numMBsRun = 0; size_t tempArraySize = 0;