Mepsiloeerge branch 'master' into pkranenBoost

.
This commit is contained in:
pkranen 2015-11-05 13:01:56 +01:00
Родитель 75e9a89834 f8659f531e
Коммит b964be5ac2
14 изменённых файлов: 32048 добавлений и 7660 удалений

Просмотреть файл

@ -354,10 +354,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
for (wstring & path : filelist)
{
#ifdef WIN32 // sorry for windows users, we have to pay some cost here
std::replace(path.begin(), path.end(), L'\\', L'/');
if (path.find_first_of(L'=') != wstring::npos)
{
vector<wstring> strarr = msra::strfun::split(path, L"=");
#ifdef WIN32
replace(strarr[1].begin(), strarr[1].end(), L'\\', L'/');
#endif
path = rootpath + L"/" + path;
path = strarr[0] + L"=" + rootpath + L"/" + strarr[1];
}
else
{
#ifdef WIN32
replace(path.begin(), path.end(), L'\\', L'/');
#endif
path = rootpath + L"/" + path;
}
}
}
}
@ -998,6 +1010,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
m_pMBLayout->SetAsNoInput(i, m_numValidFrames[i], m_mbNumTimeSteps);
}
// TODO: Also blast the gaps in the features and labels matrices with NaNs to prevent them from being read
}
typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
@ -1180,54 +1194,68 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
m_processedFrame[i] += (endFr-startFr);
m_switchFrame[i] = actualmbsize[i];
if (actualmbsize[i] < m_mbNumTimeSteps)
m_pMBLayout->Set(i, actualmbsize[i], MinibatchPackingFlags::SequenceStart); // NOTE: this ORs, while original code overwrote in matrix but ORed into vector
if (actualmbsize[i] == m_mbNumTimeSteps)
if (actualmbsize[i] != 0)
m_pMBLayout->Set(i, actualmbsize[i] - 1, MinibatchPackingFlags::SequenceEnd); // NOTE: this ORs, while original code overwrote in matrix but ORed into vector
startFr = m_switchFrame[i];
endFr = m_mbNumTimeSteps;
bool reNewSucc = ReNewBufferForMultiIO(i);
for (iter = matrices.begin();iter!=matrices.end(); iter++)
{
// dereference matrix that corresponds to key (input/output name) and
// populate based on whether its a feature or a label
//Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
// TODO: We should fill in a loop until we fill the minibatch for the case where just one ReNew is not sufficient
// to fill up the remaining slots in the minibatch
bool reNewSucc = ReNewBufferForMultiIO(i);
if (actualmbsize[i] < m_mbNumTimeSteps)
{
if (reNewSucc)
{
id = m_featureNameToIdMap[iter->first];
dim = m_featureNameToDimMap[iter->first];
if (sizeof(ElemType) == sizeof(float))
m_pMBLayout->Set(i, actualmbsize[i], MinibatchPackingFlags::SequenceStart); // NOTE: this ORs, while original code overwrote in matrix but ORed into vector
startFr = m_switchFrame[i];
endFr = m_mbNumTimeSteps;
for (iter = matrices.begin(); iter != matrices.end(); iter++)
{
for (size_t j = startFr,k = 0; j < endFr; j++,k++) // column major, so iterate columns
// dereference matrix that corresponds to key (input/output name) and
// populate based on whether its a feature or a label
//Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
{
// copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
memcpy_s(&m_featuresBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim], sizeof(ElemType) * dim, &m_featuresBufferMultiUtt[i].get()[k * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], sizeof(ElemType) * dim);
}
}
else
{
for (size_t j=startFr,k=0; j < endFr; j++,k++) // column major, so iterate columns in outside loop
{
for (int d = 0; d < dim; d++)
m_featuresBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim + d] = m_featuresBufferMultiUtt[i].get()[k * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
id = m_featureNameToIdMap[iter->first];
dim = m_featureNameToDimMap[iter->first];
if (sizeof(ElemType) == sizeof(float))
{
for (size_t j = startFr, k = 0; j < endFr; j++, k++) // column major, so iterate columns
{
// copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
memcpy_s(&m_featuresBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim], sizeof(ElemType) * dim, &m_featuresBufferMultiUtt[i].get()[k * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], sizeof(ElemType) * dim);
}
}
else
{
for (size_t j = startFr, k = 0; j < endFr; j++, k++) // column major, so iterate columns in outside loop
{
for (int d = 0; d < dim; d++)
m_featuresBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim + d] = m_featuresBufferMultiUtt[i].get()[k * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
}
}
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
{
id = m_labelNameToIdMap[iter->first];
dim = m_labelNameToDimMap[iter->first];
for (size_t j = startFr, k = 0; j < endFr; j++, k++)
{
for (int d = 0; d < dim; d++)
m_labelsBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim + d] = m_labelsBufferMultiUtt[i].get()[k * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
}
}
}
m_processedFrame[i] += (endFr - startFr);
}
else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
else
{
id = m_labelNameToIdMap[iter->first];
dim = m_labelNameToDimMap[iter->first];
for (size_t j = startFr,k=0; j < endFr; j++,k++)
{
for (int d = 0; d < dim; d++)
m_labelsBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim + d] = m_labelsBufferMultiUtt[i].get()[k * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
}
// Mark gaps with NoInput
m_pMBLayout->SetAsNoInput(i, actualmbsize[i], m_mbNumTimeSteps);
// TODO: Also blast the gaps in the features and labels matrices with NaNs to prevent them from being read
}
}
if (reNewSucc) m_processedFrame[i] += (endFr-startFr);
}
}
for (auto iter = matrices.begin();iter!=matrices.end(); iter++)

Просмотреть файл

@ -48,6 +48,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// construction
// -----------------------------------------------------------------------
// TODO: why is this needed? Why is this not just construction?
void ComputationNetwork::ClearNet()
{
for (auto groupIter : GetAllNodeGroups())
@ -59,6 +60,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_cacheEvalOrders.clear();
m_cacheGradientCalcOrders.clear();
m_cachedOuterLoopNodes.clear();
m_inputs.clear();
m_learnableParameters.clear();
@ -567,14 +569,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
// this is called from ClearCache() only, which in turn is called by model editing operations, such as DeleteNode(), and by RebuildNetwork()
// Basically, it invalidates all post-processing, reducing the network to the graph.
void ComputationNetwork::ClearCalcOrderCaches()
{
for (auto & it : m_cacheEvalOrders)
for (auto & iter2 : m_cacheEvalOrders[it.first])
iter2->PurgeStateForFormingRecurrentLoops();
// TODO: ^^ Why is this done? This looks like an error (this function was called ClearCache() before, so maybe someone threw this call in for good measure)
// clear network Iterations cache
m_cacheEvalOrders.clear();
m_cacheGradientCalcOrders.clear();
m_cachedOuterLoopNodes.clear();
}
// lazily reate the m_inputs[] and m_learnableParameters lists

Просмотреть файл

@ -64,6 +64,8 @@ protected:
// This structure stores that little sub-network.
class RecurrentFlowControlNode : public FlowControlNode
{
public: // m_nestedNodes needed public by ComputationNetwork::FindInRecurrentLoops(), which really should be part of RecurrentFlowControlNode
typedef FlowControlNode Base; using Base::m_nestedNodes;
public:
// next steps:
// - change m_recurrentInfo to use shared_ptrs to ComputationNodeBase
@ -76,11 +78,14 @@ protected:
virtual void ComputeInputPartial(const size_t inputIndex, const FrameRange &) override { NOT_IMPLEMENTED; } // ugh, call ComputeGradientForChildren() instead
virtual void OnComputeGradientEndIteration() override;
virtual void ComputeGradientForChildren(const FrameRange & frameRange, bool childrenInThisLoop, bool childrenInOuterLoop) override;
// TODO: should the following be virtualized, too?
const wstring & NodeName() const { return m_sourceNode->NodeName(); } // TODO: why not return a const wchar_t* again?
bool IsFuncValueOlderThanInputs() const;
virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool);
virtual void ReleaseMatricesAfterEval(MatrixPool& matrixPool);
virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool);
virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool);
virtual void ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool);
virtual bool IsFuncValueOlderThanInputs() const override;
public:
std::vector<ComputationNodeBasePtr> m_recurrentNodes; // all nodes involved in this loop, in evaluation order
//std::vector<ComputationNodeBasePtr> m_nestedNodes; // all nodes involved in this loop, in evaluation order
ComputationNodeBasePtr m_sourceNode; // one of the nodes of the loop --TODO: What is the special meaning of this node? It seems to always be a delay node.
int m_loopId; // the loop id (index in m_recurrentInfo array)
bool m_completedGradient;
@ -93,6 +98,7 @@ protected:
m_completedGradient(false),
m_completedEvaluate(false)
{
SetNodeName(L"Loop_" + m_sourceNode->NodeName());
}
};
@ -100,6 +106,7 @@ protected:
// This is the outer loop over the network nodes in PAR mode.
class OuterLoopNode : public FlowControlNode
{
typedef FlowControlNode Base; using Base::m_nestedNodes;
public:
virtual const std::wstring OperationName() const override { return L"OuterLoopNode"; }
virtual void UpdateFunctionMBSize() override { NOT_IMPLEMENTED; }
@ -110,9 +117,14 @@ protected:
virtual void ComputeInputPartial(const size_t inputIndex, const FrameRange &) override { NOT_IMPLEMENTED; } // ugh, call ComputeGradientForChildren() instead
virtual void OnComputeGradientEndIteration() override { }
virtual void ComputeGradientForChildren(const FrameRange & frameRange, bool childrenInThisLoop, bool childrenInOuterLoop) override;
virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool);
virtual void ReleaseMatricesAfterEval(MatrixPool& matrixPool);
virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool);
virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool);
virtual void ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool);
public:
OuterLoopNode(/*const*/ std::vector<shared_ptr<RecurrentFlowControlNode>> & recurrentInfo, const std::list<ComputationNodeBasePtr> & allNodes);
std::list<shared_ptr<IComputationNode>> m_outerNodes; // all top-level nodes, in evaluation order. Nested nodes are tucked inside FlowControlNodes.
// m_nestedNodes contains all top-level nodes, in evaluation order
};
public:
@ -640,6 +652,11 @@ public:
// and for a set of nodes
void StartEvaluateMinibatchLoop(const ComputationNodeBasePtr & rootNode) // (ugly name; meant to be unique so we can rename if needed)
{
#if 0
// TODO: allocation does not belong here. This is called e.g. after loading. Memory should be allocated only when actually evaluating.
// TODO: move into StartEvaluateMinibatchLoop(), but that is called for output nodes individually--can the process handle that?
AllocateEvalMatrices(rootNode);
#endif
// TODO: do we need to reset time stamps?
BuildAndValidateSubNetwork(rootNode);
}
@ -798,39 +815,49 @@ public:
void ClearGradientForAllNodes(const ComputationNodeBasePtr& rootNode)
{
std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode);
std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode); // note: any order will do
for (auto &node : allNodes)
node->ClearGradientForChildren();
for (auto & recInfo : m_recurrentInfo)
for (auto & recInfo : m_recurrentInfo) // TODO: this will go away
recInfo->m_completedGradient = false;
}
// -----------------------------------------------------------------------
// evaluation: traversal
// These three functions create and cache traversal orders of the network.
// -----------------------------------------------------------------------
// determine the required order in which nodes must be computed in order to compute 'rootNode'
// recurrent == true is only used when called from FormRecurrentLoops()
std::list<ComputationNodeBasePtr>& GetEvalOrder(const ComputationNodeBasePtr& rootNode, bool setVisitedOrder)
// skipPairNetwork == true is only used when called from FormRecurrentLoops()
std::list<ComputationNodeBasePtr>& GetEvalOrder(const ComputationNodeBasePtr& rootNode, bool skipPairNetwork)
{
return GetCalcOrder(rootNode, m_cacheEvalOrders, true/*means for forward prop*/, setVisitedOrder);
return GetCalcOrder(rootNode, m_cacheEvalOrders, true/*means for forward prop*/, skipPairNetwork);
}
// determine the required order in which nodes must be computed in order to compute the gradient of 'rootNode'
// Basically returns the reverse of GetEvalOrder(), with some special consideration to loops.
std::list<ComputationNodeBasePtr>& GetGradientCalcOrder(const ComputationNodeBasePtr& rootNode)
{
return GetCalcOrder(rootNode, m_cacheGradientCalcOrders, false/*means for backprop*/, false/*setVisitedOrder*/);
return GetCalcOrder(rootNode, m_cacheGradientCalcOrders, false/*means for backprop*/, false/*skipPairNetwork*/);
}
ComputationNodeBasePtr GetOuterLoopNode(const ComputationNodeBasePtr& rootNode)
{
if (m_cachedOuterLoopNodes.find(rootNode) == m_cachedOuterLoopNodes.end())
m_cachedOuterLoopNodes[rootNode] = make_shared<OuterLoopNode>(m_recurrentInfo, GetEvalOrder(rootNode, false));
return m_cachedOuterLoopNodes[rootNode];
}
private:
static std::list<ComputationNodeBasePtr>& GetCalcOrder(const ComputationNodeBasePtr rootNode,
static std::list<ComputationNodeBasePtr>& GetCalcOrder(const ComputationNodeBasePtr & rootNode,
std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>>& orderMap,
const bool forwardCompute, bool setVisitedOrder)
const bool forwardCompute, bool skipPairNetwork)
{
if (!rootNode)
LogicError("rootNode is NULL.");
if (orderMap.find(rootNode) == orderMap.end())
orderMap[rootNode] = rootNode->EnumerateNodes(forwardCompute, setVisitedOrder);
orderMap[rootNode] = rootNode->EnumerateNodes(forwardCompute, skipPairNetwork);
return orderMap[rootNode];
}
@ -908,8 +935,10 @@ private: // TODO: make all private that can be made private
// cache for evaluation ordering:
std::unordered_set<ComputationNodeBasePtr> m_built; // [node] flag: BuildAndValidateSubNetwork() has been called
// cached network Iterations
std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_cacheEvalOrders;
std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_cacheGradientCalcOrders;
std::map<const ComputationNodeBasePtr, ComputationNodeBasePtr> m_cachedOuterLoopNodes;
std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_inputs; // [out node] -> all input nodes feeding into out node
std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_learnableParameters; // [out node] -> all parameter nodes feeding into out node

Просмотреть файл

@ -42,7 +42,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// determine the strongly connected cliques -> m_recurrentInfo[]
DetermineSCCs(rootNode);
list<ComputationNodeBasePtr>& nodes = GetEvalOrder(rootNode, true/*set m_visitedOrder*/);
list<ComputationNodeBasePtr>& nodes = GetEvalOrder(rootNode, true/*skipPairNetwork*/);
// recover m_visitedOrder
size_t i = 1; // BUGBUG: why not 0? (left-over of refactoring)
for (auto & node : nodes)
node->m_visitedOrder = i++;
// purge identical loops (i.e. loops that have the same source node)
// TODO: Is this for the case that we call this function multiple times, or do the nodes of a loop generate multiple entries? Comment this.
@ -57,24 +61,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
size_t max_visitedOrderInLoop = 0;
// TODO: I am sure there is an STL algorithm for this.
for (auto itr : iter->m_recurrentNodes)
for (auto itr : iter->m_nestedNodes)
if (max_visitedOrderInLoop < itr->m_visitedOrder)
max_visitedOrderInLoop = itr->m_visitedOrder;
for (auto itr : iter->m_recurrentNodes)
for (auto itr : iter->m_nestedNodes)
itr->m_visitedOrder = max_visitedOrderInLoop;
}
// implant m_loopId in all nodes in all loops
for (auto & iter : m_recurrentInfo)
{
#if 1 // instead of the redundant sort() below, we just verify
for (auto & node : iter->m_nestedNodes)
if (node->m_visitedOrder != iter->m_nestedNodes.front()->m_visitedOrder)
LogicError("FormRecurrentLoops: m_visitedOrder was set to a constant, but actually... wasn't?");
#else
// sort the recurrent nodes in their ascending name, which is the same as visiting nodes in G^R
// it is done in the mergerecurrentloops function, but just keep the code --TODO: why?? Why not rather verify the order?
// BUGBUG: This sort() seems to do nothing, since the above loop sets all m_visitedOrder to the same value??
sort(iter->m_recurrentNodes.begin(),
iter->m_recurrentNodes.end(),
iter->m_recurrentNodes[0]->ByVisitedOrder);
for (auto & node : iter->m_recurrentNodes)
sort(iter->m_nestedNodes.begin(),
iter->m_nestedNodes.end(),
iter->m_nestedNodes[0]->ByVisitedOrder);
#endif
for (auto & node : iter->m_nestedNodes)
{
node->m_isPartOfLoop = true; // this is the only flag in ComputationNode that escapes FormRecurrentLoops()!
// TODO: ^^ We should instead remember a pointer to our loop sentinel
@ -91,9 +101,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// set m_indexInLoop for all nodes except Past/FutureValueNodes in all loops
// This value is only used in the block right after this.
for (size_t j = 0; j < iter->m_recurrentNodes.size(); j++)
// This is very mysterious. It is certainly no index in loop. More like a parent count, and excluding delay nodes.
for (size_t j = 0; j < iter->m_nestedNodes.size(); j++)
{
ComputationNodeBasePtr node = iter->m_recurrentNodes[j];
ComputationNodeBasePtr node = iter->m_nestedNodes[j];
for (size_t i = 0; i < node->ChildrenSize(); i++)
{
if (node->Inputs(i)->m_loopId == node->m_loopId &&
@ -101,33 +112,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
node->OperationName() != OperationNameOf(FutureValueNode)) // TODO: test for type RecurrentNode instead?
{
//assert(node->Inputs(i)->m_indexInLoop == 0); // No. It seems this variable really counts the number of parents.
node->Inputs(i)->m_indexInLoop = node->Inputs(i)->m_indexInLoop + 1; // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere. i-1?
node->Inputs(i)->m_indexInLoop++; // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
}
}
}
for (size_t i = 0; i < iter->m_recurrentNodes.size(); i++)
for (size_t i = 0; i < iter->m_nestedNodes.size(); i++)
{
ComputationNodeBasePtr node = iter->m_recurrentNodes[i];
ComputationNodeBasePtr node = iter->m_nestedNodes[i];
if (visited.find(node) == visited.end() && node->m_indexInLoop == 0)
DetermineLoopForwardOrder(visited, recStack, result, node);
}
#if 1
// update m_recurrentNodes with 'result'
iter->m_recurrentNodes.assign(result.begin(), result.end());
#else
// TODO: this loop seems to just copy the list
// m_recurrentNodes = reverse(result)
iter->m_recurrentNodes.clear();
for (size_t i = 0; i < iter->m_recurrentNodesxx.size(); i++) // BUGBUG: is the size of m_recurrentNodes (before clear) the same as result? Guaranteed?
{
iter->m_recurrentNodes.push_back(result.front());
result.pop_front();
}
iter->m_recurrentNodes = iter->m_recurrentNodes; // TODO: are they ever different?
#endif
// update m_nestedNodes with 'result'
iter->m_nestedNodes.assign(result.begin(), result.end());
}
if (m_recurrentInfo.size() > 0)
@ -167,9 +165,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// log the loops
for (auto & iter : m_recurrentInfo)
{
fprintf(stderr, "\nLoop[%d] --> %ls -> %d nodes\n", (int)iter->m_loopId, iter->m_sourceNode->NodeName().c_str(), (int)iter->m_recurrentNodes.size());
fprintf(stderr, "\nLoop[%d] --> %ls -> %d nodes\n", (int)iter->m_loopId, iter->NodeName().c_str(), (int)iter->m_nestedNodes.size());
size_t n = 0;
for (auto itr = iter->m_recurrentNodes.begin(); itr != iter->m_recurrentNodes.end(); itr++)
for (auto itr = iter->m_nestedNodes.begin(); itr != iter->m_nestedNodes.end(); itr++)
{
if (n++ % 3 == 0)
fprintf(stderr, "\n");
@ -177,6 +175,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
fprintf(stderr, "\n");
}
// now turn this into a nested network, ready for evaluation
GetOuterLoopNode(rootNode);
}
// get the strongly connected components from the graph
@ -227,26 +228,42 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// if we closed a loop then create an entry in m_recurrentInfo
if (cur->m_lowLink == cur->m_index) // m_lowLink is still equal to m_index, as we set it at the start of this function: we closed a loop
{
// TODO: build array first in a local array. Only if succeeds, then construct the node off it.
RecurrentFlowControlNode rInfo(loopId, cur);
for (;;)
{
ComputationNodeBasePtr w = sccStack.back();
sccStack.pop_back();
w->m_inStack = false;
rInfo.m_recurrentNodes.push_back(w);
rInfo.m_nestedNodes.push_back(w);
if (w == cur) // hit our starting point: done
break;
}
if (rInfo.m_recurrentNodes.size() > 1) // non-looped nodes are detected here as loops of size 1 --skip those
if (rInfo.m_nestedNodes.size() > 1) // non-looped nodes are detected here as loops of size 1 --skip those
{
loopId++;
m_recurrentInfo.push_back(make_shared<RecurrentFlowControlNode>(move(rInfo)));
// only add to the array if the loop is not already there
// Since FormRecurrentLoops() is called multiple times, for multiple output nodes, we end up producing the same loop multiple times.
bool bFound = false; // find a dup --TODO: check whether there is an STL algorithm for this
for (const auto & iter2 : m_recurrentInfo)
{
if (iter2->m_sourceNode == cur)
{
bFound = true;
break;
}
}
if (!bFound)
{
// TODO: construct rInfo down here
m_recurrentInfo.push_back(make_shared<RecurrentFlowControlNode>(move(rInfo)));
loopId++; // and count it
}
}
}
}
// purge identical loops (i.e. loops that have the same source node)
// TODO: Why not do this where we push a loop into m_recurrentInfo?
// TODO: Delete this function once we find it never triggers.
void ComputationNetwork::UniqRecurrentLoops()
{
if (m_recurrentInfo.size() <= 1)
@ -262,7 +279,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if ((*iter2).m_sourceNode == iter->m_sourceNode)
{
bFound = true;
break;
LogicError("UniqRecurrentLoops: Duplicate loops should no longer occur."); // ...since tested when creating in the first place.
//break;
}
}
if (!bFound)
@ -348,7 +366,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int iId = recInfo->m_loopId;
if (!accessed[iId])
{
newList.insert(newList.end(), recInfo->m_recurrentNodes.begin(), recInfo->m_recurrentNodes.end());
newList.insert(newList.end(), recInfo->m_nestedNodes.begin(), recInfo->m_nestedNodes.end());
accessed[iId] = true;
}
}
@ -378,12 +396,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
for (auto & rInfo : m_recurrentInfo)
{
assert(rInfo->m_recurrentNodes.size() > 0); // (this check was left over after refactoring; it should not be necessary)
assert(rInfo->m_nestedNodes.size() > 0); // (this check was left over after refactoring; it should not be necessary)
bool hasPastValueNode = false;
bool hasFutureValueNode = false;
for (auto & node : rInfo->m_recurrentNodes)
for (auto & node : rInfo->m_nestedNodes)
{
if (node->OperationName() == OperationNameOf(PastValueNode))
hasPastValueNode = true;

Просмотреть файл

@ -29,11 +29,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: pass a set of nodes instead of only one
// TODO: rename to ForwardProp()? To make it very clear?
// This calls EvaluateThisNode() on all nodes in order of data flow through the network.
// By default, the network is applied concurrently on all frames in a minibatch in parallel (a "map" operation)
// By default, the network is applied concurrently on all frames in a minibatch in parallel (PAR mode, a "map" operation)
// Recurrent loops deviate:
// - a recurrent loop is the loop of nodes that make up computation for one time step (e.g. Times -> Plus -> Sigmoid -> Delay)
// - these must be executed frame by frame rather than as a map
// - such a loop is treated as if they were a little nested network; this is done inside here
// - such a loop is treated as if they were a little nested network; this is done inside RecurrentFlowControlNodes
// - these little nested networks are defined in m_recurrentInfo[]
void ComputationNetwork::Evaluate(const ComputationNodeBasePtr & rootNode)
{
@ -43,14 +43,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
LogicError("Evaluate for node %ls %ls: BuildAndValidateSubNetwork() has not been called on this node.");
// TODO: change this to a time stamp to make it consistent with PAR mode
// TODO: No, this is no longer needed with OuterLoopNode. Keep it for now to verify this through runtime checks.
for (auto & recInfo : m_recurrentInfo)
recInfo->m_completedEvaluate = false;
// traverse all nodes in the pre-determined evaluation order
#define USE_OUTER_LOOP_NODE // once this is working then get rid of this #define
#ifdef USE_OUTER_LOOP_NODE
OuterLoopNode outerLoopNode(m_recurrentInfo, GetEvalOrder(rootNode, false));
outerLoopNode.EvaluateThisNode(FrameRange(nullptr));
GetOuterLoopNode(rootNode)->EvaluateThisNode(FrameRange(nullptr));
#else
// determines order of evaluation, such that children get evaluated before their parent nodes
std::list<ComputationNodeBasePtr>& allNodes = GetEvalOrder(rootNode, false);
@ -63,7 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
shared_ptr<RecurrentFlowControlNode> recInfo = FindInRecurrentLoops(m_recurrentInfo, node); // check if this node participates in a recurrent loop
if (recInfo && IsFuncValueOlderThanInputs(recInfo->m_recurrentNodes) && !recInfo->m_completedEvaluate)
if (recInfo && IsFuncValueOlderThanInputs(recInfo->m_nestedNodes) && !recInfo->m_completedEvaluate)
{
#if 1
recInfo->UpdateFunctionMBSize();
@ -72,7 +72,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
recInfo->OnEvaluateEndIteration();
#else
// node participates in a recurrent loop: process the loop frame by frame
const auto & recurrentNodes = recInfo->m_recurrentNodes;
const auto & recurrentNodes = recInfo->m_nestedNodes;
// get layout associated with this loop
auto pMBLayout = recurrentNodes[0]->GetMBLayout();
@ -148,7 +148,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop)
// TODO: pass a set of nodes instead of only one?
// TODO: remove Evaluate() from here, instead call it at call site, and in here merely check whether everything is computed already
// BUGBUG: The decision to loop (SEQ execution) is made by parent, but some children can be executer PAR. It should be possible to detect this.
template<class ElemType>
void ComputationNetwork::ComputeGradient(const ComputationNodeBasePtr rootNode, // training criterion to compute the gradients for
bool bResetToOne, // true if reset the gradient of rootnode to 1.0 --This is the default.
@ -181,17 +180,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
dynamic_pointer_cast<ComputationNode<ElemType>>(rootNode)->GradientValues().SetValue(*rootGradientInitValue);
#ifdef USE_OUTER_LOOP_NODE
#if 1
// sanity check --TODO: remove this once this has been found to not trigger for a while (it should be--EnumerateNodes() just reverses its result when called by GetGradientCalcOrder(). Which makes a lot of sense.)
auto evalOrder = GetEvalOrder(rootNode, false);
auto gradOrder = GetGradientCalcOrder(rootNode);
evalOrder.reverse();
if (evalOrder != gradOrder)
LogicError("ComputeGradient: Gradient computation order must be reverse of evaluation order.");
#endif
OuterLoopNode outerLoopNode(m_recurrentInfo, GetEvalOrder(rootNode, false));
outerLoopNode.ComputeGradientForChildren(FrameRange(nullptr), true, true);
GetOuterLoopNode(rootNode)->ComputeGradientForChildren(FrameRange(nullptr), true, true);
#else
// run backprop pass
std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode);
@ -214,7 +203,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
recInfo->ComputeGradientForChildren(FrameRange(node->GetMBLayout()), true, true);
recInfo->OnComputeGradientEndIteration();
#else
const auto & recurrentNodes = recInfo->m_recurrentNodes;
const auto & recurrentNodes = recInfo->m_nestedNodes;
for (auto & node2 : recurrentNodes)
node2->OnComputeGradientBeginIteration();
auto pMBLayout = recurrentNodes[0]->GetMBLayout();
@ -267,6 +256,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template void ComputationNetwork::ComputeGradient<double>(const ComputationNodeBasePtr rootNode, bool bResetToOne, const Matrix<double>* rootGradientInitValue, bool bClearGradient, bool resetTimeStampAfterComputation);
#ifdef USE_OUTER_LOOP_NODE
// -----------------------------------------------------------------------
// OuterLoopNode methods -- implements PAR traversal
// -----------------------------------------------------------------------
// implementation of OuterLoopNode (implements outer loop over non-recurrent nodes)
ComputationNetwork::OuterLoopNode::OuterLoopNode(/*const*/ std::vector<shared_ptr<RecurrentFlowControlNode>> & recurrentInfo, const std::list<ComputationNodeBasePtr> & allNodes/*must be in eval order*/)
{
@ -278,7 +271,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (recInfo) // node is part of a SEQ loop: gather all of them. The nodes must be consecutive in 'allNodes'
{
// instead of the node itself, include the sentinel RecurrentFlowControlNode in our list
m_outerNodes.push_back(recInfo);
m_nestedNodes.push_back(recInfo);
// and verify that we only encountered the loop once (all nodes should have been consecutive)
if (!loopsSeen.insert(recInfo).second)
LogicError("OuterLoopNode: members of loop %ls are not consecutive in node list.", recInfo->NodeName().c_str());
@ -288,24 +281,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else // regular top-level node (non-looping, PAR)
{
m_outerNodes.push_back(*nodeIter);
m_nestedNodes.push_back(*nodeIter);
nodeIter++; // and consume this node
}
}
}
/*virtual*/ void ComputationNetwork::OuterLoopNode::EvaluateThisNode(const FrameRange & frameRange) /*override*/
{
for (auto & pnode : m_outerNodes)
for (auto & node : m_nestedNodes)
{
auto recInfo = dynamic_pointer_cast<RecurrentFlowControlNode>(pnode);
auto node = dynamic_pointer_cast<ComputationNodeBase>(pnode);
// TODO: This ^^ is not nice.
// We are close but not finished with unifying. Eventually, there must be no if statement below.
#if 1
#if 1
if (node->IsFuncValueOlderThanInputs())
#else
bool isFuncValueOlderThanInputs =
(recInfo && recInfo->IsFuncValueOlderThanInputs()) || // TODO: abstract this out into a virtual function
(node && node->IsFuncValueOlderThanInputs());
if (isFuncValueOlderThanInputs)
#endif
{
auto recInfo = dynamic_pointer_cast<RecurrentFlowControlNode>(node);
if (recInfo)
assert(recInfo->m_sourceNode->GetMBLayout() == node->GetMBLayout());
if (recInfo)
assert(!recInfo->m_completedEvaluate); // TODO: not needed anymore, I think
node->UpdateFunctionMBSize();
// BUGBUG: IsLeaf() for RecurrentFlowControlNode returns false because that node has no children. So we get lucky here. Otherwise it would fail in Validate(). Fix this by getting rid of the Validate() call here.
if (node && !node->IsLeaf() && !node->RequiresPreCompute())
node->Validate(true); // BUGBUG: Validate() should not be called during evaluation. This is meant to update m_functionValues' size in case of sharing.
node->OnEvaluateBeginIteration();
node->EvaluateThisNode(frameRange.WithLayout(node->GetMBLayout()));
node->OnEvaluateEndIteration();
if (recInfo)
recInfo->m_completedEvaluate = true;
node->UpdateEvalTimeStamp(); // TODO: abstract this out to a virtual function
}
#else
// --- if this node is part of a recurrence, evaluate all nodes that participate in this loop
if (recInfo && recInfo->IsFuncValueOlderThanInputs() && !recInfo->m_completedEvaluate)
if (recInfo && recInfo->IsFuncValueOlderThanInputs() /*&& !recInfo->m_completedEvaluate*/)
{
assert(!recInfo->m_completedEvaluate);
pnode->UpdateFunctionMBSize();
pnode->OnEvaluateBeginIteration();
pnode->EvaluateThisNode(frameRange.WithLayout(recInfo->m_sourceNode->GetMBLayout()));
@ -330,6 +351,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
pnode->OnEvaluateEndIteration();
node->UpdateEvalTimeStamp();
}
#endif
#ifdef _DEBUG
else if (node)
node->OnEvaluateEndIteration(); // HACK: performs NaN check, but does nothing else
@ -340,14 +362,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
/*virtual*/ void ComputationNetwork::OuterLoopNode::ComputeGradientForChildren(const FrameRange & frameRange, bool childrenInThisLoop, bool childrenInOuterLoop) /*override*/
{
childrenInThisLoop, childrenInOuterLoop; // TODO: think through what these mean when coming from PAR mode
// TODO: finish this
// process nodes in pre-determined order
for (auto inode = m_outerNodes.rbegin(); inode != m_outerNodes.rend(); inode++) // iterate backwards over evaluation order
for (auto pnode = m_nestedNodes.rbegin(); pnode != m_nestedNodes.rend(); pnode++) // iterate backwards over evaluation order
{
auto pnode = *inode;
auto recInfo = dynamic_pointer_cast<RecurrentFlowControlNode>(pnode);
auto node = dynamic_pointer_cast<ComputationNodeBase>(pnode);
auto & node = *pnode;
#if 1
auto recInfo = dynamic_pointer_cast<RecurrentFlowControlNode>(node);
if (recInfo)
assert(recInfo->m_sourceNode->GetMBLayout() == node->GetMBLayout());
if (recInfo)
assert(!recInfo->m_completedGradient); // TODO: not needed anymore, I think
node->OnComputeGradientBeginIteration();
node->ComputeGradientForChildren(frameRange.WithLayout(node->GetMBLayout()), true, true);
node->OnComputeGradientEndIteration();
if (recInfo)
recInfo->m_completedGradient = true;
#else
// --- first, perform recurrent loops if this node participates in one
if (recInfo)
@ -377,34 +410,45 @@ namespace Microsoft { namespace MSR { namespace CNTK {
pnode->ComputeGradientForChildren(frameRange.WithLayout(node->GetMBLayout()), true, true);
pnode->OnComputeGradientEndIteration();
}
#endif
}
}
/*virtual*/ void ComputationNetwork::OuterLoopNode::RequestMatricesBeforeEval(MatrixPool& matrixPool) /*override*/ { }
/*virtual*/ void ComputationNetwork::OuterLoopNode::ReleaseMatricesAfterEval(MatrixPool& matrixPool) /*override*/ { }
/*virtual*/ void ComputationNetwork::OuterLoopNode::AllocateGradientMatricesForChildren(MatrixPool& matrixPool) /*override*/ { }
/*virtual*/ void ComputationNetwork::OuterLoopNode::RequestMatricesBeforeGradientComp(MatrixPool& matrixPool) /*override*/ { }
/*virtual*/ void ComputationNetwork::OuterLoopNode::ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool) /*override*/ { }
#endif
// -----------------------------------------------------------------------
// RecurrentFlowControlNode methods -- implements SEQ traversal
// -----------------------------------------------------------------------
// implementations of RecurrentFlowControlNode (loop unrolling)
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::UpdateFunctionMBSize() /*override*/
{
for (auto & node2 : m_recurrentNodes)
for (auto & node2 : m_nestedNodes)
node2->UpdateFunctionMBSize(); // TODO: for sequence-to-sequence models we will need to be able to grow this step by step since size is unknown upfront
}
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::OnEvaluateBeginIteration() /*override*/
{
// get layout associated with this loop
auto pMBLayout = m_recurrentNodes[0]->GetMBLayout();
// take the opportunity to check that layout is shared by all nodes in the loop
// TODO: we should do this in a constructor.
for (auto & node2 : m_nestedNodes)
{
if (node2->GetMBLayout() != GetMBLayout())
LogicError("Evaluate: all nodes inside a recurrent loop must have a layout that is identical; mismatch found for nodes '%ls' vs. '%ls'",
node2->NodeName().c_str(), m_nestedNodes[0]->NodeName().c_str());
}
// tell all that loop is about to commence
for (auto & node2 : m_recurrentNodes)
{
if (!pMBLayout || node2->GetMBLayout() != pMBLayout) // take the opportunity to check that layout is shared by all nodes in the loop
LogicError("Evaluate: all nodes inside a recurrent loop must have a layout that is identical; mismatch found for nodes '%ls' vs. '%ls'",
node2->NodeName().c_str(), m_recurrentNodes[0]->NodeName().c_str());
for (auto & node2 : m_nestedNodes)
node2->OnEvaluateBeginIteration();
}
// since we share memory we need to resize function value matrices correctly
// TODO: No, Validate() should only run as a prep stage. This will go away once we separate dimension inference and actual resizing.
for (auto & node2 : m_recurrentNodes)
for (auto & node2 : m_nestedNodes)
node2->Validate(true);
}
@ -416,13 +460,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
// get layout associated with this loop
// All nodes share the same layout.
auto pMBLayout = m_recurrentNodes[0]->GetMBLayout();
assert(GetMBLayout() == m_nestedNodes[0]->GetMBLayout());
// for every time step run through all nodes in this particular loop (treat the loop like a little ComputationNetwork)
FrameRangeIteration range(pMBLayout, m_steppingDirection);
FrameRangeIteration range(GetMBLayout(), m_steppingDirection);
for (auto t = range.begin(); t != range.end(); t++)
{
for (auto & node2 : m_recurrentNodes)
for (auto & node2 : m_nestedNodes)
{
//fprintf(stderr, "EvaluateThisNode %d %ls %ls\n", (int)t.timeIdxInSeq, node2->NodeName().c_str(), node2->OperationName().c_str());
node2->EvaluateThisNode(t);
@ -437,20 +481,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::OnEvaluateEndIteration() /*override*/
{
// tell all that loop is done --e.g. PastValueNode will capture its state for BPTT processing
for (auto & node2 : m_recurrentNodes)
for (auto & node2 : m_nestedNodes)
node2->OnEvaluateEndIteration();
}
// called before first iteration step of ComputeGradient()
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::OnComputeGradientBeginIteration() /*override*/
{
for (auto & node2 : m_recurrentNodes)
for (auto & node2 : m_nestedNodes)
node2->OnComputeGradientBeginIteration();
}
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::ComputeGradientForChildren(const FrameRange &, bool childrenInThisLoop, bool childrenInOuterLoop) /*override*/
{
childrenInThisLoop, childrenInOuterLoop; // TODO: think through what these mean when coming from PAR mode
const auto & recurrentNodes = m_recurrentNodes; // BUGBUG: -ForForward?? Does this mean we can remove non-ForForward?
const auto & recurrentNodes = m_nestedNodes; // BUGBUG: -ForForward?? Does this mean we can remove non-ForForward?
auto pMBLayout = recurrentNodes[0]->GetMBLayout();
FrameRangeIteration range(pMBLayout, m_steppingDirection);
for (auto t = range.rbegin(); t != range.rend(); t++) // note: reverse iteration
@ -476,7 +520,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::OnComputeGradientEndIteration() /*override*/
{
#ifdef OPT_OUTER_GRADIENT
for (auto nodeIter2 = m_recurrentNodes.rbegin(); nodeIter2 != m_recurrentNodes.rend(); ++nodeIter2)
for (auto nodeIter2 = m_nestedNodes.rbegin(); nodeIter2 != m_nestedNodes.rend(); ++nodeIter2)
{
auto & node2 = *nodeIter2;
// BUGBUG: The following can no longer be done after this code was moved into RecurrentFlowControlNode
@ -484,13 +528,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//if (IsNodeReqMultiSeqHandling(node2))
// node2->MaskMissingGradientColumnsToZero(t);
// TODO: exclude children that are not part of the recurrent loop, and do thise below, separately.
node2->ComputeGradientForChildren(FrameRange(m_recurrentNodes[0]->GetMBLayout()), false/*childrenInThisLoop*/, true/*childrenInOuterLoop*/);
node2->ComputeGradientForChildren(FrameRange(m_nestedNodes[0]->GetMBLayout()), false/*childrenInThisLoop*/, true/*childrenInOuterLoop*/);
}
#endif
for (auto & node2 : m_recurrentNodes)
for (auto & node2 : m_nestedNodes)
node2->OnComputeGradientEndIteration();
}
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::RequestMatricesBeforeEval(MatrixPool& matrixPool) /*override*/
{
for (auto & nodeLoopIter : m_nestedNodes)
nodeLoopIter->RequestMatricesBeforeEval(matrixPool);
}
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::ReleaseMatricesAfterEval(MatrixPool& matrixPool) /*override*/ { }
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::AllocateGradientMatricesForChildren(MatrixPool& matrixPool) /*override*/
{
// TODO: should we deallocate in opposite order?
for (auto nodeIter = m_nestedNodes.rbegin(); nodeIter != m_nestedNodes.rend(); ++nodeIter)
{
(*nodeIter)->AllocateGradientMatricesForChildren(matrixPool);
}
}
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::RequestMatricesBeforeGradientComp(MatrixPool& matrixPool) /*override*/ { }
/*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool) /*override*/
{
for (auto nodeIter = m_nestedNodes.rbegin(); nodeIter != m_nestedNodes.rend(); ++nodeIter)
{
if ((*nodeIter)->NeedGradient())
(*nodeIter)->ReleaseMatricesAfterGradientComp(matrixPool);
}
}
// find if node is part of a recurrent loop; and return the loop id
// If found then return a pointer to the list of nodes of this loop.
/*static*/ shared_ptr<ComputationNetwork::RecurrentFlowControlNode> ComputationNetwork::FindInRecurrentLoops(/*const*/ std::vector<std::shared_ptr<RecurrentFlowControlNode>> & recurrentInfo, const ComputationNodeBasePtr& node)
@ -498,14 +566,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// look in all recurrent loops of the network
// TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
for (auto & iter : recurrentInfo)
if (std::find(iter->m_recurrentNodes.begin(), iter->m_recurrentNodes.end(), node) != iter->m_recurrentNodes.end()) // TODO: should this loop need to be a method of RecurrentFlowControlNode?
if (std::find(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end(), node) != iter->m_nestedNodes.end()) // TODO: should this loop need to be a method of RecurrentFlowControlNode?
return iter;
return nullptr; // not part of a recurrent loop
}
// check if any of the nodes in the recurrence IsFuncValueOlderThanInputs(), with exception of delay nodes for which this check would fail and can be skipped
// TODO: Would it be sufficient to check against our own time stamp, so that we can use a unified time-stamping mechanism? Then we'd not need this special check for delayed nodes; just check all inputs against our own time stamp.
// TODO: move this function up to its peers
bool ComputationNetwork::RecurrentFlowControlNode::IsFuncValueOlderThanInputs() const
{
for (auto & ptr : m_recurrentNodes)
for (auto & ptr : m_nestedNodes)
{
if (ptr->IsFuncValueOlderThanInputs() &&
ptr->OperationName() != OperationNameOf(PastValueNode) &&
@ -517,6 +588,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return false;
}
#ifndef USE_OUTER_LOOP_NODE
// TODO: this will move into RecurrentFlowControlNode
bool ComputationNetwork::IsFuncValueOlderThanInputs(const vector<ComputationNodeBasePtr>& recurrentNodes)
{
@ -531,7 +603,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
return false;
}
#endif
// TODO: do this on OuterLoopNode
void ComputationNetwork::ResetEvalTimeStamp()
{
for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
@ -587,8 +661,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (FeatureNodes().size() == 0 && !allowFragment)
RuntimeError("No Feature nodes specified");
#if 1 // If it is not done here, it will causea crash. But it really only belongs into StartEvluationMinibatchLoop()
// TODO: allocation does not belong here. This is called e.g. after loading. Memory should be allocated only when actually evaluating.
// TODO: move into StartEvaluateMinibatchLoop(), but that is called for output nodes individually--can the process handle that?
AllocateAllEvalMatrices(EvaluationNodes(), OutputNodes(), FinalCriterionNodes());
#endif
// first give criteria nodes as root node
if (FinalCriterionNodes().size() > 0)
{
@ -686,6 +763,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (toValidate != 0)
LogicError("ValidateSubNetwork: ValidateNodes(true) unexpectedly returned with work left to do.");
// propagate some info to RecurrentFlowControlNode
// TODO: In the future we should validate not on the flat list but the OuterLoopNode structure. Then this will be unnecessary.
for (auto & recInfo : m_recurrentInfo)
{
auto & node = recInfo->m_sourceNode;
recInfo->m_needsGradient = node->m_needsGradient;
recInfo->LinkToMBLayout(node->GetMBLayout());
}
for (auto & node : nodes)
{
#if 0 // not possible once we have inconsistent layouts
@ -787,7 +873,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice.
void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode)
{
const auto inserted = m_built.insert(rootNode).second; // remember we built it
bool inserted = m_built.insert(rootNode).second; // remember we built it
if (!inserted)
return; // already done
@ -802,6 +888,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ValidateSubNetwork(rootNode);
}
// tests whether BuildAndValidateSubNetwork() was called
bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode)
{
return m_built.find(rootNode) != m_built.end();
@ -810,11 +897,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// -----------------------------------------------------------------------
// memory allocation
// -----------------------------------------------------------------------
//this function will need to be called before actual validation and execution to
//predetermine how to share matrices to reduce memory usage.
//TODO: find a simple topological order and allocateEvalMatrices on that order directly
//without passing in eval, out, and train nodes.
#if 1
// this function will need to be called before actual validation and execution to
// predetermine how to share matrices to reduce memory usage.
// TODO: find a simple topological order and allocateEvalMatrices on that order directly
// without passing in eval, out, and train nodes.
void ComputationNetwork::AllocateAllEvalMatrices(std::vector<ComputationNodeBasePtr>& evalRootNodes,
std::vector<ComputationNodeBasePtr>& outValueRootNodes,
std::vector<ComputationNodeBasePtr>& trainRootNodes)
@ -829,6 +916,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
AllocateEvalMatrices(trainRootNodes[i]);
}
#endif
// TODO: use the same loop mechanism as Evaluate()
void ComputationNetwork::AllocateEvalMatrices(ComputationNodeBasePtr rootNode)
@ -859,15 +947,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
assert(recInfo != nullptr);
if (!recInfo->m_completedEvaluate)
{
const auto & recurrentNodes = recInfo->m_recurrentNodes;
for (auto &nodeLoopIter : recurrentNodes)
#if 1
recInfo->RequestMatricesBeforeEval(m_matrixPool);
#else
for (auto &nodeLoopIter : recInfo->m_nestedNodes)
{
nodeLoopIter->RequestMatricesBeforeEval(m_matrixPool);
}
#endif
recInfo->m_completedEvaluate = true;
for (auto &nodeLoopIter : recurrentNodes)
for (auto &nodeLoopIter : recInfo->m_nestedNodes)
{
ReleaseMatricesAfterEvalForChildren(nodeLoopIter, parentCount);
}
@ -898,16 +989,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
FormRecurrentLoops(rootNode);
//PopulateParents(rootNode);
std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode);
//determine children size
//std::map<ComputationNodeBasePtr, int> childrenCount;
//for (auto &nodeIter : allNodes)
//{
// childrenCount[nodeIter] = nodeIter->ChildrenSize();
//}
//now, simulate the gradient computation order to determine how to allocate matrices
for (auto & recInfo : m_recurrentInfo)
recInfo->m_completedGradient = false;
@ -923,11 +1006,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
shared_ptr<RecurrentFlowControlNode> recInfo = FindInRecurrentLoops(m_recurrentInfo, n);
if (recInfo && recInfo->m_completedGradient == false)
{
const auto & recurrentNodes = recInfo->m_recurrentNodes;
// SEQ mode: allocate all in loop first, then deallocate again
#if 1 // TODO: next step: use OuterLoopNode::AllocateGradientMatricesForChildren() and ReleaseMatricesAfterGradientComp()...
// BUGBUG: naw, ^^ would not work! Wrong order! Need to rethink this. Need to make AllocateEvalMatrices() and AllocateGradientMatrices() the virtual functions.
recInfo->AllocateGradientMatricesForChildren(m_matrixPool);
//loops are computed sample by sample so we have to allocate them all
recInfo->m_completedGradient = true;
recInfo->ReleaseMatricesAfterGradientComp(m_matrixPool);
#else
const auto & recurrentNodes = recInfo->m_nestedNodes;
//loops are computed sample by sample so we have to allocate them all
for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
{
AllocateGradientMatricesForChildren(*nodeIter);
(*nodeIter)->AllocateGradientMatricesForChildren(m_matrixPool);
}
recInfo->m_completedGradient = true;
for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
@ -937,29 +1028,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
(*nodeIter)->ReleaseMatricesAfterGradientComp(m_matrixPool);
}
}
#endif
}
}
else
{
AllocateGradientMatricesForChildren(n);
if ((n != rootNode) && n->NeedGradient()) //root node's informatioin will be used and should not be shared with others, also it's small (1x1)
// PAR mode: we can allocate and immediately deallocate one by one
n->AllocateGradientMatricesForChildren(m_matrixPool);
if ((n != rootNode) && n->NeedGradient()) //root node's information will be used and should not be shared with others, also it's small (1x1)
n->ReleaseMatricesAfterGradientComp(m_matrixPool);
}
}
}
//void ReleaseMatricesAfterGradientCompForParents(ComputationNodeBasePtr n, std::map<ComputationNodeBasePtr, int>& childrenCount)
//{
// for (int i = 0; i < n->ParentSize(); i++)
// {
// ComputationNodeBasePtr pNode = n->Parent(i);
// childrenCount[pNode] --;
// if (childrenCount[pNode] == 0)
// pNode->ReleaseMatricesAfterGradientComp(m_matrixPool);
// }
//}
#if 0
void ComputationNetwork::AllocateGradientMatricesForChildren(ComputationNodeBasePtr parentNode)
{
std::vector<ComputationNodeBasePtr> children = parentNode->GetChildren();
@ -969,5 +1051,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
children[i]->RequestMatricesBeforeGradientComp(m_matrixPool);
}
}
#endif
}}}

Просмотреть файл

@ -173,7 +173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
typedef Matrix<float> FloatMatrix;
typedef Matrix<double> DoubleMatrix;
atomic_ullong ComputationNetworkOwnedNodeState::s_timeStampCounter = ATOMIC_VAR_INIT(0);
atomic_ullong TimeStamp::s_timeStampCounter = ATOMIC_VAR_INIT(0);
template<> std::map<size_t, std::map<size_t, FloatMatrix*>> ComputationNode<float>::s_constOnes{};
template<> std::map<size_t, std::map<size_t, DoubleMatrix*>> ComputationNode<double>::s_constOnes{};

Просмотреть файл

@ -100,6 +100,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
virtual void OnComputeGradientEndIteration() = 0; // called after last iteration step of ComputeGradient()
// TODO: this one does not quite fit here
// functions that are called from Network, but not necessarily overridden by the node implementations themselves
virtual void ComputeGradientForChildren(const FrameRange & frameRange, bool childrenInThisLoop, bool childrenInOuterLoop) = 0;
// --- optional overrides that add functionality
@ -107,6 +108,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// Any override must call Base version as well.
// Default implementations are in ComputationNodeBase or ComputationNode<ElemType>.
virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool) = 0; //request matrices needed to do node function value evaluation
virtual void ReleaseMatricesAfterEval(MatrixPool& matrixPool) = 0; //release temp matrices that are only used by forward computation. Don't release matrices that need to be used in the gradient computation
virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool) = 0;
virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool) = 0; //request matrices that are needed for gradient computation
virtual void ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool) = 0; //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
virtual void Validate(bool isFinalValidationPass) = 0; // main base validation function
virtual void InferImageDimsFromInputs() = 0;
virtual void SaveToFile(File& fstream) const = 0;
@ -144,35 +151,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
PurgeStateForFormingRecurrentLoops();
m_isPartOfLoop = false;
ResetEvalTimeStamp(); // bring it into defined state
}
void CopyTo(ComputationNetworkOwnedNodeState & other) const
{
// TODO: is that really all we copy? (this is a result of refactoring, so it seems yes indeed). Should we at least ClearCache()?
other.m_evalTimeStamp = m_evalTimeStamp;
other.m_isPartOfLoop = m_isPartOfLoop;
other.m_needsGradient = m_needsGradient;
}
int64_t UpdateEvalTimeStamp()
{
m_evalTimeStamp = atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1); // TODO: does this really need to be atomic? We are not multi-threaded
return m_evalTimeStamp;
}
void ResetEvalTimeStamp()
{
m_evalTimeStamp = s_timeStampCounter;
}
int64_t GetEvalTimeStamp() const { return m_evalTimeStamp; }
int64_t CreateUniqId() const
{
return atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1);
}
static bool ByVisitedOrder(const ComputationNetworkOwnedNodeState * lhs, const ComputationNetworkOwnedNodeState * rhs) // sorting predicate
{
return lhs->m_visitedOrder < rhs->m_visitedOrder;
@ -182,9 +169,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
private:
static atomic_ullong s_timeStampCounter;
int64_t m_evalTimeStamp; //this is used to reduce unnecessary recomputation when a different node in the model is reevaluated
bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
protected: // TODO: should be fully encapsulated here
@ -213,6 +197,38 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool m_inStack;
};
// =======================================================================
// TimeStamp -- helper class to manage a time stamp
// =======================================================================
class TimeStamp
{
public:
TimeStamp() { ResetEvalTimeStamp(); }
void CopyTo(TimeStamp & other) const { other.m_evalTimeStamp = m_evalTimeStamp; }
void ResetEvalTimeStamp() { m_evalTimeStamp = s_timeStampCounter; }
int64_t GetEvalTimeStamp() const { return m_evalTimeStamp; }
// create a new unique time stamp
void UpdateEvalTimeStamp() { m_evalTimeStamp = CreateUniqId(); }
// the difference is taken to take into account numeric overflow (which really should never happen for a 64-bit integer... but hey, it's free!)
bool IsOlderThan(const TimeStamp & other) const
{
// BUGBUG: For some reason, we must test equality as well, although that does not indicate being older.
return GetEvalTimeStamp() - other.GetEvalTimeStamp() /*<*/ <= 0;
}
int64_t CreateUniqId() const
{
return /*1 +*/ atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1);
}
private:
static atomic_ullong s_timeStampCounter;
int64_t m_evalTimeStamp; //this is used to reduce unnecessary recomputation when a different node in the model is reevaluated
};
// =======================================================================
// ComputationNodeBase -- abstract base class for all computation nodes
// TODO: decide the name. This does contain actual members such as the node name, so it's not really a pure interface.
@ -220,7 +236,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
class ComputationNodeBase :
public IComputationNode,
public/*protected*/ ComputationNetworkOwnedNodeState, // TODO: figure this out, somehow the 'friend' thing does not work
public/*protected*/ ComputationNetworkOwnedNodeState, // TODO: figure this out, somehow the 'friend' thing does not work
public TimeStamp, // for time-stamp management
public ScriptableObjects::ComputationNodeObject,
public ScriptableObjects::WithTag, public ScriptableObjects::HasName, public ScriptableObjects::HasToString,
public std::enable_shared_from_this<ComputationNodeBase>
@ -255,6 +272,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
node->m_outputImageLayout = m_outputImageLayout;
ComputationNetworkOwnedNodeState::CopyTo(*node);
TimeStamp::CopyTo(*node);
}
}
@ -568,17 +586,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// This creates a list such that children are evaluated before their parents.
// If !forForwardProp then the order will be reversed, suitable for backprop.
// The 'recurrent' version is only called from FormRecurrentLoops().
// Side-effects (unbeknownst to the name of the function):
// - m_needsGradient flags, are propagated up from children --BUGBUG! This should only be computed in ValidateSubNetwork().
// - ComputationNetworkOwnedNodeState::m_visitedOrder (only if 'recurrent' flag is set; otherwise leave untouched), as needed by FormRecurrentNodes()
// TODO: This should be a method of ComputationNetwork, not ComputationNode.
std::list<ComputationNodeBasePtr> EnumerateNodes(bool forForwardProp/*else get order for backprop*/, bool setVisitedOrder)
std::list<ComputationNodeBasePtr> EnumerateNodes(bool forForwardProp/*else get order for backprop*/, bool skipPairNetwork)
{
std::list<ComputationNodeBasePtr> nodes;
std::unordered_set<ComputationNodeBasePtr> visited;
// get forward computation order
EnumerateNodesR(visited, nodes, setVisitedOrder); // call into the recursive portion of this function below
EnumerateNodesR(visited, nodes, skipPairNetwork); // call into the recursive portion of this function below
// if caller wants order for backprop then reverse it
if (!forForwardProp)
@ -588,19 +603,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
private:
// Recursive part of EnumerateNodes().
void EnumerateNodesR(std::unordered_set<ComputationNodeBasePtr>& visited, std::list<ComputationNodeBasePtr>& result, bool setVisitedOrder)
void EnumerateNodesR(std::unordered_set<ComputationNodeBasePtr>& visited, std::list<ComputationNodeBasePtr>& result, bool skipPairNetwork)
{
if (visited.find(shared_from_this()) == visited.end()) // do not include a node twice
{
visited.insert(shared_from_this()); // have visited tagged here to avoid infinite loop over children, children's children, etc
// children first for function evaluation
if (OperationName() != L"PairNetwork" || !setVisitedOrder) // (don't step through network-pair boundary if called from FormRecurrentLoops())
if (OperationName() != L"PairNetwork" || !skipPairNetwork) // (don't step through network-pair boundary if called from FormRecurrentLoops())
{
for (int i = 0; i < m_children.size(); i++)
{
if (m_children[i])
m_children[i]->EnumerateNodesR(visited, result, setVisitedOrder);
m_children[i]->EnumerateNodesR(visited, result, skipPairNetwork);
}
}
@ -614,8 +629,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// now that all children are in list before us, put ourselves
result.push_back(shared_from_this());
#if 0 // this does not work, since m_visitedOrder gets cleared out, while the list survives in a cache
if (setVisitedOrder) // FormRecurrentNodes() would like this variable to be set as well
m_visitedOrder = result.size();
#endif
}
}
public:
@ -636,13 +653,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// check whether a node is up-to-date w.r.t. its children, for lazy evaluation
// If this returns false, node must be evaluated to update m_functionValues.
bool IsFuncValueOlderThanInputs() const
// BUGBUG: The function name is incorrect. It also returns 'true' if a child has the same time stamp (not older).
// This is virtual because it is overridden by traversal nodes.
virtual bool IsFuncValueOlderThanInputs() const
{
for (size_t i = 0; i<ChildrenSize(); i++)
{
#if 1
if (IsOlderThan(*m_children[i]))
return true;
#else
//the second condition is used when the time stamp change from positive to negative
if (m_children[i]->GetEvalTimeStamp() >= GetEvalTimeStamp() || m_children[i]->GetEvalTimeStamp() + 1e10 < GetEvalTimeStamp())
return true;
#endif
}
return false;
@ -709,19 +733,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return name;
}
//request matrices needed to do node function value evaluation
virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool) = 0;
//release temp matrices that are only used by forward computation
//don't release matrices that need to be used in the gradient computation
virtual void ReleaseMatricesAfterEval(MatrixPool& matrixPool) = 0;
//request matrices that are needed for gradient computation
virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool) = 0;
//release gradient and temp matrices that no longer needed after all the children's gradients are computed.
virtual void ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool) = 0;
protected:
// data members
//std::vector<ComputationNodeBasePtr> m_parents; //m_parents are dynamically determined based on the root node you want to compute
@ -857,6 +868,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
}
virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool) override
{
for (int i = 0; i < m_children.size(); i++)
{
if (m_children[i]->NeedGradient())
m_children[i]->RequestMatricesBeforeGradientComp(matrixPool);
}
}
//request matrices that are needed for gradient computation
virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool)
{
@ -1367,23 +1387,44 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// FlowControlNode -- special wrapper node for use by ComputationNetwork only
// =======================================================================
class FlowControlNode : public IComputationNode
class FlowControlNode : public ComputationNodeBase
{
typedef ComputationNodeBase Base;
public:
FlowControlNode() : ComputationNodeBase(DEVICEID_NOTYETDETERMINED/*we don't own matrices*/, L""/*name: we don't care*/) { }
#pragma warning (disable: 4100)
// these should never be called on flow-control nodes
virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) { NOT_IMPLEMENTED; }
virtual void Validate(bool isFinalValidationPass) { NOT_IMPLEMENTED; } // main base validation function
virtual void InferImageDimsFromInputs() { NOT_IMPLEMENTED; }
virtual void SaveToFile(File& fstream) const { NOT_IMPLEMENTED; }
virtual void LoadFromFile(File& /*fstream*/, size_t /*modelVersion*/) { NOT_IMPLEMENTED; }
virtual void CopyTo(ComputationNodeBasePtr node, const std::wstring& newName, const CopyNodeFlags flags) const { NOT_IMPLEMENTED; }
// these are meant to be implemented by ComputationNode<ElemType> but should never be called on traversal nodes
// TODO: There are too many of these. This indicates improper class hierarchies.
virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) override { NOT_IMPLEMENTED; }
virtual void Validate(bool isFinalValidationPass) override { NOT_IMPLEMENTED; } // main base validation function
virtual void InferImageDimsFromInputs() override { NOT_IMPLEMENTED; }
virtual void SaveToFile(File& fstream) const override { NOT_IMPLEMENTED; }
virtual void LoadFromFile(File& /*fstream*/, size_t /*modelVersion*/) override { NOT_IMPLEMENTED; }
virtual void CopyTo(ComputationNodeBasePtr node, const std::wstring& newName, const CopyNodeFlags flags) const override { NOT_IMPLEMENTED; }
virtual ComputationNodeBasePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) override { NOT_IMPLEMENTED; }
virtual size_t GetNumRows() const override { NOT_IMPLEMENTED; }
virtual size_t GetNumCols() const override { NOT_IMPLEMENTED; }
virtual void Resize(size_t rows, size_t cols) override { NOT_IMPLEMENTED; }
virtual double Get00Element() const override { NOT_IMPLEMENTED; }
virtual void AttachInputs(const std::vector<ComputationNodeBasePtr>& inputs) override { NOT_IMPLEMENTED; }
virtual void PrintSelf(bool) const override { NOT_IMPLEMENTED; }
virtual void ValidateInferChildDims(size_t,size_t,size_t) override { NOT_IMPLEMENTED; }
virtual void SetInput(const size_t,const Microsoft::MSR::CNTK::ComputationNodeBase::ComputationNodeBasePtr &) override { NOT_IMPLEMENTED; }
virtual void ClearGradientForChildren(void) override { NOT_IMPLEMENTED; }
virtual void MaskMissingValuesColumnsToZero(const Microsoft::MSR::CNTK::FrameRange &) override { NOT_IMPLEMENTED; }
virtual void MaskMissingGradientColumnsToZero(const Microsoft::MSR::CNTK::FrameRange &) override { NOT_IMPLEMENTED; }
virtual void InvalidateMissingValuesColumns(const Microsoft::MSR::CNTK::FrameRange &) override { NOT_IMPLEMENTED; }
virtual void InvalidateMissingGradientColumns(const Microsoft::MSR::CNTK::FrameRange &) override { NOT_IMPLEMENTED; }
virtual std::wstring ToString(void) const override { NOT_IMPLEMENTED; }
// these are meant to be called during computation, so provide dummy implementations
virtual bool RequiresPreCompute() const { return false; } // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
virtual void PrintSelfBeforeValidation() const { }
virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const { }
virtual bool RequiresPreCompute() const override { return false; } // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() override { return true; }
virtual void PrintSelfBeforeValidation() const override { }
virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const override { }
protected:
public: // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of RecurrentFlowControlNode
std::vector<ComputationNodeBasePtr> m_nestedNodes; // nodes tucked away in this node, in evaluation order
};
// =======================================================================

Просмотреть файл

@ -1824,11 +1824,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (useDistributedMBReading)
{
trainSetDataReader->StartDistributedMinibatchLoop(tunedMBSize, epochNumber, g_mpi->CurrentNodeRank(),
g_mpi->NumNodesInUse(), m_epochSize);
g_mpi->NumNodesInUse(), epochSize);
}
else
{
trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, epochSize);
}
net.StartEvaluateMinibatchLoop(evaluationNodes);
@ -2160,9 +2160,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
totalEpochSamples += aggregateNumSamplesWithLabel;
totalSamplesSeen += aggregateNumSamplesWithLabel;
if (totalEpochSamples >= epochSize)
break;
// call DataEnd function
// This signals something from SGD to the reader.
// DataEnd does reader specific process if sentence ending is reached

Просмотреть файл

@ -371,6 +371,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
m_matrixName=NULL;
m_format = matrixFormatDense;
m_externalBuffer = false;
m_workspace = nullptr;
}
template<class ElemType>
@ -503,6 +504,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return m_computeDevice;
}
template<class ElemType>
std::unique_ptr<GPUMatrix<ElemType>> GPUMatrix<ElemType>::GetOrCreateWorkspace() const
{
// REVIEW alexeyk: not thread-safe, fine for now.
if (m_workspace == nullptr)
m_workspace = new conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>();
assert(m_workspace != nullptr);
auto deviceId = m_computeDevice;
return m_workspace->pop_or_create([deviceId]() { return std::make_unique<GPUMatrix<ElemType>>(deviceId); });
}
template<class ElemType>
void GPUMatrix<ElemType>::ReleaseWorkspace(std::unique_ptr<GPUMatrix<ElemType>> src) const
{
assert(m_workspace != nullptr);
m_workspace->push(std::move(src));
}
#pragma region Basic Operators
template<class ElemType>
GPUMatrix<ElemType> GPUMatrix<ElemType>::ColumnSlice(size_t startColumn, size_t numCols) const
@ -3052,10 +3071,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cbtemp = ctemp * sizeof(ElemType);
// ElemType count needed to store indices, accounting for natural alignment for uint64_t type.
size_t cidx = ((celt + 1) * sizeof(uint64_t) - 1 + sizeof(ElemType) - 1) / sizeof(ElemType);
// Prepare temp workspace.
auto deviceId = m_computeDevice;
assert(m_workspace != nullptr);
auto workspace = m_workspace->pop_or_create([deviceId]() { return std::make_unique<GPUMatrix<ElemType>>(deviceId); });
// Get temp workspace.
auto workspace = GetOrCreateWorkspace();
// Resize to store: output values for the 1st and 2nd passes, input indices, output indices, and temp storage.
workspace->Resize(m, 2 * n + (2 * cidx + ctemp + m - 1) / m);
outVal1 = workspace->m_pArray;
@ -3081,7 +3098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
cblock = (topK * n + ThreadsPerBlock - 1) / ThreadsPerBlock;
_copyTopKResults<<<cblock, ThreadsPerBlock, 0, t_stream>>>(inIdx, outVal2, maxIndexes.m_pArray, maxValues.m_pArray, m, n, topK);
m_workspace->push(std::move(workspace));
ReleaseWorkspace(std::move(workspace));
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));

Просмотреть файл

@ -92,9 +92,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
static cublasHandle_t s_cuHandle[MaxGpus];
static void *s_curandGenerator;
// Have to use naked pointer to avoid issues with __declspec(dllexport) on Windows.
// REVIEW alexeyk: can be allocated lazily but the current footprint is small anyway.
mutable conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>* m_workspace = new conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>;
// Have to use naked pointer to avoid issues with __declspec(dllexport) on Windows (C4251).
// Cannot use atomic for the same reason either.
mutable conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>* m_workspace;
private:
void performInplaceFunction(int kind);
@ -102,6 +102,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t LocateColumn (const size_t j) const;
void Clear();
void ZeroInit(int deviceId);
std::unique_ptr<GPUMatrix<ElemType>> GetOrCreateWorkspace() const;
void ReleaseWorkspace(std::unique_ptr<GPUMatrix<ElemType>> src) const;
public:
GPUMatrix(int deviceId);

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу

Разница между файлами не показана из-за своего большого размера Загрузить разницу