Mepsiloeerge branch 'master' into pkranenBoost

.
2015-11-05 13:01:56 +01:00 · 2015-11-05 13:01:56 +01:00 · b964be5ac2
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@ -354,10 +354,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        for (wstring & path : filelist)
                        {
-#ifdef WIN32                // sorry for windows users, we have to pay some cost here 
-                            std::replace(path.begin(), path.end(), L'\\', L'/'); 
+                            if (path.find_first_of(L'=') != wstring::npos)
+                            {
+                                vector<wstring> strarr = msra::strfun::split(path, L"="); 
+#ifdef WIN32
+                                replace(strarr[1].begin(), strarr[1].end(), L'\\', L'/');
 #endif 
-                            path = rootpath + L"/" + path;  
+
+                                path = strarr[0] + L"=" + rootpath + L"/" + strarr[1]; 
+                            }                     
+                            else
+                            {
+#ifdef WIN32
+                                replace(path.begin(), path.end(), L'\\', L'/');
+#endif 
+                                path = rootpath + L"/" + path;  
+                            }                            
                        }
                    }
                }
@ -998,6 +1010,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            {
                                m_pMBLayout->SetAsNoInput(i, m_numValidFrames[i], m_mbNumTimeSteps);
                            }
+
+                            // TODO: Also blast the gaps in the features and labels matrices with NaNs to prevent them from being read
                        }

                        typename std::map<std::wstring, Matrix<ElemType>*>::iterator iter;
@ -1180,54 +1194,68 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            }
                            m_processedFrame[i] += (endFr-startFr);
                            m_switchFrame[i] = actualmbsize[i];
-                            if (actualmbsize[i] < m_mbNumTimeSteps)
-                                m_pMBLayout->Set(i, actualmbsize[i], MinibatchPackingFlags::SequenceStart); // NOTE: this ORs, while original code overwrote in matrix but ORed into vector
-                            if (actualmbsize[i] == m_mbNumTimeSteps)
+                            if (actualmbsize[i] != 0)
                                m_pMBLayout->Set(i, actualmbsize[i] - 1, MinibatchPackingFlags::SequenceEnd); // NOTE: this ORs, while original code overwrote in matrix but ORed into vector
-                            startFr = m_switchFrame[i];
-                            endFr = m_mbNumTimeSteps;
-                            bool reNewSucc = ReNewBufferForMultiIO(i);
-                            for (iter = matrices.begin();iter!=matrices.end(); iter++)
-                            {
-                                // dereference matrix that corresponds to key (input/output name) and 
-                                // populate based on whether its a feature or a label
-                                //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels

-                                if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
+                            // TODO: We should fill in a loop until we fill the minibatch for the case where just one ReNew is not sufficient
+                            // to fill up the remaining slots in the minibatch
+                            bool reNewSucc = ReNewBufferForMultiIO(i);
+                            if (actualmbsize[i] < m_mbNumTimeSteps)
+                            {
+                                if (reNewSucc)
                                {
-                                    id = m_featureNameToIdMap[iter->first];
-                                    dim = m_featureNameToDimMap[iter->first];
-                                    if (sizeof(ElemType) == sizeof(float))
+                                    m_pMBLayout->Set(i, actualmbsize[i], MinibatchPackingFlags::SequenceStart); // NOTE: this ORs, while original code overwrote in matrix but ORed into vector
+                                    startFr = m_switchFrame[i];
+                                    endFr = m_mbNumTimeSteps;
+                                    for (iter = matrices.begin(); iter != matrices.end(); iter++)
                                    {
-                                        for (size_t j = startFr,k = 0; j < endFr; j++,k++) // column major, so iterate columns
+                                        // dereference matrix that corresponds to key (input/output name) and 
+                                        // populate based on whether its a feature or a label
+                                        //Matrix<ElemType>& data = *matrices[iter->first]; // can be features or labels
+
+                                        if (m_nameToTypeMap[iter->first] == InputOutputTypes::real)
                                        {
-                                            // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
-                                            memcpy_s(&m_featuresBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim], sizeof(ElemType) * dim, &m_featuresBufferMultiUtt[i].get()[k * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], sizeof(ElemType) * dim);
-                                        }
-                                    }
-                                    else
-                                    {
-                                        for (size_t j=startFr,k=0; j < endFr; j++,k++) // column major, so iterate columns in outside loop
-                                        {
-                                            for (int d = 0; d < dim; d++)
-                                                m_featuresBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim + d] = m_featuresBufferMultiUtt[i].get()[k * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
+                                            id = m_featureNameToIdMap[iter->first];
+                                            dim = m_featureNameToDimMap[iter->first];
+                                            if (sizeof(ElemType) == sizeof(float))
+                                            {
+                                                for (size_t j = startFr, k = 0; j < endFr; j++, k++) // column major, so iterate columns
+                                                {
+                                                    // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
+                                                    memcpy_s(&m_featuresBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim], sizeof(ElemType) * dim, &m_featuresBufferMultiUtt[i].get()[k * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], sizeof(ElemType) * dim);
+                                                }
+                                            }
+                                            else
+                                            {
+                                                for (size_t j = startFr, k = 0; j < endFr; j++, k++) // column major, so iterate columns in outside loop
+                                                {
+                                                    for (int d = 0; d < dim; d++)
+                                                        m_featuresBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim + d] = m_featuresBufferMultiUtt[i].get()[k * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
+                                                }
+                                            }
+                                        }
+                                        else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
+                                        {
+                                            id = m_labelNameToIdMap[iter->first];
+                                            dim = m_labelNameToDimMap[iter->first];
+                                            for (size_t j = startFr, k = 0; j < endFr; j++, k++)
+                                            {
+                                                for (int d = 0; d < dim; d++)
+                                                    m_labelsBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim + d] = m_labelsBufferMultiUtt[i].get()[k * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
+                                            }
                                        }
                                    }
+
+                                    m_processedFrame[i] += (endFr - startFr);
                                }
-                                else if (m_nameToTypeMap[iter->first] == InputOutputTypes::category)
+                                else
                                {
-                                    id = m_labelNameToIdMap[iter->first];
-                                    dim = m_labelNameToDimMap[iter->first];
-                                    for (size_t j = startFr,k=0; j < endFr; j++,k++)
-                                    {
-                                        for (int d = 0; d < dim; d++)
-                                            m_labelsBufferMultiIO[id].get()[(j * m_numSeqsPerMB + i) * dim + d] = m_labelsBufferMultiUtt[i].get()[k * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
-                                    }
+                                    // Mark gaps with NoInput
+                                    m_pMBLayout->SetAsNoInput(i, actualmbsize[i], m_mbNumTimeSteps);
+
+                                    // TODO: Also blast the gaps in the features and labels matrices with NaNs to prevent them from being read
                                }
                            }
-
-                            if (reNewSucc) m_processedFrame[i] += (endFr-startFr);
-
                        }
                    }
                    for (auto iter = matrices.begin();iter!=matrices.end(); iter++)
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@ -48,6 +48,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // construction
    // -----------------------------------------------------------------------

+    // TODO: why is this needed? Why is this not just construction?
    void ComputationNetwork::ClearNet()
    {
        for (auto groupIter : GetAllNodeGroups())
@ -59,6 +60,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        m_cacheEvalOrders.clear();
        m_cacheGradientCalcOrders.clear();
+        m_cachedOuterLoopNodes.clear();

        m_inputs.clear();
        m_learnableParameters.clear();
@ -567,14 +569,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    // this is called from ClearCache() only, which in turn is called by model editing operations, such as DeleteNode(), and by RebuildNetwork()
+    // Basically, it invalidates all post-processing, reducing the network to the graph.
    void ComputationNetwork::ClearCalcOrderCaches()
    {
        for (auto & it : m_cacheEvalOrders)
            for (auto & iter2 : m_cacheEvalOrders[it.first])
                iter2->PurgeStateForFormingRecurrentLoops();
        // TODO: ^^ Why is this done? This looks like an error (this function was called ClearCache() before, so maybe someone threw this call in for good measure)
+
+        // clear network Iterations cache
        m_cacheEvalOrders.clear();
        m_cacheGradientCalcOrders.clear();
+        m_cachedOuterLoopNodes.clear();
    }

    // lazily reate the m_inputs[] and m_learnableParameters lists
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@ -64,6 +64,8 @@ protected:
    // This structure stores that little sub-network.
    class RecurrentFlowControlNode : public FlowControlNode
    {
+    public: // m_nestedNodes needed public by ComputationNetwork::FindInRecurrentLoops(), which really should be part of RecurrentFlowControlNode
+        typedef FlowControlNode Base; using Base::m_nestedNodes;
    public:
        // next steps:
        //  - change m_recurrentInfo to use shared_ptrs to ComputationNodeBase
@ -76,11 +78,14 @@ protected:
        virtual void ComputeInputPartial(const size_t inputIndex, const FrameRange &) override { NOT_IMPLEMENTED; } // ugh, call ComputeGradientForChildren() instead
        virtual void OnComputeGradientEndIteration() override;
        virtual void ComputeGradientForChildren(const FrameRange & frameRange, bool childrenInThisLoop, bool childrenInOuterLoop) override;
-        // TODO: should the following be virtualized, too?
-        const wstring & NodeName() const { return m_sourceNode->NodeName(); }   // TODO: why not return a const wchar_t* again?
-        bool IsFuncValueOlderThanInputs() const;
+        virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool);
+        virtual void ReleaseMatricesAfterEval(MatrixPool& matrixPool);
+        virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool);
+        virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool);
+        virtual void ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool);
+        virtual bool IsFuncValueOlderThanInputs() const override;
    public:
-        std::vector<ComputationNodeBasePtr> m_recurrentNodes;               // all nodes involved in this loop, in evaluation order
+        //std::vector<ComputationNodeBasePtr> m_nestedNodes;               // all nodes involved in this loop, in evaluation order
        ComputationNodeBasePtr m_sourceNode;                                // one of the nodes of the loop   --TODO: What is the special meaning of this node? It seems to always be a delay node.
        int m_loopId;                                                       // the loop id (index in m_recurrentInfo array)
        bool m_completedGradient;
@ -93,6 +98,7 @@ protected:
            m_completedGradient(false),
            m_completedEvaluate(false)
        {
+            SetNodeName(L"Loop_" + m_sourceNode->NodeName());
        }
    };

@ -100,6 +106,7 @@ protected:
    // This is the outer loop over the network nodes in PAR mode.
    class OuterLoopNode : public FlowControlNode
    {
+        typedef FlowControlNode Base; using Base::m_nestedNodes;
    public:
        virtual const std::wstring OperationName() const override { return L"OuterLoopNode"; }
        virtual void UpdateFunctionMBSize() override { NOT_IMPLEMENTED; }
@ -110,9 +117,14 @@ protected:
        virtual void ComputeInputPartial(const size_t inputIndex, const FrameRange &) override { NOT_IMPLEMENTED; } // ugh, call ComputeGradientForChildren() instead
        virtual void OnComputeGradientEndIteration() override { }
        virtual void ComputeGradientForChildren(const FrameRange & frameRange, bool childrenInThisLoop, bool childrenInOuterLoop) override;
+        virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool);
+        virtual void ReleaseMatricesAfterEval(MatrixPool& matrixPool);
+        virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool);
+        virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool);
+        virtual void ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool);
    public:
        OuterLoopNode(/*const*/ std::vector<shared_ptr<RecurrentFlowControlNode>> & recurrentInfo, const std::list<ComputationNodeBasePtr> & allNodes);
-        std::list<shared_ptr<IComputationNode>> m_outerNodes;             // all top-level nodes, in evaluation order. Nested nodes are tucked inside FlowControlNodes.
+        // m_nestedNodes contains all top-level nodes, in evaluation order
    };

 public:
@ -640,6 +652,11 @@ public:
    // and for a set of nodes
    void StartEvaluateMinibatchLoop(const ComputationNodeBasePtr & rootNode)  // (ugly name; meant to be unique so we can rename if needed)
    {
+#if 0
+        // TODO: allocation does not belong here. This is called e.g. after loading. Memory should be allocated only when actually evaluating.
+        // TODO: move into StartEvaluateMinibatchLoop(), but that is called for output nodes individually--can the process handle that?
+        AllocateEvalMatrices(rootNode);
+#endif
        // TODO: do we need to reset time stamps?
        BuildAndValidateSubNetwork(rootNode);
    }
@ -798,39 +815,49 @@ public:

    void ClearGradientForAllNodes(const ComputationNodeBasePtr& rootNode)
    {
-        std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode);
+        std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode);   // note: any order will do

        for (auto &node : allNodes)
            node->ClearGradientForChildren();

-        for (auto & recInfo : m_recurrentInfo)
+        for (auto & recInfo : m_recurrentInfo)      // TODO: this will go away
            recInfo->m_completedGradient = false;
    }

+    // -----------------------------------------------------------------------
+    // evaluation: traversal
+    // These three functions create and cache traversal orders of the network.
+    // -----------------------------------------------------------------------
+
    // determine the required order in which nodes must be computed in order to compute 'rootNode'
-    // recurrent == true is only used when called from FormRecurrentLoops()
-    std::list<ComputationNodeBasePtr>& GetEvalOrder(const ComputationNodeBasePtr& rootNode, bool setVisitedOrder)
+    // skipPairNetwork == true is only used when called from FormRecurrentLoops()
+    std::list<ComputationNodeBasePtr>& GetEvalOrder(const ComputationNodeBasePtr& rootNode, bool skipPairNetwork)
    {
-        return GetCalcOrder(rootNode, m_cacheEvalOrders, true/*means for forward prop*/, setVisitedOrder);
+        return GetCalcOrder(rootNode, m_cacheEvalOrders, true/*means for forward prop*/, skipPairNetwork);
    }

    // determine the required order in which nodes must be computed in order to compute the gradient of 'rootNode'
    // Basically returns the reverse of GetEvalOrder(), with some special consideration to loops.
    std::list<ComputationNodeBasePtr>& GetGradientCalcOrder(const ComputationNodeBasePtr& rootNode)
    {
-        return GetCalcOrder(rootNode, m_cacheGradientCalcOrders, false/*means for backprop*/, false/*setVisitedOrder*/);
+        return GetCalcOrder(rootNode, m_cacheGradientCalcOrders, false/*means for backprop*/, false/*skipPairNetwork*/);
+    }
+
+    ComputationNodeBasePtr GetOuterLoopNode(const ComputationNodeBasePtr& rootNode)
+    {
+        if (m_cachedOuterLoopNodes.find(rootNode) == m_cachedOuterLoopNodes.end())
+            m_cachedOuterLoopNodes[rootNode] = make_shared<OuterLoopNode>(m_recurrentInfo, GetEvalOrder(rootNode, false));
+        return m_cachedOuterLoopNodes[rootNode];
    }

 private:

-    static std::list<ComputationNodeBasePtr>& GetCalcOrder(const ComputationNodeBasePtr rootNode,
+    static std::list<ComputationNodeBasePtr>& GetCalcOrder(const ComputationNodeBasePtr & rootNode,
                                                           std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>>& orderMap,
-                                                           const bool forwardCompute, bool setVisitedOrder)
+                                                           const bool forwardCompute, bool skipPairNetwork)
    {
-        if (!rootNode)
-            LogicError("rootNode is NULL.");
        if (orderMap.find(rootNode) == orderMap.end())
-            orderMap[rootNode] = rootNode->EnumerateNodes(forwardCompute, setVisitedOrder);
+            orderMap[rootNode] = rootNode->EnumerateNodes(forwardCompute, skipPairNetwork);
        return orderMap[rootNode];
    }

@ -908,8 +935,10 @@ private:    // TODO: make all private that can be made private
    // cache for evaluation ordering:
    std::unordered_set<ComputationNodeBasePtr> m_built;   // [node] flag: BuildAndValidateSubNetwork() has been called

+    // cached network Iterations
    std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_cacheEvalOrders;
    std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_cacheGradientCalcOrders;
+    std::map<const ComputationNodeBasePtr, ComputationNodeBasePtr> m_cachedOuterLoopNodes;

    std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_inputs;                 // [out node] -> all input nodes feeding into out node
    std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_learnableParameters;    // [out node] -> all parameter nodes feeding into out node
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkAnalysis.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkAnalysis.cpp
@ -42,7 +42,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // determine the strongly connected cliques -> m_recurrentInfo[]
        DetermineSCCs(rootNode);

-        list<ComputationNodeBasePtr>& nodes = GetEvalOrder(rootNode, true/*set m_visitedOrder*/);
+        list<ComputationNodeBasePtr>& nodes = GetEvalOrder(rootNode, true/*skipPairNetwork*/);
+        // recover m_visitedOrder
+        size_t i = 1;       // BUGBUG: why not 0? (left-over of refactoring)
+        for (auto & node : nodes)
+            node->m_visitedOrder = i++;

        // purge identical loops (i.e. loops that have the same source node)
        // TODO: Is this for the case that we call this function multiple times, or do the nodes of a loop generate multiple entries? Comment this.
@ -57,24 +61,30 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            size_t max_visitedOrderInLoop = 0;
            // TODO: I am sure there is an STL algorithm for this.
-            for (auto itr : iter->m_recurrentNodes)
+            for (auto itr : iter->m_nestedNodes)
                if (max_visitedOrderInLoop < itr->m_visitedOrder)
                    max_visitedOrderInLoop = itr->m_visitedOrder;
-            for (auto itr : iter->m_recurrentNodes)
+            for (auto itr : iter->m_nestedNodes)
                itr->m_visitedOrder = max_visitedOrderInLoop;
        }

        // implant m_loopId in all nodes in all loops
        for (auto & iter : m_recurrentInfo)
        {
+#if 1       // instead of the redundant sort() below, we just verify
+            for (auto & node : iter->m_nestedNodes)
+                if (node->m_visitedOrder != iter->m_nestedNodes.front()->m_visitedOrder)
+                    LogicError("FormRecurrentLoops: m_visitedOrder was set to a constant, but actually... wasn't?");
+#else
            // sort the recurrent nodes in their ascending name, which is the same as visiting nodes in G^R
            // it is done in the mergerecurrentloops function, but just keep the code       --TODO: why?? Why not rather verify the order?
            // BUGBUG: This sort() seems to do nothing, since the above loop sets all m_visitedOrder to the same value??
-            sort(iter->m_recurrentNodes.begin(),
-                 iter->m_recurrentNodes.end(),
-                 iter->m_recurrentNodes[0]->ByVisitedOrder);
- 
-            for (auto & node : iter->m_recurrentNodes)
+            sort(iter->m_nestedNodes.begin(),
+                 iter->m_nestedNodes.end(),
+                 iter->m_nestedNodes[0]->ByVisitedOrder);
+#endif
+
+            for (auto & node : iter->m_nestedNodes)
            {
                node->m_isPartOfLoop = true;        // this is the only flag in ComputationNode that escapes FormRecurrentLoops()!
                // TODO: ^^ We should instead remember a pointer to our loop sentinel
@ -91,9 +101,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            // set m_indexInLoop for all nodes except Past/FutureValueNodes in all loops
            // This value is only used in the block right after this.
-            for (size_t j = 0; j < iter->m_recurrentNodes.size(); j++)
+            // This is very mysterious. It is certainly no index in loop. More like a parent count, and excluding delay nodes.
+            for (size_t j = 0; j < iter->m_nestedNodes.size(); j++)
            {
-                ComputationNodeBasePtr node = iter->m_recurrentNodes[j];
+                ComputationNodeBasePtr node = iter->m_nestedNodes[j];
                for (size_t i = 0; i < node->ChildrenSize(); i++)
                {
                    if (node->Inputs(i)->m_loopId == node->m_loopId && 
@ -101,33 +112,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        node->OperationName() != OperationNameOf(FutureValueNode))      // TODO: test for type RecurrentNode instead?
                    {
                        //assert(node->Inputs(i)->m_indexInLoop == 0);                    // No. It seems this variable really counts the number of parents.
-                        node->Inputs(i)->m_indexInLoop = node->Inputs(i)->m_indexInLoop + 1;        // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere. i-1?
+                        node->Inputs(i)->m_indexInLoop++;               // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
                    }
                }
            }

-            for (size_t i = 0; i < iter->m_recurrentNodes.size(); i++)
+            for (size_t i = 0; i < iter->m_nestedNodes.size(); i++)
            {
-                ComputationNodeBasePtr node = iter->m_recurrentNodes[i];
+                ComputationNodeBasePtr node = iter->m_nestedNodes[i];
                if (visited.find(node) == visited.end() && node->m_indexInLoop == 0)
                    DetermineLoopForwardOrder(visited, recStack, result, node);
            }

-#if 1
-            // update m_recurrentNodes with 'result'
-            iter->m_recurrentNodes.assign(result.begin(), result.end());
-#else
-            // TODO: this loop seems to just copy the list
-            //       m_recurrentNodes = reverse(result)
-            iter->m_recurrentNodes.clear();
-            for (size_t i = 0; i < iter->m_recurrentNodesxx.size(); i++)    // BUGBUG: is the size of m_recurrentNodes (before clear) the same as result? Guaranteed?
-            {
-                iter->m_recurrentNodes.push_back(result.front());
-                result.pop_front();
-            }
-
-            iter->m_recurrentNodes = iter->m_recurrentNodes;  // TODO: are they ever different?
-#endif
+            // update m_nestedNodes with 'result'
+            iter->m_nestedNodes.assign(result.begin(), result.end());
        }

        if (m_recurrentInfo.size() > 0)
@ -167,9 +165,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // log the loops
        for (auto & iter : m_recurrentInfo)
        {
-            fprintf(stderr, "\nLoop[%d] --> %ls -> %d nodes\n", (int)iter->m_loopId, iter->m_sourceNode->NodeName().c_str(), (int)iter->m_recurrentNodes.size());
+            fprintf(stderr, "\nLoop[%d] --> %ls -> %d nodes\n", (int)iter->m_loopId, iter->NodeName().c_str(), (int)iter->m_nestedNodes.size());
            size_t n = 0;
-            for (auto itr = iter->m_recurrentNodes.begin(); itr != iter->m_recurrentNodes.end(); itr++)
+            for (auto itr = iter->m_nestedNodes.begin(); itr != iter->m_nestedNodes.end(); itr++)
            {
                if (n++ % 3 == 0)
                    fprintf(stderr, "\n");
@ -177,6 +175,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            fprintf(stderr, "\n");
        }
+
+        // now turn this into a nested network, ready for evaluation
+        GetOuterLoopNode(rootNode);
    }

    // get the strongly connected components from the graph
@ -227,26 +228,42 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // if we closed a loop then create an entry in m_recurrentInfo
        if (cur->m_lowLink == cur->m_index)   // m_lowLink is still equal to m_index, as we set it at the start of this function: we closed a loop
        {
+            // TODO: build array first in a local array. Only if succeeds, then construct the node off it.
            RecurrentFlowControlNode rInfo(loopId, cur);
            for (;;)
            {
                ComputationNodeBasePtr w = sccStack.back();
                sccStack.pop_back();
                w->m_inStack = false;
-                rInfo.m_recurrentNodes.push_back(w);
+                rInfo.m_nestedNodes.push_back(w);
                if (w == cur)                       // hit our starting point: done
                    break;
            }
-            if (rInfo.m_recurrentNodes.size() > 1)  // non-looped nodes are detected here as loops of size 1 --skip those
+            if (rInfo.m_nestedNodes.size() > 1)  // non-looped nodes are detected here as loops of size 1 --skip those
            {
-                loopId++;
-                m_recurrentInfo.push_back(make_shared<RecurrentFlowControlNode>(move(rInfo)));
+                // only add to the array if the loop is not already there
+                // Since FormRecurrentLoops() is called multiple times, for multiple output nodes, we end up producing the same loop multiple times.
+                bool bFound = false;    // find a dup  --TODO: check whether there is an STL algorithm for this
+                for (const auto & iter2 : m_recurrentInfo)
+                {
+                    if (iter2->m_sourceNode == cur)
+                    {
+                        bFound = true;
+                        break;
+                    }
+                }
+                if (!bFound)
+                {
+                    // TODO: construct rInfo down here
+                    m_recurrentInfo.push_back(make_shared<RecurrentFlowControlNode>(move(rInfo)));
+                    loopId++;                           // and count it
+                }
            }
        }
    }

    // purge identical loops (i.e. loops that have the same source node)
-    // TODO: Why not do this where we push a loop into m_recurrentInfo?
+    // TODO: Delete this function once we find it never triggers.
    void ComputationNetwork::UniqRecurrentLoops()
    {
        if (m_recurrentInfo.size() <= 1)
@ -262,7 +279,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if ((*iter2).m_sourceNode == iter->m_sourceNode)
                {
                    bFound = true;
-                    break;
+                    LogicError("UniqRecurrentLoops: Duplicate loops should no longer occur.");  // ...since tested when creating in the first place.
+                    //break;
                }
            }
            if (!bFound)
@ -348,7 +366,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                int iId = recInfo->m_loopId;
                if (!accessed[iId])
                {
-                    newList.insert(newList.end(), recInfo->m_recurrentNodes.begin(), recInfo->m_recurrentNodes.end());
+                    newList.insert(newList.end(), recInfo->m_nestedNodes.begin(), recInfo->m_nestedNodes.end());
                    accessed[iId] = true;
                }
            }
@ -378,12 +396,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        for (auto & rInfo : m_recurrentInfo)
        {
-            assert(rInfo->m_recurrentNodes.size() > 0);    // (this check was left over after refactoring; it should not be necessary)
+            assert(rInfo->m_nestedNodes.size() > 0);    // (this check was left over after refactoring; it should not be necessary)

            bool hasPastValueNode = false;
            bool hasFutureValueNode = false;

-            for (auto & node : rInfo->m_recurrentNodes)
+            for (auto & node : rInfo->m_nestedNodes)
            {
                if (node->OperationName() == OperationNameOf(PastValueNode))
                    hasPastValueNode = true;
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -29,11 +29,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // TODO: pass a set of nodes instead of only one
    // TODO: rename to ForwardProp()? To make it very clear?
    // This calls EvaluateThisNode() on all nodes in order of data flow through the network.
-    // By default, the network is applied concurrently on all frames in a minibatch in parallel (a "map" operation)
+    // By default, the network is applied concurrently on all frames in a minibatch in parallel (PAR mode, a "map" operation)
    // Recurrent loops deviate:
    //  - a recurrent loop is the loop of nodes that make up computation for one time step (e.g. Times -> Plus -> Sigmoid -> Delay)
    //  - these must be executed frame by frame rather than as a map
-    //  - such a loop is treated as if they were a little nested network; this is done inside here
+    //  - such a loop is treated as if they were a little nested network; this is done inside RecurrentFlowControlNodes
    //  - these little nested networks are defined in m_recurrentInfo[]
    void ComputationNetwork::Evaluate(const ComputationNodeBasePtr & rootNode)
    {
@ -43,14 +43,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            LogicError("Evaluate for node %ls %ls: BuildAndValidateSubNetwork() has not been called on this node.");

        // TODO: change this to a time stamp to make it consistent with PAR mode
+        // TODO: No, this is no longer needed with OuterLoopNode. Keep it for now to verify this through runtime checks.
        for (auto & recInfo : m_recurrentInfo)
            recInfo->m_completedEvaluate = false;

        // traverse all nodes in the pre-determined evaluation order
 #define USE_OUTER_LOOP_NODE     // once this is working then get rid of this #define
 #ifdef USE_OUTER_LOOP_NODE
-        OuterLoopNode outerLoopNode(m_recurrentInfo, GetEvalOrder(rootNode, false));
-        outerLoopNode.EvaluateThisNode(FrameRange(nullptr));
+        GetOuterLoopNode(rootNode)->EvaluateThisNode(FrameRange(nullptr));
 #else
        // determines order of evaluation, such that children get evaluated before their parent nodes
        std::list<ComputationNodeBasePtr>& allNodes = GetEvalOrder(rootNode, false);
@ -63,7 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            shared_ptr<RecurrentFlowControlNode> recInfo = FindInRecurrentLoops(m_recurrentInfo, node);   // check if this node participates in a recurrent loop

-            if (recInfo && IsFuncValueOlderThanInputs(recInfo->m_recurrentNodes) && !recInfo->m_completedEvaluate)
+            if (recInfo && IsFuncValueOlderThanInputs(recInfo->m_nestedNodes) && !recInfo->m_completedEvaluate)
            {
 #if 1
                recInfo->UpdateFunctionMBSize();
@ -72,7 +72,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                recInfo->OnEvaluateEndIteration();
 #else
                // node participates in a recurrent loop: process the loop frame by frame
-                const auto & recurrentNodes = recInfo->m_recurrentNodes;
+                const auto & recurrentNodes = recInfo->m_nestedNodes;

                // get layout associated with this loop
                auto pMBLayout = recurrentNodes[0]->GetMBLayout();
@ -148,7 +148,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop)
    // TODO: pass a set of nodes instead of only one?
    // TODO: remove Evaluate() from here, instead call it at call site, and in here merely check whether everything is computed already
-    // BUGBUG: The decision to loop (SEQ execution) is made by parent, but some children can be executer PAR. It should be possible to detect this.
    template<class ElemType>
    void ComputationNetwork::ComputeGradient(const ComputationNodeBasePtr rootNode,         // training criterion to compute the gradients for
                                             bool bResetToOne,                              // true if reset the gradient of rootnode to 1.0  --This is the default.
@ -181,17 +180,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            dynamic_pointer_cast<ComputationNode<ElemType>>(rootNode)->GradientValues().SetValue(*rootGradientInitValue);

 #ifdef USE_OUTER_LOOP_NODE
-#if 1
-        // sanity check   --TODO: remove this once this has been found to not trigger for a while (it should be--EnumerateNodes() just reverses its result when called by GetGradientCalcOrder(). Which makes a lot of sense.)
-        auto evalOrder = GetEvalOrder(rootNode, false);
-        auto gradOrder = GetGradientCalcOrder(rootNode);
-        evalOrder.reverse();
-        if (evalOrder != gradOrder)
-            LogicError("ComputeGradient: Gradient computation order must be reverse of evaluation order.");
-#endif
-
-        OuterLoopNode outerLoopNode(m_recurrentInfo, GetEvalOrder(rootNode, false));
-        outerLoopNode.ComputeGradientForChildren(FrameRange(nullptr), true, true);
+        GetOuterLoopNode(rootNode)->ComputeGradientForChildren(FrameRange(nullptr), true, true);
 #else
        // run backprop pass
        std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode);
@ -214,7 +203,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    recInfo->ComputeGradientForChildren(FrameRange(node->GetMBLayout()), true, true);
                    recInfo->OnComputeGradientEndIteration();
 #else
-                    const auto & recurrentNodes = recInfo->m_recurrentNodes;
+                    const auto & recurrentNodes = recInfo->m_nestedNodes;
                    for (auto & node2 : recurrentNodes)
                        node2->OnComputeGradientBeginIteration();
                    auto pMBLayout = recurrentNodes[0]->GetMBLayout();
@ -267,6 +256,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template void ComputationNetwork::ComputeGradient<double>(const ComputationNodeBasePtr rootNode, bool bResetToOne, const Matrix<double>* rootGradientInitValue, bool bClearGradient, bool resetTimeStampAfterComputation);

 #ifdef USE_OUTER_LOOP_NODE
+    // -----------------------------------------------------------------------
+    // OuterLoopNode methods -- implements PAR traversal
+    // -----------------------------------------------------------------------
+
    // implementation of OuterLoopNode (implements outer loop over non-recurrent nodes)
    ComputationNetwork::OuterLoopNode::OuterLoopNode(/*const*/ std::vector<shared_ptr<RecurrentFlowControlNode>> & recurrentInfo, const std::list<ComputationNodeBasePtr> & allNodes/*must be in eval order*/)
    {
@ -278,7 +271,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (recInfo)            // node is part of a SEQ loop: gather all of them. The nodes must be consecutive in 'allNodes'
            {
                // instead of the node itself, include the sentinel RecurrentFlowControlNode in our list
-                m_outerNodes.push_back(recInfo);
+                m_nestedNodes.push_back(recInfo);
                // and verify that we only encountered the loop once (all nodes should have been consecutive)
                if (!loopsSeen.insert(recInfo).second)
                    LogicError("OuterLoopNode: members of loop %ls are not consecutive in node list.", recInfo->NodeName().c_str());
@ -288,24 +281,52 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            else                    // regular top-level node (non-looping, PAR)
            {
-                m_outerNodes.push_back(*nodeIter);
+                m_nestedNodes.push_back(*nodeIter);
                nodeIter++;         // and consume this node
            }
        }
    }
    /*virtual*/ void ComputationNetwork::OuterLoopNode::EvaluateThisNode(const FrameRange & frameRange) /*override*/
    {
-        for (auto & pnode : m_outerNodes)
+        for (auto & node : m_nestedNodes)
        {
-            auto recInfo = dynamic_pointer_cast<RecurrentFlowControlNode>(pnode);
-            auto node = dynamic_pointer_cast<ComputationNodeBase>(pnode);
-            // TODO: This ^^ is not nice.
-            //       We are close but not finished with unifying. Eventually, there must be no if statement below.
+#if 1
+#if 1
+            if (node->IsFuncValueOlderThanInputs())
+#else
+            bool isFuncValueOlderThanInputs =
+                (recInfo && recInfo->IsFuncValueOlderThanInputs()) ||           // TODO: abstract this out into a virtual function
+                (node && node->IsFuncValueOlderThanInputs());
+            if (isFuncValueOlderThanInputs)
+#endif
+            {
+                auto recInfo = dynamic_pointer_cast<RecurrentFlowControlNode>(node);
+                if (recInfo)
+                    assert(recInfo->m_sourceNode->GetMBLayout() == node->GetMBLayout());

+                if (recInfo)
+                    assert(!recInfo->m_completedEvaluate);      // TODO: not needed anymore, I think
+
+                node->UpdateFunctionMBSize();
+
+                // BUGBUG: IsLeaf() for RecurrentFlowControlNode returns false because that node has no children. So we get lucky here. Otherwise it would fail in Validate(). Fix this by getting rid of the Validate() call here.
+                if (node && !node->IsLeaf() && !node->RequiresPreCompute())
+                    node->Validate(true);                       // BUGBUG: Validate() should not be called during evaluation. This is meant to update m_functionValues' size in case of sharing.
+
+                node->OnEvaluateBeginIteration();
+                node->EvaluateThisNode(frameRange.WithLayout(node->GetMBLayout()));
+                node->OnEvaluateEndIteration();
+
+                if (recInfo)
+                    recInfo->m_completedEvaluate = true;
+                node->UpdateEvalTimeStamp();                // TODO: abstract this out to a virtual function
+            }
+#else
            // --- if this node is part of a recurrence, evaluate all nodes that participate in this loop

-            if (recInfo && recInfo->IsFuncValueOlderThanInputs() && !recInfo->m_completedEvaluate)
+            if (recInfo && recInfo->IsFuncValueOlderThanInputs() /*&& !recInfo->m_completedEvaluate*/)
            {
+                assert(!recInfo->m_completedEvaluate);
                pnode->UpdateFunctionMBSize();
                pnode->OnEvaluateBeginIteration();
                pnode->EvaluateThisNode(frameRange.WithLayout(recInfo->m_sourceNode->GetMBLayout()));
@ -330,6 +351,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                pnode->OnEvaluateEndIteration();
                node->UpdateEvalTimeStamp();
            }
+#endif
 #ifdef _DEBUG
            else if (node)
                node->OnEvaluateEndIteration();  // HACK: performs NaN check, but does nothing else
@ -340,14 +362,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    /*virtual*/ void ComputationNetwork::OuterLoopNode::ComputeGradientForChildren(const FrameRange & frameRange, bool childrenInThisLoop, bool childrenInOuterLoop) /*override*/
    {
        childrenInThisLoop, childrenInOuterLoop;    // TODO: think through what these mean when coming from PAR mode
-        // TODO: finish this
        // process nodes in pre-determined order
-        for (auto inode = m_outerNodes.rbegin(); inode != m_outerNodes.rend(); inode++)   // iterate backwards over evaluation order
+        for (auto pnode = m_nestedNodes.rbegin(); pnode != m_nestedNodes.rend(); pnode++)   // iterate backwards over evaluation order
        {
-            auto pnode = *inode;
-            auto recInfo = dynamic_pointer_cast<RecurrentFlowControlNode>(pnode);
-            auto node = dynamic_pointer_cast<ComputationNodeBase>(pnode);
+            auto & node = *pnode;

+#if 1
+            auto recInfo = dynamic_pointer_cast<RecurrentFlowControlNode>(node);
+            if (recInfo)
+                assert(recInfo->m_sourceNode->GetMBLayout() == node->GetMBLayout());
+            if (recInfo)
+                assert(!recInfo->m_completedGradient);  // TODO: not needed anymore, I think
+
+            node->OnComputeGradientBeginIteration();
+            node->ComputeGradientForChildren(frameRange.WithLayout(node->GetMBLayout()), true, true);
+            node->OnComputeGradientEndIteration();
+
+            if (recInfo)
+                recInfo->m_completedGradient = true;
+#else
            // --- first, perform recurrent loops if this node participates in one

            if (recInfo)
@ -377,34 +410,45 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                pnode->ComputeGradientForChildren(frameRange.WithLayout(node->GetMBLayout()), true, true);
                pnode->OnComputeGradientEndIteration();
            }
+#endif
        }
    }
+    /*virtual*/ void ComputationNetwork::OuterLoopNode::RequestMatricesBeforeEval(MatrixPool& matrixPool) /*override*/ { }
+    /*virtual*/ void ComputationNetwork::OuterLoopNode::ReleaseMatricesAfterEval(MatrixPool& matrixPool) /*override*/ { }
+    /*virtual*/ void ComputationNetwork::OuterLoopNode::AllocateGradientMatricesForChildren(MatrixPool& matrixPool) /*override*/ { }
+    /*virtual*/ void ComputationNetwork::OuterLoopNode::RequestMatricesBeforeGradientComp(MatrixPool& matrixPool) /*override*/ { }
+    /*virtual*/ void ComputationNetwork::OuterLoopNode::ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool) /*override*/ { }
 #endif

+    // -----------------------------------------------------------------------
+    // RecurrentFlowControlNode methods -- implements SEQ traversal
+    // -----------------------------------------------------------------------
+
    // implementations of RecurrentFlowControlNode (loop unrolling)
    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::UpdateFunctionMBSize() /*override*/
    {
-        for (auto & node2 : m_recurrentNodes)
+        for (auto & node2 : m_nestedNodes)
            node2->UpdateFunctionMBSize(); // TODO: for sequence-to-sequence models we will need to be able to grow this step by step since size is unknown upfront
    }

    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::OnEvaluateBeginIteration() /*override*/
    {
-        // get layout associated with this loop
-        auto pMBLayout = m_recurrentNodes[0]->GetMBLayout();
+        // take the opportunity to check that layout is shared by all nodes in the loop
+        // TODO: we should do this in a constructor.
+        for (auto & node2 : m_nestedNodes)
+        {
+            if (node2->GetMBLayout() != GetMBLayout())
+                LogicError("Evaluate: all nodes inside a recurrent loop must have a layout that is identical; mismatch found for nodes '%ls' vs. '%ls'",
+                            node2->NodeName().c_str(), m_nestedNodes[0]->NodeName().c_str());
+        }

        // tell all that loop is about to commence
-        for (auto & node2 : m_recurrentNodes)
-        {
-            if (!pMBLayout || node2->GetMBLayout() != pMBLayout)  // take the opportunity to check that layout is shared by all nodes in the loop
-                LogicError("Evaluate: all nodes inside a recurrent loop must have a layout that is identical; mismatch found for nodes '%ls' vs. '%ls'",
-                            node2->NodeName().c_str(), m_recurrentNodes[0]->NodeName().c_str());
+        for (auto & node2 : m_nestedNodes)
            node2->OnEvaluateBeginIteration();
-        }

        // since we share memory we need to resize function value matrices correctly
        // TODO: No, Validate() should only run as a prep stage. This will go away once we separate dimension inference and actual resizing.
-        for (auto & node2 : m_recurrentNodes)
+        for (auto & node2 : m_nestedNodes)
            node2->Validate(true);
    }

@ -416,13 +460,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        // get layout associated with this loop
        // All nodes share the same layout.
-        auto pMBLayout = m_recurrentNodes[0]->GetMBLayout();
+        assert(GetMBLayout() == m_nestedNodes[0]->GetMBLayout());

        // for every time step run through all nodes in this particular loop (treat the loop like a little ComputationNetwork)
-        FrameRangeIteration range(pMBLayout, m_steppingDirection);
+        FrameRangeIteration range(GetMBLayout(), m_steppingDirection);
        for (auto t = range.begin(); t != range.end(); t++)
        {
-            for (auto & node2 : m_recurrentNodes)
+            for (auto & node2 : m_nestedNodes)
            {
                //fprintf(stderr, "EvaluateThisNode %d %ls %ls\n", (int)t.timeIdxInSeq, node2->NodeName().c_str(), node2->OperationName().c_str());
                node2->EvaluateThisNode(t);
@ -437,20 +481,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::OnEvaluateEndIteration() /*override*/
    {
        // tell all that loop is done  --e.g. PastValueNode will capture its state for BPTT processing
-        for (auto & node2 : m_recurrentNodes)
+        for (auto & node2 : m_nestedNodes)
            node2->OnEvaluateEndIteration();
    }

    // called before first iteration step of ComputeGradient()
    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::OnComputeGradientBeginIteration() /*override*/
    {
-        for (auto & node2 : m_recurrentNodes)
+        for (auto & node2 : m_nestedNodes)
            node2->OnComputeGradientBeginIteration();
    }
    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::ComputeGradientForChildren(const FrameRange &, bool childrenInThisLoop, bool childrenInOuterLoop) /*override*/
    {
        childrenInThisLoop, childrenInOuterLoop;    // TODO: think through what these mean when coming from PAR mode
-        const auto & recurrentNodes = m_recurrentNodes;       // BUGBUG: -ForForward?? Does this mean we can remove non-ForForward?
+        const auto & recurrentNodes = m_nestedNodes;       // BUGBUG: -ForForward?? Does this mean we can remove non-ForForward?
        auto pMBLayout = recurrentNodes[0]->GetMBLayout();
        FrameRangeIteration range(pMBLayout, m_steppingDirection);
        for (auto t = range.rbegin(); t != range.rend(); t++)   // note: reverse iteration
@ -476,7 +520,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::OnComputeGradientEndIteration() /*override*/
    {
 #ifdef OPT_OUTER_GRADIENT
-        for (auto nodeIter2 = m_recurrentNodes.rbegin(); nodeIter2 != m_recurrentNodes.rend(); ++nodeIter2)
+        for (auto nodeIter2 = m_nestedNodes.rbegin(); nodeIter2 != m_nestedNodes.rend(); ++nodeIter2)
        {
            auto & node2 = *nodeIter2;
            // BUGBUG: The following can no longer be done after this code was moved into RecurrentFlowControlNode
@ -484,13 +528,37 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //if (IsNodeReqMultiSeqHandling(node2))
            //    node2->MaskMissingGradientColumnsToZero(t);
            // TODO: exclude children that are not part of the recurrent loop, and do thise below, separately.
-            node2->ComputeGradientForChildren(FrameRange(m_recurrentNodes[0]->GetMBLayout()), false/*childrenInThisLoop*/, true/*childrenInOuterLoop*/);
+            node2->ComputeGradientForChildren(FrameRange(m_nestedNodes[0]->GetMBLayout()), false/*childrenInThisLoop*/, true/*childrenInOuterLoop*/);
        }
 #endif
-        for (auto & node2 : m_recurrentNodes)
+        for (auto & node2 : m_nestedNodes)
            node2->OnComputeGradientEndIteration();
    }

+    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::RequestMatricesBeforeEval(MatrixPool& matrixPool) /*override*/
+    {
+        for (auto & nodeLoopIter : m_nestedNodes)
+            nodeLoopIter->RequestMatricesBeforeEval(matrixPool);
+    }
+    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::ReleaseMatricesAfterEval(MatrixPool& matrixPool) /*override*/ { }
+    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::AllocateGradientMatricesForChildren(MatrixPool& matrixPool) /*override*/
+    {
+        // TODO: should we deallocate in opposite order?
+        for (auto nodeIter = m_nestedNodes.rbegin(); nodeIter != m_nestedNodes.rend(); ++nodeIter)
+        {
+            (*nodeIter)->AllocateGradientMatricesForChildren(matrixPool);
+        }
+    }
+    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::RequestMatricesBeforeGradientComp(MatrixPool& matrixPool) /*override*/ { }
+    /*virtual*/ void ComputationNetwork::RecurrentFlowControlNode::ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool) /*override*/
+    {
+        for (auto nodeIter = m_nestedNodes.rbegin(); nodeIter != m_nestedNodes.rend(); ++nodeIter)
+        {
+            if ((*nodeIter)->NeedGradient())
+                (*nodeIter)->ReleaseMatricesAfterGradientComp(matrixPool);
+        }
+    }
+
    // find if node is part of a recurrent loop; and return the loop id
    // If found then return a pointer to the list of nodes of this loop.
    /*static*/ shared_ptr<ComputationNetwork::RecurrentFlowControlNode> ComputationNetwork::FindInRecurrentLoops(/*const*/ std::vector<std::shared_ptr<RecurrentFlowControlNode>> & recurrentInfo, const ComputationNodeBasePtr& node)
@ -498,14 +566,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // look in all recurrent loops of the network
        // TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
        for (auto & iter : recurrentInfo)
-            if (std::find(iter->m_recurrentNodes.begin(), iter->m_recurrentNodes.end(), node) != iter->m_recurrentNodes.end())  // TODO: should this loop need to be a method of RecurrentFlowControlNode?
+            if (std::find(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end(), node) != iter->m_nestedNodes.end())  // TODO: should this loop need to be a method of RecurrentFlowControlNode?
                return iter;
        return nullptr;  // not part of a recurrent loop
    }

+    // check if any of the nodes in the recurrence IsFuncValueOlderThanInputs(), with exception of delay nodes for which this check would fail and can be skipped
+    // TODO: Would it be sufficient to check against our own time stamp, so that we can use a unified time-stamping mechanism? Then we'd not need this special check for delayed nodes; just check all inputs against our own time stamp.
+    // TODO: move this function up to its peers
    bool ComputationNetwork::RecurrentFlowControlNode::IsFuncValueOlderThanInputs() const
    {
-        for (auto & ptr : m_recurrentNodes)
+        for (auto & ptr : m_nestedNodes)
        {
            if (ptr->IsFuncValueOlderThanInputs() &&
                ptr->OperationName() != OperationNameOf(PastValueNode) &&
@ -517,6 +588,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return false;
    }

+#ifndef USE_OUTER_LOOP_NODE
    // TODO: this will move into RecurrentFlowControlNode
    bool ComputationNetwork::IsFuncValueOlderThanInputs(const vector<ComputationNodeBasePtr>& recurrentNodes)
    {
@ -531,7 +603,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
        return false;
    }
+#endif

+    // TODO: do this on OuterLoopNode
    void ComputationNetwork::ResetEvalTimeStamp()
    {
        for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
@ -587,8 +661,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (FeatureNodes().size() == 0 && !allowFragment)
            RuntimeError("No Feature nodes specified");

+#if 1   // If it is not done here, it will causea crash. But it really only belongs into StartEvluationMinibatchLoop()
        // TODO: allocation does not belong here. This is called e.g. after loading. Memory should be allocated only when actually evaluating.
+        // TODO: move into StartEvaluateMinibatchLoop(), but that is called for output nodes individually--can the process handle that?
        AllocateAllEvalMatrices(EvaluationNodes(), OutputNodes(), FinalCriterionNodes());
+#endif
        // first give criteria nodes as root node
        if (FinalCriterionNodes().size() > 0)
        {
@ -686,6 +763,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (toValidate != 0)
            LogicError("ValidateSubNetwork: ValidateNodes(true) unexpectedly returned with work left to do.");

+        // propagate some info to RecurrentFlowControlNode
+        // TODO: In the future we should validate not on the flat list but the OuterLoopNode structure. Then this will be unnecessary.
+        for (auto & recInfo : m_recurrentInfo)
+        {
+            auto & node = recInfo->m_sourceNode;
+            recInfo->m_needsGradient = node->m_needsGradient;
+            recInfo->LinkToMBLayout(node->GetMBLayout());
+        }
+
        for (auto & node : nodes)
        {
 #if 0       // not possible once we have inconsistent layouts
@ -787,7 +873,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice.
    void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode)
    {
-        const auto inserted = m_built.insert(rootNode).second;  // remember we built it
+        bool inserted = m_built.insert(rootNode).second;  // remember we built it
        if (!inserted)
            return;                                             // already done

@ -802,6 +888,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        ValidateSubNetwork(rootNode);
    }

+    // tests whether BuildAndValidateSubNetwork() was called
    bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode)
    {
        return m_built.find(rootNode) != m_built.end();
@ -810,11 +897,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------
    // memory allocation
    // -----------------------------------------------------------------------
-
-    //this function will need to be called before actual validation and execution to 
-    //predetermine how to share matrices to reduce memory usage.
-    //TODO: find a simple topological order and allocateEvalMatrices on that order directly
-    //without passing in eval, out, and train nodes.
+#if 1
+    // this function will need to be called before actual validation and execution to 
+    // predetermine how to share matrices to reduce memory usage.
+    // TODO: find a simple topological order and allocateEvalMatrices on that order directly
+    // without passing in eval, out, and train nodes.
    void ComputationNetwork::AllocateAllEvalMatrices(std::vector<ComputationNodeBasePtr>& evalRootNodes,
                                                     std::vector<ComputationNodeBasePtr>& outValueRootNodes,
                                                     std::vector<ComputationNodeBasePtr>& trainRootNodes)
@ -829,6 +916,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            AllocateEvalMatrices(trainRootNodes[i]);

    }
+#endif

    // TODO: use the same loop mechanism as Evaluate()
    void ComputationNetwork::AllocateEvalMatrices(ComputationNodeBasePtr rootNode)
@ -859,15 +947,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                assert(recInfo != nullptr);
                if (!recInfo->m_completedEvaluate)
                {
-                    const auto & recurrentNodes = recInfo->m_recurrentNodes;
-                    for (auto &nodeLoopIter : recurrentNodes)
+#if 1
+                    recInfo->RequestMatricesBeforeEval(m_matrixPool);
+#else
+                    for (auto &nodeLoopIter : recInfo->m_nestedNodes)
                    {
                        nodeLoopIter->RequestMatricesBeforeEval(m_matrixPool);
                    }
+#endif

                    recInfo->m_completedEvaluate = true;

-                    for (auto &nodeLoopIter : recurrentNodes)
+                    for (auto &nodeLoopIter : recInfo->m_nestedNodes)
                    {
                        ReleaseMatricesAfterEvalForChildren(nodeLoopIter, parentCount);
                    }
@ -898,16 +989,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        FormRecurrentLoops(rootNode);

-        //PopulateParents(rootNode);
        std::list<ComputationNodeBasePtr>& allNodes = GetGradientCalcOrder(rootNode);

-        //determine children size
-        //std::map<ComputationNodeBasePtr, int> childrenCount;
-        //for (auto &nodeIter : allNodes)
-        //{
-        //    childrenCount[nodeIter] = nodeIter->ChildrenSize();
-        //}
-
        //now, simulate the gradient computation order to determine how to allocate matrices
        for (auto & recInfo : m_recurrentInfo)
            recInfo->m_completedGradient = false;
@ -923,11 +1006,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                shared_ptr<RecurrentFlowControlNode> recInfo = FindInRecurrentLoops(m_recurrentInfo, n);
                if (recInfo && recInfo->m_completedGradient == false)
                {
-                    const auto & recurrentNodes = recInfo->m_recurrentNodes;
+                    // SEQ mode: allocate all in loop first, then deallocate again
+#if 1               // TODO: next step: use OuterLoopNode::AllocateGradientMatricesForChildren() and ReleaseMatricesAfterGradientComp()...
+                    // BUGBUG: naw, ^^ would not work! Wrong order! Need to rethink this. Need to make AllocateEvalMatrices() and AllocateGradientMatrices() the virtual functions.
+                    recInfo->AllocateGradientMatricesForChildren(m_matrixPool);
+                    //loops are computed sample by sample so we have to allocate them all 
+                    recInfo->m_completedGradient = true;
+                    recInfo->ReleaseMatricesAfterGradientComp(m_matrixPool);
+#else
+                    const auto & recurrentNodes = recInfo->m_nestedNodes;
                    //loops are computed sample by sample so we have to allocate them all 
                    for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
                    {
-                        AllocateGradientMatricesForChildren(*nodeIter);
+                        (*nodeIter)->AllocateGradientMatricesForChildren(m_matrixPool);
                    }
                    recInfo->m_completedGradient = true;
                    for (auto nodeIter = recurrentNodes.rbegin(); nodeIter != recurrentNodes.rend(); ++nodeIter)
@ -937,29 +1028,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            (*nodeIter)->ReleaseMatricesAfterGradientComp(m_matrixPool);
                        }
                    }
+#endif
                }
            }
            else
            {
-                AllocateGradientMatricesForChildren(n);
-                if ((n != rootNode) && n->NeedGradient())  //root node's informatioin will be used and should not be shared with others, also it's small (1x1)
+                // PAR mode: we can allocate and immediately deallocate one by one
+                n->AllocateGradientMatricesForChildren(m_matrixPool);
+                if ((n != rootNode) && n->NeedGradient())  //root node's information will be used and should not be shared with others, also it's small (1x1)
                    n->ReleaseMatricesAfterGradientComp(m_matrixPool);
            }
        }
    }

-    //void ReleaseMatricesAfterGradientCompForParents(ComputationNodeBasePtr n, std::map<ComputationNodeBasePtr, int>& childrenCount)
-    //{
-    //    for (int i = 0; i < n->ParentSize(); i++)
-    //    {
-    //        ComputationNodeBasePtr pNode = n->Parent(i);
-    //        childrenCount[pNode] --;
-    //        if (childrenCount[pNode] == 0)
-    //            pNode->ReleaseMatricesAfterGradientComp(m_matrixPool);
-    //    }
-    //}
-  
-
+#if 0
    void ComputationNetwork::AllocateGradientMatricesForChildren(ComputationNodeBasePtr parentNode)
    {
        std::vector<ComputationNodeBasePtr> children = parentNode->GetChildren();
@ -969,5 +1051,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                children[i]->RequestMatricesBeforeGradientComp(m_matrixPool);
        }
    }
+#endif

 }}}
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.cpp
@ -173,7 +173,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    typedef Matrix<float> FloatMatrix;
    typedef Matrix<double> DoubleMatrix;

-    atomic_ullong ComputationNetworkOwnedNodeState::s_timeStampCounter = ATOMIC_VAR_INIT(0);
+    atomic_ullong TimeStamp::s_timeStampCounter = ATOMIC_VAR_INIT(0);

    template<> std::map<size_t, std::map<size_t, FloatMatrix*>>  ComputationNode<float>::s_constOnes{};
    template<> std::map<size_t, std::map<size_t, DoubleMatrix*>> ComputationNode<double>::s_constOnes{};
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@ -100,6 +100,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void OnComputeGradientEndIteration() = 0;             // called after last iteration step of ComputeGradient()

        // TODO: this one does not quite fit here
+        // functions that are called from Network, but not necessarily overridden by the node implementations themselves
        virtual void ComputeGradientForChildren(const FrameRange & frameRange, bool childrenInThisLoop, bool childrenInOuterLoop) = 0;

        // --- optional overrides that add functionality
@ -107,6 +108,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Any override must call Base version as well.
        // Default implementations are in ComputationNodeBase or ComputationNode<ElemType>.

+        virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool) = 0;         //request matrices needed to do node function value evaluation
+        virtual void ReleaseMatricesAfterEval(MatrixPool& matrixPool) = 0;          //release temp matrices that are only used by forward computation. Don't release matrices that need to be used in the gradient computation
+        virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool) = 0;
+        virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool) = 0; //request matrices that are needed for gradient computation
+        virtual void ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool) = 0;  //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
+
        virtual void Validate(bool isFinalValidationPass) = 0;          // main base validation function
        virtual void InferImageDimsFromInputs() = 0;
        virtual void SaveToFile(File& fstream) const = 0;
@ -144,35 +151,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            PurgeStateForFormingRecurrentLoops();
            m_isPartOfLoop = false;
-            ResetEvalTimeStamp();   // bring it into defined state
        }

        void CopyTo(ComputationNetworkOwnedNodeState & other) const
        {
            // TODO: is that really all we copy? (this is a result of refactoring, so it seems yes indeed). Should we at least ClearCache()?
-            other.m_evalTimeStamp = m_evalTimeStamp;
            other.m_isPartOfLoop = m_isPartOfLoop;
            other.m_needsGradient = m_needsGradient;
        }

-        int64_t UpdateEvalTimeStamp()
-        {
-            m_evalTimeStamp = atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1);    // TODO: does this really need to be atomic? We are not multi-threaded
-            return m_evalTimeStamp;
-        }
-
-        void ResetEvalTimeStamp()
-        {
-            m_evalTimeStamp = s_timeStampCounter;
-        }
-
-        int64_t GetEvalTimeStamp() const { return m_evalTimeStamp; }
-
-        int64_t CreateUniqId() const
-        {
-            return atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1);
-        }
-
        static bool ByVisitedOrder(const ComputationNetworkOwnedNodeState * lhs, const ComputationNetworkOwnedNodeState * rhs)  // sorting predicate
        {
            return lhs->m_visitedOrder < rhs->m_visitedOrder;
@ -182,9 +169,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    private:

-        static atomic_ullong s_timeStampCounter;
-        int64_t m_evalTimeStamp; //this is used to reduce unnecessary recomputation when a different node in the model is reevaluated
-
        bool m_isPartOfLoop;        // true if this loop is part of a recurrent loop

    protected:  // TODO: should be fully encapsulated here
@ -213,6 +197,38 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        bool m_inStack;
    };

+    // =======================================================================
+    // TimeStamp -- helper class to manage a time stamp
+    // =======================================================================
+
+    class TimeStamp
+    {
+    public:
+        TimeStamp() { ResetEvalTimeStamp(); }
+        void CopyTo(TimeStamp & other) const { other.m_evalTimeStamp = m_evalTimeStamp; }
+        void ResetEvalTimeStamp() { m_evalTimeStamp = s_timeStampCounter; }
+        int64_t GetEvalTimeStamp() const { return m_evalTimeStamp; }
+
+        // create a new unique time stamp
+        void UpdateEvalTimeStamp() { m_evalTimeStamp = CreateUniqId(); }
+
+        // the difference is taken to take into account numeric overflow (which really should never happen for a 64-bit integer... but hey, it's free!)
+        bool IsOlderThan(const TimeStamp & other) const
+        {
+            // BUGBUG: For some reason, we must test equality as well, although that does not indicate being older.
+            return GetEvalTimeStamp() - other.GetEvalTimeStamp() /*<*/ <= 0;
+        }
+
+        int64_t CreateUniqId() const
+        {
+            return /*1 +*/ atomic_fetch_add(&s_timeStampCounter, (unsigned long long int) 1);
+        }
+
+    private:
+        static atomic_ullong s_timeStampCounter;
+        int64_t m_evalTimeStamp; //this is used to reduce unnecessary recomputation when a different node in the model is reevaluated
+    };
+
    // =======================================================================
    // ComputationNodeBase -- abstract base class for all computation nodes
    // TODO: decide the name. This does contain actual members such as the node name, so it's not really a pure interface.
@ -220,7 +236,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    class ComputationNodeBase :
        public IComputationNode,
-        public/*protected*/ ComputationNetworkOwnedNodeState,  // TODO: figure this out, somehow the 'friend' thing does not work
+        public/*protected*/ ComputationNetworkOwnedNodeState,   // TODO: figure this out, somehow the 'friend' thing does not work
+        public TimeStamp,                                       // for time-stamp management
        public ScriptableObjects::ComputationNodeObject,
        public ScriptableObjects::WithTag, public ScriptableObjects::HasName, public ScriptableObjects::HasToString,
        public std::enable_shared_from_this<ComputationNodeBase>
@ -255,6 +272,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                node->m_outputImageLayout = m_outputImageLayout;

                ComputationNetworkOwnedNodeState::CopyTo(*node);
+                TimeStamp::CopyTo(*node);
            }
        }

@ -568,17 +586,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // This creates a list such that children are evaluated before their parents.
        // If !forForwardProp then the order will be reversed, suitable for backprop.
        // The 'recurrent' version is only called from FormRecurrentLoops().
-        // Side-effects (unbeknownst to the name of the function):
-        //  - m_needsGradient flags, are propagated up from children         --BUGBUG! This should only be computed in ValidateSubNetwork().
-        //  - ComputationNetworkOwnedNodeState::m_visitedOrder (only if 'recurrent' flag is set; otherwise leave untouched), as needed by FormRecurrentNodes()
        // TODO: This should be a method of ComputationNetwork, not ComputationNode.
-        std::list<ComputationNodeBasePtr> EnumerateNodes(bool forForwardProp/*else get order for backprop*/, bool setVisitedOrder)
+        std::list<ComputationNodeBasePtr> EnumerateNodes(bool forForwardProp/*else get order for backprop*/, bool skipPairNetwork)
        {
            std::list<ComputationNodeBasePtr> nodes;
            std::unordered_set<ComputationNodeBasePtr> visited;

            // get forward computation order
-            EnumerateNodesR(visited, nodes, setVisitedOrder);  // call into the recursive portion of this function below
+            EnumerateNodesR(visited, nodes, skipPairNetwork);  // call into the recursive portion of this function below

            // if caller wants order for backprop then reverse it
            if (!forForwardProp)
@ -588,19 +603,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    private:
        // Recursive part of EnumerateNodes().
-        void EnumerateNodesR(std::unordered_set<ComputationNodeBasePtr>& visited, std::list<ComputationNodeBasePtr>& result, bool setVisitedOrder)
+        void EnumerateNodesR(std::unordered_set<ComputationNodeBasePtr>& visited, std::list<ComputationNodeBasePtr>& result, bool skipPairNetwork)
        {
            if (visited.find(shared_from_this()) == visited.end())      // do not include a node twice
            {
                visited.insert(shared_from_this());   // have visited tagged here to avoid infinite loop over children, children's children, etc

                // children first for function evaluation
-                if (OperationName() != L"PairNetwork" || !setVisitedOrder)    // (don't step through network-pair boundary if called from FormRecurrentLoops())
+                if (OperationName() != L"PairNetwork" || !skipPairNetwork)    // (don't step through network-pair boundary if called from FormRecurrentLoops())
                {
                    for (int i = 0; i < m_children.size(); i++)
                    {
                        if (m_children[i])
-                            m_children[i]->EnumerateNodesR(visited, result, setVisitedOrder);
+                            m_children[i]->EnumerateNodesR(visited, result, skipPairNetwork);
                    }
                }

@ -614,8 +629,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                // now that all children are in list before us, put ourselves
                result.push_back(shared_from_this());

+#if 0           // this does not work, since m_visitedOrder gets cleared out, while the list survives in a cache
                if (setVisitedOrder)    // FormRecurrentNodes() would like this variable to be set as well
                    m_visitedOrder = result.size();
+#endif
            }
        }
    public:
@ -636,13 +653,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        // check whether a node is up-to-date w.r.t. its children, for lazy evaluation
        // If this returns false, node must be evaluated to update m_functionValues.
-        bool IsFuncValueOlderThanInputs() const
+        // BUGBUG: The function name is incorrect. It also returns 'true' if a child has the same time stamp (not older).
+        // This is virtual because it is overridden by traversal nodes.
+        virtual bool IsFuncValueOlderThanInputs() const
        {
            for (size_t i = 0; i<ChildrenSize(); i++)
            {
+#if 1
+                if (IsOlderThan(*m_children[i]))
+                    return true;
+#else
                //the second condition is used when the time stamp change from positive to negative
                if (m_children[i]->GetEvalTimeStamp() >= GetEvalTimeStamp() || m_children[i]->GetEvalTimeStamp() + 1e10 < GetEvalTimeStamp())
                    return true;
+#endif
            }

            return false;
@ -709,19 +733,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return name;
        }

-        //request matrices needed to do node function value evaluation
-        virtual void RequestMatricesBeforeEval(MatrixPool& matrixPool) = 0;
-
-        //release temp matrices that are only used by forward computation
-        //don't release matrices that need to be used in the gradient computation
-        virtual void ReleaseMatricesAfterEval(MatrixPool& matrixPool) = 0;
-
-        //request matrices that are needed for gradient computation
-        virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool) = 0;
-
-        //release gradient and temp matrices that no longer needed after all the children's gradients are computed.
-        virtual void ReleaseMatricesAfterGradientComp(MatrixPool& matrixPool) = 0;
-
    protected:
        // data members
        //std::vector<ComputationNodeBasePtr> m_parents; //m_parents are dynamically determined based on the root node you want to compute
@ -857,6 +868,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
        }

+        virtual void AllocateGradientMatricesForChildren(MatrixPool& matrixPool) override
+        {
+            for (int i = 0; i < m_children.size(); i++)
+            {
+                if (m_children[i]->NeedGradient())
+                    m_children[i]->RequestMatricesBeforeGradientComp(matrixPool);
+            }
+        }
+
        //request matrices that are needed for gradient computation
        virtual void RequestMatricesBeforeGradientComp(MatrixPool& matrixPool)
        {
@ -1367,23 +1387,44 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // FlowControlNode -- special wrapper node for use by ComputationNetwork only
    // =======================================================================

-    class FlowControlNode : public IComputationNode
+    class FlowControlNode : public ComputationNodeBase
    {
        typedef ComputationNodeBase Base;
    public:
+        FlowControlNode() : ComputationNodeBase(DEVICEID_NOTYETDETERMINED/*we don't own matrices*/, L""/*name: we don't care*/) { }
+
 #pragma warning (disable: 4100)
-        // these should never be called on flow-control nodes
-        virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) { NOT_IMPLEMENTED; }
-        virtual void Validate(bool isFinalValidationPass) { NOT_IMPLEMENTED; }          // main base validation function
-        virtual void InferImageDimsFromInputs() { NOT_IMPLEMENTED; }
-        virtual void SaveToFile(File& fstream) const { NOT_IMPLEMENTED; }
-        virtual void LoadFromFile(File& /*fstream*/, size_t /*modelVersion*/) { NOT_IMPLEMENTED; }
-        virtual void CopyTo(ComputationNodeBasePtr node, const std::wstring& newName, const CopyNodeFlags flags) const { NOT_IMPLEMENTED; }
+        // these are meant to be implemented by ComputationNode<ElemType> but should never be called on traversal nodes
+        // TODO: There are too many of these. This indicates improper class hierarchies.
+        virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) override { NOT_IMPLEMENTED; }
+        virtual void Validate(bool isFinalValidationPass) override { NOT_IMPLEMENTED; }          // main base validation function
+        virtual void InferImageDimsFromInputs() override { NOT_IMPLEMENTED; }
+        virtual void SaveToFile(File& fstream) const override { NOT_IMPLEMENTED; }
+        virtual void LoadFromFile(File& /*fstream*/, size_t /*modelVersion*/) override { NOT_IMPLEMENTED; }
+        virtual void CopyTo(ComputationNodeBasePtr node, const std::wstring& newName, const CopyNodeFlags flags) const override { NOT_IMPLEMENTED; }
+        virtual ComputationNodeBasePtr Duplicate(const std::wstring& newName, const CopyNodeFlags flags) override { NOT_IMPLEMENTED; }
+        virtual size_t GetNumRows() const override { NOT_IMPLEMENTED; }
+        virtual size_t GetNumCols() const override { NOT_IMPLEMENTED; }
+        virtual void Resize(size_t rows, size_t cols) override { NOT_IMPLEMENTED; }
+        virtual double Get00Element() const override { NOT_IMPLEMENTED; }
+        virtual void AttachInputs(const std::vector<ComputationNodeBasePtr>& inputs) override { NOT_IMPLEMENTED; }
+        virtual void PrintSelf(bool) const override { NOT_IMPLEMENTED; }
+        virtual void ValidateInferChildDims(size_t,size_t,size_t) override { NOT_IMPLEMENTED; }
+        virtual void SetInput(const size_t,const Microsoft::MSR::CNTK::ComputationNodeBase::ComputationNodeBasePtr &) override { NOT_IMPLEMENTED; }
+        virtual void ClearGradientForChildren(void) override { NOT_IMPLEMENTED; }
+        virtual void MaskMissingValuesColumnsToZero(const Microsoft::MSR::CNTK::FrameRange &) override { NOT_IMPLEMENTED; }
+        virtual void MaskMissingGradientColumnsToZero(const Microsoft::MSR::CNTK::FrameRange &) override { NOT_IMPLEMENTED; }
+        virtual void InvalidateMissingValuesColumns(const Microsoft::MSR::CNTK::FrameRange &) override { NOT_IMPLEMENTED; }
+        virtual void InvalidateMissingGradientColumns(const Microsoft::MSR::CNTK::FrameRange &) override { NOT_IMPLEMENTED; }
+        virtual std::wstring ToString(void) const override { NOT_IMPLEMENTED; }
        // these are meant to be called during computation, so provide dummy implementations
-        virtual bool RequiresPreCompute() const { return false; }                    // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
-        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
-        virtual void PrintSelfBeforeValidation() const { }
-        virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const { }
+        virtual bool RequiresPreCompute() const override { return false; }                    // return true if the node's value should be computed before the normal training. e.g., mean and invStd of input features.
+        virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() override { return true; }
+        virtual void PrintSelfBeforeValidation() const override { }
+        virtual void DumpNodeInfo(const bool /*printValues*/, File& fstream) const override { }
+    protected:
+    public: // needed in ComputationNetwork::FindInRecurrentLoops(), which really should be part of RecurrentFlowControlNode
+        std::vector<ComputationNodeBasePtr> m_nestedNodes;                  // nodes tucked away in this node, in evaluation order
    };

    // =======================================================================
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@ -1824,11 +1824,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (useDistributedMBReading)
        {
            trainSetDataReader->StartDistributedMinibatchLoop(tunedMBSize, epochNumber, g_mpi->CurrentNodeRank(),
-                                                              g_mpi->NumNodesInUse(), m_epochSize);
+                                                              g_mpi->NumNodesInUse(), epochSize);
        }
        else
        {
-            trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
+            trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, epochSize);
        }

        net.StartEvaluateMinibatchLoop(evaluationNodes);
@ -2160,9 +2160,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            totalEpochSamples += aggregateNumSamplesWithLabel;
            totalSamplesSeen += aggregateNumSamplesWithLabel;

-            if (totalEpochSamples >= epochSize)
-                break;
-
            // call DataEnd function
            // This signals something from SGD to the reader.
            // DataEnd does reader specific process if sentence ending is reached
--- a/Math/Math/GPUMatrix.cu
+++ b/Math/Math/GPUMatrix.cu
@ -371,6 +371,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_matrixName=NULL;
        m_format = matrixFormatDense; 
        m_externalBuffer = false;
+        m_workspace = nullptr;
    }

    template<class ElemType>
@ -503,6 +504,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return m_computeDevice;
    }

+    template<class ElemType>
+    std::unique_ptr<GPUMatrix<ElemType>> GPUMatrix<ElemType>::GetOrCreateWorkspace() const
+    {
+        // REVIEW alexeyk: not thread-safe, fine for now.
+        if (m_workspace == nullptr)
+            m_workspace = new conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>();
+        assert(m_workspace != nullptr);
+        auto deviceId = m_computeDevice;
+        return m_workspace->pop_or_create([deviceId]() { return std::make_unique<GPUMatrix<ElemType>>(deviceId); });
+    }
+
+    template<class ElemType>
+    void GPUMatrix<ElemType>::ReleaseWorkspace(std::unique_ptr<GPUMatrix<ElemType>> src) const
+    {
+        assert(m_workspace != nullptr);
+        m_workspace->push(std::move(src));
+    }
+
 #pragma region Basic Operators
    template<class ElemType>
    GPUMatrix<ElemType> GPUMatrix<ElemType>::ColumnSlice(size_t startColumn, size_t numCols) const
@ -3052,10 +3071,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cbtemp = ctemp * sizeof(ElemType);
        // ElemType count needed to store indices, accounting for natural alignment for uint64_t type.
        size_t cidx = ((celt + 1) * sizeof(uint64_t) - 1 + sizeof(ElemType) - 1) / sizeof(ElemType);
-        // Prepare temp workspace.
-        auto deviceId = m_computeDevice;
-        assert(m_workspace != nullptr);
-        auto workspace = m_workspace->pop_or_create([deviceId]() { return std::make_unique<GPUMatrix<ElemType>>(deviceId); });
+        // Get temp workspace.
+        auto workspace = GetOrCreateWorkspace();
        // Resize to store: output values for the 1st and 2nd passes, input indices, output indices, and temp storage.
        workspace->Resize(m, 2 * n + (2 * cidx + ctemp + m - 1) / m);
        outVal1 = workspace->m_pArray;
@ -3081,7 +3098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        cblock = (topK * n + ThreadsPerBlock - 1) / ThreadsPerBlock;
        _copyTopKResults<<<cblock, ThreadsPerBlock, 0, t_stream>>>(inIdx, outVal2, maxIndexes.m_pArray, maxValues.m_pArray, m, n, topK);

-        m_workspace->push(std::move(workspace));
+        ReleaseWorkspace(std::move(workspace));

        if (do_sync)    CUDA_CALL(cudaEventRecord(done));
        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
--- a/Math/Math/GPUMatrix.h
+++ b/Math/Math/GPUMatrix.h
@ -92,9 +92,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static cublasHandle_t s_cuHandle[MaxGpus];
        static void *s_curandGenerator;

-        // Have to use naked pointer to avoid issues with __declspec(dllexport) on Windows.
-        // REVIEW alexeyk: can be allocated lazily but the current footprint is small anyway.
-        mutable conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>* m_workspace = new conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>;
+        // Have to use naked pointer to avoid issues with __declspec(dllexport) on Windows (C4251).
+        // Cannot use atomic for the same reason either.
+        mutable conc_stack<std::unique_ptr<GPUMatrix<ElemType>>>* m_workspace;

    private:
        void performInplaceFunction(int kind);
@ -102,6 +102,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        size_t LocateColumn (const size_t j) const;        
        void Clear();
        void ZeroInit(int deviceId);
+        
+        std::unique_ptr<GPUMatrix<ElemType>> GetOrCreateWorkspace() const;
+        void ReleaseWorkspace(std::unique_ptr<GPUMatrix<ElemType>> src) const;

    public:
        GPUMatrix(int deviceId);
--- a/Tests/Speech/LSTM/Truncated/baseline.cpu.txt
+++ b/Tests/Speech/LSTM/Truncated/baseline.cpu.txt
--- a/Tests/Speech/LSTM/Truncated/baseline.gpu.txt
+++ b/Tests/Speech/LSTM/Truncated/baseline.gpu.txt
--- a/Tests/Speech/LSTM/Truncated/baseline.windows.cpu.txt
+++ b/Tests/Speech/LSTM/Truncated/baseline.windows.cpu.txt
--- a/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt
+++ b/Tests/Speech/LSTM/Truncated/baseline.windows.gpu.txt