cleaned up state control of MeanNode and InvStdNode. Now no longer fails the NaN check on m_functionValues. Also factored that out into a shared base class MeanInvStdDevNodeBase;

changed pre-compute protocol in that it must call MarkComputed(false) upfront to signal initialization of accumulators; bug fix in GetNodesRequiringX(): should be dynamic cast not static cast; BuildAndValidateSubNetwork() is no longer called on demand by Evaluate(), but rather must have been called beforehand. And that is done through a new method StartEvaluateMinibatchLoop(); some (hopefully all) places that call Evaluate() now call StartEvaluateMinibatchLoop() beforehand, e.g. SGD and SimpleEvaluator; new method Evaluate(set of nodes);
2015-10-06 13:29:49 -07:00 · 2015-10-06 13:29:49 -07:00 · 8dae07f0ac
--- a/Common/Eval.cpp
+++ b/Common/Eval.cpp
@ -122,4 +122,4 @@ void Eval<ElemType>::ResetState()
 template class Eval<double>; 
 template class Eval<float>;

-}}}
+}}}
--- a/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
+++ b/MachineLearning/CNTKComputationNetworkLib/CompositeComputationNodes.h
@ -204,15 +204,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembers;
    public:
        //virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) = 0;
-        PreComputedNode(DEVICEID_TYPE deviceId, const wstring & name) : Base(deviceId, name)
-        {
-            // further initializations
-            m_hasComputed = false;
-        }
+        PreComputedNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name),
+            m_hasComputed(false)
+        { }

        // interface through which this node is operated on are these two functions
+
+        // check whether node has already undergone precomputation
        virtual bool HasComputed() const { return m_hasComputed; }
-        virtual void MarkComputed(const bool hasComputed)       // override this for further finalizing operation
+
+        // call this with 'false' at start and with 'true' at end
+        // This is used for resetting and updating from accumulators.
+        virtual void MarkComputed(const bool hasComputed)
        {
            m_hasComputed = hasComputed;
        }
@ -223,7 +227,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            Base::SaveToFile(fstream);
            fstream << m_hasComputed;
-            fstream << m_functionValues;
+            fstream << m_functionValues;    // TODO: why serialize if not yet computed?
        }

        virtual void LoadFromFile(File& fstream, size_t modelVersion) override
@ -271,34 +275,46 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        bool m_hasComputed;
    };

-#define UsingPreComputedNodeMembers UsingComputationNodeMembersBoilerplate; using Base::m_hasComputed
+#define UsingPreComputedNodeMembers UsingComputationNodeMembers; using Base::m_hasComputed

    // -----------------------------------------------------------------------
-    // MeanNode (features)
+    // MeanInvStdDevNodeBase (features)  -- common base class for Mean and InvStdDev
    // -----------------------------------------------------------------------

    template<class ElemType>
-    class MeanNode : public PreComputedNode<ElemType>, public NumInputs<1>
+    class MeanInvStdDevNodeBase : public PreComputedNode<ElemType>, public NumInputs<1>
    {
        typedef PreComputedNode<ElemType> Base; UsingPreComputedNodeMembers;
-        static const std::wstring TypeName() { return L"Mean"; }
+        //static const std::wstring TypeName() { return L"MeanInvStdDev (base)"; }
    public:
-        MeanNode(DEVICEID_TYPE deviceId, const wstring & name) :
+        MeanInvStdDevNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
            PreComputedNode<ElemType>(deviceId, name),
-            m_numSamples(0)
+            m_numSamples(SIZE_MAX)
        { }

        virtual void LoadFromFile(File& fstream, size_t modelVersion) override
        {
            Base::LoadFromFile(fstream, modelVersion);
-            m_numSamples = 0;   // TODO: intended? Not loaded from file?
+            m_numSamples = SIZE_MAX;
        }

        virtual void /*PreComputedNode::*/MarkComputed(const bool hasComputed)
        {
            Base::MarkComputed(hasComputed);
-            if (m_hasComputed)
+            if (!m_hasComputed)     // initialize
+            {
+                if (IsAccumulating())
+                    LogicError("%ls %ls operation: MarkComputed(false) has been called while accumulating.", NodeName().c_str(), OperationName().c_str());
                m_numSamples = 0;
+            }
+            else                    // finalize
+            {
+                if (!IsAccumulating())
+                    LogicError("%ls %ls operation: MarkComputed(true) has been called without MarkComputed(false) first.", NodeName().c_str(), OperationName().c_str());
+                if (m_numSamples == 0)
+                    LogicError("%ls %ls operation: No data accumulated during precomputation.", NodeName().c_str(), OperationName().c_str());
+                m_numSamples = SIZE_MAX;
+            }
        }

        virtual void ComputeInputPartial(const size_t /*inputIndex*/)
@ -306,51 +322,68 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            LogicError("Mean operation should not be involved in the gradient calculation.");
        }

-        virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override
-        {
-            if (!m_hasComputed)
-            {
-                Matrix<ElemType> &samples = Inputs(0)->FunctionValues();
-                Matrix<ElemType> &avg = FunctionValues();
-#if 1//NANCHECK
-                samples.HasNan("Mean-Samples");
-#endif
-
-                size_t numNewSamples = samples.GetNumCols();
-                Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + samples.GetNumCols()), samples, false,
-                                                         ConstOnes(numNewSamples, 1, samples.GetDeviceId()),
-                                                         false, (ElemType)m_numSamples / (m_numSamples + numNewSamples), avg);
-
-#if 1//NANCHECK
-                avg.HasNan("Mean-avg");
-                //ones.HasNan("Mean-ones");
-#endif
-
-                m_numSamples += numNewSamples;
-            }
-        }
-
-        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-            if (!m_hasComputed)
-            {
-                FunctionValues().SetValue(0);    // reset accumulator
-                fprintf(stderr, "Mean: SetValue(0)\n");
-            }
-        }
-
        virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
        {
            Base::CopyTo(nodeP, newName, flags);
            if (flags & CopyNodeFlags::copyNodeValue)
            {
-                auto node = dynamic_pointer_cast<MeanNode<ElemType>>(nodeP);
-                node->m_numSamples = m_numSamples;
+                if (m_numSamples != SIZE_MAX)
+                    LogicError("%ls %ls operation: CopyTo() called while accumulating.", NodeName().c_str(), OperationName().c_str());
+                auto node = dynamic_pointer_cast<MeanInvStdDevNodeBase<ElemType>>(nodeP);
+                node->m_numSamples = SIZE_MAX;
            }
        }
-    private:
-        size_t m_numSamples;    // TODO: move to base class?
+    protected:
+        size_t m_numSamples;    // (SIZE_MAX while outside accumulation state)
+        bool IsAccumulating() const { return m_numSamples != SIZE_MAX; }
+    };
+
+    // -----------------------------------------------------------------------
+    // MeanNode (features)
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class MeanNode : public MeanInvStdDevNodeBase<ElemType>
+    {
+        typedef MeanInvStdDevNodeBase<ElemType> Base; ComputationNodeBoilerplate; UsingPreComputedNodeMembers;
+        static const std::wstring TypeName() { return L"Mean"; }
+    public:
+        MeanNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            Base(deviceId, name)
+        { }
+
+        virtual void /*PreComputedNode::*/MarkComputed(const bool hasComputed)
+        {
+            Base::MarkComputed(hasComputed);
+            if (!m_hasComputed)     // initialize accumulation
+                FunctionValues().SetValue(0);
+            // no else branch because EvaluateThisNodeNonLooping() already leaves a valid mean in m_functionValues
+        }
+
+        virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override
+        {
+            if (m_hasComputed)
+                return;     // not accumulating
+
+            if (!IsAccumulating())
+                LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str());
+
+            Matrix<ElemType> &samples = Inputs(0)->FunctionValues();
+            Matrix<ElemType> &avg = FunctionValues();
+
+#if 1//NANCHECK
+            samples.HasNan("Mean-Samples");
+#endif
+            size_t numNewSamples = samples.GetNumCols();
+            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + samples.GetNumCols()), samples, false,
+                                                        ConstOnes(numNewSamples, 1, samples.GetDeviceId()),
+                                                        false, (ElemType)m_numSamples / (m_numSamples + numNewSamples), avg);
+#if 1//NANCHECK
+            avg.HasNan("Mean-avg");
+#endif
+
+            m_numSamples += numNewSamples;
+        }
    };

    template class MeanNode<float>;
@ -362,32 +395,34 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------

    template<class ElemType>
-    class InvStdDevNode : public PreComputedNode<ElemType>, public NumInputs<1>
+    class InvStdDevNode : public MeanInvStdDevNodeBase<ElemType>
    {
-        typedef PreComputedNode<ElemType> Base; UsingPreComputedNodeMembers;
+        typedef MeanInvStdDevNodeBase<ElemType> Base; ComputationNodeBoilerplate; UsingPreComputedNodeMembers;
        static const std::wstring TypeName() { return L"InvStdDev"; }
    public:
        InvStdDevNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            PreComputedNode<ElemType>(deviceId, name),
-            m_mean(deviceId), m_var(deviceId), m_temp(deviceId),
-            m_numSamples(0)
+            Base(deviceId, name),
+            m_mean(deviceId), m_var(deviceId), m_temp(deviceId)
        { }

-        virtual void LoadFromFile(File& fstream, size_t modelVersion) override
-        {
-            Base::LoadFromFile(fstream, modelVersion);
-            m_numSamples = 0;   // TODO: intended? not loading from file?
-        }
-
        virtual void /*PreComputedNode::*/MarkComputed(const bool hasComputed) override
        {
            Base::MarkComputed(hasComputed);

-            if (m_hasComputed && m_numSamples > 0)  //m_numSamples>0 means it's not called from model loading
+            if (!m_hasComputed) // initialize
+            {
+                // reset accumulators
+                size_t inputDim = Inputs(0)->GetNumRows();
+                m_mean.Resize(inputDim, 1);
+                m_var.Resize(inputDim, 1);
+                m_mean.SetValue(0);
+                m_var.SetValue(0);
+                FunctionValues().SetValue(0);   // also set this because not doing it may flag during debugging; avoids special-casing this
+            }
+            else                // finalize
            {
                ElemType sqrtFloor = 1e-10f;
-
-                m_var.InplaceTruncateBottom(sqrtFloor); //prevent too small variance (and negative square roots)
+                m_var.InplaceTruncateBottom(sqrtFloor);     // prevent too small variance (and negative square roots due to numeric inaccuracy)
 #if 1//NANCHECK
                m_var.HasNan("MarkComputed-InplaceTruncateBottom");
 #endif
@ -402,63 +437,43 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                m_var.HasNan("MarkComputed-ElementInverse()");
 #endif
                FunctionValues().SetValue(m_var);
-
-                m_numSamples = 0;
            }
        }

-        virtual void ComputeInputPartial(const size_t /*inputIndex*/) override
-        {
-            LogicError("InvStdDev operation should not be involved in the gradient calculation.");
-        }
-
        virtual void /*ComputationNodeNonLooping::*/EvaluateThisNodeNonLooping() override
        {
-            if (!m_hasComputed)
-            {
-                Matrix<ElemType> &samples = Inputs(0)->FunctionValues();
+            if (m_hasComputed)
+                return;     // not accumulating
+
+            if (!IsAccumulating())
+                LogicError("%ls %ls operation: MarkComputed(false) has not been called.", NodeName().c_str(), OperationName().c_str());
+
+            Matrix<ElemType> &samples = Inputs(0)->FunctionValues();
 #if 1//NANCHECK
-                samples.HasNan("InvStdDev-Samples");
+            samples.HasNan("InvStdDev-Samples");
 #endif
-                m_temp.SetValue(m_mean);
-                size_t numNewSample = samples.GetNumCols();
-                Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), samples, false,
-                                                         ConstOnes(numNewSample, 1, samples.GetDeviceId()),
-                                                         false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_mean);
+            m_temp.SetValue(m_mean);
+            size_t numNewSample = samples.GetNumCols();
+            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), samples, false,
+                                                        ConstOnes(numNewSample, 1, samples.GetDeviceId()),
+                                                        false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_mean);

-                m_temp -= m_mean;
-                m_temp.AssignElementPowerOf(m_temp, 2);
-                m_var += m_temp;
+            m_temp -= m_mean;
+            m_temp.AssignElementPowerOf(m_temp, 2);
+            m_var += m_temp;

-                m_temp.AssignDifferenceOf(samples, m_mean);
-                m_temp.AssignElementPowerOf(m_temp, 2);
+            m_temp.AssignDifferenceOf(samples, m_mean);
+            m_temp.AssignElementPowerOf(m_temp, 2);

-                Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), m_temp, false,
-                                                         ConstOnes(numNewSample, 1, samples.GetDeviceId()),
-                                                         false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_var);
+            Matrix<ElemType>::MultiplyAndWeightedAdd(1.0f / (m_numSamples + numNewSample), m_temp, false,
+                                                        ConstOnes(numNewSample, 1, samples.GetDeviceId()),
+                                                        false, (ElemType)m_numSamples / (m_numSamples + numNewSample), m_var);

 #if 1//NANCHECK
-                m_var.HasNan("InvStdDev-m_var");
+            m_var.HasNan("InvStdDev-m_var");
 #endif

-                m_numSamples += samples.GetNumCols();
-            }
-        }
-
-        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
-        {
-            Base::Validate(isFinalValidationPass);
-
-            if (!m_hasComputed)
-            {
-                size_t inputDim = Inputs(0)->GetNumRows();
-                m_mean.Resize(inputDim, 1);
-                m_var.Resize(inputDim, 1);
-                // reset accumulators
-                m_mean.SetValue(0);
-                m_var.SetValue(0);
-                fprintf(stderr, "InvStdDev: SetValue(0)\n");
-            }
+            m_numSamples += samples.GetNumCols();
        }

        virtual void MoveMatricesToDevice(const DEVICEID_TYPE deviceId) override
@ -475,15 +490,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (flags & CopyNodeFlags::copyNodeValue)
            {
                auto node = dynamic_pointer_cast<InvStdDevNode<ElemType>>(nodeP);
-                node->m_numSamples = m_numSamples;
-
                node->m_mean = m_mean;
                node->m_var = m_var;
-                node-> m_temp =  m_temp;
+                node->m_temp =  m_temp;
            }
        }
    private:
-        size_t m_numSamples;
        Matrix<ElemType> m_mean;
        Matrix<ElemType> m_var;
        Matrix<ElemType>  m_temp;
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.cpp
@ -430,7 +430,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

-    // prepares the network for computation
+    bool ComputationNetwork::BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode)
+    {
+        return m_built.find(rootNode) != m_built.end();
+    }
+
+    // prepare to compute with the subnetwork that this rootNode depends on, including
+    //  - auto-detecting recurrent loops
+    //  - collect input and learnable nodes
+    //  - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size)
    // Done lazily, called for every minibatch's invocation of EvaluateNode(), but memoizing which nodes were done already.
    // BUGBUG? Lazy triggers on the root node. I.e. for two different root nodes (training, eval), it validates twice.
    void ComputationNetwork::BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode)
@ -558,53 +566,53 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }
 #endif

-    template<class N> void ComputationNetwork::GetNodesRequiringX(std::list<ComputationNodeBasePtr> & nodesRequirePreComputation, const ComputationNodeBasePtr rootNode, bool checkComputed)
+    template<class N> void ComputationNetwork::GetNodesRequiringX(std::list<ComputationNodeBasePtr> & nodesRequiringX, const ComputationNodeBasePtr rootNode, bool checkComputed)
    {
-        if (rootNode == nullptr)        // find nodes from all available nodes
+        if (!rootNode)              // find nodes from all available nodes
        {
-            for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); nodeIter++)
+            for (const auto & nodep : m_nameToNodeMap)
            {
-                ComputationNodeBasePtr node = nodeIter->second;
-                if (node->RequiresPreCompute()) // TODO: why not check directly for the type with a dynamic_cast?
+                auto node = dynamic_pointer_cast<N>(nodep.second);
+                if (node)
                {
-                    auto preComputedNode = static_pointer_cast<N>(node);
-                    if (!checkComputed || !preComputedNode->HasComputed())
-                        nodesRequirePreComputation.push_back(node);
+                    assert(node->RequiresPreCompute());
+                    if (!checkComputed || !node->HasComputed())
+                        nodesRequiringX.push_back(node);
                }
            }
        }
        else                            // or for calculating a specific node
        {
-            const auto & nodes = GetEvalOrder(rootNode, false);
-            for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+            for (const auto & nodei : GetEvalOrder(rootNode, false))
            {
-                ComputationNodeBasePtr node = *nodeIter;
-                if (node->RequiresPreCompute()) // TODO: why not check directly for the type with a dynamic_cast?
+                auto node = dynamic_pointer_cast<N>(nodei);
+                if (node)
                {
-                    auto preComputedNode = static_pointer_cast<N>(node);
-                    if (!checkComputed || !preComputedNode->HasComputed())
-                        nodesRequirePreComputation.push_back(node);
+                    assert(node->RequiresPreCompute());
+                    if (!checkComputed || !node->HasComputed())
+                        nodesRequiringX.push_back(node);
                }
            }
        }
+        nodesRequiringX.unique();
    }

    //return list of nodes that require precomputation and not precomputed yet.
    std::list<ComputationNodeBasePtr> ComputationNetwork::GetNodesRequiringPreComputation(const ComputationNodeBasePtr rootNode, bool checkComputed)
    {
-        std::list<ComputationNodeBasePtr> nodesRequirePreComputation;
-        GetNodesRequiringX<PreComputedNode<float>>(nodesRequirePreComputation, rootNode, checkComputed);
-        GetNodesRequiringX<PreComputedNode<double>>(nodesRequirePreComputation, rootNode, checkComputed);
-        return nodesRequirePreComputation;
+        std::list<ComputationNodeBasePtr> nodesRequiringX;
+        GetNodesRequiringX<PreComputedNode<float>>(nodesRequiringX, rootNode, checkComputed);
+        GetNodesRequiringX<PreComputedNode<double>>(nodesRequiringX, rootNode, checkComputed);
+        return nodesRequiringX;
    }

    //return list of nodes that require batch mode and not precomputed yet.
    std::list<ComputationNodeBasePtr> ComputationNetwork::GetNodesRequiringBatchMode(const ComputationNodeBasePtr rootNode, bool checkComputed)
    {
-        std::list<ComputationNodeBasePtr> nodesRequirePreComputation;
-        GetNodesRequiringX<BatchModeNode<float>>(nodesRequirePreComputation, rootNode, checkComputed);
-        GetNodesRequiringX<BatchModeNode<double>>(nodesRequirePreComputation, rootNode, checkComputed);
-        return nodesRequirePreComputation;
+        std::list<ComputationNodeBasePtr> nodesRequiringX;
+        GetNodesRequiringX<BatchModeNode<float>>(nodesRequiringX, rootNode, checkComputed);
+        GetNodesRequiringX<BatchModeNode<double>>(nodesRequiringX, rootNode, checkComputed);
+        return nodesRequiringX;
    }

    // The methods below determine evaluation order, which is tricky in presence of recurrent loops.
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetwork.h
@ -565,13 +565,12 @@ public:
    //  - these must be executed frame by frame rather than as a map
    //  - such a loop is treated as if they were a little nested network; this is done inside here
    //  - these little nested networks are defined in m_recurrentInfo[]
-    void Evaluate(const ComputationNodeBasePtr rootNode)
+    void Evaluate(const ComputationNodeBasePtr & rootNode)
    {
-        // prepare to compute with the subnetwork that this rootNode depends on, including
-        //  - auto-detecting recurrent loops
-        //  - collect input and learnable nodes
-        //  - calling Validate() on all nodes lazily, which sizes all matrices (column dimensions get updated to MB size)
-        BuildAndValidateSubNetwork(rootNode);
+        // caller must call BuildAndValidateSubNetwork() before
+        // TODO: Some places are hard to fix, e.g. encoder-decoder best-path functions. Those may be broken; this message will tell you.
+        if (!BuiltAndValidatedSubNetwork(rootNode))
+            LogicError("Evaluate for node %ls %ls: BuildAndValidateSubNetwork() has not been called on this node.");

        // determines order of evaluation, such that children get evaluated before their parent nodes
        std::list<ComputationNodeBasePtr>& allNodes = GetEvalOrder(rootNode, false);
@ -692,6 +691,12 @@ public:
            }
        }
    }
+    template<class NODESET>
+    void Evaluate(const NODESET & nodes)
+    {
+        for (auto & node : nodes)
+            Evaluate(node);
+    }

    // propagate the features' MB size to all nodes of the network
    // TODO: This function should go. Resizing is now part of Validate() and EvaluateThisNode().
@ -1233,6 +1238,24 @@ private:
 public:
    // prepares the network for computation
    void BuildAndValidateSubNetwork(const ComputationNodeBasePtr rootNode);
+    // and for a set of nodes
+    void StartEvaluateMinibatchLoop(const ComputationNodeBasePtr & rootNode)  // (ugly name; meant to be unique so we can rename if needed)
+    {
+        BuildAndValidateSubNetwork(rootNode);
+    }
+    template<class NODESET>
+    void StartEvaluateMinibatchLoop(const NODESET & nodes)  // (ugly name; meant to be unique so we can rename if needed)
+    {
+        for (auto & node : nodes)
+            StartEvaluateMinibatchLoop(node);
+    }
+    template<class NODESET>
+    void StartEvaluateMinibatchLoop(const NODESET & nodes1, const NODESET & nodes2) // often needed for two sets (training & evaluation criteria)
+    {
+        StartEvaluateMinibatchLoop(nodes1);
+        StartEvaluateMinibatchLoop(nodes2);
+    }
+    bool BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode);

    //this function will need to be called before actual validation and execution to 
    //predetermine how to share matrices to reduce memory usage.
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@ -503,9 +503,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        virtual void /*IComputationNode::*/OnEvaluateBeginIteration()             // called before first iteration step of EvaluateThisNode()
        {
-            fprintf(stderr, "Trace: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
+            fprintf(stderr, "OnEvaluateBeginIteration: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
+        }
+        virtual void /*IComputationNode::*/OnEvaluateEndIteration()               // called after last iteration step of EvaluateThisNode()
+        {
+            fprintf(stderr, "OnEvaluateEndIteration: %ls %ls operation\n", NodeName().c_str(), OperationName().c_str());
        }
-        virtual void /*IComputationNode::*/OnEvaluateEndIteration() { }               // called after last iteration step of EvaluateThisNode()

    protected:

@ -823,6 +826,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    class ComputationNode : public ComputationNodeBase // abstract class that cannot be instantiated
    {
+        typedef ComputationNodeBase Base;
    protected:
        //std containers such as list and map does not support class reference so we need to use pointer
        typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
@ -1249,6 +1253,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #ifdef _DEBUG
        virtual void /*IComputationNode::*/OnEvaluateEndIteration()               // called after last iteration step of EvaluateThisNode()
        {
+            Base::OnEvaluateEndIteration();
            MaskMissingValuesColumnsToZero();
            if (m_functionValues.HasNan("OnEvaluateEndIteration"))
                LogicError("%ls %ls operation unexpectedly produced NaN values.", NodeName().c_str(), OperationName().c_str());
@ -1487,11 +1492,13 @@ public: \
    using Base::SaveToFile; using Base::UpdateFunctionAndGradientMBSize; using Base::SetInput; \
    using Base::Validate; using Base::ValidateUnaryMap; using Base::ValidateBinaryZip; using Base::ValidateUnaryReduce; using Base::ValidateBinaryReduce; using Base::ValidateInferBinaryChildren; using Base::ValidateInferInputSize

-#define UsingComputationNodeMembersBoilerplate \
+#define ComputationNodeBoilerplate \
 protected:    /* some boilerplate goes here */ \
    virtual const std::wstring OperationName() const override { return TypeName(); } \
-    virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) override { return new typename std::remove_reference<decltype(*this)>::type(deviceId, name); } \
-    UsingComputationNodeMembers
+    virtual ComputationNodeBase * NewThis(DEVICEID_TYPE deviceId, const wstring & name) override { return new typename std::remove_reference<decltype(*this)>::type(deviceId, name); }
+
+#define UsingComputationNodeMembersBoilerplate \
+    ComputationNodeBoilerplate; UsingComputationNodeMembers

 #pragma endregion base computation class

--- a/MachineLearning/CNTKEval/CNTKEvalTest/CNTKEvalTest.cpp
+++ b/MachineLearning/CNTKEval/CNTKEvalTest/CNTKEvalTest.cpp
@ -49,6 +49,7 @@ void DoCommand(const ConfigParameters& configRoot)
    DataReader<ElemType>* dataReader = new DataReader<ElemType>(readerConfig);
    eval.LoadModel(modelPath);
    dataReader->StartMinibatchLoop(mbSize, 0, epochSize);
+    eval.StartEvaluateMinibatchLoop(outputName);
    while (dataReader->GetMinibatch(inputMatrices))
    {
        void* data = (void*)arr->data();
--- a/MachineLearning/CNTKSGDLib/SGD.cpp
+++ b/MachineLearning/CNTKSGDLib/SGD.cpp
@ -1145,7 +1145,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    std::vector<ComputationNodeBasePtr> & labelNodes,
                    std::map<std::wstring, Matrix<ElemType>*>* inputMatrices)
    {
-        std::list<ComputationNodeBasePtr> nodes = net.GetNodesRequiringPreComputation();
+        std::list<ComputationNodeBasePtr> nodes = net.GetNodesRequiringPreComputation();    // this tests all HasComputed() flags

        if (nodes.size() == 0)
        {
@ -1169,39 +1169,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0);
        else                                    // using only one epoch
            trainSetDataReader->StartMinibatchLoop(m_mbSize[0], 0, m_epochSize);
-#if 1
-        size_t actualMBSize;
-        while (DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, nullptr, false, false, *inputMatrices, actualMBSize))
+        net.StartEvaluateMinibatchLoop(nodes);
+
+        // initialize
+        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
+        {
+            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
+            node->MarkComputed(false/*begin accumulating*/);
+        }
+        size_t actualMBSizeDummy;
+        while (DataReaderHelpers::GetMinibatchIntoNetwork(*trainSetDataReader, net, nullptr, false, false, *inputMatrices, actualMBSizeDummy))
        {
            // TODO: move these into GetMinibatchIntoNetwork()  --but those are passed around; necessary? Can't we get them from 'net'?
            ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
            ComputationNetwork::UpdateEvalTimeStamps(labelNodes);

-            for (auto & node : nodes)   // this loops over all pertinent PreComputeNodes
-                net.Evaluate(node);
+            net.Evaluate(nodes);
        }
-#else
-        while (trainSetDataReader->GetMinibatch(*inputMatrices))
-        {
-            // TODO: use GetMinibatchIntoNetwork(), should be easy
-            ComputationNetwork::UpdateEvalTimeStamps(featureNodes);
-            ComputationNetwork::UpdateEvalTimeStamps(labelNodes);
-
-            net.SetActualMiniBatchSizeFromFeatures();
-            trainSetDataReader->CopyMBLayoutTo(net.GetMBLayoutPtr());
-            net.VerifyActualNumParallelSequences(trainSetDataReader->GetNumParallelSequences());
-
-            // TODO: Exactly this loop should be INSIDE ComputationNetwork--pass the nodes array instead!
-            for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
-                net.Evaluate(*nodeIter);
-        }
-#endif
-
-        // mark done
+        // finalize
        for (auto nodeIter = nodes.begin(); nodeIter != nodes.end(); nodeIter++)
        {
            auto node = static_pointer_cast<PreComputedNode<ElemType>>(*nodeIter);
-            node->MarkComputed(true);
+            node->MarkComputed(true/*done accumulating*/);
        }

        return true;
@ -1714,13 +1703,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        int numMBsRun = 0;

-        size_t numEvalNodes = epochEvalErrors.size();
-
        // NOTE: the following two local matrices are not used in distGradAgg path
        // assume only one training criterion node for each epoch.
        // The criterion values are accumulated here over the minibatches (without having to pull them off the GPU).
        Matrix<ElemType> localEpochCriterion(1, 1, net.GetDeviceId());
-        Matrix<ElemType> localEpochEvalErrors(1, numEvalNodes, net.GetDeviceId());
+        Matrix<ElemType> localEpochEvalErrors(1, epochEvalErrors.size(), net.GetDeviceId());

        localEpochCriterion.SetValue(0);
        localEpochEvalErrors.SetValue(0);
@ -1740,7 +1727,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (useGradientAggregation)
        {
            epochCriterion = double(0.0);
-            epochEvalErrors.assign(numEvalNodes, double(0.0));
+            epochEvalErrors.assign(epochEvalErrors.size(), double(0.0));
        }

        Profiler profiler(m_numMBsToCUDAProfile);
@ -1752,13 +1739,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                       m_enableDistributedMBReading &&
                                       trainSetDataReader->SupportsDistributedMBRead();
        if (useDistributedMBReading)
-        {
            trainSetDataReader->StartDistributedMinibatchLoop(tunedMBSize, epochNumber, g_mpi->CurrentNodeRank(), g_mpi->NumNodesInUse(), m_epochSize);
-        }
        else
-        {
            trainSetDataReader->StartMinibatchLoop(tunedMBSize, epochNumber, m_epochSize);
-        }
+        net.StartEvaluateMinibatchLoop(evaluationNodes);
+        net.StartEvaluateMinibatchLoop(criterionNodes);
+        if (m_needAdaptRegularization && m_adaptationRegType == AdaptationRegType::KL && refNode)
+            refNet.StartEvaluateMinibatchLoop(refNode);

        // TODO: what is this??
        AttemptUtteranceDerivativeFeatures(net, trainSetDataReader, featureNodes, inputMatrices);
@ -1829,10 +1816,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                //compute eval node first since when gradient is computed the forward function values
                //may be changed and need to be recomputed when gradient and function value share the same matrix
-                for (size_t i = 0; i < numEvalNodes; i++)
-                {
-                    net.Evaluate(evaluationNodes[i]);
-                }
+                net.Evaluate(evaluationNodes);

                // only compute gradient when learning rate is large enough
                if (learnRatePerSample > m_minLearnRate * 0.01)
@ -1872,7 +1856,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    // criteria are in FunctionValues()(0,0), we accumulate into another 1x1 Matrix (to avoid having to pull the values off the GPU)
                    Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(criterionNodes[0])->FunctionValues(),
                                                          0, 0, localEpochCriterion, 0, 0);
-                    for (size_t i = 0; i < numEvalNodes; i++)
+                    for (size_t i = 0; i < evaluationNodes.size(); i++)
                    {
                        Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(evaluationNodes[i])->FunctionValues(),
                                                              0, 0, localEpochEvalErrors, 0, i);
@ -1882,14 +1866,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            else
            {
                //distributed gradient aggregation
-                LazyInitDistGradAgg(learnableNodes, numEvalNodes, m_traceLevel);
+                LazyInitDistGradAgg(learnableNodes, evaluationNodes.size(), m_traceLevel);

                //prepare the header
-                m_gradHeader->numEvalNode = numEvalNodes;
+                m_gradHeader->numEvalNode = evaluationNodes.size();
                m_gradHeader->numSamples = actualMBSize;
                m_gradHeader->numSamplesWithLabel = numSamplesWithLabel;
                m_gradHeader->criterion = actualMBSize > 0 ? criterionNodes[0]->Get00Element() : 0.0;
-                for (size_t i = 0; i < numEvalNodes; i++)
+                for (size_t i = 0; i < evaluationNodes.size(); i++)
                    m_gradHeader->evalErrors[i] = actualMBSize > 0 ? evaluationNodes[i]->Get00Element() : 0.0;

                m_distGradAgg->AggregateGradients(m_gradHeader, epochNumber);
@ -1897,7 +1881,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                aggregateNumSamples = m_gradHeader->numSamples;
                aggregateNumSamplesWithLabel = m_gradHeader->numSamplesWithLabel;
                epochCriterion += m_gradHeader->criterion;
-                for (size_t i = 0; i<numEvalNodes; i++)
+                for (size_t i = 0; i<epochEvalErrors.size(); i++)
                    epochEvalErrors[i] += m_gradHeader->evalErrors[i];
            }

@ -1963,7 +1947,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        timer.Restart();
                        epochCriterion = localEpochCriterion.Get00Element();
-                        for (size_t i = 0; i < numEvalNodes; i++)
+                        for (size_t i = 0; i < epochEvalErrors.size(); i++)
                            epochEvalErrors[i] = localEpochEvalErrors(0, i);
                        timer.Stop();

@ -1991,7 +1975,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        m_maxComputedEpochSize = numMBsRun * numSamplesLastMBs / m_numMBsToShowResult;
                    }

-                    for (size_t i = 0; i < numEvalNodes; i++)
+                    for (size_t i = 0; i < epochEvalErrors.size(); i++)
                    {
                        double evalError = (epochEvalErrors[i] - epochEvalErrorsLastMBs[i]) / numSamplesLastMBs;
                        string formatString = "EvalErr[%lu]PerSample = " + GeneratePaddedFloatOrExpFormat(0, 8, evalError) + "; ";
@ -2012,7 +1996,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    numSamplesLastMBs = 0;

                    epochCriterionLastMBs = epochCriterion;
-                    for (size_t i = 0; i < numEvalNodes; i++)
+                    for (size_t i = 0; i < epochEvalErrorsLastMBs.size(); i++)
                        epochEvalErrorsLastMBs[i] = epochEvalErrors[i];

                    if (std::isnan(epochCriterion))
@ -2057,7 +2041,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            // with parallelization, we have them in regular variables
            epochCriterion /= float(totalEpochSamples);
-            for (size_t i = 0; i< numEvalNodes; i++)
+            for (size_t i = 0; i< epochEvalErrors.size(); i++)
                epochEvalErrors[i] /= totalEpochSamples;
        }
        else
@ -2067,7 +2051,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            localEpochEvalErrors /= float(totalEpochSamples);

            epochCriterion = localEpochCriterion.Get00Element();
-            for (size_t i = 0; i < numEvalNodes; i++)
+            for (size_t i = 0; i < epochEvalErrors.size(); i++)
                epochEvalErrors[i] = localEpochEvalErrors(0, i);
        }

@ -2495,6 +2479,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 #define EPSILON 1e-5

+    // this probes the automatic gradient computation with random inputs
    template<class ElemType>
    bool SGD<ElemType>::GradientCheck(ComputationNetwork& net,
                       const std::vector<ComputationNodeBasePtr> & criterionNodes,
@ -2503,6 +2488,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        vector<string> errMsgs;

+        net.StartEvaluateMinibatchLoop(criterionNodes[npos]);
+
        // gradient checking
        for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
        {
@ -2524,7 +2511,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                node->UpdateEvalTimeStamp();

-                // use only the first criterion. Is
                net.ComputeGradient<ElemType>(criterionNodes[npos]);

                if (node->GradientValues().GetMatrixType() == MatrixType::SPARSE)
--- a/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
+++ b/MachineLearning/CNTKSGDLib/SimpleEvaluator.h
@ -122,6 +122,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                evalResultsLastMBs.push_back((ElemType)0);

            dataReader->StartMinibatchLoop(mbSize, 0, testSize);
+            m_net.StartEvaluateMinibatchLoop(evalNodes);

            while (DataReaderHelpers::GetMinibatchIntoNetwork(*dataReader, m_net, nullptr, false, false, inputMatrices, actualMBSize))
            {
@ -191,6 +192,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        //returns error rate
+        // TODO: What does this function do?
        double EvaluateUnroll(IDataReader<ElemType>* dataReader, const size_t mbSize, double &evalSetCrossEntropy, const wchar_t* output = nullptr, const size_t testSize = requestDataSize)
        {
            std::vector<ComputationNodeBasePtr> & featureNodes = m_net.FeatureNodes();
@ -211,6 +213,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            inputMatrices[L"numberobs"] = new Matrix<ElemType>(1, 1, m_net.GetDeviceId());

            dataReader->StartMinibatchLoop(mbSize, 0, testSize);
+            m_net.StartEvaluateMinibatchLoop(criterionNodes, evaluationNodes);

            double epochEvalError = 0;
            double epochCrossEntropy = 0;
@ -415,9 +418,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            double evalResultsLastMBs = (double)0;

            for (auto ptr = dataReaders.begin(); ptr != dataReaders.end(); ptr++)
-            {
                (*ptr)->StartMinibatchLoop(mbSize, 0, testSize);
-            }
+            // BUGBUG: Code below will fail because we now must call StartMinibatchLoop(), but I can't tell from below which nodes to call it for.
+            //for (auto & ptr : nets)
+            //    ptr->StartMinibatchLoop(xxx);

            bool bContinueDecoding = true;
            while (bContinueDecoding)
@ -743,7 +747,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }

-        //return true if precomputation is executed.
+        // (only called by FindBestPath...())
        void ResetPreCompute()
        {
            //mark false
@ -767,6 +771,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            ComputationNetwork::UpdateEvalTimeStamps(featureNodes);

+            net.StartEvaluateMinibatchLoop(batchComputeNodes);  // TODO: Is this correct? There is no StartMinibatchLoop() for a reader.
+
            net.SetActualMiniBatchSizeFromFeatures();
            for (auto nodeIter = batchComputeNodes.begin(); nodeIter != batchComputeNodes.end(); nodeIter++)
                net.Evaluate(*nodeIter);
--- a/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
+++ b/MachineLearning/CNTKSGDLib/SimpleOutputWriter.h
@ -61,10 +61,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //Matrix<ElemType> endOfFile =  Matrix<ElemType>((size_t)1,(size_t)1);
            //endOfFile(0,0)=0;

-            //evaluate with minibatches
+            // evaluate with minibatches
            dataReader.StartMinibatchLoop(mbSize, 0, numOutputSamples);
            dataReader.SetNumParallelSequences(1);

+            m_net.StartEvaluateMinibatchLoop(outputNodes);
+
            size_t totalEpochSamples = 0;
            std::map<std::wstring, void *, nocase_compare> outputMatrices;

@ -107,7 +109,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //clean up
            
        }
-        

        void WriteOutput(IDataReader<ElemType>& dataReader, size_t mbSize, std::wstring outputPath, const std::vector<std::wstring>& outputNodeNames, size_t numOutputSamples=requestDataSize)
        {
@ -142,10 +143,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            std::map<std::wstring, Matrix<ElemType>*> inputMatrices;
            for (size_t i=0; i<featureNodes.size(); i++)
                inputMatrices[featureNodes[i]->NodeName()] = &dynamic_pointer_cast<ComputationNode<ElemType>>(featureNodes[i])->FunctionValues();
-                        
-            //evaluate with minibatches
+
+            // evaluate with minibatches
            dataReader.StartMinibatchLoop(mbSize, 0, numOutputSamples);

+            m_net.StartEvaluateMinibatchLoop(outputNodes);
+
            size_t totalEpochSamples = 0;
            size_t numMBsRun = 0;
            size_t tempArraySize = 0;