From f221f1c73a38f61b1a1a51f617121bd1ace57313 Mon Sep 17 00:00:00 2001
From: Frank Seide <fseide@microsoft.com>
Date: Sun, 1 Nov 2015 17:43:33 -0800
Subject: [PATCH] added a warning for inefficient propagating from inside loop
 to a child that is outside the loop. Gradient could be done in PAR mode

---
 Common/Include/Sequences.h                      | 15 ++++++++++-----
 .../ComputationNetworkEvaluation.cpp            |  3 +++
 .../CNTKComputationNetworkLib/ComputationNode.h | 17 +++++++++++------
 3 files changed, 24 insertions(+), 11 deletions(-)
diff --git a/Common/Include/Sequences.h b/Common/Include/Sequences.h
index 1f2f40c74..316256bdb 100644
--- a/Common/Include/Sequences.h
+++ b/Common/Include/Sequences.h
@@ -195,6 +195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         // test function for those pieces of the code that cannot handle gaps
+        // TODO: Not efficient (linear scan). Use a global OR of all values.
         bool HasGaps() const
         {
             if (!IsAllNone())
@@ -416,7 +417,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // TODO: remove these ^^ two in favor of these vv
         size_t StartColumn(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); return timeIdxInSeq * pMBLayout->GetNumParallelSequences(); }
         size_t NumCols(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); return pMBLayout->GetNumParallelSequences(); }
-        bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
+        bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead (PAR mode)
 
         const FrameRange & Check(size_t expectedStartColumn, size_t expectedNumCols, const shared_ptr<MBLayout> & pMBLayout) const
         {
@@ -444,18 +445,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     inline shared_ptr<Matrix<char>> MBLayout::GetColumnsValidityMask(const FrameRange& frameRange, DEVICEID_TYPE deviceId) const
     {
+        // lazily compute the validity mask
         if (m_columnsValidityMask == nullptr)
         {
             Lock();
             m_columnsValidityMask.reset(new Matrix<char>(deviceId));
 
             // Determine indices of all invalid columns in the specified frameRange
-            if (!IsAllNone())
+            if (!IsAllNone())       // TODO: use HasGaps() (but currently that would mean a second linear scan, which is not efficient)
             {
                 size_t nT = GetNumTimeSteps();
                 size_t nS = GetNumParallelSequences();
 
-                std::vector<char> columnsValidityMask(nT * nS, 1);
+                std::vector<char> columnsValidityMask(nT * nS, 1);  // form the mask in a CPU-side STL vector first
                 bool foundInvalidColumn = false;
                 for (size_t t = 0; t < nT; t++)
                 {
@@ -471,14 +473,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     }
                 }
 
-                if (foundInvalidColumn)
+                if (foundInvalidColumn)                     // if any then blast it over to the GPU side
                     m_columnsValidityMask->SetValue(1, columnsValidityMask.size(), deviceId, columnsValidityMask.data());
             }
         }
 
-        if (m_columnsValidityMask->IsEmpty())
+        if (m_columnsValidityMask->IsEmpty())               // mask matrix was kept empty, which means no gaps detected
             return nullptr;
 
+        // we have a validity mask: decide what to return
         if (frameRange.IsAllFrames())
             return m_columnsValidityMask;
 
@@ -496,9 +499,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (!foundInvalidColumnsInRange)
             return nullptr;
 
+        // we get here if there is an actual validity mask and there are invalid frames in its range
         size_t startColumn = (frameRange.t() * GetNumParallelSequences()) + ((frameRange.seqIndex == SIZE_MAX) ? 0 : frameRange.seqIndex);
         size_t numColumns = (frameRange.seqIndex == SIZE_MAX) ? GetNumParallelSequences() : 1;
 
+        // TODO: why use ColumnSlice() and not DataSlice()?
         return make_shared<Matrix<char>>(m_columnsValidityMask->ColumnSlice(startColumn, numColumns));
     }
 
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
index be8040c09..3a2012b1b 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -135,6 +135,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop)
     // TODO: pass a set of nodes instead of only one?
     // TODO: remove Evaluate() from here, instead call it at call site, and in here merely check whether everything is computed already
+    // BUGBUG: The decision to loop (SEQ execution) is made by parent, but some children can be executer PAR. It should be possible to detect this.
     template<class ElemType>
     void ComputationNetwork::ComputeGradient(const ComputationNodeBasePtr rootNode, 
                                              bool bResetToOne,                              // true if reset the gradient of rootnode to 1.0
@@ -195,6 +196,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                             node2->VerifyNumParallelSequences(GetNumParallelSequences());
                             if (IsNodeReqMultiSeqHandling(node2))
                                 node2->MaskMissingGradientColumnsToZero(t);
+                            // TODO: exclude children that are not part of the recurrent loop, and do thise below, separately.
                             node2->ComputeGradientForChildren(t);
                         }
                     }
@@ -237,6 +239,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     ComputationNetwork::RecurrentInfo * ComputationNetwork::FindInRecurrentLoops(const ComputationNodeBasePtr& node)
     {
         // look in all recurrent loops of the network
+        // TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
         for (auto & iter : m_recurrentInfo)
             if (std::find(iter.m_recurrentNodes.begin(), iter.m_recurrentNodes.end(), node) != iter.m_recurrentNodes.end())
                 return &iter;
diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
index a02327772..352c78637 100644
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@@ -1158,13 +1158,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #if DUMPOUTPUT
                     fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
 #endif
-                    child->LazyZeroGradient();          // set gradient to 0 if this is the first time
+                    child->LazyZeroGradient();              // set gradient to 0 if this is the first time
+
+                    // TODO: There is an inefficiency here which we should fix.
+                    if (IsPartOfLoop() && !child->IsPartOfLoop())
+                    {
+                        assert(!frameRange.IsAllFrames());
+                        static int warnings = 0;
+                        if (warnings++ < 20)
+                            fprintf (stderr, "ComputeGradientForChildren: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n",
+                            NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
+                    }
 
-#if 0
-                    if (frameRange.IsAllFrames())       // TODO: remove this
-                        ComputeInputPartial(i);
-                    else
-#endif
                     ComputeInputPartial(i, frameRange);     // this computes partial wrt to the child and sums the gradient value in the child
                 }
 #ifdef DISPLAY_DEBUG