added a warning for inefficient propagating from inside loop to a child that is outside the loop. Gradient could be done in PAR mode

2015-11-01 17:43:33 -08:00 · 2015-11-01 17:43:33 -08:00 · f221f1c73a
--- a/Common/Include/Sequences.h
+++ b/Common/Include/Sequences.h
@ -195,6 +195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        // test function for those pieces of the code that cannot handle gaps
+        // TODO: Not efficient (linear scan). Use a global OR of all values.
        bool HasGaps() const
        {
            if (!IsAllNone())
@ -416,7 +417,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // TODO: remove these ^^ two in favor of these vv
        size_t StartColumn(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); return timeIdxInSeq * pMBLayout->GetNumParallelSequences(); }
        size_t NumCols(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); return pMBLayout->GetNumParallelSequences(); }
-        bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
+        bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead (PAR mode)

        const FrameRange & Check(size_t expectedStartColumn, size_t expectedNumCols, const shared_ptr<MBLayout> & pMBLayout) const
        {
@ -444,18 +445,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    inline shared_ptr<Matrix<char>> MBLayout::GetColumnsValidityMask(const FrameRange& frameRange, DEVICEID_TYPE deviceId) const
    {
+        // lazily compute the validity mask
        if (m_columnsValidityMask == nullptr)
        {
            Lock();
            m_columnsValidityMask.reset(new Matrix<char>(deviceId));

            // Determine indices of all invalid columns in the specified frameRange
-            if (!IsAllNone())
+            if (!IsAllNone())       // TODO: use HasGaps() (but currently that would mean a second linear scan, which is not efficient)
            {
                size_t nT = GetNumTimeSteps();
                size_t nS = GetNumParallelSequences();

-                std::vector<char> columnsValidityMask(nT * nS, 1);
+                std::vector<char> columnsValidityMask(nT * nS, 1);  // form the mask in a CPU-side STL vector first
                bool foundInvalidColumn = false;
                for (size_t t = 0; t < nT; t++)
                {
@ -471,14 +473,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    }
                }

-                if (foundInvalidColumn)
+                if (foundInvalidColumn)                     // if any then blast it over to the GPU side
                    m_columnsValidityMask->SetValue(1, columnsValidityMask.size(), deviceId, columnsValidityMask.data());
            }
        }

-        if (m_columnsValidityMask->IsEmpty())
+        if (m_columnsValidityMask->IsEmpty())               // mask matrix was kept empty, which means no gaps detected
            return nullptr;

+        // we have a validity mask: decide what to return
        if (frameRange.IsAllFrames())
            return m_columnsValidityMask;

@ -496,9 +499,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (!foundInvalidColumnsInRange)
            return nullptr;

+        // we get here if there is an actual validity mask and there are invalid frames in its range
        size_t startColumn = (frameRange.t() * GetNumParallelSequences()) + ((frameRange.seqIndex == SIZE_MAX) ? 0 : frameRange.seqIndex);
        size_t numColumns = (frameRange.seqIndex == SIZE_MAX) ? GetNumParallelSequences() : 1;

+        // TODO: why use ColumnSlice() and not DataSlice()?
        return make_shared<Matrix<char>>(m_columnsValidityMask->ColumnSlice(startColumn, numColumns));
    }

--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -135,6 +135,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop)
    // TODO: pass a set of nodes instead of only one?
    // TODO: remove Evaluate() from here, instead call it at call site, and in here merely check whether everything is computed already
+    // BUGBUG: The decision to loop (SEQ execution) is made by parent, but some children can be executer PAR. It should be possible to detect this.
    template<class ElemType>
    void ComputationNetwork::ComputeGradient(const ComputationNodeBasePtr rootNode, 
                                             bool bResetToOne,                              // true if reset the gradient of rootnode to 1.0
@ -195,6 +196,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            node2->VerifyNumParallelSequences(GetNumParallelSequences());
                            if (IsNodeReqMultiSeqHandling(node2))
                                node2->MaskMissingGradientColumnsToZero(t);
+                            // TODO: exclude children that are not part of the recurrent loop, and do thise below, separately.
                            node2->ComputeGradientForChildren(t);
                        }
                    }
@ -237,6 +239,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    ComputationNetwork::RecurrentInfo * ComputationNetwork::FindInRecurrentLoops(const ComputationNodeBasePtr& node)
    {
        // look in all recurrent loops of the network
+        // TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
        for (auto & iter : m_recurrentInfo)
            if (std::find(iter.m_recurrentNodes.begin(), iter.m_recurrentNodes.end(), node) != iter.m_recurrentNodes.end())
                return &iter;
--- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
+++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h
@ -1158,13 +1158,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #if DUMPOUTPUT
                    fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
 #endif
-                    child->LazyZeroGradient();          // set gradient to 0 if this is the first time
+                    child->LazyZeroGradient();              // set gradient to 0 if this is the first time
+
+                    // TODO: There is an inefficiency here which we should fix.
+                    if (IsPartOfLoop() && !child->IsPartOfLoop())
+                    {
+                        assert(!frameRange.IsAllFrames());
+                        static int warnings = 0;
+                        if (warnings++ < 20)
+                            fprintf (stderr, "ComputeGradientForChildren: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n",
+                            NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
+                    }

-#if 0
-                    if (frameRange.IsAllFrames())       // TODO: remove this
-                        ComputeInputPartial(i);
-                    else
-#endif
                    ComputeInputPartial(i, frameRange);     // this computes partial wrt to the child and sums the gradient value in the child
                }
 #ifdef DISPLAY_DEBUG