From f221f1c73a38f61b1a1a51f617121bd1ace57313 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Sun, 1 Nov 2015 17:43:33 -0800 Subject: [PATCH] added a warning for inefficient propagating from inside loop to a child that is outside the loop. Gradient could be done in PAR mode --- Common/Include/Sequences.h | 15 ++++++++++----- .../ComputationNetworkEvaluation.cpp | 3 +++ .../CNTKComputationNetworkLib/ComputationNode.h | 17 +++++++++++------ 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/Common/Include/Sequences.h b/Common/Include/Sequences.h index 1f2f40c74..316256bdb 100644 --- a/Common/Include/Sequences.h +++ b/Common/Include/Sequences.h @@ -195,6 +195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } // test function for those pieces of the code that cannot handle gaps + // TODO: Not efficient (linear scan). Use a global OR of all values. bool HasGaps() const { if (!IsAllNone()) @@ -416,7 +417,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // TODO: remove these ^^ two in favor of these vv size_t StartColumn(const shared_ptr & pMBLayout) const { EnsureNotAllFrames(); return timeIdxInSeq * pMBLayout->GetNumParallelSequences(); } size_t NumCols(const shared_ptr & pMBLayout) const { EnsureNotAllFrames(); return pMBLayout->GetNumParallelSequences(); } - bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead + bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead (PAR mode) const FrameRange & Check(size_t expectedStartColumn, size_t expectedNumCols, const shared_ptr & pMBLayout) const { @@ -444,18 +445,19 @@ namespace Microsoft { namespace MSR { namespace CNTK { inline shared_ptr> MBLayout::GetColumnsValidityMask(const FrameRange& frameRange, DEVICEID_TYPE deviceId) const { + // lazily compute the validity mask if (m_columnsValidityMask == nullptr) { Lock(); m_columnsValidityMask.reset(new Matrix(deviceId)); // Determine indices of all invalid columns in the specified frameRange - if (!IsAllNone()) + if (!IsAllNone()) // TODO: use HasGaps() (but currently that would mean a second linear scan, which is not efficient) { size_t nT = GetNumTimeSteps(); size_t nS = GetNumParallelSequences(); - std::vector columnsValidityMask(nT * nS, 1); + std::vector columnsValidityMask(nT * nS, 1); // form the mask in a CPU-side STL vector first bool foundInvalidColumn = false; for (size_t t = 0; t < nT; t++) { @@ -471,14 +473,15 @@ namespace Microsoft { namespace MSR { namespace CNTK { } } - if (foundInvalidColumn) + if (foundInvalidColumn) // if any then blast it over to the GPU side m_columnsValidityMask->SetValue(1, columnsValidityMask.size(), deviceId, columnsValidityMask.data()); } } - if (m_columnsValidityMask->IsEmpty()) + if (m_columnsValidityMask->IsEmpty()) // mask matrix was kept empty, which means no gaps detected return nullptr; + // we have a validity mask: decide what to return if (frameRange.IsAllFrames()) return m_columnsValidityMask; @@ -496,9 +499,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (!foundInvalidColumnsInRange) return nullptr; + // we get here if there is an actual validity mask and there are invalid frames in its range size_t startColumn = (frameRange.t() * GetNumParallelSequences()) + ((frameRange.seqIndex == SIZE_MAX) ? 0 : frameRange.seqIndex); size_t numColumns = (frameRange.seqIndex == SIZE_MAX) ? GetNumParallelSequences() : 1; + // TODO: why use ColumnSlice() and not DataSlice()? return make_shared>(m_columnsValidityMask->ColumnSlice(startColumn, numColumns)); } diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp index be8040c09..3a2012b1b 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNetworkEvaluation.cpp @@ -135,6 +135,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { // MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop) // TODO: pass a set of nodes instead of only one? // TODO: remove Evaluate() from here, instead call it at call site, and in here merely check whether everything is computed already + // BUGBUG: The decision to loop (SEQ execution) is made by parent, but some children can be executer PAR. It should be possible to detect this. template void ComputationNetwork::ComputeGradient(const ComputationNodeBasePtr rootNode, bool bResetToOne, // true if reset the gradient of rootnode to 1.0 @@ -195,6 +196,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { node2->VerifyNumParallelSequences(GetNumParallelSequences()); if (IsNodeReqMultiSeqHandling(node2)) node2->MaskMissingGradientColumnsToZero(t); + // TODO: exclude children that are not part of the recurrent loop, and do thise below, separately. node2->ComputeGradientForChildren(t); } } @@ -237,6 +239,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { ComputationNetwork::RecurrentInfo * ComputationNetwork::FindInRecurrentLoops(const ComputationNodeBasePtr& node) { // look in all recurrent loops of the network + // TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup? for (auto & iter : m_recurrentInfo) if (std::find(iter.m_recurrentNodes.begin(), iter.m_recurrentNodes.end(), node) != iter.m_recurrentNodes.end()) return &iter; diff --git a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h index a02327772..352c78637 100644 --- a/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h +++ b/MachineLearning/CNTKComputationNetworkLib/ComputationNode.h @@ -1158,13 +1158,18 @@ namespace Microsoft { namespace MSR { namespace CNTK { #if DUMPOUTPUT fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str()); #endif - child->LazyZeroGradient(); // set gradient to 0 if this is the first time + child->LazyZeroGradient(); // set gradient to 0 if this is the first time + + // TODO: There is an inefficiency here which we should fix. + if (IsPartOfLoop() && !child->IsPartOfLoop()) + { + assert(!frameRange.IsAllFrames()); + static int warnings = 0; + if (warnings++ < 20) + fprintf (stderr, "ComputeGradientForChildren: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n", + NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str()); + } -#if 0 - if (frameRange.IsAllFrames()) // TODO: remove this - ComputeInputPartial(i); - else -#endif ComputeInputPartial(i, frameRange); // this computes partial wrt to the child and sums the gradient value in the child } #ifdef DISPLAY_DEBUG