added a warning for inefficient propagating from inside loop to a child that is outside the loop. Gradient could be done in PAR mode

This commit is contained in:
Frank Seide 2015-11-01 17:43:33 -08:00
Родитель 34bbf53e0e
Коммит f221f1c73a
3 изменённых файлов: 24 добавлений и 11 удалений

Просмотреть файл

@ -195,6 +195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
// test function for those pieces of the code that cannot handle gaps
// TODO: Not efficient (linear scan). Use a global OR of all values.
bool HasGaps() const
{
if (!IsAllNone())
@ -416,7 +417,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// TODO: remove these ^^ two in favor of these vv
size_t StartColumn(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); return timeIdxInSeq * pMBLayout->GetNumParallelSequences(); }
size_t NumCols(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); return pMBLayout->GetNumParallelSequences(); }
bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead (PAR mode)
const FrameRange & Check(size_t expectedStartColumn, size_t expectedNumCols, const shared_ptr<MBLayout> & pMBLayout) const
{
@ -444,18 +445,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
inline shared_ptr<Matrix<char>> MBLayout::GetColumnsValidityMask(const FrameRange& frameRange, DEVICEID_TYPE deviceId) const
{
// lazily compute the validity mask
if (m_columnsValidityMask == nullptr)
{
Lock();
m_columnsValidityMask.reset(new Matrix<char>(deviceId));
// Determine indices of all invalid columns in the specified frameRange
if (!IsAllNone())
if (!IsAllNone()) // TODO: use HasGaps() (but currently that would mean a second linear scan, which is not efficient)
{
size_t nT = GetNumTimeSteps();
size_t nS = GetNumParallelSequences();
std::vector<char> columnsValidityMask(nT * nS, 1);
std::vector<char> columnsValidityMask(nT * nS, 1); // form the mask in a CPU-side STL vector first
bool foundInvalidColumn = false;
for (size_t t = 0; t < nT; t++)
{
@ -471,14 +473,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
}
if (foundInvalidColumn)
if (foundInvalidColumn) // if any then blast it over to the GPU side
m_columnsValidityMask->SetValue(1, columnsValidityMask.size(), deviceId, columnsValidityMask.data());
}
}
if (m_columnsValidityMask->IsEmpty())
if (m_columnsValidityMask->IsEmpty()) // mask matrix was kept empty, which means no gaps detected
return nullptr;
// we have a validity mask: decide what to return
if (frameRange.IsAllFrames())
return m_columnsValidityMask;
@ -496,9 +499,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (!foundInvalidColumnsInRange)
return nullptr;
// we get here if there is an actual validity mask and there are invalid frames in its range
size_t startColumn = (frameRange.t() * GetNumParallelSequences()) + ((frameRange.seqIndex == SIZE_MAX) ? 0 : frameRange.seqIndex);
size_t numColumns = (frameRange.seqIndex == SIZE_MAX) ? GetNumParallelSequences() : 1;
// TODO: why use ColumnSlice() and not DataSlice()?
return make_shared<Matrix<char>>(m_columnsValidityMask->ColumnSlice(startColumn, numColumns));
}

Просмотреть файл

@ -135,6 +135,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop)
// TODO: pass a set of nodes instead of only one?
// TODO: remove Evaluate() from here, instead call it at call site, and in here merely check whether everything is computed already
// BUGBUG: The decision to loop (SEQ execution) is made by parent, but some children can be executer PAR. It should be possible to detect this.
template<class ElemType>
void ComputationNetwork::ComputeGradient(const ComputationNodeBasePtr rootNode,
bool bResetToOne, // true if reset the gradient of rootnode to 1.0
@ -195,6 +196,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
node2->VerifyNumParallelSequences(GetNumParallelSequences());
if (IsNodeReqMultiSeqHandling(node2))
node2->MaskMissingGradientColumnsToZero(t);
// TODO: exclude children that are not part of the recurrent loop, and do thise below, separately.
node2->ComputeGradientForChildren(t);
}
}
@ -237,6 +239,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ComputationNetwork::RecurrentInfo * ComputationNetwork::FindInRecurrentLoops(const ComputationNodeBasePtr& node)
{
// look in all recurrent loops of the network
// TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
for (auto & iter : m_recurrentInfo)
if (std::find(iter.m_recurrentNodes.begin(), iter.m_recurrentNodes.end(), node) != iter.m_recurrentNodes.end())
return &iter;

Просмотреть файл

@ -1158,13 +1158,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
#if DUMPOUTPUT
fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
#endif
child->LazyZeroGradient(); // set gradient to 0 if this is the first time
child->LazyZeroGradient(); // set gradient to 0 if this is the first time
// TODO: There is an inefficiency here which we should fix.
if (IsPartOfLoop() && !child->IsPartOfLoop())
{
assert(!frameRange.IsAllFrames());
static int warnings = 0;
if (warnings++ < 20)
fprintf (stderr, "ComputeGradientForChildren: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n",
NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
}
#if 0
if (frameRange.IsAllFrames()) // TODO: remove this
ComputeInputPartial(i);
else
#endif
ComputeInputPartial(i, frameRange); // this computes partial wrt to the child and sums the gradient value in the child
}
#ifdef DISPLAY_DEBUG