added a warning for inefficient propagating from inside loop to a child that is outside the loop. Gradient could be done in PAR mode
This commit is contained in:
Родитель
34bbf53e0e
Коммит
f221f1c73a
|
@ -195,6 +195,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
|
||||
// test function for those pieces of the code that cannot handle gaps
|
||||
// TODO: Not efficient (linear scan). Use a global OR of all values.
|
||||
bool HasGaps() const
|
||||
{
|
||||
if (!IsAllNone())
|
||||
|
@ -416,7 +417,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// TODO: remove these ^^ two in favor of these vv
|
||||
size_t StartColumn(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); return timeIdxInSeq * pMBLayout->GetNumParallelSequences(); }
|
||||
size_t NumCols(const shared_ptr<MBLayout> & pMBLayout) const { EnsureNotAllFrames(); return pMBLayout->GetNumParallelSequences(); }
|
||||
bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead
|
||||
bool IsAllFrames() const { return timeIdxInSeq == SIZE_MAX; } // if true then above functions may not be called; caller must use entire batch instead (PAR mode)
|
||||
|
||||
const FrameRange & Check(size_t expectedStartColumn, size_t expectedNumCols, const shared_ptr<MBLayout> & pMBLayout) const
|
||||
{
|
||||
|
@ -444,18 +445,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
|
||||
inline shared_ptr<Matrix<char>> MBLayout::GetColumnsValidityMask(const FrameRange& frameRange, DEVICEID_TYPE deviceId) const
|
||||
{
|
||||
// lazily compute the validity mask
|
||||
if (m_columnsValidityMask == nullptr)
|
||||
{
|
||||
Lock();
|
||||
m_columnsValidityMask.reset(new Matrix<char>(deviceId));
|
||||
|
||||
// Determine indices of all invalid columns in the specified frameRange
|
||||
if (!IsAllNone())
|
||||
if (!IsAllNone()) // TODO: use HasGaps() (but currently that would mean a second linear scan, which is not efficient)
|
||||
{
|
||||
size_t nT = GetNumTimeSteps();
|
||||
size_t nS = GetNumParallelSequences();
|
||||
|
||||
std::vector<char> columnsValidityMask(nT * nS, 1);
|
||||
std::vector<char> columnsValidityMask(nT * nS, 1); // form the mask in a CPU-side STL vector first
|
||||
bool foundInvalidColumn = false;
|
||||
for (size_t t = 0; t < nT; t++)
|
||||
{
|
||||
|
@ -471,14 +473,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
}
|
||||
}
|
||||
|
||||
if (foundInvalidColumn)
|
||||
if (foundInvalidColumn) // if any then blast it over to the GPU side
|
||||
m_columnsValidityMask->SetValue(1, columnsValidityMask.size(), deviceId, columnsValidityMask.data());
|
||||
}
|
||||
}
|
||||
|
||||
if (m_columnsValidityMask->IsEmpty())
|
||||
if (m_columnsValidityMask->IsEmpty()) // mask matrix was kept empty, which means no gaps detected
|
||||
return nullptr;
|
||||
|
||||
// we have a validity mask: decide what to return
|
||||
if (frameRange.IsAllFrames())
|
||||
return m_columnsValidityMask;
|
||||
|
||||
|
@ -496,9 +499,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
if (!foundInvalidColumnsInRange)
|
||||
return nullptr;
|
||||
|
||||
// we get here if there is an actual validity mask and there are invalid frames in its range
|
||||
size_t startColumn = (frameRange.t() * GetNumParallelSequences()) + ((frameRange.seqIndex == SIZE_MAX) ? 0 : frameRange.seqIndex);
|
||||
size_t numColumns = (frameRange.seqIndex == SIZE_MAX) ? GetNumParallelSequences() : 1;
|
||||
|
||||
// TODO: why use ColumnSlice() and not DataSlice()?
|
||||
return make_shared<Matrix<char>>(m_columnsValidityMask->ColumnSlice(startColumn, numColumns));
|
||||
}
|
||||
|
||||
|
|
|
@ -135,6 +135,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
// MAIN ENTRY POINT for evaluation followed by gradient computation (forward prop then back prop)
|
||||
// TODO: pass a set of nodes instead of only one?
|
||||
// TODO: remove Evaluate() from here, instead call it at call site, and in here merely check whether everything is computed already
|
||||
// BUGBUG: The decision to loop (SEQ execution) is made by parent, but some children can be executer PAR. It should be possible to detect this.
|
||||
template<class ElemType>
|
||||
void ComputationNetwork::ComputeGradient(const ComputationNodeBasePtr rootNode,
|
||||
bool bResetToOne, // true if reset the gradient of rootnode to 1.0
|
||||
|
@ -195,6 +196,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
node2->VerifyNumParallelSequences(GetNumParallelSequences());
|
||||
if (IsNodeReqMultiSeqHandling(node2))
|
||||
node2->MaskMissingGradientColumnsToZero(t);
|
||||
// TODO: exclude children that are not part of the recurrent loop, and do thise below, separately.
|
||||
node2->ComputeGradientForChildren(t);
|
||||
}
|
||||
}
|
||||
|
@ -237,6 +239,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
ComputationNetwork::RecurrentInfo * ComputationNetwork::FindInRecurrentLoops(const ComputationNodeBasePtr& node)
|
||||
{
|
||||
// look in all recurrent loops of the network
|
||||
// TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
|
||||
for (auto & iter : m_recurrentInfo)
|
||||
if (std::find(iter.m_recurrentNodes.begin(), iter.m_recurrentNodes.end(), node) != iter.m_recurrentNodes.end())
|
||||
return &iter;
|
||||
|
|
|
@ -1158,13 +1158,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
#if DUMPOUTPUT
|
||||
fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
|
||||
#endif
|
||||
child->LazyZeroGradient(); // set gradient to 0 if this is the first time
|
||||
child->LazyZeroGradient(); // set gradient to 0 if this is the first time
|
||||
|
||||
// TODO: There is an inefficiency here which we should fix.
|
||||
if (IsPartOfLoop() && !child->IsPartOfLoop())
|
||||
{
|
||||
assert(!frameRange.IsAllFrames());
|
||||
static int warnings = 0;
|
||||
if (warnings++ < 20)
|
||||
fprintf (stderr, "ComputeGradientForChildren: Inefficiency: %ls %ls operation in loop propagates gradient to non-loop %ls %ls\n",
|
||||
NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
|
||||
}
|
||||
|
||||
#if 0
|
||||
if (frameRange.IsAllFrames()) // TODO: remove this
|
||||
ComputeInputPartial(i);
|
||||
else
|
||||
#endif
|
||||
ComputeInputPartial(i, frameRange); // this computes partial wrt to the child and sums the gradient value in the child
|
||||
}
|
||||
#ifdef DISPLAY_DEBUG
|
||||
|
|
Загрузка…
Ссылка в новой задаче