bug fix: LMSequenceReader randomization must be deterministic (seed = epoch)

2016-03-09 22:22:24 -08:00 · 2016-03-09 22:22:24 -08:00 · 4209d9df10
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -268,8 +268,8 @@ Parameters =
    Stabilize (x, enabled=true) =
        if enabled
        then [
-                 beta = Exp(ScalarParam())
-                 result = Scale(beta, x)
+                 beta = Exp (ScalarParam())
+                 result = Scale (beta, x)
             ].result
        else x
 ]
--- a/Source/Common/Include/Config.h
+++ b/Source/Common/Include/Config.h
@ -18,14 +18,14 @@ using namespace std;
 namespace Microsoft { namespace MSR { namespace CNTK {

 #define FUNCTIONOPEN "("
-#define OPENBRACES "[{(\""
-#define CLOSINGBRACES "]})\""
+#define OPENBRACES    "[{(\"" // all opening braces
+#define CLOSINGBRACES "]})\"" // and matching closing ones

 static const std::string::size_type npos = (std::string::size_type) -1;

 // These are the constants associated with the "ResolveVariables" method.
-static const char* openBraceVar = "$";
-static const char* closingBraceVar = "$";
+static const char* openBraceVar    = "$"; // beginning of a var
+static const char* closingBraceVar = "$"; // end of a var
 static const char* forbiddenCharactersInVarName = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \t\n";
 static const char* forbiddenCharactersInVarNameEscapeWhitespace = ",/<>?;':\"[]{}\\|!@#%^&*()+=~` \\t\\n";
 static const std::size_t openBraceVarSize = strlen(openBraceVar);
@ -357,23 +357,19 @@ public:
    // str - string to search
    // tokenStart - start location in the string to search
    // returns: character position of matching closing brace, string::npos if no brace present at start position
-    // BUGBUG: This seems to only work for one kind of braces at a time. Nested other braces are not
-    // understood. Also, braces in strings are not protected. [fseide]
-    static std::string::size_type FindBraces(const std::string& str, std::string::size_type tokenStart)
+    static size_t FindBraces(const std::string& str, const size_t tokenStart)
    {
        const auto len = str.length();
        // start is outside (or rather, at end of string): no brace here
        if (tokenStart >= len)
-        {
            return npos;
-        }

        // open braces and quote
-        static const std::string openBraces = OPENBRACES;
+        static const std::string openBraces    = OPENBRACES;    // currently "[{(\""
        // close braces and quote
        static const std::string closingBraces = CLOSINGBRACES;

-        const auto charsToLookFor = closingBraces + openBraces; // all chars we match for
+        static const auto charsToLookFor = closingBraces + openBraces; // all chars we match for

        // get brace index for first character of input string
        const auto braceFound = openBraces.find(str[tokenStart]);
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -218,7 +218,17 @@ void ComputationNetwork::ReadPersistableParameters(File& fstream, bool create)
        if (create) // loaded from scratch
            AddNodeToNet(node);
        else                      // reloaded existing
-            node->Validate(true); // nothing that propagates should have changed  --TODO: have a more rigid mechanism to prevent resizing; this should only reload the model parameters
+        {
+            let old = node->GetSampleLayout();
+            let changed = ValidateNode(node, /*isFinalValidationPass=*/true);
+            if (changed)
+            {
+                let upd = node->GetSampleLayout();
+                fprintf(stderr, "ValidateSubNetwork: %ls %ls operation changed, from [%s] to [%s].", node->NodeName().c_str(), node->OperationName().c_str(),
+                    string(old).c_str(), string(upd).c_str());
+                //LogicError("ValidateSubNetwork: %ls %ls operation changed during reload or re-validation.", node->NodeName().c_str(), node->OperationName().c_str());
+            }
+        }
    }

    fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -165,6 +165,7 @@ public:
 private:
    void ValidateNetwork();
    void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t& todo);
+    bool ValidateNode(ComputationNodeBasePtr node, bool isFinalValidationPass) const;
    void MarkValueNonSharableNodes();

 private:
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -579,6 +579,36 @@ static pair<TensorShape, bool> GetDims(const ComputationNodeBasePtr& node)
    return make_pair(node->GetSampleLayout(), node->HasMBLayout());
 }

+bool ComputationNetwork::ValidateNode(ComputationNodeBasePtr node, bool isFinalValidationPass) const
+{
+    const auto& children = node->GetInputs();
+
+    // keep state
+    MBLayoutPtr oldMBLayoutPtr = node->GetMBLayout();
+    auto dim = GetDims(node);
+    vector<pair<TensorShape, bool>> childDims;
+    for (auto& child : children)
+        childDims.push_back(GetDims(child));
+    auto sampleLayout = node->GetSampleLayout();
+    // We do call validate(final) as many times as needed, since stuff may have changed underneath.
+    node->Validate(isFinalValidationPass /*final*/); // all nodes have been visited: do verification instead of just inference
+    // also take the opportunity to propagate m_needsGradient
+    auto needsGradient = node->m_needsGradient;
+    for (auto& child : children) // TODO: do we need a check that this is stable if isFinalValidationPass?
+        node->m_needsGradient |= child->m_needsGradient;
+    // check state --node will be valid if all nodes have been visited and node has not been updated
+    bool unchanged = true;
+    unchanged &= (oldMBLayoutPtr == node->GetMBLayout());
+    unchanged &= (dim == GetDims(node));
+    vector<pair<TensorShape, bool>> newChildDims;
+    for (auto& child : children)
+        newChildDims.push_back(GetDims(child));
+    unchanged &= (childDims == newChildDims);
+    unchanged &= (sampleLayout == node->GetSampleLayout());
+    unchanged &= (needsGradient == node->m_needsGradient);
+    return !unchanged;
+}
+
 void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t& todo)
 {
    todo = 0; // returns how many nodes are to be redone
@ -596,35 +626,15 @@ void ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, bool
        }
        // if there is not at least one visited child
        bool valid = false;
-        if (hasVisitedChild || isLeaf)
+        if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate()
        {
-            // got at least one child: it makes sense to call Validate()
-            // keep state
-            MBLayoutPtr oldMBLayoutPtr = node->GetMBLayout();
-            auto dim = GetDims(node);
-            vector<pair<TensorShape, bool>> childDims;
-            for (auto& child : children)
-                childDims.push_back(GetDims(child));
-            auto sampleLayout = node->GetSampleLayout();
-            // We do call validate(final) as many times as needed, since stuff may have changed underneath.
+            // TODO: PrintSelfBeforeValidation() into a function returning a string, and print all in a single line (also when it throws; print & rethrow).
            node->PrintSelfBeforeValidation();
-            node->Validate(isFinalValidationPass /*final*/); // all nodes have been visited: do verification instead of just inference
-            fprintf(stderr, " -> [%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : "");
+            bool unchanged = !ValidateNode(node, isFinalValidationPass);
            node->m_visited = true;
-            // also take the opportunity to propagate m_needsGradient
-            auto needsGradient = node->m_needsGradient;
-            for (auto& child : children) // TODO: do we need a check that this is stable if isFinalValidationPass?
-                node->m_needsGradient |= child->m_needsGradient;
-            // check state --node will be valid if all nodes have been visited and node has not been updated
-            bool unchanged = true;
-            unchanged &= (oldMBLayoutPtr == node->GetMBLayout());
-            unchanged &= (dim == GetDims(node));
-            vector<pair<TensorShape, bool>> newChildDims;
-            for (auto& child : children)
-                newChildDims.push_back(GetDims(child));
-            unchanged &= (childDims == newChildDims);
-            unchanged &= (sampleLayout == node->GetSampleLayout());
-            unchanged &= (needsGradient == node->m_needsGradient);
+            fprintf(stderr, "[%s%s]", string(node->GetSampleLayout()).c_str(), node->HasMBLayout() ? " x *" : "");
+            // print the new type
+            // sanity checks
            if (isFinalValidationPass && !unchanged)
                LogicError("ValidateSubNetwork: %ls %ls operation changed during final validation.", node->NodeName().c_str(), node->OperationName().c_str());
            if (isFinalValidationPass && !allChildrenVisited)
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -307,7 +307,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, size_t onl
                fprintfOrDie(f, "%s", sampleSeparator.c_str());
            if (j == jstop)
            {
-                fprintf(f, "..."); // 'nuff said
+                fprintf(f, "... (%d more)", (int)(jend - jstop)); // 'nuff said
                break;
            }
            for (size_t i = 0; i < iend; i++)
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -167,8 +167,8 @@ struct ComputationNetworkOwnedNodeState
    // These are public since you are meant to set these flags manually in the debugger or temporarily poke into them from code as needed.
    bool m_traceNodeValue = false;
    bool m_traceNodeValueAsCategoryLabel = false;
-    size_t m_traceNodeValueUpToDim = 5;
-    size_t m_traceNodeValueUpToT = 5;
+    size_t m_traceNodeValueUpToDim = 3; // 3 should be enough to see simple patterns such as all values are identical or out of range
+    size_t m_traceNodeValueUpToT = 8;   // 8 time steps fit comfortably into a normal-sized console
    void EnableNodeTracing(bool isCategoryLabel) { m_traceNodeValue = true; m_traceNodeValueAsCategoryLabel = isCategoryLabel; }

 protected:                // TODO: should be fully encapsulated here
@ -1513,8 +1513,9 @@ public:
    {
        if (m_traceNodeValue)
        {
-            fprintf(stderr, "Trace --> %ls = %ls -> [%s%s]\n", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str(), HasMBLayout() ? " x *" : "");
-            WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, true/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector<std::string>(),
+            const auto shape = GetTensorShape(DetermineElementwiseTensorRank());
+            fprintf(stderr, "Trace --> %ls = %ls -> [%s]\n", NodeName().c_str(), OperationName().c_str(), string(shape).c_str());
+            WriteMinibatchWithFormatting(stderr, m_traceNodeValueUpToDim, m_traceNodeValueUpToT, false/*transpose*/, m_traceNodeValueAsCategoryLabel, std::vector<std::string>(),
                                         ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
                                         "%13.10f"/*valueFormatString*/);
        }
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -167,7 +167,8 @@ public:

        // BUGBUG: I got an error in when reloading persistent parameterse for a model that had dimension specified as 0, which did not get re-inferred correctly.
        //         We should either simply not write this parameter out at all (since it can always be inferred), or write the tensor shape.
-        SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
+        if (GetSampleLayout().GetNumElements() != rows) // legacy format: if #rows matches then assume current tensor shape is up to date
+            SetDims(TensorShape(rows), HasMBLayout() /*may be true on reload (roll-back)*/); // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
        m_delayedValue.Resize(rows, 0);                                                  // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag

        if (modelVersion >= CNTK_MODEL_VERSION_2)
--- a/Source/Readers/LMSequenceReader/SequenceReader.cpp
+++ b/Source/Readers/LMSequenceReader/SequenceReader.cpp
@ -1578,7 +1578,7 @@ void BatchSequenceReader<ElemType>::Reset()
 {
    mProcessed.clear();
    mToProcess.clear();
-    mLastProcssedSentenceId = 0;
+    mLastProcessedSentenceId = 0;
    mPosInSentence = 0;
    mLastPosInSentence = 0;
    mNumRead = 0;
@ -1651,6 +1651,7 @@ void BatchSequenceReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epo
    // we use epochSize, which might not be set yet, so use a default value for allocations if not yet set
    size_t epochSize = m_epochSize == requestDataSize ? 1000 : m_epochSize;
    m_epoch = epoch;
+    m_randomSeed = (unsigned int)m_epoch;
    m_mbStartSample = epoch * m_epochSize;
    m_epochSamplesReturned = 0;     // counter to know when we returned one epoch

@ -1700,7 +1701,7 @@ size_t BatchSequenceReader<ElemType>::DetermineSequencesToProcess()
            int mp = (int) mToProcess[s];
            if (mProcessed[mp])
            {
-                mLastProcssedSentenceId = mp;
+                mLastProcessedSentenceId = mp;
                mLastPosInSentence = 0;
                allDone = true;
                break;
@ -1722,7 +1723,7 @@ size_t BatchSequenceReader<ElemType>::DetermineSequencesToProcess()
    size_t maxToProcess = mRequestedNumParallelSequences > 0 ? mRequestedNumParallelSequences : SIZE_MAX; // if mRequestedNumParallelSequences is 0 then we go by MB size
    size_t maxTokens    = mRequestedNumParallelSequences > 0 ?                       SIZE_MAX : m_mbSize;
    size_t numTokens = 0;  // token counter
-    for (size_t seq = mLastProcssedSentenceId;
+    for (size_t seq = mLastProcessedSentenceId;
         seq < mNumRead &&                 // hit end of buffer
         mToProcess.size() < maxToProcess; // hit parallel-sequence limit
         seq++)
@ -1791,14 +1792,14 @@ bool BatchSequenceReader<ElemType>::GetMinibatchData(size_t& /*out*/ firstPosInS
 #ifdef _MSC_VER // make some old configurations reproducable (m_cacheBlockSize used to be a constant)  --TODO: remove in a few months
        if (m_cacheBlockSize == 50000)
        {
+            srand(++m_randomSeed); // TODO: older code did not have that; so no idea what random seed was used
            std::random_shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end());
            // Note: random_shuffle is deprecated since C++14.
        }
        else // new configs use a wider randomization
 #endif
        {
-            std::random_device rd;
-            std::mt19937 g(rd());
+            std::mt19937 g(++m_randomSeed); // random seed is initialized to epoch, but gets incremented for intermediate reshuffles
            std::shuffle(m_parser.mSentenceIndex2SentenceInfo.begin(), m_parser.mSentenceIndex2SentenceInfo.end(), g);
        }

--- a/Source/Readers/LMSequenceReader/SequenceReader.h
+++ b/Source/Readers/LMSequenceReader/SequenceReader.h
@ -354,7 +354,9 @@ public:
    using Base::mRequestedNumParallelSequences; // IDataReader<ElemType>

 private:
-    size_t mLastProcssedSentenceId;
+    unsigned int m_randomSeed = 0; // deterministic random seed
+
+    size_t mLastProcessedSentenceId;

    size_t mNumRead;               // number of sentences in current cache block
    vector<bool> mProcessed;       // [mNumRead] true if sequence has already been returned in this cache block
@ -379,7 +381,7 @@ public:
    BatchSequenceReader()
        : m_pMBLayout(make_shared<MBLayout>())
    {
-        mLastProcssedSentenceId = 0;
+        mLastProcessedSentenceId = 0;
        mRequestedNumParallelSequences = 1;
        mLastPosInSentence = 0;
        mNumRead = 0;