Merge branch 'master' of https://github.com/Microsoft/CNTK into fseide/pack

2016-03-22 09:00:59 -07:00 · 2016-03-22 09:00:59 -07:00 · f188493589
--- a/2
+++ b/2
@ -226,6 +226,8 @@ READER_SRC =\
 	$(SOURCEDIR)/Readers/ReaderLib/Bundler.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/NoRandomizer.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/ReaderShim.cpp \
+	$(SOURCEDIR)/Readers/ReaderLib/ChunkRandomizer.cpp \
+	$(SOURCEDIR)/Readers/ReaderLib/SequenceRandomizer.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/SampleModePacker.cpp \

 COMMON_SRC =\
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -192,6 +192,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(PerDimMeanVarDeNormalizationNode), L"PerDimMVDeNorm")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(PerDimMeanVarNormalizationNode), L"PerDimMVNorm")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(PlusNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(ReciprocalNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(RectifiedLinearNode), L"ReLU")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ReshapeNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(RowRepeatNode))) ret = true;
--- a/Source/ActionsLib/OtherActions.cpp
+++ b/Source/ActionsLib/OtherActions.cpp
@ -521,8 +521,8 @@ void DoTopologyPlot(const ConfigParameters& config)
        renderCmd = regex_replace(renderCmd, inputPlaceHolder,  L"$1" + outputDotFile + L"$3");
        renderCmd = regex_replace(renderCmd, outputPlaceHolder, L"$1" + outputFile    + L"$3");
 #endif
-        msra::strfun::ReplaceAll(renderCmd, wstring(L"<IN>"), outputDotFile);
-        msra::strfun::ReplaceAll(renderCmd, wstring(L"<OUT>"), outputFile);
+        renderCmd = msra::strfun::ReplaceAll(renderCmd, wstring(L"<IN>"), outputDotFile);
+        renderCmd = msra::strfun::ReplaceAll(renderCmd, wstring(L"<OUT>"), outputFile);
    }


--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -77,6 +77,7 @@ DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [
 // TODO: DiagTimes = ElementTimes
 Dropout(activationVectorSequence, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = activationVectorSequence /*plus the function args*/ ]
 ElementTimes(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'ElementTimes' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ]
+ElementDivide(aMatrix, anotherMatrix, tag='') = ElementTimes(aMatrix, Reciprocal(anotherMatrix))
 ErrorPrediction(labelVectorSequence, outVectorSequence, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = (labelVectorSequence : outVectorSequence) /*plus the function args*/ ]
 Exp(x, tag='') = new ComputationNode [ operation = 'Exp' ; inputs = x /*plus the function args*/ ]
 GatherPacked(indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'GatherPacked' ; inputs = (indexSequence : sourceData) /*plus the function args*/ ]
@ -94,6 +95,7 @@ PackedIndex(targetObject, indexSequence, tag='') = new ComputationNode [ operati
 PerDimMeanVarDeNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarDeNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
 PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
 Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
+Reciprocal(z, tag='') = new ComputationNode [ operation = 'Reciprocal' ; inputs = z /*plus the function args*/ ]
 RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ]
 Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
 // TODO: Scale = ElementTimes
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -50,7 +50,8 @@ public:

    ComputationNetwork() :
        m_randomSeedOffset(0),
-        m_isCompiled(false),
+          m_isCompiled(false),
+          m_areMatricesAllocated(false),
        m_pMBLayout(make_shared<MBLayout>()),
        m_environment(make_shared<ComputationEnvironment>())
    {
@ -180,6 +181,7 @@ private:
    void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
    void CollectInputAndLearnableParametersRec(const ComputationNodeBasePtr& node, set<ComputationNodeBasePtr>& visited, list<ComputationNodeBasePtr>& inputs, list<ComputationNodeBasePtr>& learnableParameters);
    bool IsCompiled() const { return m_isCompiled; }
+    bool AreMatricesAllocated() const { return m_areMatricesAllocated; }
    void VerifyIsCompiled(const char* where) const;
 public:
    void AllocateAllMatrices(const std::vector<ComputationNodeBasePtr>& evalRootNodes, const std::vector<ComputationNodeBasePtr>& outValueRootNodes, ComputationNodeBasePtr trainRootNode);
@ -418,7 +420,6 @@ public:
                            const double& wp = 0.0f,
                            const double& bMMIfactor = 0.0f,
                            const bool& sMBR = false);
-
    static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples);

    // -----------------------------------------------------------------------
@ -518,6 +519,7 @@ public:
        return m_nameToNodeMap.size();
    }

+
    std::vector<ComputationNodeBasePtr> GetAllNodes() const
    {
        std::vector<ComputationNodeBasePtr> nodes;
@ -668,7 +670,6 @@ public:
        m_nameToNodeMap.erase(node->NodeName());
        return node;
    }
-
 public:
    // -----------------------------------------------------------------------
    // evaluation
@ -950,7 +951,6 @@ private:

    // environment information that nodes may want to inquire, e.g. to know whether we are training
    ComputationEnvironmentPtr m_environment;
-
 private:
    // -----------------------------------------------------------------------
    // the following members are all result of post-processing by CompileNetwork()
@ -964,6 +964,7 @@ private:

    // cache for evaluation ordering:
    bool m_isCompiled; // CompileNetwork has been called
+    bool m_areMatricesAllocated; // AllocateAllMatrices has been called

    // cached network iterations
    std::map<const ComputationNodeBasePtr, std::list<ComputationNodeBasePtr>> m_evalOrders; // [out node] flat depth-first traversal starting from out node
@ -993,4 +994,4 @@ template class Matrix<double>;
 //  - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling)
 //  - have overrides of RuntimeError etc. in ComputationNode, which prepend the error string with the node name and operation

-}}}
+} } }
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -77,6 +77,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(PassNode))                             return New<PassNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PlusNode))                             return New<PlusNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ReconcileMBLayoutNode))                return New<ReconcileMBLayoutNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(ReciprocalNode))                       return New<ReciprocalNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RectifiedLinearNode))                  return New<RectifiedLinearNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ReshapeNode))                          return New<ReshapeNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RowRepeatNode))                        return New<RowRepeatNode<ElemType>>(forward<_Types>(_Args)...);
@ -472,6 +473,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LogSo
    return net.AddNodeToNetAndAttachInputs(New<LogSoftmaxNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Reciprocal(const ComputationNodePtr a, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<ReciprocalNode<ElemType>>(net.GetDeviceId(), nodeName), a);
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Sqrt(const ComputationNodePtr a, const std::wstring nodeName)
 {
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -112,6 +112,7 @@ public:
    ComputationNodePtr PerDimMeanVarDeNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
    ComputationNodePtr PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
    ComputationNodePtr Plus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
+    ComputationNodePtr Reciprocal(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr RectifiedLinear(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr Reshape(const ComputationNodePtr a, const TensorShape& imageLayout, const std::wstring nodeName = L"");
    ComputationNodePtr RowRepeat(const ComputationNodePtr a, const size_t num_repeat, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -671,50 +671,60 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
 void ComputationNetwork::MarkValueNonSharableNodes()
 {
    const auto& nodes = GetEvalOrder(nullptr);
-    std::map<wstring, bool> allLeafDescendentsAreParameters;
+    std::map<wstring, bool> allLeafDescendentsAreParametersOrPreComputeNodes;
    std::list<ComputationNodeBasePtr> allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter));
    // note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not

+    std::list<ComputationNodeBasePtr> allPreComputeNodes;
+    for (const auto& node : nodes)
+    {
+        auto pcnode = dynamic_pointer_cast<IPreComputeNode>(node);
+        if (pcnode)
+            allPreComputeNodes.push_back(node);
+    }
+
    for (auto& node : nodes)
    {
        auto children = node->GetInputs();
        wstring myname = node->NodeName();
-        bool allParameters = true;
+        bool allParametersOrPreComputeNodes = true;

        if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already
        {
-            for (auto child : children)
+            if (std::find(allPreComputeNodes.begin(), allPreComputeNodes.end(), node) == allPreComputeNodes.end())
            {
-                wstring ChildName = child->NodeName();
-                if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end())
+                for (auto child : children)
                {
-                    // not found, means it is a leaf node (we are at eval order )
-                    assert(child->IsLeaf() || child->IsPartOfLoop());
-                    if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child) != allLearnableParameters.end())
+                    wstring ChildName = child->NodeName();
+                    if (allLeafDescendentsAreParametersOrPreComputeNodes.find(ChildName) == allLeafDescendentsAreParametersOrPreComputeNodes.end())
                    {
-                        allLeafDescendentsAreParameters[ChildName] = true;
+                        // not found, means it is a leaf node (we are at eval order )
+                        assert(child->IsLeaf() || child->IsPartOfLoop());
+                        if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child) != allLearnableParameters.end())
+                        {
+                            allLeafDescendentsAreParametersOrPreComputeNodes[ChildName] = true;
+                        }
+                        else
+                        {
+                            allParametersOrPreComputeNodes = false;
+                            allLeafDescendentsAreParametersOrPreComputeNodes[ChildName] = false;
+                            break;
+                        }
                    }
                    else
                    {
-                        allParameters = false;
-                        allLeafDescendentsAreParameters[ChildName] = false;
-                        break;
-                    }
-                }
-                else
-                {
-                    if (allLeafDescendentsAreParameters[ChildName] == false)
-                    {
-                        allParameters = false;
-                        break;
+                        if (allLeafDescendentsAreParametersOrPreComputeNodes[ChildName] == false)
+                        {
+                            allParametersOrPreComputeNodes = false;
+                            break;
+                        }
                    }
                }
            }
-            allLeafDescendentsAreParameters[myname] = allParameters;
-            if (allParameters)
-            {
+
+            allLeafDescendentsAreParametersOrPreComputeNodes[myname] = allParametersOrPreComputeNodes;
+            if (allParametersOrPreComputeNodes)
                node->MarkValueNonSharable();
-            }
        }
    }
 }
@ -727,6 +737,9 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
                                             const std::vector<ComputationNodeBasePtr>& outValueRootNodes,
                                             ComputationNodeBasePtr trainRootNode)
 {
+    if (AreMatricesAllocated())
+        return;
+
    // Allocate memory for forward/backward computation
    fprintf(stderr, "\n\nAllocating matrices for forward and/or backward propagation.\n");

@ -859,6 +872,8 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
            }
        }
    }
+
+    m_areMatricesAllocated = true;
 }

 void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount)
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -70,6 +70,57 @@ public:
 template class PlusNode<float>;
 template class PlusNode<double>;

+// -----------------------------------------------------------------------
+// LogPlusNode (summand1, summand2)
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class LogPlusNode : public BinaryElementWiseNode<ElemType>
+{
+    typedef BinaryElementWiseNode<ElemType> Base;
+    UsingBinaryElementwiseNodeBaseMembers;
+    static const std::wstring TypeName()
+    {
+        return L"LogPlus";
+    }
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(LogPlusNode);
+    LogPlusNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
+    {
+        size_t rank = DetermineElementwiseTensorRank();
+        auto result =           ValueTensorFor(rank, fr);
+        auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
+        result.AssignLogSumOf(input0, input1);
+    }
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+    {
+        size_t rank = DetermineElementwiseTensorRank();
+        auto gradient      =                    GradientTensorFor(rank, fr);
+        auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
+        auto input0        = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto input1        = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());        
+
+        // if reduction then mask the respective input(s) (zero out the gaps)
+        if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
+            MaskMissingGradientColumnsToZero(fr);
+        if (Input(inputIndex)->ReducesInTimeWrt(Input(1 - inputIndex)))
+            Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);
+
+        inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input0, input1);
+    }
+};
+
+template class LogPlusNode<float>;
+template class LogPlusNode<double>;
+
 // -----------------------------------------------------------------------
 // MinusNode (minuend, subtrahend)
 // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -145,6 +145,7 @@ DeclareUnaryElementWiseWithOpCodeNode(Cosine,          Cosine,          Elementw
 DeclareUnaryElementWiseWithOpCodeNode(Abs,             Abs,             ElementwiseProductWithAbsDerivative,                       BinaryWithInputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Negate,          Negate,          Negate,                                                    UnaryGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Sqrt,            Sqrt,            ElementwiseProductWithSqrtDerivative,                      BinaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Reciprocal,      Reciprocal,      ElementwiseProductWithReciprocalDerivative,                BinaryWithOutputGradient);

 #pragma pop_macro("DeclareUnaryElementWiseWithOpCodeNode")

--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -76,6 +76,7 @@ enum ElementWiseOperator
    opNegate,
    opNot,
    opAbs,
+    opReciprocal,
    opSigmoid,
    opTanh,
    opSqr,
@ -113,12 +114,14 @@ enum ElementWiseOperator
    opElementwiseProductWithCosDerivative,
    opElementwiseProductWithAbsDerivative,
    opElementwiseProductWithSqrtDerivative,
+    opElementwiseProductWithReciprocalDerivative,
    opSqrOfDifference,
    // binary ops for indexing
    // opIndex,
    // ternary
    opCond /*a ? b : c*/,
-    opClip /*clip a within interval b..c*/
+    opClip, /*clip a within interval b..c*/
+    opElementwiseProductWithLogSumDerivative
    // Note: not all that's implemented in CNTK ComputationNodes has an opcode yet.
 };

@ -131,6 +134,7 @@ enum ElementWiseOperator
    Macro(Negate);            \
    Macro(Not);               \
    Macro(Abs);               \
+    Macro(Reciprocal);        \
    Macro(Sigmoid);           \
    Macro(Tanh);              \
    Macro(Sqr);               \
@ -164,13 +168,15 @@ enum ElementWiseOperator
    Macro(ElementwiseProductWithLogDerivativeFromOutput);             \
    Macro(ElementwiseProductWithCosDerivative);                       \
    Macro(ElementwiseProductWithAbsDerivative);                       \
+    Macro(ElementwiseProductWithReciprocalDerivative);                \
    Macro(ElementwiseProductWithSqrtDerivative);                      \
    Macro(SqrOfDifference);                                           \
    //Macro(Index);

-#define ForAllTernaryOps(Macro) \
-    Macro(Cond);                \
-    Macro(Clip);
+#define ForAllTernaryOps(Macro)                    \
+    Macro(Cond);                                   \
+    Macro(Clip);                                   \
+    Macro(ElementwiseProductWithLogSumDerivative);

 // -----------------------------------------------------------------------
 // various enums to describe
--- a/Source/Math/TensorOps.h
+++ b/Source/Math/TensorOps.h
@ -202,6 +202,7 @@ DefUnaryOp(Exp, exp_(a));
 DefUnaryOp(Log, ClippedLog(a));
 DefUnaryOp(LinearRectifier, a > 0 ? a : 0);
 DefUnaryOp(Cosine, cos_(a));
+DefUnaryOp(Reciprocal, a == 0 ? 0 : 1 / a);
 #pragma pop_macro("DefUnaryOp")

 #pragma push_macro("DefBinaryOp")
@ -236,6 +237,7 @@ DefBinaryOp(ElementwiseProductWithLinearRectifierDerivativeFromOutput, b > 0 ? a
 DefBinaryOp(ElementwiseProductWithLogDerivativeFromOutput, a* exp_(-b));
 DefBinaryOp(ElementwiseProductWithCosDerivative, a * -sin_(b)); // note: b = input for cos()
 DefBinaryOp(ElementwiseProductWithAbsDerivative, a * Sgn(b)); // note: b = input for abs()
+DefBinaryOp(ElementwiseProductWithReciprocalDerivative, a * -Sqr(b)); // b = output
 DefBinaryOp(ElementwiseProductWithSqrtDerivative, a / (2 * b)); // b = output; d/dx sqrt(x) = 1/(2 * sqrt(x)) --> note this is the same as ElementwiseQuotient w a constant; if more show up like this we should add more template params
 DefBinaryOp(SqrOfDifference, Sqr(a - b));
 //DefBinaryOp(Index, IndexElement(a, b, i));  // note: this one uses the third argument
@ -252,9 +254,9 @@ DefBinaryOp(SqrOfDifference, Sqr(a - b));

 DefTernaryOp(Cond, a ? b : c);
 DefTernaryOp(Clip, a < b ? b : (a > c ? c : a));
+DefTernaryOp(ElementwiseProductWithLogSumDerivative, a * Sigmoid(c - b));
+
 #pragma pop_macro("DefTernaryOp")
-}
-}
-}
+}}}
 #pragma pop_macro("DECL")
 #pragma pop_macro("TENSOR_OPS_DECL")
--- a/Source/Readers/CNTKTextFormatReader/Descriptors.h
+++ b/Source/Readers/CNTKTextFormatReader/Descriptors.h
@ -11,9 +11,6 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-    typedef size_t SequenceId;
-    typedef size_t TimelineOffset;
-
    // Stream (input) metadata. This text-reader specific descriptor adds two 
    // additional fields: stream alias (name prefix in each sample) and expected
    // sample dimension.
@ -29,14 +26,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // locate and retrieve a sequence from file, given a sequence descriptor.
    struct SequenceDescriptor : SequenceDescription
    {
-        SequenceDescriptor() 
+        SequenceDescriptor() : SequenceDescription({}), m_fileOffsetBytes(0),
+            m_byteSize(0)
        {
-            m_id = 0;
-            m_numberOfSamples = 0;
-            m_chunkId = 0;
-            m_isValid = false;
-            m_fileOffsetBytes = 0;
-            m_byteSize = 0;
        }
        // size_t m_numberOfSamples -- number of samples in the sequence (largest count among all inputs)
        // in case of text data this value == number of rows this sequence spans over.
@ -47,31 +39,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // Chunk metadata, similar to the sequence descriptor above, 
    // but used to facilitate indexing and retrieval of blobs of input data of
    // some user-specified size.
-    struct ChunkDescriptor
-    {
-        size_t m_id; 
+    struct ChunkDescriptor : ChunkDescription
+    { 
+        ChunkDescriptor() : ChunkDescription({}), m_byteSize(0) {}
+        // TODO: if we don't want to keep the whole index 
+        // (metadata for all sequences in memory), we should not
+        // leave this empty when building a chunk index, and only
+        // fill it out when the chunk needs to be loaded 
+        // (the indexer will have to do a second pass for this chunk).
+        std::vector<SequenceDescriptor> m_sequences;
+        
        size_t m_byteSize; // size in bytes
-        size_t m_numSequences; // number of sequences in this chunk
-        TimelineOffset m_timelineOffset; // offset into the timeline -- timeline index of
-                                         // the very first sequence from this chunk.
    };

-    // The index comprises two timelines with different granularities. One is 
-    // is a collection of sequences, the other -- of chunks. 
-    // TODO: needs to be refactored to support partial timeline.
-    struct Index 
-    {
-        Index(bool hasSequenceIds, 
-            std::vector<SequenceDescriptor> timeline,
-            std::vector<ChunkDescriptor> chunks) 
-            : m_hasSequenceIds(hasSequenceIds), m_timeline(timeline), m_chunks(chunks)
-        {
-        }
+    typedef shared_ptr<ChunkDescriptor> ChunkDescriptorPtr;

-        bool m_hasSequenceIds; // true when input contains sequence id column
-        std::vector<SequenceDescriptor> m_timeline;
-        std::vector<ChunkDescriptor> m_chunks;
-    };
-
-    typedef std::shared_ptr<Index> IndexPtr;
-}}}
+    // A collection of chunk descriptors, each containing
+    // a collection of sequence descriptors for the corresponding
+    // chunk of the input data. 
+    typedef std::vector<ChunkDescriptor> Index;
+}}}
--- a/Source/Readers/CNTKTextFormatReader/Indexer.cpp
+++ b/Source/Readers/CNTKTextFormatReader/Indexer.cpp
@ -21,14 +21,13 @@ Indexer::Indexer(FILE* file, bool skipSequenceIds, size_t chunkSize) :
    m_bufferEnd(nullptr),
    m_pos(nullptr),
    m_done(false),
-    m_skipSequenceIds(skipSequenceIds),
+    m_hasSequenceIds(!skipSequenceIds),
    m_maxChunkSize(chunkSize)
 {
    if (m_file == nullptr)
    {
        RuntimeError("Input file not open for reading");
    }
-    m_chunks.push_back({});
 }

 void Indexer::RefillBuffer()
@ -53,7 +52,7 @@ void Indexer::RefillBuffer()
    }
 }

-void Indexer::UpdateTimeline(SequenceDescriptor& sd)
+void Indexer::AddSequence(SequenceDescriptor& sd)
 {
    assert(!m_chunks.empty());
    ChunkDescriptor* chunk = &m_chunks.back();
@ -62,19 +61,18 @@ void Indexer::UpdateTimeline(SequenceDescriptor& sd)
        m_chunks.push_back({});
        chunk = &m_chunks.back();
        chunk->m_id = m_chunks.size() - 1;
-        chunk->m_timelineOffset = m_timeline.size();
    }
    chunk->m_byteSize += sd.m_byteSize;
-    chunk->m_numSequences++;
+    chunk->m_numberOfSequences++;
+    chunk->m_numberOfSamples += sd.m_numberOfSamples;
    sd.m_chunkId = chunk->m_id;
-
-    m_timeline.push_back(sd);
+    chunk->m_sequences.push_back(sd);
 }

-IndexPtr Indexer::BuildFromLines()
+void Indexer::BuildFromLines()
 {
    assert(m_pos == m_bufferStart);
-    m_skipSequenceIds = true;
+    m_hasSequenceIds = false;
    size_t lines = 0;
    int64_t offset = GetFileOffset();
    while (!m_done)
@ -90,7 +88,7 @@ IndexPtr Indexer::BuildFromLines()
            offset = GetFileOffset() + 1;
            sd.m_byteSize = offset - sd.m_fileOffsetBytes;
            // TODO: ignore empty lines.
-            UpdateTimeline(sd);
+            AddSequence(sd);
            ++m_pos;
            ++lines;
        }
@ -99,16 +97,23 @@ IndexPtr Indexer::BuildFromLines()
            RefillBuffer();
        }
    }
-
-    return make_shared<Index>(
-        !m_skipSequenceIds,
-        std::move(m_timeline),
-        std::move(m_chunks));
 }

-
-IndexPtr Indexer::Build()
+void Indexer::Build()
 {
+    if (!m_chunks.empty())
+    {
+        return;
+    }
+    
+    if (m_maxChunkSize > 0)
+    {
+        auto fileSize = filesize(m_file);
+        m_chunks.reserve((fileSize + m_maxChunkSize - 1) / m_maxChunkSize);
+    }
+
+    m_chunks.push_back({});
+
    RefillBuffer(); // read the first block of data
    if (m_done)
    {
@ -125,10 +130,11 @@ IndexPtr Indexer::Build()
    }

    // check the first byte and decide what to do next
-    if (m_skipSequenceIds || m_bufferStart[0] == NAME_PREFIX)
+    if (!m_hasSequenceIds || m_bufferStart[0] == NAME_PREFIX)
    {
        // skip sequence id parsing, treat lines as individual sequences
-        return BuildFromLines();
+        BuildFromLines();
+        return;
    }

    size_t id = 0;
@ -154,7 +160,7 @@ IndexPtr Indexer::Build()
        {
            // found a new sequence, which starts at the [offset] bytes into the file
            sd.m_byteSize = offset - sd.m_fileOffsetBytes;
-            UpdateTimeline(sd);
+            AddSequence(sd);
            sd = {};
            sd.m_id = id;
            sd.m_fileOffsetBytes = offset;
@ -164,12 +170,7 @@ IndexPtr Indexer::Build()

    // calculate the byte size for the last sequence
    sd.m_byteSize = m_fileOffsetEnd - sd.m_fileOffsetBytes;
-    UpdateTimeline(sd);
-
-    return make_shared<Index>(
-        !m_skipSequenceIds,
-        std::move(m_timeline), // TODO: shrink_to_fit?
-        std::move(m_chunks));
+    AddSequence(sd);
 }


@ -236,4 +237,4 @@ bool Indexer::GetNextSequenceId(size_t& id)
        " at the offset = %" PRIi64 "\n", GetFileOffset());
 }

-}}}
+}}}
--- a/Source/Readers/CNTKTextFormatReader/Indexer.h
+++ b/Source/Readers/CNTKTextFormatReader/Indexer.h
@ -19,10 +19,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 class Indexer 
 {
 public:
-    Indexer(FILE* file, bool skipSequenceIds, size_t chunkSize = 32 * 1024 * 1024);
+    Indexer(FILE* file, bool skipSequenceIds = false, size_t chunkSize = 32 * 1024 * 1024);

-    // Reads the input file building an index of sequence metadata.
-    IndexPtr Build();
+    // Reads the input file, building and index of chunks and corresponding
+    // sequences.
+    void Build();
+
+    // Returns input data index (chunk and sequence metadata)
+    const Index& GetIndex() const { return m_chunks; }
+
+    // True, when input does not have the sequence id column
+    // or when sequence id column was ignored during indexing
+    // (by passing skipSequenceIds = true to the constructor).
+    bool HasSequenceIds() const { return m_hasSequenceIds; }

 private:
    FILE* m_file;
@ -37,18 +46,18 @@ private:

    bool m_done; // true, when all input was processed

-    bool m_skipSequenceIds; // true, when input contains one sequence per line 
-                            // and sequence id column can be skipped.
+    bool m_hasSequenceIds; // true, when input contains one sequence per line 
+                           // or when sequence id column was ignored during indexing.

    const size_t m_maxChunkSize; // maximum permitted chunk size;

-    std::vector<SequenceDescriptor> m_timeline; // a collection of sequence descriptors
    std::vector<ChunkDescriptor> m_chunks; // a collection of chunk descriptors

-    // Assigns an appropriate chunk id to the sequence descriptor,
+    // Adds sequence (metadata) to the index. Additionally, it
+    // assigns an appropriate chunk id to the sequence descriptor,
    // ensures that chunks do not exceed the maximum allowed size
    // (except when a sequence size is greater than the maximum chunk size)
-    void UpdateTimeline(SequenceDescriptor& sd);
+    void AddSequence(SequenceDescriptor& sd);

    // fills up the buffer with data from file, all previously buffered data
    // will be overwritten.
@ -64,9 +73,10 @@ private:
    // Otherwise, writes sequence id value to the provided reference, returns true.
    bool GetNextSequenceId(size_t& id);

-    // Builds timeline, treating each line as an individual sequence.
-    // Does not do any sequence parsing, instead uses line number as the corresponding sequence id.
-    IndexPtr BuildFromLines();
+    // Build a chunk/sequence index, treating each line as an individual sequence.
+    // Does not do any sequence parsing, instead uses line number as 
+    // the corresponding sequence id.
+    void BuildFromLines();

    // Returns current offset in the input file (in bytes). 
    int64_t GetFileOffset() const { return m_fileOffsetStart + (m_pos - m_bufferStart); }
--- a/Source/Readers/CNTKTextFormatReader/TextParser.cpp
+++ b/Source/Readers/CNTKTextFormatReader/TextParser.cpp
@ -33,9 +33,10 @@ public:
    explicit TextDataChunk(const ChunkDescriptor& descriptor);

    // Gets sequences by id.
-    std::vector<SequenceDataPtr> GetSequence(size_t sequenceId) override;
+    void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override;

    std::map<size_t, std::vector<SequenceDataPtr>> m_sequencePtrMap;
+
    // Buffer to store the actual data.
    std::vector<SequenceBuffer> m_sequences;

@ -72,6 +73,9 @@ TextParser<ElemType>::TextParser(const std::wstring& filename, const vector<Stre
    m_filename(filename),
    m_file(nullptr),
    m_streamInfos(streams.size()),
+    m_indexer(nullptr),
+    m_fileOffsetStart(0),
+    m_fileOffsetEnd(0),
    m_buffer(new char[BUFFER_SIZE + 1]),
    m_bufferStart(nullptr),
    m_bufferEnd(nullptr),
@ -84,13 +88,6 @@ TextParser<ElemType>::TextParser(const std::wstring& filename, const vector<Stre
 {
    assert(streams.size() > 0);

-    m_file = fopenOrDie(m_filename, L"rbS");
-    if (funicode(m_file)) 
-    {
-        RuntimeError("Found a UTF-16 BOM at the beginning of the input file %ls. "
-            "UTF-16 encoding is currently not supported.", m_filename.c_str());
-    }
-
    m_maxAliasLength = 0;

    for (size_t i = 0; i < streams.size(); ++i)
@ -127,17 +124,33 @@ TextParser<ElemType>::~TextParser()
 template <class ElemType>
 void TextParser<ElemType>::Initialize()
 {
-    if (m_index) 
+    if (m_indexer != nullptr) 
    {
        return;
    }

-    m_index = Indexer(m_file, m_skipSequenceIds).Build();
+    attempt(5, [this]()
+    {
+        m_file = fopenOrDie(m_filename, L"rbS");
+    });
+    
+    if (funicode(m_file))
+    {
+        RuntimeError("Found a UTF-16 BOM at the beginning of the input file %ls. "
+            "UTF-16 encoding is currently not supported.", m_filename.c_str());
+    }
+
+    m_indexer = make_unique<Indexer>(m_file, m_skipSequenceIds, m_chunkSizeBytes);
+
+    attempt(5, [this]()
+    {
+        m_indexer->Build();
+    });

    // it's still possible that the actual input data does not have sequence id column.
-    m_skipSequenceIds = !m_index->m_hasSequenceIds; 
+    m_skipSequenceIds = !m_indexer->HasSequenceIds();

-    assert(m_index);
+    assert(m_indexer != nullptr);

    int64_t position = _ftelli64(m_file);
    if (position == -1L)
@ -150,48 +163,65 @@ void TextParser<ElemType>::Initialize()
 }

 template <class ElemType>
-vector<StreamDescriptionPtr> TextParser<ElemType>::GetStreamDescriptions() const
+ChunkDescriptions TextParser<ElemType>::GetChunkDescriptions()
 {
-    return m_streams;
-}
+    assert(m_indexer != nullptr);

-template <class ElemType>
-size_t TextParser<ElemType>::GetTotalNumberOfChunks() 
-{
-    return m_index->m_chunks.size();
-}
+    const auto& index = m_indexer->GetIndex();

-template <class ElemType>
-void TextParser<ElemType>::FillSequenceDescriptions(SequenceDescriptions& timeline) const
-{
-    timeline.resize(m_index->m_timeline.size());
-    std::transform(
-        m_index->m_timeline.begin(),
-        m_index->m_timeline.end(),
-        timeline.begin(),
-        [](const SequenceDescription& desc)
+    ChunkDescriptions result;
+    result.reserve(index.size());
+    for (auto const& chunk : index)
    {
-        return &desc;
-    });
+        result.push_back(shared_ptr<ChunkDescription>(
+            new ChunkDescription {
+                chunk.m_id,
+                chunk.m_numberOfSamples,
+                chunk.m_numberOfSequences
+        }));
+    }
+
+    return result;
 }

 template <class ElemType>
-TextParser<ElemType>::TextDataChunk::TextDataChunk(const ChunkDescriptor& descriptor) : 
-    m_sequences(descriptor.m_numSequences)
+void TextParser<ElemType>::GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& result)
+{
+    const auto& index = m_indexer->GetIndex();
+    const auto& chunk = index[chunkId];
+    result.reserve(chunk.m_sequences.size());
+    
+    for (auto const& s : chunk.m_sequences)
+    {
+        result.push_back(
+        { 
+            s.m_id, 
+            s.m_numberOfSamples,
+            s.m_chunkId,
+            s.m_isValid,
+            s.m_key
+        });
+    }
+}
+
+template <class ElemType>
+TextParser<ElemType>::TextDataChunk::TextDataChunk(const ChunkDescriptor& descriptor)
 {
    m_id = descriptor.m_id;
    m_sequenceRequestCount = 0;
+    m_sequences.reserve(descriptor.m_numberOfSequences);
 }

 template <class ElemType>
-vector<SequenceDataPtr> TextParser<ElemType>::TextDataChunk::GetSequence(size_t sequenceId)
+void TextParser<ElemType>::TextDataChunk::GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result)
 {
    auto it = m_sequencePtrMap.find(sequenceId);
    assert(it != m_sequencePtrMap.end());
 //TODO: Remove pragma once new randomizer is in master.
 #pragma omp atomic
    ++m_sequenceRequestCount;
-    return it->second;
+    result.reserve(it->second.size());
+    copy(it->second.begin(), it->second.end(), back_inserter(result));
 }

 template <class ElemType>
@ -208,7 +238,7 @@ ChunkPtr TextParser<ElemType>::GetChunk(size_t chunkId)
        }
        else
        {
-            const auto& chunkDescriptor = m_index->m_chunks[chunkId];
+            const auto& chunkDescriptor = m_indexer->GetIndex()[chunkId];
            auto textChunk = make_shared<TextDataChunk>(chunkDescriptor);

            attempt(5, [this, &textChunk, &chunkDescriptor]()
@ -251,13 +281,10 @@ ChunkPtr TextParser<ElemType>::GetChunk(size_t chunkId)
 template <class ElemType>
 void TextParser<ElemType>::LoadChunk(TextChunkPtr& chunk, const ChunkDescriptor& descriptor)
 {
-    vector<SequenceBuffer> sequences(descriptor.m_numSequences);
-    for (size_t i = 0; i < descriptor.m_numSequences; ++i)
+    for (const auto& sequenceDescriptor : descriptor.m_sequences)
    {
-        size_t offset = descriptor.m_timelineOffset + i;
-        const auto& sequenceDescriptor = m_index->m_timeline[offset];
-        chunk->m_sequences[i] = move(LoadSequence(!m_skipSequenceIds, sequenceDescriptor));
-        const auto& sequenceData = chunk->m_sequences[i];
+        chunk->m_sequences.push_back(LoadSequence(!m_skipSequenceIds, sequenceDescriptor));
+        const auto& sequenceData = chunk->m_sequences.back();
        vector<SequenceDataPtr> sequencePtrs(m_streamInfos.size());
        for (size_t j = 0; j < m_streamInfos.size(); ++j)
        {
@ -270,8 +297,7 @@ void TextParser<ElemType>::LoadChunk(TextChunkPtr& chunk, const ChunkDescriptor&
                data->m_sampleLayout = m_streams[j]->m_sampleLayout;
                data->m_numberOfSamples = input->m_numberOfSamples;
                data->m_chunk = chunk;
-                // TODO: add m_id to the sequence data
-                //data->m_id = sequenceDescriptor.m_id;
+                data->m_id = sequenceDescriptor.m_id;
                sequencePtrs[j] = data;
            }
            else
@ -292,8 +318,7 @@ void TextParser<ElemType>::LoadChunk(TextChunkPtr& chunk, const ChunkDescriptor&
                }

                data->m_chunk = chunk;
-                // TODO: add m_id to the sequence data
-                //data->m_id = sequenceDescriptor.m_id;
+                data->m_id = sequenceDescriptor.m_id;
                sequencePtrs[j] = data;
            }
        }
--- a/Source/Readers/CNTKTextFormatReader/TextParser.h
+++ b/Source/Readers/CNTKTextFormatReader/TextParser.h
@ -8,9 +8,11 @@
 #include "DataDeserializerBase.h"
 #include "Descriptors.h"
 #include "TextConfigHelper.h"
+#include "Indexer.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

+template <class ElemType>
 class CNTKTextFormatReaderTestRunner;

 // TODO: more details when tracing warnings 
@ -18,30 +20,28 @@ class CNTKTextFormatReaderTestRunner;
 template <class ElemType>
 class TextParser : public DataDeserializerBase {
 public:
-    TextParser(const TextConfigHelper& helper);
+    explicit TextParser(const TextConfigHelper& helper);

    ~TextParser();

    // Builds an index of the input data.
    void Initialize();

-    // Description of streams that this data deserializer provides.
-    std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override;
-
    // Retrieves a chunk of data.
    ChunkPtr GetChunk(size_t chunkId) override;

-    // Retrieves total number of chunks this deserializer can produce.
-    size_t GetTotalNumberOfChunks() override;
-protected:
-    void FillSequenceDescriptions(SequenceDescriptions& timeline) const override;
+    // Get information about chunks.
+    ChunkDescriptions GetChunkDescriptions() override;
+
+    // Get information about particular chunk.
+    void GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& result) override;

 private:
    // A buffer to keep data for all samples in a (variable length) sequence 
    // from a single input stream.
    struct InputStreamBuffer
    {
-        virtual ~InputStreamBuffer() {};
+        virtual ~InputStreamBuffer() { };

        size_t m_numberOfSamples = 0;
        std::vector<ElemType> m_buffer;
@ -91,7 +91,7 @@ private:
    size_t m_maxAliasLength;
    std::map<std::string, size_t> m_aliasToIdMap;

-    IndexPtr m_index;
+    std::unique_ptr<Indexer> m_indexer;

    int64_t m_fileOffsetStart;
    int64_t m_fileOffsetEnd;
@ -109,9 +109,6 @@ private:
    unsigned int m_traceLevel;
    unsigned int m_numAllowedErrors;
    bool m_skipSequenceIds;
-    
-    // All streams this reader provides.
-    std::vector<StreamDescriptionPtr> m_streams;

    // A map of currently loaded chunks
    // TODO: remove caching once partial randomization is in master.
@ -167,7 +164,7 @@ private:

    void SetChunkCacheSize(unsigned int size);

-    friend class CNTKTextFormatReaderTestRunner;
+    friend class CNTKTextFormatReaderTestRunner<ElemType>;

    DISABLE_COPY_AND_MOVE(TextParser);
 };
--- a/Source/Readers/ExperimentalHTKMLFReader/ConfigHelper.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/ConfigHelper.cpp
@ -10,7 +10,9 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-std::pair<size_t, size_t> ConfigHelper::GetContextWindow()
+using namespace std;
+
+pair<size_t, size_t> ConfigHelper::GetContextWindow()
 {
    size_t left = 0, right = 0;
    intargvector contextWindow = m_config(L"contextWindow", ConfigParameters::Array(intargvector(vector<int>{1})));
@ -39,12 +41,12 @@ std::pair<size_t, size_t> ConfigHelper::GetContextWindow()
        InvalidArgument("contextWindow must have 1 or 2 values specified, found %d.", (int)contextWindow.size());
    }

-    return std::make_pair(left, right);
+    return make_pair(left, right);
 }

 void ConfigHelper::CheckFeatureType()
 {
-    std::wstring type = m_config(L"type", L"real");
+    wstring type = m_config(L"type", L"real");
    if (_wcsicmp(type.c_str(), L"real"))
    {
        InvalidArgument("Feature type must be of type 'real'.");
@ -53,16 +55,16 @@ void ConfigHelper::CheckFeatureType()

 void ConfigHelper::CheckLabelType()
 {
-    std::wstring type;
+    wstring type;
    if (m_config.Exists(L"labelType"))
    {
        // TODO: let's deprecate this eventually and just use "type"...
-        type = static_cast<const std::wstring&>(m_config(L"labelType"));
+        type = static_cast<const wstring&>(m_config(L"labelType"));
    }
    else
    {
        // outputs should default to category
-        type = static_cast<const std::wstring&>(m_config(L"type", L"category"));
+        type = static_cast<const wstring&>(m_config(L"type", L"category"));
    }

    if (_wcsicmp(type.c_str(), L"category"))
@ -75,10 +77,10 @@ void ConfigHelper::CheckLabelType()
 // features - [in,out] a vector of feature name strings
 // labels - [in,out] a vector of label name strings
 void ConfigHelper::GetDataNamesFromConfig(
-    std::vector<std::wstring>& features,
-    std::vector<std::wstring>& labels,
-    std::vector<std::wstring>& hmms,
-    std::vector<std::wstring>& lattices)
+    vector<wstring>& features,
+    vector<wstring>& labels,
+    vector<wstring>& hmms,
+    vector<wstring>& lattices)
 {
    for (const auto& id : m_config.GetMemberIds())
    {
@ -146,9 +148,9 @@ size_t ConfigHelper::GetLabelDimension()
    InvalidArgument("Labels must specify dimension: 'dim/labelDim' property is missing.");
 }

-std::vector<std::wstring> ConfigHelper::GetMlfPaths()
+vector<wstring> ConfigHelper::GetMlfPaths()
 {
-    std::vector<std::wstring> result;
+    vector<wstring> result;
    if (m_config.ExistsCurrent(L"mlfFile"))
    {
        result.push_back(m_config(L"mlfFile"));
@ -194,10 +196,10 @@ size_t ConfigHelper::GetRandomizationWindow()
    return result;
 }

-std::wstring ConfigHelper::GetRandomizer()
+wstring ConfigHelper::GetRandomizer()
 {
    // get the read method, defaults to "blockRandomize"
-    std::wstring randomizer(m_config(L"readMethod", L"blockRandomize"));
+    wstring randomizer(m_config(L"readMethod", L"blockRandomize"));

    if (randomizer == L"blockRandomize" && GetRandomizationWindow() == randomizeNone)
    {
@ -207,22 +209,24 @@ std::wstring ConfigHelper::GetRandomizer()
    return randomizer;
 }

-std::vector<std::wstring> ConfigHelper::GetFeaturePaths()
+vector<wstring> ConfigHelper::GetSequencePaths()
 {
-    std::wstring scriptPath = m_config(L"scpFile");
-    std::wstring rootPath = m_config(L"prefixPathInSCP", L"");
+    wstring scriptPath = m_config(L"scpFile");
+    wstring rootPath = m_config(L"prefixPathInSCP", L"");

    vector<wstring> filelist;
    fprintf(stderr, "Reading script file %ls ...", scriptPath.c_str());

-    size_t n = 0;
-    for (msra::files::textreader reader(scriptPath); reader;)
+    // TODO: possibly change to class File, we should be able to read data from pipelines.E.g.
+    //  scriptPath = "gzip -c -d FILE.txt |", or do a popen with C++ streams, so that we can have a generic open function that returns an ifstream.
+    ifstream scp(msra::strfun::utf8(scriptPath).c_str());
+    string line;
+    while (getline(scp, line))
    {
-        filelist.push_back(reader.wgetline());
-        n++;
+        filelist.push_back(msra::strfun::utf16(line));
    }

-    fprintf(stderr, " %d entries\n", static_cast<int>(n));
+    fprintf(stderr, " %d entries\n", static_cast<int>(filelist.size()));

    // post processing file list :
    //  - if users specified PrefixPath, add the prefix to each of path in filelist
@ -230,11 +234,11 @@ std::vector<std::wstring> ConfigHelper::GetFeaturePaths()
    if (!rootPath.empty()) // use has specified a path prefix for this  feature
    {
        // first make slash consistent (sorry for Linux users:this is not necessary for you)
-        std::replace(rootPath.begin(), rootPath.end(), L'\\', L'/');
+        replace(rootPath.begin(), rootPath.end(), L'\\', L'/');

        // second, remove trailing slash if there is any
-        std::wregex trailer(L"/+$");
-        rootPath = std::regex_replace(rootPath, trailer, wstring());
+        wregex trailer(L"/+$");
+        rootPath = regex_replace(rootPath, trailer, wstring());

        // third, join the rootPath with each entry in filelist
        if (!rootPath.empty())
@ -243,7 +247,7 @@ std::vector<std::wstring> ConfigHelper::GetFeaturePaths()
            {
                if (path.find_first_of(L'=') != wstring::npos)
                {
-                    std::vector<std::wstring> strarr = msra::strfun::split(path, L"=");
+                    vector<wstring> strarr = msra::strfun::split(path, L"=");
 #ifdef WIN32
                    replace(strarr[1].begin(), strarr[1].end(), L'\\', L'/');
 #endif
@ -276,7 +280,7 @@ std::vector<std::wstring> ConfigHelper::GetFeaturePaths()
                This works well if you store the scp file with the features but
                do not want different scp files everytime you move or create new features
                */
-        std::wstring scpDirCached;
+        wstring scpDirCached;
        for (auto& entry : filelist)
        {
            ExpandDotDotDot(entry, scriptPath, scpDirCached);
@ -300,7 +304,7 @@ intargvector ConfigHelper::GetNumberOfUtterancesPerMinibatchForAllEppochs()
    return numberOfUtterances;
 }

-void ConfigHelper::ExpandDotDotDot(std::wstring& featPath, const std::wstring& scpPath, std::wstring& scpDirCached)
+void ConfigHelper::ExpandDotDotDot(wstring& featPath, const wstring& scpPath, wstring& scpDirCached)
 {
    wstring delim = L"/\\";

--- a/Source/Readers/ExperimentalHTKMLFReader/ConfigHelper.h
+++ b/Source/Readers/ExperimentalHTKMLFReader/ConfigHelper.h
@ -49,8 +49,8 @@ public:
    // Gets mlf file paths from the configuraiton.
    std::vector<std::wstring> GetMlfPaths();

-    // Gets feature file paths from the configuration.
-    std::vector<std::wstring> GetFeaturePaths();
+    // Gets utterance paths from the configuration.
+    std::vector<std::wstring> GetSequencePaths();

    // Gets randomization window.
    size_t GetRandomizationWindow();
--- a/Source/Readers/ExperimentalHTKMLFReader/CorpusDescriptor.h
+++ b/Source/Readers/ExperimentalHTKMLFReader/CorpusDescriptor.h
@ -5,22 +5,20 @@

 #pragma once

-#include <string>
-#include <memory>
+#include "StringToIdMap.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

 // Represents a full corpus.
 // Defines which sequences should participate in the reading.
 // TODO: Currently it is only a skeleton class.
-// TODO: For HtkMlf it will be based on the set of sequences from the SCP file.
+// TODO: For HtkMlf it can be based on the set of sequences from the SCP file.
 // TODO: Extract an interface.
 class CorpusDescriptor
 {
 public:
-    CorpusDescriptor(std::vector<std::wstring>&& sequences) : m_sequences(sequences)
-    {
-    }
+    CorpusDescriptor()
+    {}

    // Checks if the specified sequence should be used for reading.
    bool IsIncluded(const std::wstring& sequenceKey)
@ -29,8 +27,16 @@ public:
        return true;
    }

+    // Gets string registry
+    WStringToIdMap& GetStringRegistry()
+    {
+        return m_stringRegistry;
+    }
+
 private:
-    std::vector<std::wstring> m_sequences;
+    DISABLE_COPY_AND_MOVE(CorpusDescriptor);
+
+    WStringToIdMap m_stringRegistry;
 };

 typedef std::shared_ptr<CorpusDescriptor> CorpusDescriptorPtr;
--- a/Source/Readers/ExperimentalHTKMLFReader/ExperimentalHTKMLFReader.vcxproj
+++ b/Source/Readers/ExperimentalHTKMLFReader/ExperimentalHTKMLFReader.vcxproj
@ -88,7 +88,7 @@
    <ClInclude Include="..\..\Common\Include\ssematrix.h" />
    <ClInclude Include="..\..\Common\Include\fileutil.h" />
    <ClInclude Include="..\..\Common\Include\ExceptionWithCallStack.h" />
-    <ClInclude Include="ChunkDescription.h" />
+    <ClInclude Include="HTKChunkDescription.h" />
    <ClInclude Include="ConfigHelper.h" />
    <ClInclude Include="CorpusDescriptor.h" />
    <ClInclude Include="HTKDataDeserializer.h" />
--- a/Source/Readers/ExperimentalHTKMLFReader/ExperimentalHTKMLFReader.vcxproj.filters
+++ b/Source/Readers/ExperimentalHTKMLFReader/ExperimentalHTKMLFReader.vcxproj.filters
@ -48,10 +48,10 @@
      <Filter>Common\Include</Filter>
    </ClInclude>
    <ClInclude Include="UtteranceDescription.h" />
-    <ClInclude Include="ChunkDescription.h" />
    <ClInclude Include="..\..\Common\Include\ExceptionWithCallStack.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
+    <ClInclude Include="HTKChunkDescription.h" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Common">
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKChunkDescription.h
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKChunkDescription.h
@ -3,8 +3,11 @@
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //

+#pragma once
+
 #include "DataDeserializer.h"
 #include "../HTKMLFReader/htkfeatio.h"
+#include "UtteranceDescription.h"
 #include "ssematrix.h"

 namespace Microsoft { namespace MSR { namespace CNTK {
@ -12,10 +15,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // Class represents a description of an HTK chunk.
 // It is only used internally by the HTK deserializer.
 // Can exist without associated data and provides methods for requiring/releasing chunk data.
-class ChunkDescription
+// TODO: We should consider splitting data load from the description in the future versions.
+class HTKChunkDescription
 {
    // All utterances in the chunk.
-    std::vector<UtteranceDescription*> m_utteranceSet;
+    std::vector<UtteranceDescription> m_utterances;

    // Stores all frames of the chunk consecutively (mutable since this is a cache).
    mutable msra::dbn::matrix m_frames;
@ -28,18 +32,18 @@ class ChunkDescription
    size_t m_totalFrames;

 public:
-    ChunkDescription() : m_totalFrames(0)
+    HTKChunkDescription() : m_totalFrames(0)
    {
    }

    // Gets number of utterances in the chunk.
    size_t GetNumberOfUtterances() const
    {
-        return m_utteranceSet.size();
+        return m_utterances.size();
    }

    // Adds an utterance to the chunk.
-    void Add(UtteranceDescription* utterance)
+    void Add(UtteranceDescription&& utterance)
    {
        if (IsInRam())
        {
@ -47,8 +51,8 @@ public:
        }

        m_firstFrames.push_back(m_totalFrames);
-        m_totalFrames += utterance->GetNumberOfFrames();
-        m_utteranceSet.push_back(utterance);
+        m_totalFrames += utterance.GetNumberOfFrames();
+        m_utterances.push_back(std::move(utterance));
    }

    // Gets total number of frames in the chunk.
@ -57,13 +61,25 @@ public:
        return m_totalFrames;
    }

-    // Get number of frames in a sequences identified by the index.
-    size_t GetUtteranceNumberOfFrames(size_t index) const
+    // Get utterance description by its index.
+    const UtteranceDescription* GetUtterance(size_t index) const
    {
-        return m_utteranceSet[index]->GetNumberOfFrames();
+        return &m_utterances[index];
    }

-    // Returns frames of a given utterance.
+    // Get utterance by the absolute frame index in chunk.
+    // Uses the upper bound to do the binary search among sequences of the chunk.
+    size_t GetUtteranceForChunkFrameIndex(size_t frameIndex) const
+    {
+        auto result = std::upper_bound(
+            m_utterances.begin(),
+            m_utterances.end(),
+            frameIndex, 
+            [](size_t fi, const UtteranceDescription& a) { return fi < a.GetStartFrameIndexInsideChunk(); });
+        return result - 1 - m_utterances.begin();
+    }
+
+    // Returns all frames of a given utterance.
    msra::dbn::matrixstripe GetUtteranceFrames(size_t index) const
    {
        if (!IsInRam())
@ -72,7 +88,7 @@ public:
        }

        const size_t ts = m_firstFrames[index];
-        const size_t n = GetUtteranceNumberOfFrames(index);
+        const size_t n = GetUtterance(index)->GetNumberOfFrames();
        return msra::dbn::matrixstripe(m_frames, ts, n);
    }

@ -99,16 +115,16 @@ public:

            // read all utterances; if they are in the same archive, htkfeatreader will be efficient in not closing the file
            m_frames.resize(featureDimension, m_totalFrames);
-            foreach_index(i, m_utteranceSet)
+            foreach_index(i, m_utterances)
            {
                // read features for this file
                auto framesWrapper = GetUtteranceFrames(i);
-                reader.read(m_utteranceSet[i]->GetPath(), featureKind, samplePeriod, framesWrapper);
+                reader.read(m_utterances[i].GetPath(), featureKind, samplePeriod, framesWrapper);
            }

            if (verbosity)
            {
-                fprintf(stderr, "RequireData: %d utterances read\n", (int)m_utteranceSet.size());
+                fprintf(stderr, "RequireData: %d utterances read\n", (int)m_utterances.size());
            }
        }
        catch (...)
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.cpp
@ -20,14 +20,20 @@ std::vector<std::wstring> htkfeatreader::parsedpath::archivePathStringVector;

 namespace Microsoft { namespace MSR { namespace CNTK {

+using namespace std;
+
 HTKDataDeserializer::HTKDataDeserializer(
    CorpusDescriptorPtr corpus,
    const ConfigParameters& feature,
-    const std::wstring& featureName)
+    const wstring& featureName)
    : m_ioFeatureDimension(0),
      m_samplePeriod(0),
-      m_verbosity(0)
+      m_verbosity(0),
+      m_corpus(corpus),
+      m_totalNumberOfFrames(0)
 {
+    // Currently we only support frame mode.
+    // TODO: Support of full sequences.
    bool frameMode = feature.Find("frameMode", "true");
    if (!frameMode)
    {
@ -35,149 +41,165 @@ HTKDataDeserializer::HTKDataDeserializer(
    }

    ConfigHelper config(feature);
-
    config.CheckFeatureType();

-    std::vector<std::wstring> featureFiles = config.GetFeaturePaths();
-
    auto context = config.GetContextWindow();
    m_elementType = config.GetElementType();
+
    m_dimension = config.GetFeatureDimension();
    m_dimension = m_dimension * (1 + context.first + context.second);

-    size_t numSequences = featureFiles.size();
-
    m_augmentationWindow = config.GetContextWindow();

-    m_utterances.reserve(numSequences);
-    size_t totalFrames = 0;
-    foreach_index (i, featureFiles)
-    {
-        UtteranceDescription description(std::move(msra::asr::htkfeatreader::parsedpath(featureFiles[i])));
-        size_t numberOfFrames = description.GetNumberOfFrames();
-        description.m_id = i;
+    InitializeChunkDescriptions(config);
+    InitializeStreams(featureName);
+    InitializeFeatureInformation();
+}

-        // we need at least 2 frames for boundary markers to work
+// Initializes chunks based on the configuration and utterance descriptions.
+void HTKDataDeserializer::InitializeChunkDescriptions(ConfigHelper& config)
+{
+    // Read utterance descriptions.
+    vector<wstring> paths = config.GetSequencePaths();
+    vector<UtteranceDescription> utterances;
+    utterances.reserve(paths.size());
+    auto& stringRegistry = m_corpus->GetStringRegistry();
+    for (const auto& u : paths)
+    {
+        UtteranceDescription description(move(msra::asr::htkfeatreader::parsedpath(u)));
+        size_t numberOfFrames = description.GetNumberOfFrames();
+
+        // TODO: we need at least 2 frames for boundary markers to work
+        // TODO: this should be removed when MLF deserializer is rewritten.
        if (numberOfFrames < 2)
        {
            fprintf(stderr, "HTKDataDeserializer::HTKDataDeserializer: skipping utterance with %d frames because it has less than 2 frames: %ls\n",
                (int)numberOfFrames, description.GetKey().c_str());
-            description.m_isValid = false;
-            description.m_numberOfSamples = 0;
-        }
-        else
-        {
-            description.m_isValid = true;
-            description.m_numberOfSamples = numberOfFrames;
+            continue;
        }

-        m_utterances.push_back(description);
-        totalFrames += description.m_numberOfSamples;
+        size_t id = stringRegistry.AddValue(description.GetKey());
+        description.SetId(id);
+        utterances.push_back(description);
+        m_totalNumberOfFrames += numberOfFrames;
    }

-    size_t totalSize = std::accumulate(
-        m_utterances.begin(),
-        m_utterances.end(),
-        static_cast<size_t>(0),
-        [](size_t sum, const UtteranceDescription& s)
-        {
-            return s.m_numberOfSamples + sum;
-        });
-
    const size_t MaxUtterancesPerChunk = 65535;
    // distribute them over chunks
    // We simply count off frames until we reach the chunk size.
    // Note that we first randomize the chunks, i.e. when used, chunks are non-consecutive and thus cause the disk head to seek for each chunk.
-    const size_t framespersec = 100;                   // we just assume this; our efficiency calculation is based on this
-    const size_t chunkframes = 15 * 60 * framespersec; // number of frames to target for each chunk
+
+    // We have 100 frames in a second.
+    const size_t FramesPerSec = 100;
+
+    // A chunk consitutes 15 minutes
+    const size_t ChunkFrames = 15 * 60 * FramesPerSec; // number of frames to target for each chunk

    // Loading an initial 24-hour range will involve 96 disk seeks, acceptable.
    // When paging chunk by chunk, chunk size ~14 MB.

    m_chunks.resize(0);
-    m_chunks.reserve(totalSize / chunkframes);
+    m_chunks.reserve(m_totalNumberOfFrames / ChunkFrames);

    int chunkId = -1;
-    foreach_index(i, m_utterances)
+    size_t startFrameInsideChunk = 0;
+    foreach_index(i, utterances)
    {
        // if exceeding current entry--create a new one
        // I.e. our chunks are a little larger than wanted (on av. half the av. utterance length).
-        if (m_chunks.empty() || m_chunks.back().GetTotalFrames() > chunkframes || m_chunks.back().GetNumberOfUtterances() >= MaxUtterancesPerChunk)
+        if (m_chunks.empty() || m_chunks.back().GetTotalFrames() > ChunkFrames || m_chunks.back().GetNumberOfUtterances() >= MaxUtterancesPerChunk)
        {
-            m_chunks.push_back(ChunkDescription());
+            m_chunks.push_back(HTKChunkDescription());
            chunkId++;
+            startFrameInsideChunk = 0;
        }

        // append utterance to last chunk
-        ChunkDescription& currentchunk = m_chunks.back();
-        m_utterances[i].SetIndexInsideChunk(currentchunk.GetNumberOfUtterances());
-        currentchunk.Add(&m_utterances[i]); // move it out from our temp array into the chunk
-        m_utterances[i].m_chunkId = chunkId;
+        HTKChunkDescription& currentChunk = m_chunks.back();
+        utterances[i].AssignToChunk(chunkId, currentChunk.GetNumberOfUtterances(), startFrameInsideChunk);
+        startFrameInsideChunk += utterances[i].GetNumberOfFrames();
+        currentChunk.Add(move(utterances[i]));
    }

+    // Creating a table of weak pointers to chunks,
+    // so that if randomizer asks the same chunk twice 
+    // we do not need to recreated the chunk if we already uploaded in memory.
+    m_weakChunks.resize(m_chunks.size());
+
    fprintf(stderr,
        "HTKDataDeserializer::HTKDataDeserializer: %d utterances grouped into %d chunks, av. chunk size: %.1f utterances, %.1f frames\n",
-        (int)m_utterances.size(),
+        (int)utterances.size(),
        (int)m_chunks.size(),
-        m_utterances.size() / (double)m_chunks.size(),
-        totalSize / (double)m_chunks.size());
+        utterances.size() / (double)m_chunks.size(),
+        m_totalNumberOfFrames / (double)m_chunks.size());
+}

-    // TODO: Currently we have global sequence id.
-    // After changing the timeline interface they must never referred to by a sequential id, only by chunk/within-chunk index
-    // because they are asked on the chunk anyway.
-
-    m_frames.reserve(totalFrames);
-    foreach_index(i, m_utterances)
-    {
-        if (!m_utterances[i].m_isValid)
-        {
-            continue;
-        }
-
-        std::wstring key = m_utterances[i].GetKey();
-        for (size_t k = 0; k < m_utterances[i].m_numberOfSamples; ++k)
-        {
-            Frame f(&m_utterances[i]);
-            f.m_key.major = key;
-            f.m_key.minor = k;
-            f.m_id = m_frames.size();
-            f.m_chunkId = m_utterances[i].m_chunkId;
-            f.m_numberOfSamples = 1;
-            f.m_frameIndex = k;
-            f.m_isValid = true;
-            m_frames.push_back(f);
-
-            m_sequences.push_back(&m_frames[f.m_id]);
-        }
-    }
-
-    m_weakChunks.resize(m_chunks.size());
-
-    StreamDescriptionPtr stream = std::make_shared<StreamDescription>();
+// Describes exposed stream - a single stream of htk features.
+void HTKDataDeserializer::InitializeStreams(const wstring& featureName)
+{
+    StreamDescriptionPtr stream = make_shared<StreamDescription>();
    stream->m_id = 0;
    stream->m_name = featureName;
-    stream->m_sampleLayout = std::make_shared<TensorShape>(m_dimension);
+    stream->m_sampleLayout = make_shared<TensorShape>(m_dimension);
    stream->m_elementType = m_elementType;
    stream->m_storageType = StorageType::dense;
    m_streams.push_back(stream);
+}

+// Reading information about the features from the first file.
+// This information is used later to check that all features among all files have the same properties.
+void HTKDataDeserializer::InitializeFeatureInformation()
+{
    msra::util::attempt(5, [&]()
    {
        msra::asr::htkfeatreader reader;
-        reader.getinfo(m_utterances[0].GetPath(), m_featureKind, m_ioFeatureDimension, m_samplePeriod);
+        reader.getinfo(m_chunks.front().GetUtterance(0)->GetPath(), m_featureKind, m_ioFeatureDimension, m_samplePeriod);
        fprintf(stderr, "HTKDataDeserializer::HTKDataDeserializer: determined feature kind as %d-dimensional '%s' with frame shift %.1f ms\n",
            (int)m_dimension, m_featureKind.c_str(), m_samplePeriod / 1e4);
    });
 }

-const SequenceDescriptions& HTKDataDeserializer::GetSequenceDescriptions() const
+// Gets information about available chunks.
+ChunkDescriptions HTKDataDeserializer::GetChunkDescriptions()
 {
-    return m_sequences;
+    ChunkDescriptions chunks;
+    chunks.reserve(m_chunks.size());
+
+    for (size_t i = 0; i < m_chunks.size(); ++i)
+    {
+        auto cd = make_shared<ChunkDescription>();
+        cd->m_id = i;
+        cd->m_numberOfSamples = m_chunks[i].GetTotalFrames();
+        cd->m_numberOfSequences = m_chunks[i].GetTotalFrames();
+        chunks.push_back(cd);
+    }
+    return chunks;
 }

-std::vector<StreamDescriptionPtr> HTKDataDeserializer::GetStreamDescriptions() const
+// Gets sequences for a particular chunk.
+// This information is used by the randomizer to fill in current windows of sequences.
+void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDescription>& result)
 {
-    return m_streams;
+    const HTKChunkDescription& chunk = m_chunks[chunkId];
+    result.reserve(chunk.GetTotalFrames());
+    size_t offsetInChunk = 0;
+    for (size_t i = 0; i < chunk.GetNumberOfUtterances(); ++i)
+    {
+        auto utterance = chunk.GetUtterance(i);
+        size_t major = utterance->GetId();
+        // Because it is a frame mode, creating sequences for each frame.
+        for (size_t k = 0; k < utterance->GetNumberOfFrames(); ++k)
+        {
+            SequenceDescription f;
+            f.m_chunkId = chunkId;
+            f.m_key.m_major = major;
+            f.m_key.m_minor = k;
+            f.m_id = offsetInChunk++;
+            f.m_isValid = true;
+            f.m_numberOfSamples = 1;
+            result.push_back(f);
+        }
+    }
 }

 // A wrapper around a matrix that views it as a vector of column vectors.
@ -210,8 +232,6 @@ private:
 // It is up to the randomizer to decide when to release a particular chunk.
 class HTKDataDeserializer::HTKChunk : public Chunk
 {
-    HTKDataDeserializer* m_parent;
-    size_t m_chunkId;
 public:
    HTKChunk(HTKDataDeserializer* parent, size_t chunkId) : m_parent(parent), m_chunkId(chunkId)
    {
@ -225,18 +245,26 @@ public:
        });
    }

-    virtual std::vector<SequenceDataPtr> GetSequence(size_t sequenceId) override
+    // Gets data for the sequnce.
+    virtual void GetSequence(size_t sequenceId, vector<SequenceDataPtr>& result) override
    {
-        return m_parent->GetSequenceById(sequenceId);
+        m_parent->GetSequenceById(m_chunkId, sequenceId, result);
    }

+    // Unloads the data from memory.
    ~HTKChunk()
    {
        auto& chunkDescription = m_parent->m_chunks[m_chunkId];
        chunkDescription.ReleaseData();
    }
+
+private:
+    DISABLE_COPY_AND_MOVE(HTKChunk);
+    HTKDataDeserializer* m_parent;
+    size_t m_chunkId;
 };

+// Gets a data chunk with the specified chunk id.
 ChunkPtr HTKDataDeserializer::GetChunk(size_t chunkId)
 {
    if (!m_weakChunks[chunkId].expired())
@ -244,11 +272,14 @@ ChunkPtr HTKDataDeserializer::GetChunk(size_t chunkId)
        return m_weakChunks[chunkId].lock();
    }

-    auto chunk = std::make_shared<HTKChunk>(this, chunkId);
+    auto chunk = make_shared<HTKChunk>(this, chunkId);
    m_weakChunks[chunkId] = chunk;
    return chunk;
 };

+// This class stores sequence data for HTK,
+//     - for floats: a simple pointer to the chunk data
+//     - for doubles: allocated array of doubles which is freed when the sequence is no longer used.
 struct HTKSequenceData : DenseSequenceData
 {
    msra::dbn::matrix m_buffer;
@ -256,6 +287,8 @@ struct HTKSequenceData : DenseSequenceData
    ~HTKSequenceData()
    {
        msra::dbn::matrixstripe frame(m_buffer, 0, m_buffer.cols());
+
+        // Checking if m_data just a pointer in to the 
        if (m_data != &frame(0, 0))
        {
            delete[] reinterpret_cast<double*>(m_data);
@ -264,15 +297,16 @@ struct HTKSequenceData : DenseSequenceData
    }
 };

-typedef std::shared_ptr<HTKSequenceData> HTKSequenceDataPtr;
+typedef shared_ptr<HTKSequenceData> HTKSequenceDataPtr;

-std::vector<SequenceDataPtr> HTKDataDeserializer::GetSequenceById(size_t id)
+// Get a sequence by its chunk id and id.
+void HTKDataDeserializer::GetSequenceById(size_t chunkId, size_t id, vector<SequenceDataPtr>& r)
 {
-    const auto& frame = m_frames[id];
-    UtteranceDescription* utterance = frame.m_utterence;
-
-    const auto& chunkDescription = m_chunks[utterance->m_chunkId];
-    auto utteranceFrames = chunkDescription.GetUtteranceFrames(utterance->GetIndexInsideChunk());
+    const auto& chunkDescription = m_chunks[chunkId];
+    size_t utteranceIndex = chunkDescription.GetUtteranceForChunkFrameIndex(id);
+    const UtteranceDescription* utterance = chunkDescription.GetUtterance(utteranceIndex);
+    auto utteranceFrames = chunkDescription.GetUtteranceFrames(utteranceIndex);
+    size_t frameIndex = id - utterance->GetStartFrameIndexInsideChunk();

    // wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors()
    MatrixAsVectorOfVectors utteranceFramesWrapper(utteranceFrames);
@ -286,15 +320,13 @@ std::vector<SequenceDataPtr> HTKDataDeserializer::GetSequenceById(size_t id)
        leftExtent = rightExtent = msra::dbn::augmentationextent(utteranceFramesWrapper[0].size(), m_dimension);
    }

-    HTKSequenceDataPtr result = std::make_shared<HTKSequenceData>();
+    HTKSequenceDataPtr result = make_shared<HTKSequenceData>();
    result->m_buffer.resize(m_dimension, 1);
-    const std::vector<char> noBoundaryFlags; // dummy
-    msra::dbn::augmentneighbors(utteranceFramesWrapper, noBoundaryFlags, frame.m_frameIndex, leftExtent, rightExtent, result->m_buffer, 0);
+    const vector<char> noBoundaryFlags; // TODO: dummy, currently to boundaries supported.
+    msra::dbn::augmentneighbors(utteranceFramesWrapper, noBoundaryFlags, frameIndex, leftExtent, rightExtent, result->m_buffer, 0);

-    result->m_numberOfSamples = frame.m_numberOfSamples;
+    result->m_numberOfSamples = 1;
    msra::dbn::matrixstripe stripe(result->m_buffer, 0, result->m_buffer.cols());
-    const size_t dimensions = stripe.rows();
-
    if (m_elementType == ElementType::tfloat)
    {
        result->m_data = &stripe(0, 0);
@ -302,6 +334,7 @@ std::vector<SequenceDataPtr> HTKDataDeserializer::GetSequenceById(size_t id)
    else
    {
        assert(m_elementType == ElementType::tdouble);
+        const size_t dimensions = stripe.rows();
        double *doubleBuffer = new double[dimensions];
        const float *floatBuffer = &stripe(0, 0);

@ -313,17 +346,7 @@ std::vector<SequenceDataPtr> HTKDataDeserializer::GetSequenceById(size_t id)
        result->m_data = doubleBuffer;
    }

-    return std::vector<SequenceDataPtr>(1, result);
+    r.push_back(result);
 }

-const SequenceDescription* HTKDataDeserializer::GetSequenceDescriptionByKey(const KeyType&)
-{
-    LogicError("HTKDataDeserializer::GetSequenceDescriptionByKey: currently not implemented. Supported only as a primary deserializer.");
-}
-
-size_t HTKDataDeserializer::GetTotalNumberOfChunks()
-{
-    return m_chunks.size();
-}
-
-} } }
+}}}
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.h
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.h
@ -5,84 +5,67 @@

 #pragma once

-#include "DataDeserializer.h"
+#include "DataDeserializerBase.h"
 #include "Config.h"
 #include "CorpusDescriptor.h"
 #include "UtteranceDescription.h"
-#include "ChunkDescription.h"
+#include "HTKChunkDescription.h"
+#include "ConfigHelper.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

 // Class represents an HTK deserializer.
 // Provides a set of chunks/sequences to the upper layers.
-class HTKDataDeserializer : public IDataDeserializer
+class HTKDataDeserializer : public DataDeserializerBase
 {
 public:
    HTKDataDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& featureConfig, const std::wstring& featureName);

-    // Describes streams this data deserializer can produce. Streams correspond to network inputs.
-    // Produces a single stream of HTK features.
-    virtual std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override;
+    // Get information about chunks.
+    virtual ChunkDescriptions GetChunkDescriptions() override;

-    // Retrieves description of all sequences this data deserializer can produce, together with associated chunks.
-    // TODO: For huge corpus, the memory footprint is too big. We adapt this interface to request timeline in chunks.
-    virtual const SequenceDescriptions& GetSequenceDescriptions() const override;
+    // Get information about particular chunk.
+    virtual void GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& result) override;

-    // Retrieves sequence description by its key. Used for deserializers that are not in "primary"/"driving" mode.
-    virtual const SequenceDescription* GetSequenceDescriptionByKey(const KeyType& key) override;
-
-    // Retrieves total number of chunks this deserializer can produce.
-    virtual size_t GetTotalNumberOfChunks() override;
-
-    // Retrieves a chunk with data.
+    // Retrieves data for a chunk.
    virtual ChunkPtr GetChunk(size_t chunkId) override;

 private:
+    class HTKChunk;
    DISABLE_COPY_AND_MOVE(HTKDataDeserializer);

-    // Represents a frame.
-    // TODO: Change the structure to descrease the memory footprint.
-    // TOOD: SequenceDescription should become an interfaces and be requested only for current chunks.
-    struct Frame : SequenceDescription
-    {
-        Frame(UtteranceDescription* u) : m_utterence(u), m_frameIndex(0)
-        {
-        }
+    // Initialization functions.
+    void InitializeChunkDescriptions(ConfigHelper& config);
+    void InitializeStreams(const std::wstring& featureName);
+    void InitializeFeatureInformation();

-        UtteranceDescription* m_utterence;
-        size_t m_frameIndex;
-    };
-
-    class HTKChunk;
-    std::vector<SequenceDataPtr> GetSequenceById(size_t id);
+    // Gets sequence by its chunk id and id inside the chunk.
+    void GetSequenceById(size_t chunkId, size_t id, std::vector<SequenceDataPtr>&);

    // Dimension of features.
    size_t m_dimension;

-    // All utterance descriptions.
-    std::vector<UtteranceDescription> m_utterances;
-
-    // All frame descriptions.
-    // TODO: This will be changed when the timeline is asked in chunks.
-    std::vector<Frame> m_frames;
-    SequenceDescriptions m_sequences;
-
    // Type of the features.
    ElementType m_elementType;

    // Chunk descriptions.
-    std::vector<ChunkDescription> m_chunks;
+    std::vector<HTKChunkDescription> m_chunks;
+
    // Weak pointers on existing chunks.
+    // If randomizer asks the same chunk twice we do not need to recreate
+    // the chunk if we already uploaded it in memory.
    std::vector<std::weak_ptr<Chunk>> m_weakChunks;

    // Augmentation window.
    std::pair<size_t, size_t> m_augmentationWindow;

-    // Streams exposed by this deserializer.
-    std::vector<StreamDescriptionPtr> m_streams;
+    CorpusDescriptorPtr m_corpus;

    int m_verbosity;

+    // Total number of frames.
+    size_t m_totalNumberOfFrames;
+
    // Auxiliary data for checking against the data in the feature file.
    unsigned int m_samplePeriod;
    size_t m_ioFeatureDimension;
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKMLFReader.cpp
@ -27,8 +27,7 @@ std::vector<IDataDeserializerPtr> CreateDeserializers(const ConfigParameters& re
        InvalidArgument("Network needs at least 1 feature and 1 label specified.");
    }

-    std::vector<std::wstring> sequences = ConfigHelper(readerConfig(featureNames.front())).GetFeaturePaths();
-    CorpusDescriptorPtr corpus = std::make_shared<CorpusDescriptor>(std::move(sequences));
+    CorpusDescriptorPtr corpus = std::make_shared<CorpusDescriptor>();

    std::vector<IDataDeserializerPtr> featureDeserializers;
    std::vector<IDataDeserializerPtr> labelDeserializers;
@ -69,7 +68,7 @@ HTKMLFReader::HTKMLFReader(MemoryProviderPtr provider,
    auto deserializers = CreateDeserializers(readerConfig);
    assert(deserializers.size() == 2);

-    auto bundler = std::make_shared<Bundler>(readerConfig, deserializers[0], deserializers);
+    auto bundler = std::make_shared<Bundler>(readerConfig, deserializers[0], deserializers, false);

    std::wstring readMethod = config.GetRandomizer();
    if (!AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize")))
@ -78,7 +77,7 @@ HTKMLFReader::HTKMLFReader(MemoryProviderPtr provider,
    }

    int verbosity = readerConfig(L"verbosity", 2);
-    m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DistributionMode::chunk_modulus, true /* useLegacyRandomization */);
+    m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DecimationMode::chunk, true /* useLegacyRandomization */);
    m_randomizer->Initialize(nullptr, readerConfig);

    // Create output stream descriptions (all dense)
--- a/Source/Readers/ExperimentalHTKMLFReader/MLFDataDeserializer.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/MLFDataDeserializer.cpp
@ -25,9 +25,9 @@ public:
    MLFChunk(MLFDataDeserializer* parent) : m_parent(parent)
    {}

-    virtual std::vector<SequenceDataPtr> GetSequence(size_t sequenceId) override
+    virtual void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override
    {
-        return m_parent->GetSequenceById(sequenceId);
+        m_parent->GetSequenceById(sequenceId, result);
    }
 };

@ -76,9 +76,16 @@ MLFDataDeserializer::MLFDataDeserializer(CorpusDescriptorPtr corpus, const Confi
    description.m_isValid = true;
    size_t totalFrames = 0;

+    auto& stringRegistry = corpus->GetStringRegistry();
    for (const auto& l : labels)
    {
-        description.m_key.major = l.first;
+        // Currently the string registry contains only utterances described in scp.
+        // So here we skip all others.
+        if (!stringRegistry.Contains(l.first))
+            continue;
+
+        description.m_key.m_major = stringRegistry[l.first];
+
        const auto& utterance = l.second;
        description.m_sequenceStart = m_classIds.size();
        description.m_isValid = true;
@ -113,32 +120,29 @@ MLFDataDeserializer::MLFDataDeserializer(CorpusDescriptorPtr corpus, const Confi
        description.m_numberOfSamples = numberOfFrames;
        totalFrames += numberOfFrames;
        m_utteranceIndex.push_back(m_frames.size());
-        m_keyToSequence[description.m_key.major] = m_utteranceIndex.size() - 1;
+        m_keyToSequence[description.m_key.m_major] = m_utteranceIndex.size() - 1;

+        // TODO: Should be created by chunks only.
        MLFFrame f;
        f.m_chunkId = 0;
        f.m_numberOfSamples = 1;
-        f.m_key.major = description.m_key.major;
+        f.m_key.m_major = description.m_key.m_major;
        f.m_isValid = description.m_isValid;
        for (size_t k = 0; k < description.m_numberOfSamples; ++k)
        {
            f.m_id = m_frames.size();
-            f.m_key.minor = k;
+            f.m_key.m_minor = k;
            f.m_index = description.m_sequenceStart + k;
            m_frames.push_back(f);
-            m_sequences.push_back(&m_frames[f.m_id]);
        }
    }

-    m_sequences.reserve(m_frames.size());
-    for (int i = 0; i < m_frames.size(); ++i)
-    {
-        m_sequences.push_back(&m_frames[i]);
-    }
+    m_totalNumberOfFrames = totalFrames;

-    fprintf(stderr, "MLFDataDeserializer::MLFDataDeserializer: read %d sequences\n", (int)m_sequences.size());
+    fprintf(stderr, "MLFDataDeserializer::MLFDataDeserializer: read %d sequences\n", (int)m_frames.size());
    fprintf(stderr, "MLFDataDeserializer::MLFDataDeserializer: read %d utterances\n", (int)m_keyToSequence.size());

+    // Initializing stream description - a single stream of MLF data.
    StreamDescriptionPtr stream = std::make_shared<StreamDescription>();
    stream->m_id = 0;
    stream->m_name = name;
@ -146,22 +150,53 @@ MLFDataDeserializer::MLFDataDeserializer(CorpusDescriptorPtr corpus, const Confi
    stream->m_storageType = StorageType::sparse_csc;
    stream->m_elementType = m_elementType;
    m_streams.push_back(stream);
+
+    // Initializing array of labels.
+    m_categories.reserve(dimension);
+    for (size_t i = 0; i < dimension; ++i)
+    {
+        SparseSequenceDataPtr category = std::make_shared<SparseSequenceData>();
+        category->m_indices.resize(1);
+        category->m_indices[0] = std::vector<size_t>{ m_categories.size() };
+        if (m_elementType == ElementType::tfloat)
+        {
+            category->m_data = &s_oneFloat;
+        }
+        else
+        {
+            assert(m_elementType == ElementType::tdouble);
+            category->m_data = &s_oneDouble;
+        }
+        m_categories.push_back(category);
+    }
 }

-const SequenceDescriptions& MLFDataDeserializer::GetSequenceDescriptions() const
+// Currently MLF has a single chunk.
+// TODO: This will be changed when the deserializer properly supports chunking.
+ChunkDescriptions MLFDataDeserializer::GetChunkDescriptions()
 {
-    return m_sequences;
+    auto cd = std::make_shared<ChunkDescription>();
+    cd->m_id = 0;
+    cd->m_numberOfSequences = m_frames.size();
+    cd->m_numberOfSamples = m_frames.size();
+    return ChunkDescriptions{cd};
 }

-std::vector<StreamDescriptionPtr> MLFDataDeserializer::GetStreamDescriptions() const
+// Gets sequences for a particular chunk.
+void MLFDataDeserializer::GetSequencesForChunk(size_t, std::vector<SequenceDescription>& result)
 {
-    return m_streams;
-}
-
-size_t MLFDataDeserializer::GetTotalNumberOfChunks()
-{
-    // Currently all mlf data is in memory.
-    return 1;
+    result.reserve(m_frames.size());
+    for (size_t i = 0; i < m_frames.size(); ++i)
+    {
+        SequenceDescription f;
+        f.m_key.m_major = m_frames[i].m_key.m_major;
+        f.m_key.m_minor = m_frames[i].m_key.m_minor;
+        f.m_id = m_frames[i].m_id;
+        f.m_chunkId = m_frames[i].m_chunkId;
+        f.m_numberOfSamples = 1;
+        f.m_isValid = true;
+        result.push_back(f);
+    }
 }

 ChunkPtr MLFDataDeserializer::GetChunk(size_t chunkId)
@ -171,38 +206,26 @@ ChunkPtr MLFDataDeserializer::GetChunk(size_t chunkId)
    return std::make_shared<MLFChunk>(this);
 }

-std::vector<SequenceDataPtr> MLFDataDeserializer::GetSequenceById(size_t sequenceId)
+void MLFDataDeserializer::GetSequenceById(size_t sequenceId, std::vector<SequenceDataPtr>& result)
 {
    size_t label = m_classIds[m_frames[sequenceId].m_index];
-    SparseSequenceDataPtr r = std::make_shared<SparseSequenceData>();
-    r->m_indices.resize(1);
-    r->m_indices[0] = std::vector<size_t>{ label };
-
-    if (m_elementType == ElementType::tfloat)
-    {
-        r->m_data = &s_oneFloat;
-    }
-    else
-    {
-        assert(m_elementType == ElementType::tdouble);
-        r->m_data = &s_oneDouble;
-    }
-
-    return std::vector<SequenceDataPtr> { r };
+    assert(label < m_categories.size());
+    result.push_back(m_categories[label]);
 }

 static SequenceDescription s_InvalidSequence { 0, 0, 0, false };

-const SequenceDescription* MLFDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key)
+void MLFDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& result)
 {
-    auto sequenceId = m_keyToSequence.find(key.major);
+    auto sequenceId = m_keyToSequence.find(key.m_major);
    if (sequenceId == m_keyToSequence.end())
    {
-        return &s_InvalidSequence;
+        result = s_InvalidSequence;
+        return;
    }

-    size_t index = m_utteranceIndex[sequenceId->second] + key.minor;
-    return m_sequences[index];
+    size_t index = m_utteranceIndex[sequenceId->second] + key.m_minor;
+    result = m_frames[index];
 }

 }}}
--- a/Source/Readers/ExperimentalHTKMLFReader/MLFDataDeserializer.h
+++ b/Source/Readers/ExperimentalHTKMLFReader/MLFDataDeserializer.h
@ -14,24 +14,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 // Class represents an MLF deserializer.
 // Provides a set of chunks/sequences to the upper layers.
-class MLFDataDeserializer : public IDataDeserializer
+class MLFDataDeserializer : public DataDeserializerBase
 {
 public:
    MLFDataDeserializer(CorpusDescriptorPtr corpus, const ConfigParameters& config, const std::wstring& streamName);

-    // Describes streams this data deserializer can produce. Streams correspond to network inputs.
-    // Produces a single stream of MLF labels.
-    virtual std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override;
-
-    // Retrieves description of all sequences this data deserializer can produce, together with associated chunks.
-    // TODO: For huge corpus, the memory footprint is too big. We adapt this interface to request timeline in chunks.
-    virtual const SequenceDescriptions& GetSequenceDescriptions() const override;
-
    // Retrieves sequence description by its key. Used for deserializers that are not in "primary"/"driving" mode.
-    const SequenceDescription* GetSequenceDescriptionByKey(const KeyType& key) override;
+    void GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& s) override;

-    // Retrieves total number of chunks this deserializer can produce.
-    virtual size_t GetTotalNumberOfChunks() override;
+    // Gets description of all chunks.
+    virtual ChunkDescriptions GetChunkDescriptions() override;
+
+    // Get sequence descriptions of a particular chunk.
+    virtual void GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& s) override;

    // Retrieves a chunk with data.
    // TODO: Currenty it is a single chunk => all labels are loaded into memory.
@ -39,6 +34,7 @@ public:
    virtual ChunkPtr GetChunk(size_t) override;

 private:
+    class MLFChunk;
    DISABLE_COPY_AND_MOVE(MLFDataDeserializer);

    // Inner class for a frame.
@ -48,28 +44,30 @@ private:
        size_t m_index;
    };

-    class MLFChunk;
-
-    std::vector<SequenceDataPtr> GetSequenceById(size_t sequenceId);
+    void GetSequenceById(size_t sequenceId, std::vector<SequenceDataPtr>& result);

    // Key to sequence map.
-    std::map<wstring, size_t> m_keyToSequence;
+    std::map<size_t, size_t> m_keyToSequence;

    // Array of all labels.
    msra::dbn::biggrowablevector<msra::dbn::CLASSIDTYPE> m_classIds;
+
    // Index of utterances in the m_classIds.
    msra::dbn::biggrowablevector<size_t> m_utteranceIndex;

    // TODO: All sequences(currently frames), this deserializer provides.
    // This interface has to change when the randomizer asks timeline in chunks.
    msra::dbn::biggrowablevector<MLFFrame> m_frames;
-    SequenceDescriptions m_sequences;

    // Type of the data this serializer provdes.
    ElementType m_elementType;

-    // Streams, this deserializer provides. A single mlf stream.
-    std::vector<StreamDescriptionPtr> m_streams;
+    // Total number of frames.
+    size_t m_totalNumberOfFrames;
+
+    // Array of available categories.
+    // We do no allocate data for all input sequences, only returning a pointer to existing category.
+    std::vector<SparseSequenceDataPtr> m_categories;
 };

 }}}
--- a/Source/Readers/ExperimentalHTKMLFReader/UtteranceDescription.h
+++ b/Source/Readers/ExperimentalHTKMLFReader/UtteranceDescription.h
@ -3,6 +3,8 @@
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //

+#pragma once
+
 #include "DataDeserializer.h"
 #include "../HTKMLFReader/htkfeatio.h"

@ -10,17 +12,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 // This class represents a descriptor for a single utterance.
 // It is only used internally by the HTK deserializer.
-class UtteranceDescription : public SequenceDescription
+class UtteranceDescription
 {
    // Archive filename and frame range in that file.
    msra::asr::htkfeatreader::parsedpath m_path;

    // Index of the utterance inside the chunk.
    size_t m_indexInsideChunk;
+    // Position of the first sample of the utterance inside the chunk.
+    size_t m_startFrameIndexInsideChunk;
+    // Chunk id.
+    size_t m_chunkId;
+    // Utterance id.
+    size_t m_id;

 public:
    UtteranceDescription(msra::asr::htkfeatreader::parsedpath&& path)
-        : m_path(std::move(path)), m_indexInsideChunk(0)
+        : m_path(std::move(path)), m_indexInsideChunk(0), m_startFrameIndexInsideChunk(0), m_chunkId(SIZE_MAX)
    {
    }

@ -40,14 +48,23 @@ public:
        return filename.substr(0, filename.find_last_of(L"."));
    }

-    size_t GetIndexInsideChunk() const
+    void AssignToChunk(size_t chunkId, size_t indexInsideChunk, size_t frameInsideChunk)
    {
-        return m_indexInsideChunk;
+        m_chunkId = chunkId;
+        m_indexInsideChunk = indexInsideChunk;
+        m_startFrameIndexInsideChunk = frameInsideChunk;
    }

-    void SetIndexInsideChunk(size_t indexInsideChunk)
+    size_t GetId() const  { return m_id; }
+    void SetId(size_t id) { m_id = id; }
+
+    size_t GetChunkId() const  { return m_chunkId; }
+    size_t GetIndexInsideChunk() const { return m_indexInsideChunk;}
+    size_t GetStartFrameIndexInsideChunk() const { return m_startFrameIndexInsideChunk; }
+
+    void SetStartFrameInsideChunk(size_t startFrameIndexInsideChunk)
    {
-        m_indexInsideChunk = indexInsideChunk;
+        m_startFrameIndexInsideChunk = startFrameIndexInsideChunk;
    }
 };

--- a/Source/Readers/ImageReader/ImageDataDeserializer.cpp
+++ b/Source/Readers/ImageReader/ImageDataDeserializer.cpp
@ -59,7 +59,7 @@ public:
    {
    }

-    virtual std::vector<SequenceDataPtr> GetSequence(size_t sequenceId) override
+    virtual void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override
    {
        assert(sequenceId == m_description.m_id);
        UNUSED(sequenceId);
@ -92,11 +92,12 @@ public:
        image->m_sampleLayout = std::make_shared<TensorShape>(dimensions.AsTensorShape(HWC));
        image->m_numberOfSamples = 1;
        image->m_chunk = shared_from_this();
+        result.push_back(image);

        SparseSequenceDataPtr label = std::make_shared<SparseSequenceData>();
        label->m_chunk = shared_from_this();
        m_parent.m_labelGenerator->CreateLabelFor(imageSequence.m_classId, *label);
-        return std::vector<SequenceDataPtr> { image, label };
+        result.push_back(label);
    }
 };

@ -134,6 +135,29 @@ ImageDataDeserializer::ImageDataDeserializer(const ConfigParameters& config)
    CreateSequenceDescriptions(configHelper.GetMapPath(), labelDimension);
 }

+// Descriptions of chunks exposed by the image reader.
+ChunkDescriptions ImageDataDeserializer::GetChunkDescriptions()
+{
+    ChunkDescriptions result;
+    result.reserve(m_imageSequences.size());
+    for (auto const& s : m_imageSequences)
+    {
+        auto chunk = std::make_shared<ChunkDescription>();
+        chunk->m_id = s.m_chunkId;
+        chunk->m_numberOfSamples = 1;
+        chunk->m_numberOfSequences = 1;
+        result.push_back(chunk);
+    }
+
+    return result;
+}
+
+void ImageDataDeserializer::GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& result)
+{
+    // Currently a single sequence per chunk.
+    result.push_back(m_imageSequences[chunkId]);
+}
+
 void ImageDataDeserializer::CreateSequenceDescriptions(std::string mapPath, size_t labelDimension)
 {
    UNUSED(labelDimension);
@ -165,6 +189,8 @@ void ImageDataDeserializer::CreateSequenceDescriptions(std::string mapPath, size
        description.m_chunkId = lineIndex;
        description.m_path = imagePath;
        description.m_classId = std::stoi(classId);
+        description.m_key.m_major = description.m_id;
+        description.m_key.m_minor = 0;

        if (description.m_classId >= labelDimension)
        {
@ -179,30 +205,6 @@ void ImageDataDeserializer::CreateSequenceDescriptions(std::string mapPath, size
    }
 }

-size_t ImageDataDeserializer::GetTotalNumberOfChunks()
-{
-    // Currently we use one chunk per image.
-    return m_imageSequences.size();
-}
-
-std::vector<StreamDescriptionPtr> ImageDataDeserializer::GetStreamDescriptions() const
-{
-    return m_streams;
-}
-
-void ImageDataDeserializer::FillSequenceDescriptions(SequenceDescriptions& timeline) const
-{
-    timeline.resize(m_imageSequences.size());
-    std::transform(
-        m_imageSequences.begin(),
-        m_imageSequences.end(),
-        timeline.begin(),
-        [](const ImageSequenceDescription& desc)
-        {
-            return &desc;
-        });
-}
-
 ChunkPtr ImageDataDeserializer::GetChunk(size_t chunkId)
 {
    auto sequenceDescription = m_imageSequences[chunkId];
--- a/Source/Readers/ImageReader/ImageDataDeserializer.h
+++ b/Source/Readers/ImageReader/ImageDataDeserializer.h
@ -22,15 +22,14 @@ class ImageDataDeserializer : public DataDeserializerBase
 public:
    explicit ImageDataDeserializer(const ConfigParameters& config);

-    // Description of streams that this data deserializer provides.
-    std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override;
-    virtual size_t GetTotalNumberOfChunks() override;
-
-    // Get sequences by specified ids. Order of returned sequences corresponds to the order of provided ids.
+    // Gets sequences by specified ids. Order of returned sequences corresponds to the order of provided ids.
    virtual ChunkPtr GetChunk(size_t chunkId) override;

-protected:
-    void FillSequenceDescriptions(SequenceDescriptions& timeline) const override;
+    // Gets chunk descriptions.
+    virtual ChunkDescriptions GetChunkDescriptions() override;
+
+    // Gets sequence descriptions for the chunk.
+    virtual void GetSequencesForChunk(size_t, std::vector<SequenceDescription>&) override;

 private:
    // Creates a set of sequence descriptions.
--- a/Source/Readers/ImageReader/ImageReader.cpp
+++ b/Source/Readers/ImageReader/ImageReader.cpp
@ -39,7 +39,7 @@ ImageReader::ImageReader(MemoryProviderPtr provider,
    TransformerPtr randomizer;
    if (configHelper.ShouldRandomize())
    {
-        randomizer = std::make_shared<BlockRandomizer>(0, 1, deserializer);
+        randomizer = std::make_shared<BlockRandomizer>(0, 1, deserializer, BlockRandomizer::DecimationMode::sequence, false);
    }
    else
    {
--- a/Source/Readers/ReaderLib/BlockRandomizer.cpp
+++ b/Source/Readers/ReaderLib/BlockRandomizer.cpp
@ -8,484 +8,123 @@
 #include "BlockRandomizer.h"
 #include <algorithm>
 #include <utility>
-#include <iostream>
+#include <deque>

 #include "DataReader.h"
 #include <random>
+#include <set>

 namespace Microsoft { namespace MSR { namespace CNTK {

-// TODO: This is an old code, used for legacy randomization to make sure to preserve the same behavior for the tests.
-static inline size_t rand(const size_t begin, const size_t end)
-{
-    // still only covers 32-bit range
-    const size_t randomNumber = ::rand() * RAND_MAX + ::rand();
-    return begin + randomNumber % (end - begin);
-}
-
-// TODO: This is an old code, used for legacy randomization to make sure to preserve the same behavior for the tests.
-// TODO: Will be removed after more testing of the new functionality is done, currently the set of tests is limited.
-// Shuffle a vector into random order by randomly swapping elements.
-template <typename TVector>
-void RandomShuffle(TVector& v, size_t randomSeed)
-{
-    if (v.size() > RAND_MAX * static_cast<size_t>(RAND_MAX))
-    {
-        RuntimeError("RandomShuffle: too large set: need to change to different random generator!");
-    }
-
-    srand((unsigned int)randomSeed);
-    foreach_index (currentLocation, v)
-    {
-        // Pick a random location a location and swap with current
-        const size_t randomLocation = rand(0, v.size());
-        std::swap(v[currentLocation], v[randomLocation]);
-    }
-}
-
-
-bool BlockRandomizer::TimelineIsValidForRandomization(const SequenceDescriptions& timeline) const
-{
-    SequenceDescription previous = { SIZE_MAX, 0, 0, true };
-
-    auto it = std::find_if_not(timeline.begin(), timeline.end(),
-        [&](const SequenceDescription* current)
-    {
-        bool result = current->m_isValid
-            && previous.m_id + 1 == current->m_id
-            && previous.m_chunkId <= current->m_chunkId
-            && current->m_chunkId <= previous.m_chunkId + 1
-            && 0 < current->m_numberOfSamples;
-        previous = *current;
-        return result;
-    });
-    return it == timeline.end();
-}
-
-void BlockRandomizer::RandomizeChunks()
-{
-    // Create vector of chunk indices and shuffle them using current sweep as seed
-    std::vector<size_t> randomizedChunkIndices;
-    randomizedChunkIndices.reserve(m_numChunks);
-    for (size_t i = 0; i < m_numChunks; i++)
-    {
-        randomizedChunkIndices.push_back(i);
-    }
-
-    if (m_useLegacyRandomization)
-    {
-        RandomShuffle(randomizedChunkIndices, m_sweep);
-    }
-    else
-    {
-        std::mt19937 m_rng((int)m_sweep);
-        std::shuffle(randomizedChunkIndices.begin(), randomizedChunkIndices.end(), m_rng);
-    }
-
-    // Place randomized chunks on global time line
-    m_randomizedChunks.clear();
-    m_randomizedChunks.reserve(m_numChunks + 1);
-    size_t chunkId, samplePosition, sequencePosition;
-    for (chunkId = 0, samplePosition = m_sweepStartInSamples, sequencePosition = 0; chunkId < m_numChunks; chunkId++)
-    {
-        const size_t originalChunkIndex = randomizedChunkIndices[chunkId];
-        const size_t numSequences =
-            m_chunkInformation[originalChunkIndex + 1].m_sequencePositionStart -
-            m_chunkInformation[originalChunkIndex].m_sequencePositionStart;
-        const size_t numSamples =
-            m_chunkInformation[originalChunkIndex + 1].m_samplePositionStart -
-            m_chunkInformation[originalChunkIndex].m_samplePositionStart;
-        m_randomizedChunks.push_back(RandomizedChunk{ sequencePosition, samplePosition, originalChunkIndex });
-        samplePosition += numSamples;
-        sequencePosition += numSequences;
-    }
-
-    // Add sentinel
-    m_randomizedChunks.push_back(RandomizedChunk{ sequencePosition, samplePosition, SIZE_MAX });
-
-    // For each chunk, compute the randomization range (w.r.t. the randomized chunk sequence)
-    size_t halfWindowRange = m_randomizationRangeInSamples / 2;
-    for (size_t chunkId = 0; chunkId < m_numChunks; chunkId++)
-    {
-        auto& chunk = m_randomizedChunks[chunkId];
-        // start with the range of left neighbor
-        if (chunkId == 0)
-        {
-            chunk.m_windowBegin = 0;
-            chunk.m_windowEnd = 1;
-        }
-        else
-        {
-            chunk.m_windowBegin = m_randomizedChunks[chunkId - 1].m_windowBegin; // might be too early
-            chunk.m_windowEnd = m_randomizedChunks[chunkId - 1].m_windowEnd; // might have more space
-        }
-        while (chunk.m_info.m_samplePositionStart - m_randomizedChunks[chunk.m_windowBegin].m_info.m_samplePositionStart > halfWindowRange)
-            chunk.m_windowBegin++; // too early
-        // TODO m_randomizedChunks[chunk.windowend + 1].info.samplePositionStart - m_randomizedChunks[chunk.windowbegin].info.samplePositionStart < m_randomizationRangeInSamples
-        chunk.m_windowEnd = std::max(chunk.m_windowEnd, chunk.m_windowBegin + 1);
-        while (chunk.m_windowEnd < m_numChunks &&
-            m_randomizedChunks[chunk.m_windowEnd + 1].m_info.m_samplePositionStart - chunk.m_info.m_samplePositionStart < halfWindowRange)
-            chunk.m_windowEnd++; // got more space
-    }
-}
-
-// TODO: Profile and eliminate PositionConverter, better convert sequencePosition to RandomizedChunk
-// once.
-size_t BlockRandomizer::GetChunkIndexForSequencePosition(size_t sequencePosition) const
-{
-    assert(sequencePosition <= m_numSamples);
-
-    struct PositionConverter
-    {
-        size_t m_position;
-        PositionConverter(const RandomizedChunk & chunk) : m_position(chunk.m_info.m_sequencePositionStart) {};
-        PositionConverter(size_t sequencePosition) : m_position(sequencePosition) {};
-    };
-
-    auto result = std::lower_bound(m_randomizedChunks.begin(), m_randomizedChunks.end(), sequencePosition,
-        [](const PositionConverter& a, const PositionConverter& b)
-    {
-        return a.m_position <= b.m_position;
-    });
-
-    return result - m_randomizedChunks.begin() - 1;
-}
-
-bool BlockRandomizer::IsValidForPosition(size_t targetPosition, const SequenceDescription& seqDesc) const
-{
-    const auto& chunk = m_randomizedChunks[GetChunkIndexForSequencePosition(targetPosition)];
-    return chunk.m_windowBegin <= seqDesc.m_chunkId && seqDesc.m_chunkId < chunk.m_windowEnd;
-}
-
-void BlockRandomizer::Randomize()
-{
-    const auto& timeline = m_deserializer->GetSequenceDescriptions();
-    RandomizeChunks();
-
-    // Set up m_randomTimeline, shuffled by chunks.
-    m_randomTimeline.clear();
-    m_randomTimeline.reserve(m_numSequences);
-    for (size_t chunkId = 0; chunkId < m_numChunks; chunkId++)
-    {
-        auto originalChunkIndex = m_randomizedChunks[chunkId].m_originalChunkIndex;
-
-        for (size_t sequencePosition = m_chunkInformation[originalChunkIndex].m_sequencePositionStart;
-             sequencePosition < m_chunkInformation[originalChunkIndex + 1].m_sequencePositionStart;
-             sequencePosition++)
-        {
-            SequenceDescription randomizedSeqDesc = *timeline[sequencePosition];
-            randomizedSeqDesc.m_chunkId = chunkId;
-            m_randomTimeline.push_back(randomizedSeqDesc);
-        }
-    }
-    assert(m_randomTimeline.size() == m_numSequences);
-
-    // Check we got those setup right
-    foreach_index (i, m_randomTimeline)
-    {
-        assert(IsValidForPosition(i, m_randomTimeline[i]));
-    }
-
-    // Now randomly shuffle m_randomTimeline, while considering the
-    // constraints of what chunk range needs to be in memory.
-    srand((unsigned int)(m_sweep + 1));
-    foreach_index (i, m_randomTimeline)
-    {
-        // Get valid randomization range, expressed in chunks
-        const size_t chunkId = GetChunkIndexForSequencePosition(i);
-        const size_t windowBegin = m_randomizedChunks[chunkId].m_windowBegin;
-        const size_t windowEnd = m_randomizedChunks[chunkId].m_windowEnd;
-
-        // Get valid randomization range, expressed in sequence positions.
-        size_t posBegin = m_randomizedChunks[windowBegin].m_info.m_sequencePositionStart;
-        size_t posEnd = m_randomizedChunks[windowEnd].m_info.m_sequencePositionStart;
-
-        for (;;)
-        {
-            // Pick a sequence position from [posBegin, posEnd)
-            const size_t j = rand(posBegin, posEnd);
-
-            // Try again if the sequence currently at j cannot be placed at position i.
-            if (!IsValidForPosition(i, m_randomTimeline[j]))
-                continue;
-
-            // Try again if the sequence currently at i cannot be placed at position j.
-            if (!IsValidForPosition(j, m_randomTimeline[i]))
-                continue;
-
-            // Swap and break out.
-            std::swap(m_randomTimeline[i], m_randomTimeline[j]); // TODO old swap was perhaps more efficient
-            break;
-        }
-    }
-
-    // Verify that we got it right
-    foreach_index (i, m_randomTimeline)
-    {
-        // TODO assert only
-        if (!IsValidForPosition(i, m_randomTimeline[i]))
-            LogicError("BlockRandomizer::Randomize: randomization logic mangled!");
-    }
-}
-
-// Randomizes if new sweep of the data is needed.
-// Returns true in case when randomization happend and false if the end of the current
-// sweep has not yet been reached (no randomization took place).
-bool BlockRandomizer::RandomizeIfNewSweepIsEntered()
-{
-    // Check that StartEpoch() was called
-    assert(m_sequencePositionInSweep != SIZE_MAX);
-
-    if (m_sequencePositionInSweep >= m_numSequences)
-    {
-        if (m_verbosity > 0)
-            std::cerr << __FUNCTION__ << ": re-randomizing for sweep " << m_sweep
-                      << " in " << (m_frameMode ? "frame" : "utterance") << " mode" << endl;
-        m_sweep++;
-        m_sweepStartInSamples += m_numSamples;
-        Randomize();
-        m_sequencePositionInSweep -= m_numSequences;
-        assert(m_sequencePositionInSweep < m_numSequences); // cannot jump ahead more than a sweep
-        return true;
-    };
-
-    return false;
-}
-
-void BlockRandomizer::RandomizeForGlobalSamplePosition(const size_t samplePosition)
-{
-    size_t sweep = samplePosition / m_numSamples;
-
-    if (m_sweep != sweep)
-    {
-        m_sweep = sweep;
-        m_sweepStartInSamples = sweep * m_numSamples;
-        Randomize();
-    }
-    m_sequencePositionInSweep = samplePosition % m_numSamples; // TODO only for m_frameMode
-};
-
-//
-// Public methods
-//
-
-BlockRandomizer::BlockRandomizer(int verbosity,
-                                 size_t randomizationRangeInSamples,
-                                 IDataDeserializerPtr deserializer,
-                                 DistributionMode distributionMode,
-                                 bool useLegacyRandomization) :
-    m_verbosity(verbosity),
-    m_randomizationRangeInSamples(randomizationRangeInSamples),
-    m_deserializer(deserializer),
-    m_distributionMode(distributionMode),
-    m_useLegacyRandomization(useLegacyRandomization),
-    m_sweep(SIZE_MAX),
-    m_sequencePositionInSweep(SIZE_MAX),
-    m_samplePositionInEpoch(SIZE_MAX),
-    m_epochSize(SIZE_MAX)
+BlockRandomizer::BlockRandomizer(
+    int verbosity,
+    size_t randomizationRangeInSamples,
+    IDataDeserializerPtr deserializer,
+    DecimationMode decimationMode,
+    bool useLegacyRandomization)
+    : m_verbosity(verbosity),
+      m_deserializer(deserializer),
+      m_decimationMode(decimationMode),
+      m_sweep(SIZE_MAX),
+      m_epochSize(SIZE_MAX),
+      m_globalSamplePosition(SIZE_MAX),
+      m_sweepTotalNumberOfSamples(0),
+      m_lastSeenChunkId(SIZE_MAX),
+      m_chunkRandomizer(std::make_shared<ChunkRandomizer>(deserializer, randomizationRangeInSamples, useLegacyRandomization))
 {
    assert(deserializer != nullptr);
-    const SequenceDescriptions& timeline = m_deserializer->GetSequenceDescriptions();
-    assert(TimelineIsValidForRandomization(timeline));
-
-    if (timeline.size() == 0)
-    {
-        m_numSequences = 0;
-        m_numChunks = 0;
-    }
-    else
-    {
-        // TODO let timeline keep this info?
-        m_numSequences = timeline.back()->m_id + 1;
-        m_numChunks = timeline.back()->m_chunkId + 1;
-    }
-
-    // Generate additional information about physical chunks
-    assert(m_chunkInformation.size() == 0);
-    m_chunkInformation.reserve(m_numChunks + 1);
-    m_chunkInformation.insert(m_chunkInformation.begin(),
-        m_numChunks + 1,
-        ChunkInformation{ SIZE_MAX, SIZE_MAX });
-
-    size_t maxNumberOfSamples = 0;
-
-    m_numSamples = 0;
-    for (const auto& seqDesc : timeline)
-    {
-        // TODO let timeline keep this info?
-        auto& chunkInformation = m_chunkInformation[seqDesc->m_chunkId];
-        chunkInformation.m_sequencePositionStart =
-            min(chunkInformation.m_sequencePositionStart, seqDesc->m_id);
-        chunkInformation.m_samplePositionStart =
-            min(chunkInformation.m_samplePositionStart, m_numSamples);
-        maxNumberOfSamples = max(maxNumberOfSamples, seqDesc->m_numberOfSamples);
-        m_numSamples += seqDesc->m_numberOfSamples;
-    }
-
-    // Add sentinel
-    m_chunkInformation[m_numChunks] = { m_numSequences, m_numSamples };
-
-    // Frame mode to the randomizer just means there are only single-sample sequences
-    m_frameMode = (maxNumberOfSamples == 1);

    m_streams = m_deserializer->GetStreamDescriptions();
+    m_sequenceRandomizer = std::make_shared<SequenceRandomizer>(m_deserializer, m_chunkRandomizer);
+
+    // Calculate total number of samples.
+    m_sweepTotalNumberOfSamples = 0;
+    for (auto const & chunk : m_deserializer->GetChunkDescriptions())
+    {
+        m_sweepTotalNumberOfSamples += chunk->m_numberOfSamples;
+    }
 }

-void BlockRandomizer::Initialize(TransformerPtr next, const ConfigParameters& readerConfig)
-{
-    // Not used for the block randomizer.
-    UNUSED(next);
-    UNUSED(readerConfig);
-}
-
+// Start a new epoch.
 void BlockRandomizer::StartEpoch(const EpochConfiguration& config)
 {
-    m_workerRank = config.m_workerRank;
-    m_numberOfWorkers = config.m_numberOfWorkers;
-
-    // eldak: check partial minibatches.
+    m_config = config;
    if (config.m_totalEpochSizeInSamples == requestDataSize)
    {
-        m_epochSize = m_numSamples;
+        m_epochSize = m_sweepTotalNumberOfSamples;
    }
    else
    {
        m_epochSize = config.m_totalEpochSizeInSamples;
    }

-    // TODO add some asserts on EpochConfiguration
-    m_samplePositionInEpoch = 0;
-    size_t timeframe = m_epochSize * config.m_epochIndex;
-    assert(m_frameMode); // TODO !m_frameMode needs fixes
-    assert(timeframe != SIZE_MAX); // used as special value for init
-    RandomizeForGlobalSamplePosition(timeframe);
-};
+    // Calculates global sample position.
+    m_globalSamplePosition = m_epochSize * config.m_epochIndex;
+    PrepareNewSweepIfNeeded(m_globalSamplePosition);

-bool BlockRandomizer::GetNextSequenceIds(size_t sampleCount, std::vector<size_t>& originalIds, std::unordered_set<size_t>& originalChunks)
-{
-    assert(m_frameMode); // TODO !m_frameMode not implemented yet
-    assert(originalIds.size() == 0);
-    assert(originalChunks.size() == 0);
-    assert(sampleCount <= m_numSamples);
-
-    if (m_samplePositionInEpoch < m_epochSize)
-    {
-        if (m_distributionMode == DistributionMode::chunk_modulus)
-        {
-            size_t distributedSampleCount = 0;
-
-            while ((m_samplePositionInEpoch < m_epochSize) &&
-                   (distributedSampleCount < sampleCount))
-            {
-                if (RandomizeIfNewSweepIsEntered() && 0 < distributedSampleCount)
-                {
-                    // Minibatch ends on sweep boundary.
-                    // TODO matches old behavior, consider changing; make configurable
-                    break;
-                }
-
-                const auto& seqDesc = m_randomTimeline[m_sequencePositionInSweep];
-                if ((seqDesc.m_chunkId % m_numberOfWorkers) == m_workerRank)
-                {
-                    // Got one, collect it (and its window of chunks)
-                    originalIds.push_back(seqDesc.m_id);
-
-                    const auto & currentChunk = m_randomizedChunks[GetChunkIndexForSequencePosition(seqDesc.m_id)];
-                    const size_t windowBegin = currentChunk.m_windowBegin;
-                    const size_t windowEnd = currentChunk.m_windowEnd;
-
-                    for (size_t chunk = windowBegin; chunk < windowEnd; chunk++)
-                    {
-                        if ((chunk % m_numberOfWorkers) == m_workerRank)
-                        {
-                            originalChunks.insert(m_randomizedChunks[chunk].m_originalChunkIndex);
-                        }
-                    }
-                }
-
-                m_samplePositionInEpoch += seqDesc.m_numberOfSamples;
-                m_sequencePositionInSweep++;
-                distributedSampleCount++;
-            }
-        }
-        else
-        {
-            assert(m_distributionMode == DistributionMode::sequences_strides);
-
-            size_t nextSamplePositionInEpoch = std::min(m_epochSize, m_samplePositionInEpoch + sampleCount);
-            size_t distributedSampleCount = nextSamplePositionInEpoch - m_samplePositionInEpoch;
-            size_t strideBegin = distributedSampleCount * m_workerRank / m_numberOfWorkers;
-            size_t strideEnd = distributedSampleCount * (m_workerRank + 1) / m_numberOfWorkers;
-
-            for (size_t i = 0; i < distributedSampleCount; ++i, ++m_samplePositionInEpoch, ++m_sequencePositionInSweep)
-            {
-                RandomizeIfNewSweepIsEntered(); // TODO return value ignored here?
-                if (strideBegin <= i && i < strideEnd)
-                {
-                    const auto& seqDesc = m_randomTimeline[m_sequencePositionInSweep];
-                    originalIds.push_back(seqDesc.m_id);
-
-                    const auto & currentChunk = m_randomizedChunks[GetChunkIndexForSequencePosition(m_sequencePositionInSweep)];
-                    const size_t windowBegin = currentChunk.m_windowBegin;
-                    const size_t windowEnd = currentChunk.m_windowEnd;
-
-                    for (size_t chunk = windowBegin; chunk < windowEnd; chunk++)
-                    {
-                        originalChunks.insert(m_randomizedChunks[chunk].m_originalChunkIndex);
-                    }
-                }
-            }
-            assert(m_samplePositionInEpoch == nextSamplePositionInEpoch);
-        }
-    }
-
-    return m_epochSize <= m_samplePositionInEpoch;
+    // Sets sequence cursor to the sequence that corresponds to the global sample position.
+    // If last epoch ended in the middle of a sequence, the cursor is moved to the next sequence in the sweep.
+    m_sequenceRandomizer->SetSequencePositionTo(m_globalSamplePosition % m_sweepTotalNumberOfSamples, m_sweep);
 }

+// Prepares a new sweep if needed.
+void BlockRandomizer::PrepareNewSweepIfNeeded(size_t samplePosition)
+{
+    size_t sweep = samplePosition / m_sweepTotalNumberOfSamples;
+    if (m_sweep != sweep)
+    {
+        m_sweep = sweep;
+        m_sweepStartInSamples = sweep * m_sweepTotalNumberOfSamples;
+
+        // Rerandomizing the chunks.
+        m_chunkRandomizer->Randomize((unsigned int)m_sweep);
+
+        // Resetting seqeunce randomizer.
+        m_sequenceRandomizer->Reset(m_sweep + 1);
+
+        // Unloading all chunk data from memory.
+        m_chunks.clear();
+        m_lastSeenChunkId = SIZE_MAX;
+    }
+}
+
+// Gets next sequences not exceeding sampleCount.
 Sequences BlockRandomizer::GetNextSequences(size_t sampleCount)
 {
-    assert(m_frameMode); // TODO sequence mode not implemented yet
-    assert(m_samplePositionInEpoch != SIZE_MAX); // SetEpochConfiguration() must be called first
-
-    std::vector<size_t> originalIds;
-    std::unordered_set<size_t> originalChunks;
+    // Get next sequence descriptions.
    Sequences result;
-
-    result.m_endOfEpoch = GetNextSequenceIds(sampleCount, originalIds, originalChunks);
-
-    if (originalIds.size() == 0)
+    std::vector<RandomizedSequenceDescription> sequences;
+    result.m_endOfEpoch = GetNextSequenceDescriptions(sampleCount, sequences);
+    if (sequences.size() == 0)
    {
        return result;
    }

-    // Require and release chunks from the data deserializer
-    for (size_t originalChunkIndex = 0; originalChunkIndex < m_numChunks; originalChunkIndex++)
+    // Decimate.
+    std::vector<RandomizedSequenceDescription> decimated;
+    decimated.reserve(sequences.size());
+    Decimate(sequences, decimated);
+    if (decimated.size() == 0)
    {
-        if (originalChunks.find(originalChunkIndex) != originalChunks.end())
-        {
-            if (m_chunks.find(originalChunkIndex) == m_chunks.end())
-            {
-                m_chunks[originalChunkIndex] = m_deserializer->GetChunk(originalChunkIndex);
-            }
-        }
-        else
-        {
-            m_chunks.erase(originalChunkIndex);
-        }
+        return result;
    }

-    const auto& originalTimeline = m_deserializer->GetSequenceDescriptions();
-    result.m_data.resize(m_streams.size(), std::vector<SequenceDataPtr>(originalIds.size()));
+    result.m_data.resize(m_streams.size(), std::vector<SequenceDataPtr>(decimated.size()));

    // TODO: This will be changed, when we move transformers under the randomizer.
    // TODO: Randomizer won't should not deal with multithreading.
-
-    #pragma omp parallel for ordered schedule(dynamic)
-    for (int i = 0; i < originalIds.size(); ++i)
+#pragma omp parallel for ordered schedule(dynamic)
+    for (int i = 0; i < decimated.size(); ++i)
    {
-        const auto& sequenceDescription = originalTimeline[originalIds[i]];
-        auto sequence = m_chunks[sequenceDescription->m_chunkId]->GetSequence(originalIds[i]);
+        const auto& description = decimated[i];
+        std::vector<SequenceDataPtr> sequence;
+        auto it = m_chunks.find(description.m_chunk->m_chunkId);
+        if (it == m_chunks.end())
+        {
+            LogicError("Invalid chunk requested.");
+        }

+        it->second->GetSequence(description.m_id, sequence);
        for (int j = 0; j < m_streams.size(); ++j)
        {
            result.m_data[j][i] = sequence[j];
@ -493,6 +132,111 @@ Sequences BlockRandomizer::GetNextSequences(size_t sampleCount)
    }

    return result;
-};
+}
+
+// Get next sequence descriptions that do not exceed sample count.
+// Returns true if epoch end is reached.
+bool BlockRandomizer::GetNextSequenceDescriptions(size_t sampleCount, std::vector<RandomizedSequenceDescription>& result)
+{
+    PrepareNewSweepIfNeeded(m_globalSamplePosition);
+
+    // Check epoch.
+    size_t epochStart = m_config.m_epochIndex * m_epochSize;
+    if (m_globalSamplePosition - epochStart + sampleCount >= m_epochSize)
+    {
+        sampleCount = epochStart + m_epochSize - m_globalSamplePosition;
+    }
+
+    if (sampleCount <= 0)
+    {
+        return true;
+    }
+
+    // Check that we do not go over the sweep.
+    size_t sweepPosition = m_globalSamplePosition % m_sweepTotalNumberOfSamples;
+    if (sweepPosition + sampleCount >= m_sweepTotalNumberOfSamples)
+    {
+        sampleCount = m_sweepTotalNumberOfSamples - sweepPosition;
+    }
+    assert(sampleCount != 0);
+
+    // Randomizing sequences
+    result = m_sequenceRandomizer->GetNextSequenceDescriptions(sampleCount);
+    return false;
+}
+
+// Decimates sequences and load/unloads chunks using infromation of the SequenceRandomizer.
+void BlockRandomizer::Decimate(const std::vector<RandomizedSequenceDescription>& all, std::vector<RandomizedSequenceDescription>& decimated)
+{
+    // Swap remove all old chunks and add new ones.
+    // Require all data in chunks.
+    RetrieveDataChunks();
+
+    // Moving the cursor to the end of read sequences.
+    for (const auto& sequence : all)
+    {
+        m_globalSamplePosition += sequence.m_numberOfSamples;
+    }
+
+    decimated.reserve(all.size());
+    if (m_decimationMode == DecimationMode::chunk)
+    {
+        for (const auto& sequence : all)
+        {
+            if (sequence.m_chunk->m_chunkId % m_config.m_numberOfWorkers == m_config.m_workerRank)
+            {
+                decimated.push_back(sequence);
+            }
+        }
+    }
+    else if (m_decimationMode == DecimationMode::sequence)
+    {
+        size_t strideBegin = all.size() * m_config.m_workerRank / m_config.m_numberOfWorkers;
+        size_t strideEnd = all.size() * (m_config.m_workerRank + 1) / m_config.m_numberOfWorkers;
+        decimated.assign(all.begin() + strideBegin, all.begin() + strideEnd);
+    }
+    else
+    {
+        LogicError("Not supported mode.");
+    }
+}
+
+// Retrives chunk data based on the window information provided by SequenceRandomizer
+void BlockRandomizer::RetrieveDataChunks()
+{
+    const auto& window = m_sequenceRandomizer->GetChunkWindow();
+    if (window.back().m_chunkId == m_lastSeenChunkId)
+    {
+        return; // nothing to retrieve.
+    }
+
+    m_lastSeenChunkId = window.back().m_chunkId;
+
+    // in the loop we are building a new map of currently loaded chunks:
+    // we are iterating thru all chunks in the window and if they are not in m_chunks map - 
+    // they get requested from the deserializer.
+    // There could be some chunks in the m_chunks that are not required anymore, by swapping the chunks with m_chunks, we are removing those.
+    std::map<size_t, ChunkPtr> chunks;
+    for (auto const& chunk : window)
+    {
+        if (m_decimationMode == DecimationMode::chunk && chunk.m_chunkId % m_config.m_numberOfWorkers != m_config.m_workerRank)
+        {
+            continue;
+        }
+
+        auto it = m_chunks.find(chunk.m_original->m_id);
+        if (it != m_chunks.end())
+        {
+            chunks[chunk.m_chunkId] = it->second;
+        }
+        else
+        {
+            chunks[chunk.m_chunkId] = m_deserializer->GetChunk(chunk.m_original->m_id);
+        }
+    }
+
+    // Swapping current chunks in the m_chunks, by that removing all stale and remembering newly loaded.
+    m_chunks.swap(chunks);
+}

 }}}
--- a/Source/Readers/ReaderLib/BlockRandomizer.h
+++ b/Source/Readers/ReaderLib/BlockRandomizer.h
@ -6,115 +6,117 @@
 #pragma once

 #include <vector>
-#include <unordered_set>

 #include "Transformer.h"
 #include "DataDeserializer.h"
+#include "ChunkRandomizer.h"
+#include "SequenceRandomizer.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

-// The class represents a randomizer that does randomization based on chunks/sequences inside a set of chunk.
-// TODO: currently this code moved from the old block randomizer.
-// TODO: The class will be further refactored and common based will be extracted with NoRandomizer.
-// TODO: Currently works only for frame mode (numberOfSample in sequence == 1)
-// TODO: This layering will be changed, when we move transformers under the randomizer, it won't be a transformer anymore.
+// A randomizer that firstly randomizes chunks and then sequences inside a rolling window of chunks.
+// Uses ChunkRandomizer to randomize chunk descriptions and SequenceRandomizer to randomize sequence descriptions inside a window of chunks.
+// It requires only a window of sequence descriptions and corresponding chunk data.
+// The code is based on the old block randomizer and it preserves the same behavior to pass all available tests.
+// The high-level algorithm is:
+//     When next sequences are requested (limited by the sampleCount), the following steps are performed:
+//         1) if a new sweep is entered, randomize chunk descriptions using ChunkRandomizer, also precalculate randomization windows for all 
+//            chunk descriptions
+//         2) if a new chunk is entered, using SequenceRandomizer identify a window of chunks and requested their sequence descriptions from deserializer.
+//         3) randomize sequence descriptions inside the window 
+//         4) return sequence descriptions not exceeding sampleCount/minibatch limit
+//         5) decimate sequence descriptions based on the worker rank
+//         6) request chunks of data based on decimated sequences and return sequence data
+//
+// This class is responsible for decimation and loading the data chunks in to memory.
+// Actual randomization happens in ChunkRandomizer and SequenceRandomizer.
+// TODO: The behavior can be simplified by only randomizing sequences forward.
+// TODO: The layering will be changed, when we move transformers under the randomizer, it won't be a transformer anymore.
 class BlockRandomizer : public Transformer
 {
 public:
-    enum class DistributionMode {
-        chunk_modulus,
-        sequences_strides
+    // Currently, decimation based on sequences or chunks is supported.
+    enum class DecimationMode
+    {
+        chunk,
+        sequence
    };

-    BlockRandomizer(int verbosity,
-                    size_t randomizationRangeInSamples,
-                    IDataDeserializerPtr deserializer,
-                    DistributionMode distributionMode = DistributionMode::sequences_strides,
-                    bool useLegacyRandomization = false);
+    BlockRandomizer(
+        int verbosity,
+        size_t randomizationRangeInSamples,
+        IDataDeserializerPtr deserializer,
+        DecimationMode decimationMode = DecimationMode::chunk,
+        bool useLegacyRandomization = false);

-    virtual ~BlockRandomizer()
-    {
-    }
+    virtual void Initialize(TransformerPtr, const ConfigParameters&) override {};

-    virtual void Initialize(TransformerPtr next, const ConfigParameters& readerConfig) override;
+    // Starts a new epoch.
    virtual void StartEpoch(const EpochConfiguration& config) override;
+
+    // Gets next sequences.
    virtual Sequences GetNextSequences(size_t sampleCount) override;
+
+    // Gets stream descriptions.
    virtual std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override
    {
        return m_deserializer->GetStreamDescriptions();
    }

 private:
-    // Structure for per-chunk information
-    struct ChunkInformation
-    {
-        size_t m_sequencePositionStart;
-        size_t m_samplePositionStart;
-    };
+    // Retrieve data for chunks.
+    void RetrieveDataChunks();

-    // Structure that will be maintained for each randomized chunk
-    struct RandomizedChunk
-    {
-        struct ChunkInformation m_info; // sample positions are global // TODO could drop 'global' requirement?
+    // Get next sequence descriptions that do not exceed sample count.
+    // Returns true if epoch end is reached.
+    bool GetNextSequenceDescriptions(size_t sampleCount, std::vector<RandomizedSequenceDescription>& result);

-        size_t m_originalChunkIndex;
+    // Decimates sequence descriptions and loads chunks of data.
+    void Decimate(const std::vector<RandomizedSequenceDescription>& all, std::vector<RandomizedSequenceDescription>& decimated);

-        // Randomization range (in randomized chunk positions; right-side open)
-        size_t m_windowBegin;
-        size_t m_windowEnd;
-    };
+    // Prepares a new sweep if needed.
+    void PrepareNewSweepIfNeeded(size_t samplePosition);

-    // General configuration
-    bool m_useLegacyRandomization;
-    int m_verbosity;
-    size_t m_randomizationRangeInSamples; // full window
-    DistributionMode m_distributionMode;
+    // Global sample position on the timeline.
+    size_t m_globalSamplePosition;

-    // Deserializer and information on the original timeline
-    IDataDeserializerPtr m_deserializer;
-    size_t m_numSequences;
-    size_t m_numChunks;
-    size_t m_numSamples;
-    bool m_frameMode;                                 // true iff only single-sample sequences
-    std::vector<ChunkInformation> m_chunkInformation; // (includes a sentinel)
+    // Configuration of the epoch.
+    EpochConfiguration m_config;

-    // Per-epoch configuration
-    size_t m_workerRank;
-    size_t m_numberOfWorkers;
+    // Epoch size.
    size_t m_epochSize;
-    size_t m_samplePositionInEpoch;

-    // Per-randomization-sweep information
+    // Current sweep.
    size_t m_sweep;
-    size_t m_sweepStartInSamples; // TODO do we need it?
-    size_t m_sequencePositionInSweep;
-    std::vector<RandomizedChunk> m_randomizedChunks;    // (includes a sentinel)
-    // TODO optimize footprint:
-    //      (do not require full timeline, i.e., Amit's change in original HTKMLFReader)
-    //      (instead of SequenceDescription, use something smaller)
-    std::vector<SequenceDescription> m_randomTimeline;
+
+    // Global position of the current sweep in samples.
+    size_t m_sweepStartInSamples;
+
+    // Total number of samples in a sweep.
+    size_t m_sweepTotalNumberOfSamples;
+
+    IDataDeserializerPtr m_deserializer;
+
+    // Chunk randomizer.
+    ChunkRandomizerPtr m_chunkRandomizer;
+
+    // Sequence randomizer.
+    SequenceRandomizerPtr m_sequenceRandomizer;
+
+    // Exposed streams.
    std::vector<StreamDescriptionPtr> m_streams;

-    // Chunks that we currently hold a pointer to
-    std::map<size_t, ChunkPtr> m_chunks; // TODO vector? or unordered_map
+    // A map of data chunks.
+    std::map<size_t, ChunkPtr> m_chunks;

-    // Check that timeline has only valid sequences of non-zero length
-    // with incrementing IDs and non-decreasing chunk identifiers.
-    bool TimelineIsValidForRandomization(const SequenceDescriptions& timeline) const;
+    // Last seen data chunk id.
+    size_t m_lastSeenChunkId;

-    void RandomizeChunks();
+    // Decimation mode.
+    DecimationMode m_decimationMode;

-    size_t GetChunkIndexForSequencePosition(size_t sequencePosition) const;
-
-    bool IsValidForPosition(size_t targetPosition, const SequenceDescription& seqDesc) const;
-
-    void Randomize();
-
-    void RandomizeForGlobalSamplePosition(const size_t samplePosition);
-
-    bool RandomizeIfNewSweepIsEntered();
-
-    bool GetNextSequenceIds(size_t sampleCount, std::vector<size_t>& originalIds, std::unordered_set<size_t>& originalChunks);
+    // General configuration
+    int m_verbosity;
 };

 }}}
--- a/Source/Readers/ReaderLib/Bundler.cpp
+++ b/Source/Readers/ReaderLib/Bundler.cpp
@ -8,110 +8,159 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+// Represents bundled chunk description with possible cleansed data.
+struct Bundler::BundlerChunkDescription : public ChunkDescription
+{
+    ChunkDescriptionPtr m_original;
+
+    // Sequences that are invalid in at least one deserializer.
+    std::set<size_t> m_invalid;
+};
+
 Bundler::Bundler(
    const ConfigParameters& readerConfig,
    IDataDeserializerPtr driver,
-    std::vector<IDataDeserializerPtr> deserializers)
+    std::vector<IDataDeserializerPtr> deserializers,
+    bool cleanse)
    : m_deserializers(deserializers), m_driver(driver)
 {
    UNUSED(readerConfig);
-    std::vector<StreamDescriptionPtr> streams;
+
+    // Combines streams of underlying deserializers.
    for (auto d : deserializers)
    {
        for (auto i : d->GetStreamDescriptions())
        {
            StreamDescriptionPtr stream = std::make_shared<StreamDescription>(*i);
-            stream->m_id = streams.size();
-            streams.push_back(stream);
+            stream->m_id = m_streams.size();
+            m_streams.push_back(stream);
        }
    }

-    m_streams = streams;
-    CreateSequenceDescriptions();
+    m_cleanse = cleanse;
+    CreateChunkDescriptions();
 }

-// Creates additional structures for fast indexing between chunks/deseriazliers.
-// TODO: This must be changed when we introduce chunking of the timeline.
-void Bundler::CreateSequenceDescriptions()
+// Creates chunk descriptions based on chunks of underlying deserializers.
+void Bundler::CreateChunkDescriptions()
 {
-    m_sequenceToSequence.resize(m_deserializers.size());
-    m_sequenceToChunk.resize(m_deserializers.size());
-    m_sequenceDescriptions.reserve(m_driver->GetSequenceDescriptions().size());
-    m_chunkOffsets.reserve(m_driver->GetTotalNumberOfChunks() + 1);
-
-    size_t maxNumberOfSequences = m_driver->GetSequenceDescriptions().size();
-    for (int i = 0; i < m_deserializers.size(); ++i)
+    auto chunks = m_driver->GetChunkDescriptions();
+    if (chunks.size() < 1)
    {
-        // TODO use reserve(), .push_back() ? also elsewhere
-        m_sequenceToSequence[i].resize(maxNumberOfSequences);
-        m_sequenceToChunk[i].resize(maxNumberOfSequences);
+        RuntimeError("Driving deserializer should at least provide one chunk.");
    }

-    fprintf(stderr, "Bundler::CreateSequenceDescriptions: auxiliary mapping data for bundler has been allocated.\n");
+    m_chunks.reserve(chunks.size());

-    size_t previousChunk = SIZE_MAX;
-    size_t currentMapping = 0;
-    for (int i = 0; i < m_driver->GetSequenceDescriptions().size(); ++i)
+    // If there is not cleaning required simply build chunks based on the chunk descriptions of the primary deserializer.
+    if (!m_cleanse)
    {
-        const auto* sequenceDescription = m_driver->GetSequenceDescriptions()[i];
-
-        bool isValid = true;
-        for (int j = 1; j < m_deserializers.size(); ++j)
+        for (const auto& c : chunks)
        {
-            auto description = m_deserializers[j]->GetSequenceDescriptionByKey(sequenceDescription->m_key);
-            if (!description->m_isValid)
+            auto cd = std::make_shared<BundlerChunkDescription>();
+            cd->m_numberOfSamples = c->m_numberOfSamples;
+            cd->m_numberOfSequences = c->m_numberOfSequences;
+            cd->m_id = m_chunks.size();
+            cd->m_original = c;
+            m_chunks.push_back(cd);
+        }
+        return;
+    }
+
+    // Otherwise build bundling chunks using underlying deserializers.
+    std::vector<SequenceDescription> sequenceDescriptions;
+    sequenceDescriptions.reserve(chunks.front()->m_numberOfSequences);
+    SequenceDescription s;
+    for (size_t chunkIndex = 0; chunkIndex < chunks.size(); ++chunkIndex)
+    {
+        size_t numberOfSamples = 0;
+        size_t numberOfSequences = 0;
+        sequenceDescriptions.clear();
+
+        // Iterating thru all sequences and identifying whether they are valid among all deserializers.
+        m_driver->GetSequencesForChunk(chunks[chunkIndex]->m_id, sequenceDescriptions);
+        std::set<size_t> invalid;
+        for (size_t sequenceIndex = 0; sequenceIndex < sequenceDescriptions.size(); ++sequenceIndex)
+        {
+            auto sequence = sequenceDescriptions[sequenceIndex];
+            bool isValid = true;
+            for (size_t deserializerIndex = 1; deserializerIndex < m_deserializers.size(); ++deserializerIndex)
            {
-                isValid = false;
-                break;
+                m_deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequenceDescriptions[sequenceIndex].m_key, s);
+                if (!s.m_isValid)
+                {
+                    isValid = false;
+                    invalid.insert(sequenceIndex);
+                    break;
+                }
            }

-            m_sequenceToChunk[j][currentMapping] = description->m_chunkId;
-            m_sequenceToSequence[j][currentMapping] = description->m_id;
-        }
-
-        m_sequenceToChunk[0][currentMapping] = sequenceDescription->m_chunkId;
-        m_sequenceToSequence[0][currentMapping] = sequenceDescription->m_id;
-
-        if (isValid)
-        {
-            if (sequenceDescription->m_chunkId != previousChunk)
+            if (isValid)
            {
-                m_chunkOffsets.push_back(m_sequenceDescriptions.size());
-                previousChunk = sequenceDescription->m_chunkId;
+                numberOfSamples += sequence.m_numberOfSamples;
+                numberOfSequences++;
            }
-
-            m_sequenceDescriptions.push_back(*sequenceDescription);
-            m_sequenceDescriptions.back().m_id = m_sequenceDescriptions.size() - 1;
-            m_sequenceToSequence[0][currentMapping] = sequenceDescription->m_id;
-            currentMapping++;
        }
-    }

-    fprintf(stderr, "Bundler::CreateSequenceDescriptions: dropped %d sequences\n", (int)(m_driver->GetSequenceDescriptions().size() - m_sequenceDescriptions.size()));
-    fprintf(stderr, "Bundler::CreateSequenceDescriptions: total number of sequences is  %d\n", (int)m_sequenceDescriptions.size());
-
-    for (int i = 0; i < m_deserializers.size(); ++i)
-    {
-        m_sequenceToSequence[i].resize(currentMapping);
-    }
-
-    // Last
-    m_chunkOffsets.push_back(m_sequenceDescriptions.size());
-
-    m_sequences.resize(m_sequenceDescriptions.size());
-    for (int k = 0; k < m_sequenceDescriptions.size(); ++k)
-    {
-        m_sequences[k] = &m_sequenceDescriptions[k];
+        // Build a chunk for valid sequences.
+        if (numberOfSamples > 0)
+        {
+            auto cd = std::make_shared<BundlerChunkDescription>();
+            cd->m_numberOfSamples = numberOfSamples;
+            cd->m_numberOfSequences = numberOfSequences;
+            cd->m_id = m_chunks.size();
+            cd->m_original = chunks[chunkIndex];
+            m_chunks.push_back(cd);
+            cd->m_invalid = std::move(invalid);
+        }
    }
 }

-// Represents a chunk that has poibters to the underlying deserialzer chunks.
-class BundlingChunk : public Chunk
+// Gets chunk descriptions.
+ChunkDescriptions Bundler::GetChunkDescriptions()
+{
+    return ChunkDescriptions(m_chunks.begin(), m_chunks.end());
+}
+
+// Gets sequence descriptions for a chunk.
+void Bundler::GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& sequences)
+{
+    BundlerChunkDescriptionPtr chunk = m_chunks[chunkId];
+    ChunkDescriptionPtr original = chunk->m_original;
+    m_driver->GetSequencesForChunk(original->m_id, sequences);
+
+    // Can return because all sequences are clean.
+    if (chunk->m_invalid.empty())
+    {
+        return;
+    }
+
+    // Do cleansing.
+    std::vector<SequenceDescription> result;
+    result.reserve(sequences.size());
+    for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
+    {
+        if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end())
+        {
+            continue;
+        }
+
+        result.push_back(sequences[sequenceIndex]);
+    }
+    std::swap(sequences, result);
+}
+
+// Represents a chunk that has pointers to the underlying deserialzer chunks.
+class Bundler::BundlingChunk : public Chunk
 {
    size_t m_numberOfInputs;
    Bundler* m_parent;
    size_t m_chunkId;
-    std::vector<std::vector<ChunkPtr>> m_innerChunks;
+
+    // A mapping between exposed sequence id and inner chunk for each deserialzier.
+    std::vector<ChunkPtr> m_innerChunks;
+    // A mapping between exposed sequence id and inner sequence id for each deserializer.
+    std::vector<size_t> m_sequenceToSequence;

    DISABLE_COPY_AND_MOVE(BundlingChunk);

@ -119,62 +168,67 @@ public:
    BundlingChunk(size_t numberOfInputs, Bundler* parent, size_t chunkId)
        : m_numberOfInputs(numberOfInputs), m_parent(parent), m_chunkId(chunkId)
    {
-        size_t numberOfSequences = m_parent->m_chunkOffsets[chunkId + 1] - m_parent->m_chunkOffsets[chunkId];
-        m_innerChunks.resize(numberOfSequences);
+        BundlerChunkDescriptionPtr chunk = m_parent->m_chunks[m_chunkId];
+        ChunkDescriptionPtr original = chunk->m_original;

-        int innerIndex = 0;
-        for (size_t sequenceId = m_parent->m_chunkOffsets[chunkId]; innerIndex < numberOfSequences; ++sequenceId, ++innerIndex)
+        auto& deserializers = m_parent->m_deserializers;
+        assert(numberOfInputs == deserializers.size());
+        std::vector<SequenceDescription> sequences;
+        sequences.reserve(original->m_numberOfSequences);
+
+        // Creating chunk mapping.
+        m_parent->m_driver->GetSequencesForChunk(original->m_id, sequences);
+        ChunkPtr drivingChunk = m_parent->m_driver->GetChunk(original->m_id);
+        m_sequenceToSequence.resize(m_numberOfInputs * sequences.size());
+        m_innerChunks.resize(m_numberOfInputs * sequences.size());
+        for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
        {
-            m_innerChunks[innerIndex].resize(m_parent->m_deserializers.size());
-            for (size_t i = 0; i < m_parent->m_deserializers.size(); ++i)
+            if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end())
            {
-                size_t innerChunkId = m_parent->m_sequenceToChunk[i][sequenceId];
-                m_innerChunks[innerIndex][i] = m_parent->m_deserializers[i]->GetChunk(innerChunkId);
+                continue;
+            }
+
+            size_t currentIndex = sequenceIndex * m_numberOfInputs;
+            m_sequenceToSequence[currentIndex] = sequences[sequenceIndex].m_id;
+            m_innerChunks[currentIndex] = drivingChunk;
+        }
+
+        // Creating sequence mapping and requiring underlying chunks.
+        SequenceDescription s;
+        for (size_t deserializerIndex = 1; deserializerIndex < m_parent->m_deserializers.size(); ++deserializerIndex)
+        {
+            for (size_t sequenceIndex = 0; sequenceIndex < sequences.size(); ++sequenceIndex)
+            {
+                if (chunk->m_invalid.find(sequenceIndex) != chunk->m_invalid.end())
+                {
+                    continue;
+                }
+
+                size_t currentIndex = sequenceIndex * m_numberOfInputs + deserializerIndex;
+                deserializers[deserializerIndex]->GetSequenceDescriptionByKey(sequences[sequenceIndex].m_key, s);
+                m_sequenceToSequence[currentIndex] = s.m_id;
+                m_innerChunks[currentIndex] = deserializers[deserializerIndex]->GetChunk(s.m_chunkId);
            }
        }
    }

-    virtual std::vector<SequenceDataPtr> GetSequence(size_t sequenceId) override
+    // Gets sequence by its id.
+    virtual void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override
    {
-        size_t index = sequenceId - m_parent->m_chunkOffsets[m_chunkId];
-        const auto& chunks = m_innerChunks[index];
-        std::vector<SequenceDataPtr> result;
        result.reserve(m_numberOfInputs);
-
-        for (int i = 0; i < chunks.size(); ++i)
+        size_t currentIndex = sequenceId * m_numberOfInputs;
+        for (int i = 0; i < m_parent->m_deserializers.size(); ++i)
        {
-            size_t originalSequenceId = m_parent->m_sequenceToSequence[i][sequenceId];
-            auto sequences = chunks[i]->GetSequence(originalSequenceId);
-            result.insert(result.end(), sequences.begin(), sequences.end());
+            size_t originalSequenceId = m_sequenceToSequence[currentIndex + i];
+            m_innerChunks[currentIndex + i]->GetSequence(originalSequenceId, result);
        }
-
-        return result;
    }
 };

+// Get chunk data by id.
 ChunkPtr Bundler::GetChunk(size_t chunkId)
 {
    return std::make_shared<BundlingChunk>(m_streams.size(), this, chunkId);
 }

-const SequenceDescriptions& Bundler::GetSequenceDescriptions() const
-{
-    return m_sequences;
-}
-
-std::vector<StreamDescriptionPtr> Bundler::GetStreamDescriptions() const
-{
-    return m_streams;
-}
-
-const SequenceDescription* Bundler::GetSequenceDescriptionByKey(const KeyType&)
-{
-    throw std::logic_error("Not implemented");
-}
-
-size_t Bundler::GetTotalNumberOfChunks()
-{
-    return m_chunkOffsets.size();
-}
-
 }}}
--- a/Source/Readers/ReaderLib/Bundler.h
+++ b/Source/Readers/ReaderLib/Bundler.h
@ -6,62 +6,54 @@
 #pragma once

 #include "DataDeserializer.h"
+#include "DataDeserializerBase.h"
 #include "Config.h"
+#include <set>

 namespace Microsoft { namespace MSR { namespace CNTK {

 // Class represents an bundler of several deserializers.
 // In case when only a single deserializer is used, the bundler can be omitted and 
 // no performance penalty is paid.
-// TODO: The interface will changed when the timeline will support chunking.
-class Bundler : public IDataDeserializer
+class Bundler : public DataDeserializerBase
 {
 public:
-    Bundler(const ConfigParameters& readerConfig, IDataDeserializerPtr driver, std::vector<IDataDeserializerPtr> deserializers);
+    Bundler(const ConfigParameters& readerConfig, IDataDeserializerPtr driver, std::vector<IDataDeserializerPtr> deserializers, bool cleanse);

-    // Retrieves description of all sequences this data deserializer can produce, together with associated chunks.
-    // TODO: For huge corpus, the memory footprint is too big. We adapt this interface to request timeline in chunks.
-    virtual const SequenceDescriptions& GetSequenceDescriptions() const override;
+    // Gets chunk descriptions.
+    virtual ChunkDescriptions GetChunkDescriptions() override;

-    // Retrieves description of a single sequence given its key.
-    virtual const SequenceDescription* GetSequenceDescriptionByKey(const KeyType& key) override;
+    // Gets sequence descriptions for a particular chunk.
+    virtual void GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& result) override;

-    // Describes bundled streams of the underlying data deserializers.
-    virtual std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override;
-
-    // Retrieves a chunk with data.
-    virtual ChunkPtr GetChunk(size_t) override;
-
-    // Retrieves total number of chunks this deserializer can produce.
-    virtual size_t GetTotalNumberOfChunks() override;
+    // Gets a chunk with data.
+    virtual ChunkPtr GetChunk(size_t chunkId) override;

 private:
    DISABLE_COPY_AND_MOVE(Bundler);

-    void CreateSequenceDescriptions();
+    class BundlingChunk;
+    struct BundlerChunkDescription;
+    typedef std::shared_ptr<BundlerChunkDescription> BundlerChunkDescriptionPtr;
+
+    // Creates chunk descriptions based on chunks of underlying deserializers.
+    void CreateChunkDescriptions();

-    // Exposed bundled streams.
-    std::vector<StreamDescriptionPtr> m_streams;
    // Underlying deserializers.
    std::vector<IDataDeserializerPtr> m_deserializers;
+
    // Driving deserializer that defines chunks.
    IDataDeserializerPtr m_driver;

-    // Seqeunce descriptions.
-    std::vector<SequenceDescription> m_sequenceDescriptions;
-    SequenceDescriptions m_sequences;
+    // Chunk descriptions.
+    std::vector<BundlerChunkDescriptionPtr> m_chunks;

-    // Exposed sequence id to chunk mapping.
-    std::vector<std::vector<size_t>> m_sequenceToChunk;
-
-    // Exposed sequence id to internal sequence id mapping.
-    std::vector<std::vector<size_t>> m_sequenceToSequence;
-
-    // Chunk offsets - m_chunkOffsets[chunkId] stores the index of 
-    // the sequence in m_sequenceDescription where the chunk starts.
-    std::vector<size_t> m_chunkOffsets;
-
-    friend class BundlingChunk;
+    // A flag that indicates whether there is a need to clean data between different deserializers.
+    // It is possible that some sequence is valid in one deserializer but invalid in another. This sequences should be removed.
+    // At the same time this introduces unnecessary overhead when the data is clean, because all chunks should be checked in advance to expose
+    // correct number of samples/sequences they contain.
+    // If this flag is set to false, no cleaning will be done, so additional overhead.
+    bool m_cleanse;
 };

 }}}
--- a/Source/Readers/ReaderLib/ChunkRandomizer.cpp
+++ b/Source/Readers/ReaderLib/ChunkRandomizer.cpp
@ -0,0 +1,128 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "ChunkRandomizer.h"
+#include <random>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // NOTE: This is an old code, used for legacy randomization to make sure we preserve the same behavior for the tests.
+    // TODO: Deprecate when the new randomizer is in place.
+    static inline size_t rand(const size_t begin, const size_t end)
+    {
+        // still only covers 32-bit range
+        const size_t randomNumber = ::rand() * RAND_MAX + ::rand();
+        return begin + randomNumber % (end - begin);
+    }
+
+    // NOTE: This is an old code, used for legacy randomization to make sure we preserve the same behavior for the tests.
+    // TODO: Deprecate when the new randomizer is in place.
+    template <typename TVector>
+    void RandomShuffle(TVector& v, size_t randomSeed)
+    {
+        if (v.size() > RAND_MAX * static_cast<size_t>(RAND_MAX))
+        {
+            RuntimeError("RandomShuffle: too large set: need to change to different random generator!");
+        }
+
+        srand(static_cast<unsigned int>(randomSeed));
+        foreach_index(currentLocation, v)
+        {
+            // Pick a random location a location and swap with current
+            const size_t randomLocation = rand(0, v.size());
+            std::swap(v[currentLocation], v[randomLocation]);
+        }
+    }
+
+    ChunkRandomizer::ChunkRandomizer(IDataDeserializerPtr deserializer, size_t randomizationRangeInSamples, bool legacy) :
+        m_deserializer(deserializer), m_legacy(legacy), m_randomizationRangeInSamples(randomizationRangeInSamples)
+    {
+        m_originalChunks = m_deserializer->GetChunkDescriptions();
+    }
+
+    // Gets randomized chunks.
+    const std::vector<RandomizedChunk>& ChunkRandomizer::GetRandomizedChunks() const
+    {
+        return m_randomizedChunks;
+    }
+
+    // Randomizes chunks depending on the mode (legacy or not) and calculates randomization windows.
+    void ChunkRandomizer::Randomize(unsigned int seed)
+    {
+        std::vector<size_t> randomizedChunkIndices;
+        randomizedChunkIndices.reserve(m_originalChunks.size());
+        for (size_t i = 0; i < m_originalChunks.size(); i++)
+        {
+            randomizedChunkIndices.push_back(i);
+        }
+
+        if (m_legacy)
+        {
+            RandomShuffle(randomizedChunkIndices, seed);
+        }
+        else
+        {
+            std::mt19937 m_rng(static_cast<int>(seed));
+            std::shuffle(randomizedChunkIndices.begin(), randomizedChunkIndices.end(), m_rng);
+        }
+
+        // Place randomized chunks on the timeline
+        m_randomizedChunks.clear();
+        m_randomizedChunks.reserve(m_originalChunks.size());
+        size_t samplePosition = 0;
+        size_t sequencePosition = 0;
+        for (size_t chunkIndex = 0; chunkIndex < m_originalChunks.size(); chunkIndex++)
+        {
+            const size_t originalChunkIndex = randomizedChunkIndices[chunkIndex];
+            const size_t numberOfSamples = m_originalChunks[originalChunkIndex]->m_numberOfSamples;
+            const size_t numberOfSequences = m_originalChunks[originalChunkIndex]->m_numberOfSequences;
+
+            RandomizedChunk randomizedChunk;
+            randomizedChunk.m_chunkId = chunkIndex;
+            randomizedChunk.m_original = m_originalChunks[originalChunkIndex].get();
+            randomizedChunk.m_samplePositionStart = samplePosition;
+            randomizedChunk.m_sequencePositionStart = sequencePosition;
+            m_randomizedChunks.push_back(randomizedChunk);
+            samplePosition += numberOfSamples;
+            sequencePosition += numberOfSequences;
+        }
+
+        // For each chunk, compute the randomization range (w.r.t. the randomized chunk sequence)
+        size_t halfWindowRange = m_randomizationRangeInSamples / 2;
+        for (size_t chunkId = 0; chunkId < m_originalChunks.size(); chunkId++)
+        {
+            auto& chunk = m_randomizedChunks[chunkId];
+
+            // start with the range of left neighbor
+            if (chunkId == 0)
+            {
+                chunk.m_randomizationWindow.m_begin = 0;
+                chunk.m_randomizationWindow.m_end = 1;
+            }
+            else
+            {
+                chunk.m_randomizationWindow.m_begin = m_randomizedChunks[chunkId - 1].m_randomizationWindow.m_begin; // might be too early
+                chunk.m_randomizationWindow.m_end = m_randomizedChunks[chunkId - 1].m_randomizationWindow.m_end; // might have more space
+            }
+
+            // Need to adapt now.
+            while (chunk.m_samplePositionStart - m_randomizedChunks[chunk.m_randomizationWindow.m_begin].m_samplePositionStart > halfWindowRange)
+            {
+                // too early, need to increase
+                chunk.m_randomizationWindow.m_begin++;
+            }
+
+            chunk.m_randomizationWindow.m_end = std::max(chunk.m_randomizationWindow.m_end, chunk.m_randomizationWindow.m_begin + 1);
+
+            while (chunk.m_randomizationWindow.m_end < m_originalChunks.size() &&
+                m_randomizedChunks[chunk.m_randomizationWindow.m_end].SampleEndPosition() - chunk.m_samplePositionStart < halfWindowRange)
+            {
+                // got more space, move window to the right.
+                chunk.m_randomizationWindow.m_end++;
+            }
+        }
+    }
+}}}
--- a/Source/Readers/ReaderLib/ChunkRandomizer.h
+++ b/Source/Readers/ReaderLib/ChunkRandomizer.h
@ -0,0 +1,74 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include <vector>
+#include "DataDeserializer.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // Represents an interval closed on the left and opened on the right.
+    struct ClosedOpenInterval
+    {
+        size_t m_begin;
+        size_t m_end;
+    };
+
+    // Information about randomized chunk.
+    struct RandomizedChunk
+    {
+        // Chunk id.
+        size_t m_chunkId;
+        // Pointer to the original chunk.
+        const ChunkDescription* m_original;
+        // Position of the first sample of the chunk in the input.
+        size_t m_samplePositionStart;
+        // Position of the first sequence of the chunk in the input.
+        size_t m_sequencePositionStart;
+        // Randomization window for this chunk.
+        ClosedOpenInterval m_randomizationWindow;
+
+        // Position of the last sample of the chunk in the input.
+        size_t SampleEndPosition() const
+        {
+            return m_original->m_numberOfSamples + m_samplePositionStart;
+        }
+
+        // Position of the last sequence of the chunk in the input.
+        size_t SequenceEndPosition() const
+        {
+            return m_original->m_numberOfSequences + m_sequencePositionStart;
+        }
+    };
+
+    // Randomizes a set of chunks and calculates their possible randomization windows.
+    // TODO: Currently, we have to preserve the same behavior for randomization in order to make all tests pass.
+    // TODO: Randomization can be made simpler if we randomize only forwards.
+    class ChunkRandomizer
+    {
+    public:
+        ChunkRandomizer(IDataDeserializerPtr deserializer, size_t randomizationRangeInSamples, bool legacy = false);
+
+        // Gets randomized chunks.
+        const std::vector<RandomizedChunk>& GetRandomizedChunks() const;
+
+        // Randomizes chunks based on the seed.
+        void Randomize(unsigned int seed);
+
+    private:
+        IDataDeserializerPtr m_deserializer;
+        // Randomized chunks.
+        std::vector<RandomizedChunk> m_randomizedChunks;
+        // Original chunks.
+        std::vector<ChunkDescriptionPtr> m_originalChunks;
+        // Whether to use legacy mode for randomization.
+        bool m_legacy;
+        // Randomization range in samples.
+        size_t m_randomizationRangeInSamples;
+    };
+
+    typedef std::shared_ptr<ChunkRandomizer> ChunkRandomizerPtr;
+}}}
--- a/Source/Readers/ReaderLib/DataDeserializer.h
+++ b/Source/Readers/ReaderLib/DataDeserializer.h
@ -10,12 +10,11 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-// Sequence key, used for correlations between sequences of different deserializers.
-// Both strings and integers are supported.
+// Sequence key, used for correlations between sequences between different deserializers.
 struct KeyType
 {
-    std::wstring major;
-    size_t minor;
+    size_t m_major;
+    size_t m_minor;
 };

 class Chunk;
@ -34,7 +33,8 @@ struct SequenceDescription
    bool m_isValid;           // Indicates whether the sequence is valid.
    KeyType m_key;            // Sequence key, used for correlations between sequences of different deserializers.
 };
-typedef std::vector<const SequenceDescription*> SequenceDescriptions;
+
+typedef std::shared_ptr<SequenceDescription> SequenceDescriptionPtr;

 // Defines sequence data and its layout.
 // Currently CNTK supports dense and sparse sequences (csc).
@ -45,6 +45,9 @@ struct SequenceDataBase
    SequenceDataBase() : m_data(nullptr) { }
    virtual ~SequenceDataBase() = default;

+    // Sequence id.
+    size_t m_id;
+
    ChunkPtr m_chunk;
    // A non-owned pointer. The actual size is provided for particular sequences,
    // i.e. see DenseSequenceData, or SparseSequenceData.
@ -95,7 +98,7 @@ public:
    // Gets a sequence per input by its identifier.
    // The sequence has a reference to the corresponding chunk. The chunk is not
    // deallocated till all its sequences are released.
-    virtual std::vector<SequenceDataPtr> GetSequence(size_t sequenceId) = 0;
+    virtual void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) = 0;

    virtual ~Chunk() {};

@ -106,6 +109,20 @@ private:
    DISABLE_COPY_AND_MOVE(Chunk);
 };

+// Represents a chunk description.
+struct ChunkDescription
+{
+    // Chunk id.
+    size_t m_id;
+    // Number of samples in the chunk.
+    size_t m_numberOfSamples;
+    // Number of sequences in the chunk.
+    size_t m_numberOfSequences;
+};
+
+typedef std::shared_ptr<ChunkDescription> ChunkDescriptionPtr;
+typedef std::vector<ChunkDescriptionPtr> ChunkDescriptions;
+
 //////////////////////////////////////////////////////////////////////////////////////////////////
 // Interface all data deserializers should implement.
 // Data deserializers are intimately familiar with a particular input formats and responsible for bringing
@ -117,25 +134,26 @@ private:
 class IDataDeserializer
 {
 public:
-    // Describes streams this data deserializer can produce. Streams correspond to network inputs.
-    // TODO: Introduce the interface to reduce the size of the sequences available at any point in time (chunks/sequences).
+    // Gets stream descriptions for all streams this deserializer exposes.
    virtual std::vector<StreamDescriptionPtr> GetStreamDescriptions() const = 0;

-    // Retrieves description of all sequences this data deserializer can produce.
-    // TODO for huge corpuses, footprint will be too big; need interface to request timeline in chunks
-    virtual const SequenceDescriptions& GetSequenceDescriptions() const = 0;
+    // Gets chunk descriptions this deserializer exposes.
+    virtual ChunkDescriptions GetChunkDescriptions() = 0;

-    // Retrieves description of a single sequence given its key.
-    virtual const SequenceDescription* GetSequenceDescriptionByKey(const KeyType& key) = 0;
+    // Gets sequence descriptions for a given a chunk.
+    virtual void GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& descriptions) = 0;

-    // Retrieves total number of chunks this deserializer can produce.
-    virtual size_t GetTotalNumberOfChunks() = 0;
+    // Gets sequence description by its key.
+    // Used by deserializers not in driving/primary mode.
+    // TODO: Possibly move this out into a separate interface.
+    virtual void GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& description) = 0;

-    // Retrieves a chunk with data.
+    // Gets chunk data given its id.
    virtual ChunkPtr GetChunk(size_t chunkId) = 0;

    virtual ~IDataDeserializer() {};
 };

 typedef std::shared_ptr<IDataDeserializer> IDataDeserializerPtr;
-} } }
+
+}}}
--- a/Source/Readers/ReaderLib/DataDeserializerBase.h
+++ b/Source/Readers/ReaderLib/DataDeserializerBase.h
@ -14,39 +14,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 class DataDeserializerBase : public IDataDeserializer
 {
 public:
-    DataDeserializerBase() : m_sequencesInitialized(false)
+    DataDeserializerBase()
    {}

-    // Provides description of all sequences the deserializer can produce.
-    const SequenceDescriptions& GetSequenceDescriptions() const override
-    {
-        if (!m_sequencesInitialized)
-        {
-            FillSequenceDescriptions(m_sequences);
-            m_sequencesInitialized = true;
-        }
-        return m_sequences;
-    }
-
-    virtual const SequenceDescription* GetSequenceDescriptionByKey(const KeyType&) override
+    virtual void GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&) override
    {
        NOT_IMPLEMENTED;
    }

-protected:
-    // Fills the timeline with sequence descriptions.
-    // Inherited classes should provide the complete Sequence descriptions for all input data.
-    virtual void FillSequenceDescriptions(SequenceDescriptions& timeline) const = 0;
+    virtual std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override
+    {
+        return m_streams;
+    }

+protected:
    // Streams this data deserializer can produce.
    std::vector<StreamDescriptionPtr> m_streams;

 private:
-    DataDeserializerBase(const DataDeserializerBase&) = delete;
-    DataDeserializerBase& operator=(const DataDeserializerBase&) = delete;
-
-    mutable SequenceDescriptions m_sequences;
-    mutable bool m_sequencesInitialized;
+    DISABLE_COPY_AND_MOVE(DataDeserializerBase);
 };

 }}}
--- a/Source/Readers/ReaderLib/NoRandomizer.cpp
+++ b/Source/Readers/ReaderLib/NoRandomizer.cpp
@ -14,41 +14,132 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 NoRandomizer::NoRandomizer(IDataDeserializerPtr deserializer)
    : m_deserializer(deserializer),
      m_samplePositionInEpoch(0),
-      m_sequencePosition(0)
+      m_currentChunkPosition(SIZE_MAX),
+      m_globalSamplePosition(0),
+      m_totalNumberOfSamples(0),
+      m_currentSequencePositionInChunk(0)
 {
    assert(deserializer != nullptr);
+    m_streams = m_deserializer->GetStreamDescriptions();
+    m_chunkDescriptions = m_deserializer->GetChunkDescriptions();

-    m_timeline = m_deserializer->GetSequenceDescriptions();
-    for (const auto& sequence : m_timeline)
+    size_t sampleCount = 0;
+    for (const auto& chunk : m_chunkDescriptions)
    {
-        if (sequence->m_numberOfSamples != 1)
-        {
-            RuntimeError("Currently, no randomizer supports only frame mode. Received a sequence with %d number of samples.",
-                static_cast<int>(sequence->m_numberOfSamples));
-        }
+        // Check that position corresponds to chunk id.
+        assert(m_chunkSampleOffset.size() == chunk->m_id);
+
+        m_chunkSampleOffset.push_back(sampleCount);
+        sampleCount += chunk->m_numberOfSamples;
    }

-    m_streams = m_deserializer->GetStreamDescriptions();
+    m_totalNumberOfSamples = sampleCount;
 }

 void NoRandomizer::Initialize(TransformerPtr, const ConfigParameters&)
 {
 }

+size_t NoRandomizer::GetChunkIndexOf(size_t samplePosition)
+{
+    auto result = std::upper_bound(m_chunkSampleOffset.begin(), m_chunkSampleOffset.end(), samplePosition);
+    return result - 1 - m_chunkSampleOffset.begin();
+}
+
 void NoRandomizer::StartEpoch(const EpochConfiguration& config)
 {
    m_config = config;

    if (m_config.m_totalEpochSizeInSamples == requestDataSize)
    {
-        m_config.m_totalEpochSizeInSamples = m_timeline.size();
+        m_config.m_totalEpochSizeInSamples = m_totalNumberOfSamples;
    }

    m_samplePositionInEpoch = 0;
-    size_t globalSamplePosition = m_config.m_totalEpochSizeInSamples * config.m_epochIndex;
-    m_sequencePosition = globalSamplePosition % m_timeline.size();
+    m_globalSamplePosition = m_config.m_totalEpochSizeInSamples * config.m_epochIndex;
+    size_t sweepSamplePosition = m_globalSamplePosition % m_totalNumberOfSamples;
+
+    size_t chunkIndex = GetChunkIndexOf(sweepSamplePosition);
+    if (chunkIndex != m_currentChunkPosition)
+    {
+        // unloading everything.
+        m_currentChunkId = SIZE_MAX;
+        m_currentChunk = nullptr;
+
+        // Need to load descriptions for the new current chunk.
+        m_currentChunkPosition = chunkIndex;
+        m_currentSequencePositionInChunk = 0;
+        m_sequenceWindow.clear();
+        m_deserializer->GetSequencesForChunk(m_currentChunkPosition, m_sequenceWindow);
+    }
+
+    // Moving current sequence inside the chunk to match the sample offset.
+    size_t sampleOffsetInsideChunk = sweepSamplePosition - m_chunkSampleOffset[m_currentChunkPosition];
+    size_t numberOfSamples = 0;
+    size_t sequenceId = 0;
+
+    // Currently linear, happens only at the border of epochs.
+    for (size_t i = 0; i < m_sequenceWindow.size(); ++i)
+    {
+        size_t sequenceSize = m_sequenceWindow[i].m_numberOfSamples;
+        if (sequenceSize + numberOfSamples > sampleOffsetInsideChunk)
+        {
+            // We have found our sequence.
+            break;
+        }
+
+        numberOfSamples += sequenceSize;
+        sequenceId++;
+    }
+
+    m_currentSequencePositionInChunk = sequenceId;
+    assert(m_chunkDescriptions[m_currentChunkPosition]->m_numberOfSequences > m_currentSequencePositionInChunk);
 };

+// Moving the cursor to the next sequence. Possibly updating the chunk information if needed.
+void NoRandomizer::MoveToNextSequence()
+{
+    SequenceDescription& sequence = m_sequenceWindow[m_currentSequencePositionInChunk];
+    m_samplePositionInEpoch += sequence.m_numberOfSamples;
+    m_globalSamplePosition += sequence.m_numberOfSamples;
+
+    if (m_currentSequencePositionInChunk + 1 >= m_chunkDescriptions[m_currentChunkPosition]->m_numberOfSequences)
+    {
+        // Moving to the next chunk.
+        m_currentChunkPosition = (m_currentChunkPosition + 1) % m_chunkDescriptions.size();
+        m_currentSequencePositionInChunk = 0;
+        m_sequenceWindow.clear();
+        m_deserializer->GetSequencesForChunk(m_currentChunkPosition, m_sequenceWindow);
+    }
+    else
+    {
+        m_currentSequencePositionInChunk++;
+    }
+}
+
+// Gets next sequence descriptions with total size less than sampleCount.
+std::vector<SequenceDescription> NoRandomizer::GetNextSequenceDescriptions(size_t sampleCount)
+{
+    assert(m_sequenceWindow.size() != 0);
+    assert(m_chunkDescriptions[m_currentChunkPosition]->m_numberOfSequences > m_currentSequencePositionInChunk);
+
+    int samples = (int)sampleCount;
+
+    std::vector<SequenceDescription> result;
+
+    do
+    {
+        const SequenceDescription& sequence = m_sequenceWindow[m_currentSequencePositionInChunk];
+        result.push_back(sequence);
+        samples -= (int)sequence.m_numberOfSamples;
+
+        MoveToNextSequence();
+    }
+    // Check whether the next sequence fits into the sample count, if not, exit.
+    while (samples - (int)m_sequenceWindow[m_currentSequencePositionInChunk].m_numberOfSamples >= 0);
+    return result;
+}
+
 Sequences NoRandomizer::GetNextSequences(size_t sampleCount)
 {
    Sequences result;
@ -58,68 +149,41 @@ Sequences NoRandomizer::GetNextSequences(size_t sampleCount)
        return result;
    }

-    size_t maxSampleCount = std::min(sampleCount, m_config.m_totalEpochSizeInSamples - m_samplePositionInEpoch);
-    size_t start = maxSampleCount * m_config.m_workerRank / m_config.m_numberOfWorkers;
-    size_t end = maxSampleCount * (m_config.m_workerRank + 1) / m_config.m_numberOfWorkers;
+    // Check that we do not go over the sweep.
+    // TODO: This preserves the old behavior. Could be done differently in the future.
+    size_t sweepPosition = m_globalSamplePosition % m_totalNumberOfSamples;
+    sampleCount = std::min(sampleCount, m_totalNumberOfSamples - sweepPosition);
+    assert(sampleCount != 0);
+
+    std::vector<SequenceDescription> descriptions = GetNextSequenceDescriptions(sampleCount);
+
+    // Retrieve only sequences that are required by this worker.
+    size_t start = descriptions.size() * m_config.m_workerRank / m_config.m_numberOfWorkers;
+    size_t end = descriptions.size() * (m_config.m_workerRank + 1) / m_config.m_numberOfWorkers;
    size_t subsetSize = end - start;
-
-    std::vector<size_t> chunkIds;
-    SequenceDescriptions sequences;
-    sequences.reserve(subsetSize);
-    size_t previousChunk = SIZE_MAX;
-    for (size_t i = start; i < end; ++i)
-    {
-        const auto& sequence = m_timeline[(m_sequencePosition + i) % m_timeline.size()];
-        assert(sequence->m_numberOfSamples == 1);
-        sequences.push_back(sequence);
-
-        if (previousChunk != sequence->m_chunkId)
-        {
-            chunkIds.push_back(sequence->m_chunkId);
-            previousChunk = sequence->m_chunkId;
-        }
-    }
-
-    m_samplePositionInEpoch += maxSampleCount;
-    m_sequencePosition = (m_sequencePosition + maxSampleCount) % m_timeline.size();
-
-    if (sequences.size() == 0)
+    if (subsetSize == 0)
    {
        return result;
    }

-    // TODO: Currently we preserve chunks not for the complete window, only for minibatch
-    // Should be changed
-    std::map<size_t, ChunkPtr> chunks;
-    for (size_t id : chunkIds)
+    result.m_data.resize(m_streams.size(), std::vector<SequenceDataPtr>(subsetSize));
+    for (int i = 0; i < subsetSize; ++i)
    {
-        auto chunk = m_chunks.find(id);
-        if (chunk == m_chunks.end())
+        std::vector<SequenceDataPtr> sequence;
+        const auto& sequenceDescription = descriptions[start + i];
+        if (sequenceDescription.m_chunkId != m_currentChunkId)
        {
-            chunks[id] = m_deserializer->GetChunk(id);
+            m_currentChunk = m_deserializer->GetChunk(sequenceDescription.m_chunkId);
+            m_currentChunkId = sequenceDescription.m_chunkId;
        }
-        else
-        {
-            chunks[id] = chunk->second;
-        }
-    }
-
-    m_chunks.swap(chunks);
-
-    // TODO: Not clear whether batching will make sense for this.
-    // We have to re-assemble the exposed result from sequences from different chunks.
-    result.m_data.resize(m_streams.size(), std::vector<SequenceDataPtr>(sequences.size()));
-
-#pragma omp parallel for ordered schedule(dynamic)
-    for (int i = 0; i < sequences.size(); ++i)
-    {
-        auto sequence = m_chunks[sequences[i]->m_chunkId]->GetSequence(sequences[i]->m_id);

+        m_currentChunk->GetSequence(sequenceDescription.m_id, sequence);
        for (int j = 0; j < m_streams.size(); ++j)
        {
            result.m_data[j][i] = sequence[j];
        }
    }
+
    return result;
 }

--- a/Source/Readers/ReaderLib/NoRandomizer.h
+++ b/Source/Readers/ReaderLib/NoRandomizer.h
@ -9,14 +9,14 @@
 #include <map>
 #include "Transformer.h"
 #include "DataDeserializer.h"
+#include "SequenceRandomizer.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

 // The class represents a randomizer that does not randomize input (identity function over the original timeline).
-// This class is used for inference and for training where the training data has already been pre - randomized.
+// Used training where the training data has already been pre - randomized.
 // TODO: currently this code moved from the old block randomizer.
 // TODO: The class will be further refactored and common based will be extracted with BlockRandomizer.
-// TODO: Currently works only for frame mode (numberOfSample in sequence == 1) and without chunking
 // TODO: This layering will be changed, when we move transformers under the randomizer, it won't be a transformer anymore.
 class NoRandomizer : public Transformer
 {
@ -32,21 +32,54 @@ public:
    }

 private:
-    // Deserializer and information on the original timeline
-    IDataDeserializerPtr m_deserializer;
+    // Gets next sequence descriptions with total size less than sampleCount.
+    std::vector<SequenceDescription> GetNextSequenceDescriptions(size_t sampleCount);

-    // Initial timeline.
-    SequenceDescriptions m_timeline;
+    // Get chunk index for the sample offset from the beginning of the sweep.
+    size_t GetChunkIndexOf(size_t samplePosition);
+
+    // Moves the cursor to the sequence possibly updating the chunk.
+    void MoveToNextSequence();
+
+    IDataDeserializerPtr m_deserializer;

    // Stream descriptions
    std::vector<StreamDescriptionPtr> m_streams;

    // Epoch configuration
    EpochConfiguration m_config;
-    size_t m_samplePositionInEpoch;
-    size_t m_sequencePosition;

-    std::map<size_t, ChunkPtr> m_chunks;
+    // Chunk descriptions.
+    ChunkDescriptions m_chunkDescriptions;
+
+    // m_chunkDescription defines the complete sweep of samples: [0 .. N]
+    // m_chunkSampleOffset for each chunk contains the sample offset in the sweep where the chunk begins.
+    std::vector<size_t> m_chunkSampleOffset;
+
+    // Current chunk data.
+    ChunkPtr m_currentChunk;
+    // Current chunk data id.
+    size_t m_currentChunkId;
+
+    // Current window of sequence descriptions.
+    std::vector<SequenceDescription> m_sequenceWindow;
+
+    // Current sequence position the randomizer works with.
+    size_t m_currentSequencePositionInChunk;
+
+    // Current chunk position that the randomizer works with.
+    // An index inside the m_chunkDescriptions.
+    size_t m_currentChunkPosition;
+
+    // Global sample position on the timeline.
+    // TODO: possible recalculate it base on samplePositionInEpoch.
+    size_t m_globalSamplePosition;
+
+    // Current sample position in the epoch.
+    size_t m_samplePositionInEpoch;
+
+    // Total number of samples in the sweep.
+    size_t m_totalNumberOfSamples;
 };

 }}}
--- a/Source/Readers/ReaderLib/ReaderLib.vcxproj
+++ b/Source/Readers/ReaderLib/ReaderLib.vcxproj
@ -78,9 +78,12 @@
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClInclude Include="Bundler.h" />
+    <ClInclude Include="ChunkRandomizer.h" />
    <ClInclude Include="DataDeserializerBase.h" />
-    <ClInclude Include="TransformerBase.h" />
    <ClInclude Include="BlockRandomizer.h" />
+    <ClInclude Include="SequenceRandomizer.h" />
+    <ClInclude Include="StringToIdMap.h" />
+    <ClInclude Include="TransformerBase.h" />
    <ClInclude Include="NoRandomizer.h" />
    <ClInclude Include="CudaMemoryProvider.h" />
    <ClInclude Include="DataDeserializer.h" />
@ -94,12 +97,14 @@
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="Bundler.cpp" />
-    <ClCompile Include="BlockRandomizer.cpp" />
+    <ClCompile Include="ChunkRandomizer.cpp" />
    <ClCompile Include="NoRandomizer.cpp" />
+    <ClCompile Include="BlockRandomizer.cpp" />
    <ClCompile Include="SampleModePacker.cpp" />
    <ClCompile Include="ReaderShim.cpp" />
+    <ClCompile Include="SequenceRandomizer.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/Source/Readers/ReaderLib/ReaderLib.vcxproj.filters
+++ b/Source/Readers/ReaderLib/ReaderLib.vcxproj.filters
@ -1,9 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
-    <ClInclude Include="BlockRandomizer.h">
-      <Filter>Randomizers</Filter>
-    </ClInclude>
    <ClInclude Include="NoRandomizer.h">
      <Filter>Randomizers</Filter>
    </ClInclude>
@ -43,11 +40,20 @@
    <ClInclude Include="Bundler.h">
      <Filter>Deserializers</Filter>
    </ClInclude>
+    <ClInclude Include="ChunkRandomizer.h">
+      <Filter>Randomizers</Filter>
+    </ClInclude>
+    <ClInclude Include="SequenceRandomizer.h">
+      <Filter>Randomizers</Filter>
+    </ClInclude>
+    <ClInclude Include="BlockRandomizer.h">
+      <Filter>Randomizers</Filter>
+    </ClInclude>
+    <ClInclude Include="StringToIdMap.h">
+      <Filter>Utils</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="BlockRandomizer.cpp">
-      <Filter>Randomizers</Filter>
-    </ClCompile>
    <ClCompile Include="NoRandomizer.cpp">
      <Filter>Randomizers</Filter>
    </ClCompile>
@ -60,6 +66,15 @@
    <ClCompile Include="Bundler.cpp">
      <Filter>Deserializers</Filter>
    </ClCompile>
+    <ClCompile Include="ChunkRandomizer.cpp">
+      <Filter>Randomizers</Filter>
+    </ClCompile>
+    <ClCompile Include="SequenceRandomizer.cpp">
+      <Filter>Randomizers</Filter>
+    </ClCompile>
+    <ClCompile Include="BlockRandomizer.cpp">
+      <Filter>Randomizers</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Interfaces">
@ -84,4 +99,4 @@
      <UniqueIdentifier>{90d4b51b-73ae-47f5-9a9e-97ef287dcead}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
-</Project>
+</Project>
--- a/Source/Readers/ReaderLib/SampleModePacker.cpp
+++ b/Source/Readers/ReaderLib/SampleModePacker.cpp
@ -104,16 +104,15 @@ void SampleModePacker::CopySequenceToBuffer(SequenceDataPtr sample, size_t strea

    if (stream->m_storageType == StorageType::dense)
    {
-        auto data = reinterpret_cast<DenseSequenceData&>(*sample);
        // Expect single sample.
-        assert(data.m_numberOfSamples == 1);
+        assert(reinterpret_cast<DenseSequenceData&>(*sample).m_numberOfSamples == 1);

        // Copying the sequence to its position in the buffer. Effectivly a buffer contains concatenation of samples for a stream.
        std::copy(sampleData, sampleData + sampleSize, buffer + sampleIndex * sampleSize);
    }
    else if (stream->m_storageType == StorageType::sparse_csc)
    {
-        auto data = reinterpret_cast<SparseSequenceData&>(*sample);
+        const auto& data = reinterpret_cast<SparseSequenceData&>(*sample);
        // Expect single sample.
        assert(data.m_indices.size() == 1);

--- a/Source/Readers/ReaderLib/SequenceRandomizer.cpp
+++ b/Source/Readers/ReaderLib/SequenceRandomizer.cpp
@ -0,0 +1,359 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#define _CRT_SECURE_NO_WARNINGS
+
+#include "SequenceRandomizer.h"
+#include <algorithm>
+#include <utility>
+#include <deque>
+
+#include "DataReader.h"
+#include <random>
+#include <set>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // NOTE: This is an old code, used for legacy randomization to make sure we preserve the same behavior for the tests.
+    // TODO: Deprecate when the new randomizer is in place.
+    static inline size_t rand(const size_t begin, const size_t end)
+    {
+        // still only covers 32-bit range
+        const size_t randomNumber = ::rand() * RAND_MAX + ::rand();
+        return begin + randomNumber % (end - begin);
+    }
+
+    SequenceRandomizer::SequenceRandomizer(
+        IDataDeserializerPtr deserializer,
+        ChunkRandomizerPtr chunkRandomizer)
+        : m_randomizedChunks(chunkRandomizer->GetRandomizedChunks()),
+        m_currentRangeBeginChunkIndex(0),
+        m_currentRangeEndChunkIndex(0),
+        m_nextSamplePositionNotYetRandomized(0),
+        m_nextSequencePositionNotYetRandomized(0),
+        m_currentSequencePosition(0),
+        m_currentChunkPosition(0),
+        m_currentSamplePosition(0),
+        m_deserializer(deserializer)
+    {
+        size_t max = 0;
+        for (const auto& c : m_randomizedChunks)
+        {
+            if (max < c.m_original->m_numberOfSequences)
+            {
+                max = c.m_original->m_numberOfSequences;
+            }
+        }
+
+        m_bufferOriginalSequences.reserve(max);
+    }
+
+    // Gets next randomized sequence descriptions not exceeding the count.
+    std::vector<RandomizedSequenceDescription> SequenceRandomizer::GetNextSequenceDescriptions(size_t sampleCount)
+    {
+        RandomizeNextSequenceDescriptions(sampleCount);
+
+        int samples = (int)sampleCount;
+
+        std::vector<RandomizedSequenceDescription> result;
+        result.reserve(sampleCount);
+
+        assert(IsChunkInWindow(m_currentChunkPosition));
+
+        size_t sequenceOffsetInsideChunk = m_currentSequencePosition - m_randomizedChunks[m_currentChunkPosition].m_sequencePositionStart;
+        RandomizedSequenceDescription* sequence = &m_sequenceWindow[m_currentChunkPosition - m_currentRangeBeginChunkIndex][sequenceOffsetInsideChunk];
+
+        result.push_back(*sequence);
+        samples -= (int)sequence->m_numberOfSamples;
+        m_currentSequencePosition++;
+        m_currentSamplePosition += sequence->m_numberOfSamples;
+
+        if (sequenceOffsetInsideChunk + 1 >= m_randomizedChunks[m_currentChunkPosition].m_original->m_numberOfSequences)
+        {
+            // Moving to the next chunk.
+            m_currentChunkPosition++;
+        }
+
+        while (samples > 0 && m_currentChunkPosition < m_randomizedChunks.size())
+        {
+            sequenceOffsetInsideChunk = m_currentSequencePosition - m_randomizedChunks[m_currentChunkPosition].m_sequencePositionStart;
+            sequence = &m_sequenceWindow[m_currentChunkPosition - m_currentRangeBeginChunkIndex][sequenceOffsetInsideChunk];
+            if (samples - sequence->m_numberOfSamples >= 0)
+            {
+                result.push_back(*sequence);
+                m_currentSequencePosition++;
+                samples -= (int)sequence->m_numberOfSamples;
+                m_currentSamplePosition += sequence->m_numberOfSamples;
+
+                if (sequenceOffsetInsideChunk + 1 >= m_randomizedChunks[m_currentChunkPosition].m_original->m_numberOfSequences)
+                {
+                    // Moving to the next chunk.
+                    m_currentChunkPosition++;
+                }
+            }
+            else
+            {
+                break;
+            }
+        }
+
+        return result;
+    }
+
+    void SequenceRandomizer::RandomizeNextSequenceDescriptions(size_t sampleCount)
+    {
+        assert(m_currentSamplePosition <= m_nextSamplePositionNotYetRandomized);
+        if (m_nextSamplePositionNotYetRandomized == m_randomizedChunks.back().SampleEndPosition())
+        {
+            return;
+        }
+
+        if (m_currentSamplePosition + sampleCount < m_nextSamplePositionNotYetRandomized)
+        {
+            return;
+        }
+
+        if (m_nextSequencePositionNotYetRandomized == m_randomizedChunks.back().SequenceEndPosition())
+        {
+            assert(false);
+            return;
+        }
+
+        assert(m_nextSamplePositionNotYetRandomized >= m_randomizedChunks[0].m_samplePositionStart);
+
+        size_t firstSamplePositionToRandomize = m_nextSamplePositionNotYetRandomized;
+        size_t firstSequencePositionToRandomize = m_nextSequencePositionNotYetRandomized;
+
+        // Find the smallest chunk index whose windows begin exceeds the chunk index
+        // of the sample position we have to randomized (current + sampleCount).
+        // We will randomize up to this chunk as the final position of windows end is guaranteed to have been determined
+        // when all sequences up to that chunk have been randomized
+        size_t lastSamplePositionChunkIdx = GetChunkIndexOf(m_currentSamplePosition + sampleCount - 1);
+        size_t endChunkIdxToRandomize = lastSamplePositionChunkIdx;
+        while (endChunkIdxToRandomize < m_randomizedChunks.size() &&
+            m_randomizedChunks[endChunkIdxToRandomize].m_randomizationWindow.m_begin <= lastSamplePositionChunkIdx)
+        {
+            endChunkIdxToRandomize++;
+        }
+
+        size_t endFramePosToRandomize = m_randomizedChunks[endChunkIdxToRandomize - 1].SampleEndPosition();
+        size_t endSequencePosToRandomize = m_randomizedChunks[endChunkIdxToRandomize - 1].SequenceEndPosition();
+
+        // Determine the range of chunks that need to be in m_sequenceWindows for us
+        // to perform the necessary randomization
+        size_t startChunkIdx = std::min(GetChunkIndexOf(m_currentSamplePosition), m_randomizedChunks[GetChunkIndexOf(firstSamplePositionToRandomize)].m_randomizationWindow.m_begin);
+        size_t endChunkIdx = m_randomizedChunks[GetChunkIndexOf(endFramePosToRandomize - 1)].m_randomizationWindow.m_end;
+
+        // Lets drop everything that is outside the new range [startChunkIdx, endChunkIdx)
+        for (size_t i = m_currentRangeBeginChunkIndex; i < startChunkIdx; ++i)
+        {
+            m_sequenceWindow.pop_front();
+            m_chunkWindow.pop_front();
+            m_currentRangeBeginChunkIndex++;
+        }
+
+        // Lets page in everything from m_currentRangeEndChunkIndex to endChunkIdx
+        for (size_t i = m_currentRangeEndChunkIndex; i < endChunkIdx; ++i)
+        {
+            AddRandomizedSequencesForChunk(i);
+        }
+
+        for (size_t t = firstSequencePositionToRandomize; t < endSequencePosToRandomize; ++t)
+        {
+            // Get valid randomization range, expressed in chunks
+            const size_t currentChunkIdx = GetChunkIndexForSequencePosition(t);
+
+            size_t chunkWindowBegin = m_randomizedChunks[currentChunkIdx].m_randomizationWindow.m_begin;
+            size_t chunkWindowEnd = m_randomizedChunks[currentChunkIdx].m_randomizationWindow.m_end;
+
+            // Get valid randomization range, expressed in sequence positions.
+            size_t posBegin = m_randomizedChunks[chunkWindowBegin].m_sequencePositionStart;
+            size_t posEnd = m_randomizedChunks[chunkWindowEnd - 1].SequenceEndPosition();
+
+            for (;;)
+            {
+                // Pick a sequence position from [posBegin, posEnd)
+                const size_t j = rand(posBegin, posEnd);
+
+                // Try again if the sequence currently at j cannot be placed at position i.
+                if (!IsValidForPosition(t, GetRandomizedSequenceDescriptionBySequenceId(j)))
+                    continue;
+
+                // Try again if the sequence currently at i cannot be placed at position j.
+                if (!IsValidForPosition(j, GetRandomizedSequenceDescriptionBySequenceId(t)))
+                    continue;
+
+                // Swap and break out.
+                std::swap(GetRandomizedSequenceDescriptionBySequenceId(t), GetRandomizedSequenceDescriptionBySequenceId(j)); // TODO old swap was perhaps more efficient
+                break;
+            }
+        }
+
+        // Verify that we got it right
+        for (size_t t = firstSequencePositionToRandomize; t < endSequencePosToRandomize; ++t)
+        {
+            // TODO assert only
+            if (!IsValidForPosition(t, GetRandomizedSequenceDescriptionBySequenceId(t)))
+            {
+                LogicError("SequenceRandomizer::RandomizeNextSequenceDescriptions: randomization logic mangled!");
+            }
+        }
+
+        m_nextSamplePositionNotYetRandomized = endFramePosToRandomize;
+        m_nextSequencePositionNotYetRandomized = endSequencePosToRandomize;
+    }
+
+    // Resets the current sweep according to the randomization seed provided.
+    void SequenceRandomizer::Reset(size_t randSeed)
+    {
+        srand((unsigned int)randSeed);
+        size_t sweepts = m_randomizedChunks[0].m_samplePositionStart;
+
+        m_sequenceWindow.clear();
+        m_chunkWindow.clear();
+        m_currentRangeBeginChunkIndex = m_randomizedChunks[0].m_randomizationWindow.m_begin;
+        m_currentRangeEndChunkIndex = m_currentRangeBeginChunkIndex;
+        m_nextSamplePositionNotYetRandomized = sweepts;
+        m_nextSequencePositionNotYetRandomized = 0;
+
+        m_currentSequencePosition = 0;
+        m_currentChunkPosition = 0;
+        m_currentSamplePosition = 0;
+    }
+
+    // Sets current sequence position to the sample offset.
+    // If offset is in the middle of the sequence, the next sequence is picked up.
+    void SequenceRandomizer::SetSequencePositionTo(size_t offset, size_t sweep)
+    {
+        size_t chunkIdx = GetChunkIndexOf(offset);
+        if (!this->IsChunkInWindow(chunkIdx))
+        {
+            Reset(sweep + 1);
+            size_t count = offset;
+            if (count == 0)
+            {
+                count++;
+            }
+
+            RandomizeNextSequenceDescriptions(count);
+        }
+
+        assert(chunkIdx >= m_currentRangeBeginChunkIndex);
+        assert(chunkIdx < m_currentRangeEndChunkIndex);
+
+        size_t sampleOffsetInsideChunk = offset - m_randomizedChunks[chunkIdx].m_samplePositionStart;
+        auto& sequences = m_sequenceWindow[chunkIdx - m_currentRangeBeginChunkIndex];
+
+        size_t numberOfSamples = 0;
+        size_t sequenceId = 0;
+        for (size_t i = 0; i < sequences.size(); ++i)
+        {
+            size_t sequenceSize = sequences[i].m_numberOfSamples;
+            if (sequenceSize + numberOfSamples > sampleOffsetInsideChunk)
+            {
+                break;
+            }
+
+            numberOfSamples += sequenceSize;
+            sequenceId++;
+        }
+
+        m_currentSequencePosition = sequenceId + m_randomizedChunks[chunkIdx].m_sequencePositionStart;
+    }
+
+    // Checks if the randomized sequence is valid for a target position using its chunk randomization window.
+    bool SequenceRandomizer::IsValidForPosition(size_t targetPosition, const RandomizedSequenceDescription& seqDesc) const
+    {
+        const auto& chunk = m_randomizedChunks[GetChunkIndexForSequencePosition(targetPosition)];
+        return chunk.m_randomizationWindow.m_begin <= seqDesc.m_chunk->m_chunkId && seqDesc.m_chunk->m_chunkId < chunk.m_randomizationWindow.m_end;
+    }
+
+    // Gets chunk index using a sequence position in the sweep.
+    // TODO: upper bound should be used instead.
+    size_t SequenceRandomizer::GetChunkIndexForSequencePosition(size_t sequencePosition) const
+    {
+        struct PositionConverter
+        {
+            size_t m_position;
+            PositionConverter(const RandomizedChunk & chunk) : m_position(chunk.m_sequencePositionStart) {};
+            PositionConverter(size_t sequencePosition) : m_position(sequencePosition) {};
+        };
+
+        auto result = std::lower_bound(m_randomizedChunks.begin(), m_randomizedChunks.end(), sequencePosition,
+            [](const PositionConverter& a, const PositionConverter& b)
+        {
+            return a.m_position <= b.m_position;
+        });
+
+        return result - 1 - m_randomizedChunks.begin();
+    }
+
+    // Gets chunk index using a sample position in the sweep.
+    // TODO: upper bound should be used instead.
+    size_t SequenceRandomizer::GetChunkIndexOf(size_t sampleOffsetInSweep)
+    {
+        size_t low = 0; // TODO: m_currentRangeBeginChunkIdx; can be done more efficient?
+        size_t high = m_randomizedChunks.size() - 1;
+        while (high > low)
+        {
+            size_t mid = (high + low) / 2;
+            if (sampleOffsetInSweep >= m_randomizedChunks[mid].SampleEndPosition())
+            {
+                low = mid + 1;
+            }
+            else if (sampleOffsetInSweep < m_randomizedChunks[mid].m_samplePositionStart)
+            {
+                assert(mid > 0);
+                high = mid - 1;
+            }
+            else
+            {
+                return mid;
+            }
+        }
+
+        assert((high == low) && ((sampleOffsetInSweep >= m_randomizedChunks[low].m_samplePositionStart) && (sampleOffsetInSweep < m_randomizedChunks[low].SampleEndPosition())));
+        return low;
+    }
+
+    // Checks if chunk index is in the current window.
+    bool SequenceRandomizer::IsChunkInWindow(size_t chunkIdx) const
+    {
+        return chunkIdx >= m_currentRangeBeginChunkIndex && chunkIdx < m_currentRangeEndChunkIndex;
+    }
+
+    // Add randomizes sequences for the chunk with a given index.
+    void SequenceRandomizer::AddRandomizedSequencesForChunk(size_t chunkIdx)
+    {
+        assert(chunkIdx == m_currentRangeEndChunkIndex);
+
+        const RandomizedChunk& chunk = m_randomizedChunks[chunkIdx];
+        std::vector<RandomizedSequenceDescription> chunkSequences;
+
+        m_bufferOriginalSequences.clear();
+        m_deserializer->GetSequencesForChunk(chunk.m_original->m_id, m_bufferOriginalSequences);
+        chunkSequences.reserve(m_bufferOriginalSequences.size());
+        for (size_t k = 0; k < m_bufferOriginalSequences.size(); k++)
+        {
+            RandomizedSequenceDescription s;
+            s.m_id = m_bufferOriginalSequences[k].m_id;
+            s.m_numberOfSamples = m_bufferOriginalSequences[k].m_numberOfSamples;
+            s.m_chunk = &chunk;
+            chunkSequences.push_back(s);
+        }
+
+        m_sequenceWindow.push_back(std::move(chunkSequences));
+        m_chunkWindow.push_back(chunk);
+        m_currentRangeEndChunkIndex++;
+    }
+
+    // Gets randomized sequence by the sequence id.
+    RandomizedSequenceDescription& SequenceRandomizer::GetRandomizedSequenceDescriptionBySequenceId(size_t sequenceId)
+    {
+        size_t globalChunkIdx = GetChunkIndexForSequencePosition(sequenceId);
+        size_t sequenceOffsetInsideChunk = sequenceId - m_randomizedChunks[globalChunkIdx].m_sequencePositionStart;
+        return m_sequenceWindow[globalChunkIdx - m_currentRangeBeginChunkIndex][sequenceOffsetInsideChunk];
+    }
+}}}
--- a/Source/Readers/ReaderLib/SequenceRandomizer.h
+++ b/Source/Readers/ReaderLib/SequenceRandomizer.h
@ -0,0 +1,115 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include <vector>
+
+#include "Transformer.h"
+#include "DataDeserializer.h"
+#include "ChunkRandomizer.h"
+#include <deque>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// Randomized sequence description.
+struct RandomizedSequenceDescription
+{
+    // Sequnce id.
+    size_t m_id;
+    // Number of samples in sequence.
+    size_t m_numberOfSamples;
+    // Randomized chunk this sequence belongs to.
+    const RandomizedChunk* m_chunk;
+};
+
+// Class that given randomized chunks, randomizes sequence descriptions in a window of chunks.
+// TODO: This code is still based on the old behavior, so that all current tests pass.
+// TODO: Can be simplified if we only randomized sequences forward.
+class SequenceRandomizer
+{
+public:
+    SequenceRandomizer(
+        IDataDeserializerPtr deserializer,
+        ChunkRandomizerPtr chunkRandomizer);
+
+    // Resets current sequence sweep according to the seed.
+    void Reset(size_t seed);
+
+    // Sets current sequence position to the sample offset.
+    // If offset is in the middle of the sequence, the next sequence is picked up.
+    void SetSequencePositionTo(size_t sweepSampleOffset, size_t sweep);
+
+    // Gets next sequence descriptions.
+    std::vector<RandomizedSequenceDescription> GetNextSequenceDescriptions(size_t sampleCount);
+
+    // Gets current randomized chunk window.
+    const std::deque<RandomizedChunk>& GetChunkWindow() const
+    {
+        return m_chunkWindow;
+    }
+
+private:
+    DISABLE_COPY_AND_MOVE(SequenceRandomizer);
+
+    // Randomizes next sequence descriptions not exceeding sample count.
+    void RandomizeNextSequenceDescriptions(size_t sampleCount);
+
+    // Validates if sequence description is valid for the current position.
+    bool IsValidForPosition(size_t targetPosition, const RandomizedSequenceDescription& seqDesc) const;
+
+    // Gets randomized chunk index by the sequence position inside the sweep.
+    size_t GetChunkIndexForSequencePosition(size_t sequencePosition) const;
+
+    // Gets randomized sequence description by the sample offset in the sweep.
+    RandomizedSequenceDescription& GetRandomizedSequenceDescriptionBySequenceId(size_t sequenceId);
+
+    // Gets chunk index given a sample offset in the sweep.
+    size_t GetChunkIndexOf(size_t sampleOffsetInSweep);
+
+    // Checks if chunk index is in the randomized window.
+    bool IsChunkInWindow(size_t chunkIndex) const;
+
+    // Adds randomized sequences to the window.
+    void AddRandomizedSequencesForChunk(size_t chunkIndex);
+
+private:
+    // Randomized chunks.
+    const std::vector<RandomizedChunk>& m_randomizedChunks;
+
+    // A rolling windows of randomized chunks.
+    // Which chunk to load is decided by the BlockRandomizer (i.e. decimation based on chunk).
+    std::deque<RandomizedChunk> m_chunkWindow;
+
+    // A rolling window of randomized sequences for the chunks.
+    std::deque<std::vector<RandomizedSequenceDescription>> m_sequenceWindow;
+
+    // Index (< m_randomizedChunks.size) of the first chunk in the window(m_chunkWindow).
+    size_t m_currentRangeBeginChunkIndex;
+
+    // Index (< m_randomizedChunks.size) of the last chunk in the window(m_chunkWindow).
+    size_t m_currentRangeEndChunkIndex;
+
+    // Next sample position not yet randomized.
+    size_t m_nextSamplePositionNotYetRandomized;
+
+    // Next sequence position not yet randomized.
+    size_t m_nextSequencePositionNotYetRandomized;
+
+    IDataDeserializerPtr m_deserializer;
+
+    // Current sequence position.
+    size_t m_currentSequencePosition;
+    // Current chunk position.
+    size_t m_currentChunkPosition;
+    // Current sample position.
+    size_t m_currentSamplePosition;
+
+    // Used only as a buffer to get sequence descriptions without memory reallocation.
+    std::vector<SequenceDescription> m_bufferOriginalSequences;
+};
+
+typedef std::shared_ptr<SequenceRandomizer> SequenceRandomizerPtr;
+}}}
--- a/Source/Readers/ReaderLib/StringToIdMap.h
+++ b/Source/Readers/ReaderLib/StringToIdMap.h
@ -0,0 +1,68 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include <string>
+#include <memory>
+#include <vector>
+#include <map>
+#include "Basics.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// This class represents a string registry pattern to share strings between different deserializers if needed.
+// It associates a unique key for a given string.
+// Currently it is implemented in-memory, but can be unloaded to external disk if needed.
+// TODO: Move this class to Basics.h when it is required by more than one reader.
+template<class TString>
+class TStringToIdMap
+{
+public:
+    TStringToIdMap()
+    {}
+
+    // Adds string value to the registry.
+    size_t AddValue(const TString& value)
+    {
+        assert(!Contains(value));
+        auto iter = m_values.insert(std::make_pair(value, m_indexedValues.size()));
+        m_indexedValues.push_back(&((iter.first)->first));
+        return m_indexedValues.size() - 1;
+    }
+
+    // Get integer id for the string value.
+    size_t operator[](const TString& value) const
+    {
+        const auto& it = m_values.find(value);
+        assert(it != m_values.end());
+        return it->second;
+    }
+
+    // Get string value by its integer id.
+    const TString& operator[](size_t id) const
+    {
+        assert(id < m_indexedValues.size());
+        return *m_indexedValues[id];
+    }
+
+    // Checks whether the value exists.
+    bool Contains(const TString& value) const
+    {
+        return m_values.find(value) != m_values.end();
+    }
+
+private:
+    // TODO: Move NonCopyable as a separate class to Basics.h
+    DISABLE_COPY_AND_MOVE(TStringToIdMap);
+
+    std::map<TString, size_t> m_values;
+    std::vector<const TString*> m_indexedValues;
+};
+
+typedef TStringToIdMap<std::wstring> WStringToIdMap;
+typedef TStringToIdMap<std::string> StringToIdMap;
+
+}}}
--- a/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/run-test
+++ b/Tests/EndToEndTests/Speech/DNN/ParallelCrossValidation/run-test
@ -8,7 +8,7 @@ Instances=2
 NumCPUThreads=$(threadsPerInstance $Instances)

 # cntkmpirun <MPI args> <CNTK config file name> <additional CNTK args>
-cntkmpirun "-n $Instances" cntkcv.cntk "numCPUThreads=$NumCPUThreads"
+cntkmpirun "-n $Instances" cntkcv.cntk "numCPUThreads=$NumCPUThreads shareNodeValueMatrices=true"
 ExitCode=$?
 sed 's/^/MPI Rank 0: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank0
 sed 's/^/MPI Rank 1: /' $TEST_RUN_DIR/"$LogFileName"_speechTrain.logrank1
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/Output.ScaledLogLikelihood.cpu
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/Output.ScaledLogLikelihood.cpu
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/Output.ScaledLogLikelihood.gpu
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/Output.ScaledLogLikelihood.gpu
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/Output.ScaledLogLikelihood.windows.cpu
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/Output.ScaledLogLikelihood.windows.cpu
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/Output.ScaledLogLikelihood.windows.gpu
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/Output.ScaledLogLikelihood.windows.gpu
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/baseline.cpu.txt
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/baseline.cpu.txt
@ -0,0 +1,578 @@
+=== Running /home/mluser/src/git_master/build/gpu/debug/bin/cntk configFile=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand/cntk.cntk currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data RunDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand OutputDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu DeviceId=-1 shareNodeValueMatrices=true
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Mar 17 2016 23:39:20
+		Last modified date: Thu Mar 17 23:00:03 2016
+		Build type: debug
+		Build target: GPU
+		With 1bit-SGD: no
+		Math lib: acml
+		CUDA_PATH: /usr/local/cuda-7.0
+		CUB_PATH: /usr/local/cub-1.4.1
+		CUDNN_PATH: /usr/local/cudnn-4.0
+		Build Branch: amitaga/memshareFixes
+		Build SHA1: 186142470e6fb7e576f7280b0898eedb61e4097f (modified)
+		Built by mluser on Source/CNTK/buildinfo.h0
+		Build Path: Source/CNTK/buildinfo.h1
+-------------------------------------------------------------------
+Changed current directory to '/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data'
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Mar 17 2016 23:39:20
+		Last modified date: Thu Mar 17 23:00:03 2016
+		Build type: debug
+		Build target: GPU
+		With 1bit-SGD: no
+		Math lib: acml
+		CUDA_PATH: /usr/local/cuda-7.0
+		CUB_PATH: /usr/local/cub-1.4.1
+		CUDNN_PATH: /usr/local/cudnn-4.0
+		Build Branch: amitaga/memshareFixes
+		Build SHA1: 186142470e6fb7e576f7280b0898eedb61e4097f (modified)
+		Built by mluser on Source/CNTK/buildinfo.h0
+		Build Path: Source/CNTK/buildinfo.h1
+-------------------------------------------------------------------
+
+Running on localhost at 2016/03/18 00:14:44
+Command line: 
+/home/mluser/src/git_master/build/gpu/debug/bin/cntk  configFile=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand/cntk.cntk  currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data  RunDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu  DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data  ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand  OutputDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu  DeviceId=-1  shareNodeValueMatrices=true
+
+
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+precision = "float"
+command = speechTrain:write
+deviceId = $DeviceId$
+parallelTrain = false
+makeMode = false
+speechTrain = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    deviceId = $DeviceId$
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+write = [
+    action = write
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = $DeviceId$
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "$RunDir$/Output"
+]
+currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+RunDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu
+DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand
+OutputDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu
+DeviceId=-1
+shareNodeValueMatrices=true
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+precision = "float"
+command = speechTrain:write
+deviceId = -1
+parallelTrain = false
+makeMode = false
+speechTrain = [
+    action = "train"
+    modelPath = "/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn"
+    deviceId = -1
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/glob_0000.mlf"
+            labelMappingFile = "/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+write = [
+    action = write
+    modelPath = "/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = -1
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/Output"
+]
+currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+RunDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu
+DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand
+OutputDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu
+DeviceId=-1
+shareNodeValueMatrices=true
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: cntk.cntk:command=speechTrain:write
+configparameters: cntk.cntk:ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand
+configparameters: cntk.cntk:currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+configparameters: cntk.cntk:DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+configparameters: cntk.cntk:deviceId=-1
+configparameters: cntk.cntk:makeMode=false
+configparameters: cntk.cntk:OutputDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu
+configparameters: cntk.cntk:parallelTrain=false
+configparameters: cntk.cntk:precision=float
+configparameters: cntk.cntk:RunDir=/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu
+configparameters: cntk.cntk:shareNodeValueMatrices=true
+configparameters: cntk.cntk:speechTrain=[
+    action = "train"
+    modelPath = "/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn"
+    deviceId = -1
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/glob_0000.mlf"
+            labelMappingFile = "/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+
+configparameters: cntk.cntk:write=[
+    action = write
+    modelPath = "/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = -1
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/Output"
+]
+
+<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+Commands: speechTrain write
+Precision = "float"
+CNTKModelPath: /tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 3
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+
+##############################################################################
+#                                                                            #
+# Action "train"                                                             #
+#                                                                            #
+##############################################################################
+
+CNTKCommandTrainBegin: speechTrain
+SimpleNetworkBuilder Using CPU
+reading script file glob_0000.scp ... 948 entries
+total 132 state names in state list /home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/state.list
+htkmlfreader: reading MLF file /home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+
+Creating virgin network.
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+	EvalErrorPrediction = ErrorPrediction
+	InvStdOfFeatures = InvStdDev
+	MeanOfFeatures = Mean
+	PosteriorProb = Softmax
+	Prior = Mean
+	ScaledLogLikelihood = Minus
+FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation
+FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation
+FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation
+FormNestedNetwork: WARNING: Was called twice for Prior Mean operation
+FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 {1} x *]
+Validating --> W2 = LearnableParameter() :  -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter() :  -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter() :  -> [512 x 363 {1,512}]
+Validating --> features = InputValue() :  -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean (features) : [363 {1} x *] -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev (features) : [363 {1} x *] -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization (features, MeanOfFeatures, InvStdOfFeatures) : [363 {1} x *], [363 {1}], [363 {1}] -> [363 {1} x *]
+Validating --> W0*features = Times (W0, MVNormalizedFeatures) : [512 x 363 {1,512}], [363 {1} x *] -> [512 {1} x *]
+Validating --> B0 = LearnableParameter() :  -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus (W0*features, B0) : [512 {1} x *], [512 x 1 {1,512}] -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid (W0*features+B0) : [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times (W1, H1) : [512 x 512 {1,512}], [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter() :  -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus (W1*H1, B1) : [512 x 1 {1,512} x *], [512 x 1 {1,512}] -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid (W1*H1+B1) : [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times (W2, H2) : [132 x 512 {1,132}], [512 x 1 {1,512} x *] -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter() :  -> [132 x 1 {1,132}]
+Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 {1,132} x *], [132 x 1 {1,132}] -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 {1} x *], [132 x 1 {1,132} x *] -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 {1} x *], [132 x 1 {1,132} x *] -> [1 {1}]
+Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 {1,132} x *] -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean (labels) : [132 {1} x *] -> [132 {1}]
+Validating --> LogOfPrior = Log (Prior) : [132 {1}] -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus (HLast, LogOfPrior) : [132 x 1 {1,132} x *], [132 {1}] -> [132 x 1 {1,132} x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+Created model with 25 nodes on CPU.
+
+Training criterion node(s):
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	EvalErrorPrediction = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Precomputing --> 3 PreCompute nodes found.
+
+	NodeName: MeanOfFeatures
+	NodeName: InvStdOfFeatures
+	NodeName: Prior
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+Precomputing --> Completed.
+
+
+Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 1 of 3]-Minibatch[   1-  10, 3.12%]: SamplesSeen = 640; TrainLossPerSample =  4.39181900; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.8095s; SamplesPerSecond = 790.6
+ Epoch[ 1 of 3]-Minibatch[  11-  20, 6.25%]: SamplesSeen = 640; TrainLossPerSample =  4.16675568; EvalErr[0]PerSample = 0.87187500; TotalTime = 0.7491s; SamplesPerSecond = 854.4
+ Epoch[ 1 of 3]-Minibatch[  21-  30, 9.38%]: SamplesSeen = 640; TrainLossPerSample =  3.98684082; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.7475s; SamplesPerSecond = 856.2
+ Epoch[ 1 of 3]-Minibatch[  31-  40, 12.50%]: SamplesSeen = 640; TrainLossPerSample =  3.86595383; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.7476s; SamplesPerSecond = 856.1
+ Epoch[ 1 of 3]-Minibatch[  41-  50, 15.62%]: SamplesSeen = 640; TrainLossPerSample =  3.81007080; EvalErr[0]PerSample = 0.88593750; TotalTime = 0.7483s; SamplesPerSecond = 855.2
+ Epoch[ 1 of 3]-Minibatch[  51-  60, 18.75%]: SamplesSeen = 640; TrainLossPerSample =  3.73428192; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.7472s; SamplesPerSecond = 856.5
+ Epoch[ 1 of 3]-Minibatch[  61-  70, 21.88%]: SamplesSeen = 640; TrainLossPerSample =  3.57475586; EvalErr[0]PerSample = 0.81875000; TotalTime = 0.7474s; SamplesPerSecond = 856.3
+ Epoch[ 1 of 3]-Minibatch[  71-  80, 25.00%]: SamplesSeen = 640; TrainLossPerSample =  3.43591919; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.7471s; SamplesPerSecond = 856.6
+ Epoch[ 1 of 3]-Minibatch[  81-  90, 28.12%]: SamplesSeen = 640; TrainLossPerSample =  3.36042786; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.7479s; SamplesPerSecond = 855.7
+ Epoch[ 1 of 3]-Minibatch[  91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample =  3.39383850; EvalErr[0]PerSample = 0.85156250; TotalTime = 0.7474s; SamplesPerSecond = 856.3
+ Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample =  3.25078430; EvalErr[0]PerSample = 0.76406250; TotalTime = 0.7471s; SamplesPerSecond = 856.6
+ Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample =  3.35325317; EvalErr[0]PerSample = 0.79375000; TotalTime = 0.7471s; SamplesPerSecond = 856.6
+ Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.62%]: SamplesSeen = 640; TrainLossPerSample =  3.19606934; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.7491s; SamplesPerSecond = 854.4
+ Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample =  3.06110535; EvalErr[0]PerSample = 0.73125000; TotalTime = 0.7479s; SamplesPerSecond = 855.7
+ Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample =  3.05118713; EvalErr[0]PerSample = 0.75625000; TotalTime = 0.7467s; SamplesPerSecond = 857.0
+ Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample =  3.02474365; EvalErr[0]PerSample = 0.74062500; TotalTime = 0.7478s; SamplesPerSecond = 855.9
+ Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.12%]: SamplesSeen = 640; TrainLossPerSample =  2.89902954; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.7484s; SamplesPerSecond = 855.1
+ Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample =  2.75173340; EvalErr[0]PerSample = 0.68125000; TotalTime = 0.7470s; SamplesPerSecond = 856.7
+ Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample =  2.83969116; EvalErr[0]PerSample = 0.71875000; TotalTime = 0.7478s; SamplesPerSecond = 855.9
+ Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample =  2.62870483; EvalErr[0]PerSample = 0.65468750; TotalTime = 0.7471s; SamplesPerSecond = 856.6
+ Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.62%]: SamplesSeen = 640; TrainLossPerSample =  2.66655273; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.7473s; SamplesPerSecond = 856.4
+ Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample =  2.61327515; EvalErr[0]PerSample = 0.65937500; TotalTime = 0.7472s; SamplesPerSecond = 856.5
+ Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample =  2.53099976; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.7468s; SamplesPerSecond = 856.9
+ Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample =  2.43747559; EvalErr[0]PerSample = 0.64375000; TotalTime = 0.7492s; SamplesPerSecond = 854.3
+ Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.12%]: SamplesSeen = 640; TrainLossPerSample =  2.41107178; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.7478s; SamplesPerSecond = 855.9
+ Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample =  2.48898926; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.7473s; SamplesPerSecond = 856.5
+ Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample =  2.34965820; EvalErr[0]PerSample = 0.61093750; TotalTime = 0.7472s; SamplesPerSecond = 856.5
+ Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample =  2.23708496; EvalErr[0]PerSample = 0.57812500; TotalTime = 0.7491s; SamplesPerSecond = 854.3
+ Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.62%]: SamplesSeen = 640; TrainLossPerSample =  2.33135376; EvalErr[0]PerSample = 0.62031250; TotalTime = 0.7476s; SamplesPerSecond = 856.1
+ Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample =  2.21607666; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.7470s; SamplesPerSecond = 856.8
+ Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample =  2.29110107; EvalErr[0]PerSample = 0.60625000; TotalTime = 0.7473s; SamplesPerSecond = 856.5
+ Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample =  2.20535278; EvalErr[0]PerSample = 0.57500000; TotalTime = 0.7460s; SamplesPerSecond = 857.9
+Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0173738; TotalSamplesSeen = 20480; EvalErrPerSample = 0.73061526; AvgLearningRatePerSample = 0.015625; EpochTime=24.0016
+SGD: Saving checkpoint model '/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn.1'
+
+Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119  momentum as time constant = 607.5 samples
+minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 2 of 3]-Minibatch[   1-  10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample =  2.05711155; EvalErr[0]PerSample = 0.55000000; TotalTime = 1.2019s; SamplesPerSecond = 2130.0
+ Epoch[ 2 of 3]-Minibatch[  11-  20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample =  2.02925358; EvalErr[0]PerSample = 0.54648438; TotalTime = 1.1996s; SamplesPerSecond = 2134.1
+ Epoch[ 2 of 3]-Minibatch[  21-  30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample =  2.02826576; EvalErr[0]PerSample = 0.54843750; TotalTime = 1.2002s; SamplesPerSecond = 2133.0
+ Epoch[ 2 of 3]-Minibatch[  31-  40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.97095871; EvalErr[0]PerSample = 0.54140625; TotalTime = 1.1996s; SamplesPerSecond = 2134.1
+ Epoch[ 2 of 3]-Minibatch[  41-  50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.94550018; EvalErr[0]PerSample = 0.53867188; TotalTime = 1.2002s; SamplesPerSecond = 2133.0
+ Epoch[ 2 of 3]-Minibatch[  51-  60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample =  2.01561737; EvalErr[0]PerSample = 0.54414063; TotalTime = 1.1998s; SamplesPerSecond = 2133.7
+ Epoch[ 2 of 3]-Minibatch[  61-  70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.94069977; EvalErr[0]PerSample = 0.52500000; TotalTime = 1.2000s; SamplesPerSecond = 2133.4
+ Epoch[ 2 of 3]-Minibatch[  71-  80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.94857330; EvalErr[0]PerSample = 0.54023438; TotalTime = 1.1906s; SamplesPerSecond = 2150.2
+Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9919976; TotalSamplesSeen = 40960; EvalErrPerSample = 0.54179686; AvgLearningRatePerSample = 0.001953125; EpochTime=9.6075
+SGD: Saving checkpoint model '/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn.2'
+
+Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119  momentum as time constant = 2429.9 samples
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 3 of 3]-Minibatch[   1-  10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample =  1.91946163; EvalErr[0]PerSample = 0.52890625; TotalTime = 2.9811s; SamplesPerSecond = 3435.0
+ Epoch[ 3 of 3]-Minibatch[  11-  20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample =  1.91066799; EvalErr[0]PerSample = 0.52783203; TotalTime = 2.9332s; SamplesPerSecond = 3491.1
+Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.9150648; TotalSamplesSeen = 61440; EvalErrPerSample = 0.52836913; AvgLearningRatePerSample = 9.7656251e-05; EpochTime=5.96187
+SGD: Saving checkpoint model '/tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn'
+CNTKCommandTrainEnd: speechTrain
+
+Action "train" complete.
+
+
+##############################################################################
+#                                                                            #
+# Action "write"                                                             #
+#                                                                            #
+##############################################################################
+
+reading script file glob_0000.write.scp ... 10 entries
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+	EvalErrorPrediction = ErrorPrediction
+	InvStdOfFeatures = InvStdDev
+	MeanOfFeatures = Mean
+	PosteriorProb = Softmax
+	Prior = Mean
+	ScaledLogLikelihood = Minus
+FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation
+FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation
+FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation
+FormNestedNetwork: WARNING: Was called twice for Prior Mean operation
+FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 {1} x *]
+Validating --> W2 = LearnableParameter() :  -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter() :  -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter() :  -> [512 x 363 {1,512}]
+Validating --> features = InputValue() :  -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean (features) : [363 {1} x *] -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev (features) : [363 {1} x *] -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization (features, MeanOfFeatures, InvStdOfFeatures) : [363 {1} x *], [363 {1}], [363 {1}] -> [363 {1} x *]
+Validating --> W0*features = Times (W0, MVNormalizedFeatures) : [512 x 363 {1,512}], [363 {1} x *] -> [512 {1} x *]
+Validating --> B0 = LearnableParameter() :  -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus (W0*features, B0) : [512 {1} x *], [512 x 1 {1,512}] -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid (W0*features+B0) : [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times (W1, H1) : [512 x 512 {1,512}], [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter() :  -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus (W1*H1, B1) : [512 x 1 {1,512} x *], [512 x 1 {1,512}] -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid (W1*H1+B1) : [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times (W2, H2) : [132 x 512 {1,132}], [512 x 1 {1,512} x *] -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter() :  -> [132 x 1 {1,132}]
+Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 {1,132} x *], [132 x 1 {1,132}] -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 {1} x *], [132 x 1 {1,132} x *] -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 {1} x *], [132 x 1 {1,132} x *] -> [1 {1}]
+Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 {1,132} x *] -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean (labels) : [132 {1} x *] -> [132 {1}]
+Validating --> LogOfPrior = Log (Prior) : [132 {1}] -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus (HLast, LogOfPrior) : [132 x 1 {1,132} x *], [132 {1}] -> [132 x 1 {1,132} x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+
+
+Allocating matrices for forward and/or backward propagation.
+evaluate: reading 368 frames of An4/71/71/cen5-fjam-b.mfc
+Minibatch[1]: ActualMBSize = 368
+evaluate: reading 438 frames of An4/213/213/cen4-fsaf2-b.mfc
+Minibatch[2]: ActualMBSize = 438
+evaluate: reading 368 frames of An4/513/513/cen7-mgah-b.mfc
+Minibatch[3]: ActualMBSize = 368
+evaluate: reading 248 frames of An4/614/614/cen7-mkdb-b.mfc
+Minibatch[4]: ActualMBSize = 248
+evaluate: reading 248 frames of An4/507/507/cen1-mgah-b.mfc
+Minibatch[5]: ActualMBSize = 248
+evaluate: reading 358 frames of An4/693/693/cen8-mmkw-b.mfc
+Minibatch[6]: ActualMBSize = 358
+evaluate: reading 308 frames of An4/918/918/cen4-mtos-b.mfc
+Minibatch[7]: ActualMBSize = 308
+evaluate: reading 608 frames of An4/477/477/an257-mewl-b.mfc
+Minibatch[8]: ActualMBSize = 608
+evaluate: reading 78 frames of An4/454/454/an70-meht-b.mfc
+Minibatch[9]: ActualMBSize = 78
+evaluate: reading 228 frames of An4/254/254/cen6-ftmj-b.mfc
+Minibatch[10]: ActualMBSize = 228
+Written to /tmp/cntk-test-20160318001444.203634/Speech/DNN_WriteCommand@debug_cpu/Output*
+Total Samples Evaluated = 3250
+
+Action "write" complete.
+
+COMPLETED
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/baseline.gpu.txt
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/baseline.gpu.txt
@ -0,0 +1,580 @@
+=== Running /home/mluser/src/git_master/build/gpu/debug/bin/cntk configFile=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand/cntk.cntk currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data RunDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand OutputDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu DeviceId=0 shareNodeValueMatrices=true
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Mar 17 2016 23:39:20
+		Last modified date: Thu Mar 17 23:00:03 2016
+		Build type: debug
+		Build target: GPU
+		With 1bit-SGD: no
+		Math lib: acml
+		CUDA_PATH: /usr/local/cuda-7.0
+		CUB_PATH: /usr/local/cub-1.4.1
+		CUDNN_PATH: /usr/local/cudnn-4.0
+		Build Branch: amitaga/memshareFixes
+		Build SHA1: 186142470e6fb7e576f7280b0898eedb61e4097f (modified)
+		Built by mluser on Source/CNTK/buildinfo.h0
+		Build Path: Source/CNTK/buildinfo.h1
+-------------------------------------------------------------------
+Changed current directory to '/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data'
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Mar 17 2016 23:39:20
+		Last modified date: Thu Mar 17 23:00:03 2016
+		Build type: debug
+		Build target: GPU
+		With 1bit-SGD: no
+		Math lib: acml
+		CUDA_PATH: /usr/local/cuda-7.0
+		CUB_PATH: /usr/local/cub-1.4.1
+		CUDNN_PATH: /usr/local/cudnn-4.0
+		Build Branch: amitaga/memshareFixes
+		Build SHA1: 186142470e6fb7e576f7280b0898eedb61e4097f (modified)
+		Built by mluser on Source/CNTK/buildinfo.h0
+		Build Path: Source/CNTK/buildinfo.h1
+-------------------------------------------------------------------
+
+Running on localhost at 2016/03/18 00:11:23
+Command line: 
+/home/mluser/src/git_master/build/gpu/debug/bin/cntk  configFile=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand/cntk.cntk  currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data  RunDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu  DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data  ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand  OutputDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu  DeviceId=0  shareNodeValueMatrices=true
+
+
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+precision = "float"
+command = speechTrain:write
+deviceId = $DeviceId$
+parallelTrain = false
+makeMode = false
+speechTrain = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    deviceId = $DeviceId$
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+write = [
+    action = write
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = $DeviceId$
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "$RunDir$/Output"
+]
+currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+RunDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu
+DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand
+OutputDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu
+DeviceId=0
+shareNodeValueMatrices=true
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+precision = "float"
+command = speechTrain:write
+deviceId = 0
+parallelTrain = false
+makeMode = false
+speechTrain = [
+    action = "train"
+    modelPath = "/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn"
+    deviceId = 0
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/glob_0000.mlf"
+            labelMappingFile = "/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+write = [
+    action = write
+    modelPath = "/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = 0
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/Output"
+]
+currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+RunDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu
+DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand
+OutputDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu
+DeviceId=0
+shareNodeValueMatrices=true
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: cntk.cntk:command=speechTrain:write
+configparameters: cntk.cntk:ConfigDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/DNN/WriteCommand
+configparameters: cntk.cntk:currentDirectory=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+configparameters: cntk.cntk:DataDir=/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data
+configparameters: cntk.cntk:deviceId=0
+configparameters: cntk.cntk:makeMode=false
+configparameters: cntk.cntk:OutputDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu
+configparameters: cntk.cntk:parallelTrain=false
+configparameters: cntk.cntk:precision=float
+configparameters: cntk.cntk:RunDir=/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu
+configparameters: cntk.cntk:shareNodeValueMatrices=true
+configparameters: cntk.cntk:speechTrain=[
+    action = "train"
+    modelPath = "/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn"
+    deviceId = 0
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/glob_0000.mlf"
+            labelMappingFile = "/home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+
+configparameters: cntk.cntk:write=[
+    action = write
+    modelPath = "/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = 0
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/Output"
+]
+
+<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+Commands: speechTrain write
+Precision = "float"
+CNTKModelPath: /tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 3
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+
+##############################################################################
+#                                                                            #
+# Action "train"                                                             #
+#                                                                            #
+##############################################################################
+
+CNTKCommandTrainBegin: speechTrain
+SimpleNetworkBuilder Using GPU 0
+reading script file glob_0000.scp ... 948 entries
+total 132 state names in state list /home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/state.list
+htkmlfreader: reading MLF file /home/mluser/src/git_master/Tests/EndToEndTests/Speech/Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+
+Creating virgin network.
+SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+	EvalErrorPrediction = ErrorPrediction
+	InvStdOfFeatures = InvStdDev
+	MeanOfFeatures = Mean
+	PosteriorProb = Softmax
+	Prior = Mean
+	ScaledLogLikelihood = Minus
+FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation
+FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation
+FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation
+FormNestedNetwork: WARNING: Was called twice for Prior Mean operation
+FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 {1} x *]
+Validating --> W2 = LearnableParameter() :  -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter() :  -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter() :  -> [512 x 363 {1,512}]
+Validating --> features = InputValue() :  -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean (features) : [363 {1} x *] -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev (features) : [363 {1} x *] -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization (features, MeanOfFeatures, InvStdOfFeatures) : [363 {1} x *], [363 {1}], [363 {1}] -> [363 {1} x *]
+Validating --> W0*features = Times (W0, MVNormalizedFeatures) : [512 x 363 {1,512}], [363 {1} x *] -> [512 {1} x *]
+Validating --> B0 = LearnableParameter() :  -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus (W0*features, B0) : [512 {1} x *], [512 x 1 {1,512}] -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid (W0*features+B0) : [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times (W1, H1) : [512 x 512 {1,512}], [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter() :  -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus (W1*H1, B1) : [512 x 1 {1,512} x *], [512 x 1 {1,512}] -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid (W1*H1+B1) : [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times (W2, H2) : [132 x 512 {1,132}], [512 x 1 {1,512} x *] -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter() :  -> [132 x 1 {1,132}]
+Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 {1,132} x *], [132 x 1 {1,132}] -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 {1} x *], [132 x 1 {1,132} x *] -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 {1} x *], [132 x 1 {1,132} x *] -> [1 {1}]
+Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 {1,132} x *] -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean (labels) : [132 {1} x *] -> [132 {1}]
+Validating --> LogOfPrior = Log (Prior) : [132 {1}] -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus (HLast, LogOfPrior) : [132 x 1 {1,132} x *], [132 {1}] -> [132 x 1 {1,132} x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+Created model with 25 nodes on GPU 0.
+
+Training criterion node(s):
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	EvalErrorPrediction = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Precomputing --> 3 PreCompute nodes found.
+
+	NodeName: MeanOfFeatures
+	NodeName: InvStdOfFeatures
+	NodeName: Prior
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+Precomputing --> Completed.
+
+
+Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 1 of 3]-Minibatch[   1-  10, 3.12%]: SamplesSeen = 640; TrainLossPerSample =  4.32135315; EvalErr[0]PerSample = 0.90000000; TotalTime = 0.0621s; SamplesPerSecond = 10301.0
+ Epoch[ 1 of 3]-Minibatch[  11-  20, 6.25%]: SamplesSeen = 640; TrainLossPerSample =  4.15070953; EvalErr[0]PerSample = 0.86718750; TotalTime = 0.0600s; SamplesPerSecond = 10670.9
+ Epoch[ 1 of 3]-Minibatch[  21-  30, 9.38%]: SamplesSeen = 640; TrainLossPerSample =  3.99901123; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.0600s; SamplesPerSecond = 10666.7
+ Epoch[ 1 of 3]-Minibatch[  31-  40, 12.50%]: SamplesSeen = 640; TrainLossPerSample =  3.86945953; EvalErr[0]PerSample = 0.87656250; TotalTime = 0.0599s; SamplesPerSecond = 10675.7
+ Epoch[ 1 of 3]-Minibatch[  41-  50, 15.62%]: SamplesSeen = 640; TrainLossPerSample =  3.80219574; EvalErr[0]PerSample = 0.87812500; TotalTime = 0.0572s; SamplesPerSecond = 11188.6
+ Epoch[ 1 of 3]-Minibatch[  51-  60, 18.75%]: SamplesSeen = 640; TrainLossPerSample =  3.72890930; EvalErr[0]PerSample = 0.86875000; TotalTime = 0.0563s; SamplesPerSecond = 11364.2
+ Epoch[ 1 of 3]-Minibatch[  61-  70, 21.88%]: SamplesSeen = 640; TrainLossPerSample =  3.56186981; EvalErr[0]PerSample = 0.82343750; TotalTime = 0.0563s; SamplesPerSecond = 11366.7
+ Epoch[ 1 of 3]-Minibatch[  71-  80, 25.00%]: SamplesSeen = 640; TrainLossPerSample =  3.42790527; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.0554s; SamplesPerSecond = 11549.0
+ Epoch[ 1 of 3]-Minibatch[  81-  90, 28.12%]: SamplesSeen = 640; TrainLossPerSample =  3.33928528; EvalErr[0]PerSample = 0.77343750; TotalTime = 0.0552s; SamplesPerSecond = 11601.6
+ Epoch[ 1 of 3]-Minibatch[  91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample =  3.36398926; EvalErr[0]PerSample = 0.84375000; TotalTime = 0.0553s; SamplesPerSecond = 11583.5
+WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
+ Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample =  3.21223450; EvalErr[0]PerSample = 0.75312500; TotalTime = 0.0552s; SamplesPerSecond = 11601.3
+ Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample =  3.31265564; EvalErr[0]PerSample = 0.78750000; TotalTime = 0.0552s; SamplesPerSecond = 11593.6
+ Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.62%]: SamplesSeen = 640; TrainLossPerSample =  3.14082031; EvalErr[0]PerSample = 0.74687500; TotalTime = 0.0551s; SamplesPerSecond = 11605.6
+ Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample =  3.00689697; EvalErr[0]PerSample = 0.69687500; TotalTime = 0.0552s; SamplesPerSecond = 11602.8
+ Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample =  3.00495911; EvalErr[0]PerSample = 0.72343750; TotalTime = 0.0552s; SamplesPerSecond = 11598.6
+ Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample =  2.97858887; EvalErr[0]PerSample = 0.73906250; TotalTime = 0.0552s; SamplesPerSecond = 11602.4
+ Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.12%]: SamplesSeen = 640; TrainLossPerSample =  2.85686035; EvalErr[0]PerSample = 0.70781250; TotalTime = 0.0552s; SamplesPerSecond = 11598.2
+ Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample =  2.69053345; EvalErr[0]PerSample = 0.67187500; TotalTime = 0.0551s; SamplesPerSecond = 11610.0
+ Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample =  2.78653564; EvalErr[0]PerSample = 0.70468750; TotalTime = 0.0551s; SamplesPerSecond = 11606.8
+ Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample =  2.57702026; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.0551s; SamplesPerSecond = 11609.6
+ Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.62%]: SamplesSeen = 640; TrainLossPerSample =  2.61571655; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.0552s; SamplesPerSecond = 11599.5
+ Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample =  2.55236206; EvalErr[0]PerSample = 0.65781250; TotalTime = 0.0552s; SamplesPerSecond = 11601.6
+ Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample =  2.48211670; EvalErr[0]PerSample = 0.62500000; TotalTime = 0.0551s; SamplesPerSecond = 11604.9
+ Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample =  2.38778687; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.0552s; SamplesPerSecond = 11596.9
+ Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.12%]: SamplesSeen = 640; TrainLossPerSample =  2.36900635; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.0551s; SamplesPerSecond = 11604.7
+ Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample =  2.43967285; EvalErr[0]PerSample = 0.63281250; TotalTime = 0.0552s; SamplesPerSecond = 11602.2
+ Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample =  2.30281982; EvalErr[0]PerSample = 0.61250000; TotalTime = 0.0552s; SamplesPerSecond = 11604.1
+ Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample =  2.19668579; EvalErr[0]PerSample = 0.55937500; TotalTime = 0.0552s; SamplesPerSecond = 11592.7
+ Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.62%]: SamplesSeen = 640; TrainLossPerSample =  2.28980103; EvalErr[0]PerSample = 0.60468750; TotalTime = 0.0552s; SamplesPerSecond = 11600.1
+ Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample =  2.17750854; EvalErr[0]PerSample = 0.62187500; TotalTime = 0.0551s; SamplesPerSecond = 11608.7
+ Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample =  2.26263428; EvalErr[0]PerSample = 0.59687500; TotalTime = 0.0552s; SamplesPerSecond = 11599.2
+ Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample =  2.15072632; EvalErr[0]PerSample = 0.56250000; TotalTime = 0.0536s; SamplesPerSecond = 11938.7
+Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 2.9799573; TotalSamplesSeen = 20480; EvalErrPerSample = 0.72216797; AvgLearningRatePerSample = 0.015625; EpochTime=1.82021
+SGD: Saving checkpoint model '/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn.1'
+
+Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119  momentum as time constant = 607.5 samples
+minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 2 of 3]-Minibatch[   1-  10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample =  2.01598530; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.0890s; SamplesPerSecond = 28776.7
+ Epoch[ 2 of 3]-Minibatch[  11-  20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.98818569; EvalErr[0]PerSample = 0.54296875; TotalTime = 0.0872s; SamplesPerSecond = 29344.0
+ Epoch[ 2 of 3]-Minibatch[  21-  30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.98698120; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.0874s; SamplesPerSecond = 29288.3
+ Epoch[ 2 of 3]-Minibatch[  31-  40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.93126144; EvalErr[0]PerSample = 0.52773437; TotalTime = 0.0868s; SamplesPerSecond = 29506.0
+ Epoch[ 2 of 3]-Minibatch[  41-  50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.90067825; EvalErr[0]PerSample = 0.52656250; TotalTime = 0.0867s; SamplesPerSecond = 29521.7
+ Epoch[ 2 of 3]-Minibatch[  51-  60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.97115860; EvalErr[0]PerSample = 0.54140625; TotalTime = 0.0866s; SamplesPerSecond = 29560.5
+ Epoch[ 2 of 3]-Minibatch[  61-  70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.89518127; EvalErr[0]PerSample = 0.52031250; TotalTime = 0.0871s; SamplesPerSecond = 29394.9
+ Epoch[ 2 of 3]-Minibatch[  71-  80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.90450439; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.0806s; SamplesPerSecond = 31764.9
+Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.949242; TotalSamplesSeen = 40960; EvalErrPerSample = 0.53417969; AvgLearningRatePerSample = 0.001953125; EpochTime=0.700466
+SGD: Saving checkpoint model '/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn.2'
+
+Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119  momentum as time constant = 2429.9 samples
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 3 of 3]-Minibatch[   1-  10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample =  1.87359848; EvalErr[0]PerSample = 0.51933594; TotalTime = 0.2379s; SamplesPerSecond = 43045.1
+ Epoch[ 3 of 3]-Minibatch[  11-  20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample =  1.86656265; EvalErr[0]PerSample = 0.51748047; TotalTime = 0.2068s; SamplesPerSecond = 49528.4
+Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8700806; TotalSamplesSeen = 61440; EvalErrPerSample = 0.51840824; AvgLearningRatePerSample = 9.7656251e-05; EpochTime=0.476436
+SGD: Saving checkpoint model '/tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn'
+CNTKCommandTrainEnd: speechTrain
+
+Action "train" complete.
+
+
+##############################################################################
+#                                                                            #
+# Action "write"                                                             #
+#                                                                            #
+##############################################################################
+
+reading script file glob_0000.write.scp ... 10 entries
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+	EvalErrorPrediction = ErrorPrediction
+	InvStdOfFeatures = InvStdDev
+	MeanOfFeatures = Mean
+	PosteriorProb = Softmax
+	Prior = Mean
+	ScaledLogLikelihood = Minus
+FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation
+FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation
+FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation
+FormNestedNetwork: WARNING: Was called twice for Prior Mean operation
+FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue() :  -> [132 {1} x *]
+Validating --> W2 = LearnableParameter() :  -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter() :  -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter() :  -> [512 x 363 {1,512}]
+Validating --> features = InputValue() :  -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean (features) : [363 {1} x *] -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev (features) : [363 {1} x *] -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization (features, MeanOfFeatures, InvStdOfFeatures) : [363 {1} x *], [363 {1}], [363 {1}] -> [363 {1} x *]
+Validating --> W0*features = Times (W0, MVNormalizedFeatures) : [512 x 363 {1,512}], [363 {1} x *] -> [512 {1} x *]
+Validating --> B0 = LearnableParameter() :  -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus (W0*features, B0) : [512 {1} x *], [512 x 1 {1,512}] -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid (W0*features+B0) : [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times (W1, H1) : [512 x 512 {1,512}], [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter() :  -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus (W1*H1, B1) : [512 x 1 {1,512} x *], [512 x 1 {1,512}] -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid (W1*H1+B1) : [512 x 1 {1,512} x *] -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times (W2, H2) : [132 x 512 {1,132}], [512 x 1 {1,512} x *] -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter() :  -> [132 x 1 {1,132}]
+Validating --> HLast = Plus (W2*H1, B2) : [132 x 1 {1,132} x *], [132 x 1 {1,132}] -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax (labels, HLast) : [132 {1} x *], [132 x 1 {1,132} x *] -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction (labels, HLast) : [132 {1} x *], [132 x 1 {1,132} x *] -> [1 {1}]
+Validating --> PosteriorProb = Softmax (HLast) : [132 x 1 {1,132} x *] -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean (labels) : [132 {1} x *] -> [132 {1}]
+Validating --> LogOfPrior = Log (Prior) : [132 {1}] -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus (HLast, LogOfPrior) : [132 x 1 {1,132} x *], [132 {1}] -> [132 x 1 {1,132} x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+
+Validating network, final pass.
+
+
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+
+
+Allocating matrices for forward and/or backward propagation.
+evaluate: reading 368 frames of An4/71/71/cen5-fjam-b.mfc
+Minibatch[1]: ActualMBSize = 368
+evaluate: reading 438 frames of An4/213/213/cen4-fsaf2-b.mfc
+Minibatch[2]: ActualMBSize = 438
+evaluate: reading 368 frames of An4/513/513/cen7-mgah-b.mfc
+Minibatch[3]: ActualMBSize = 368
+evaluate: reading 248 frames of An4/614/614/cen7-mkdb-b.mfc
+Minibatch[4]: ActualMBSize = 248
+evaluate: reading 248 frames of An4/507/507/cen1-mgah-b.mfc
+Minibatch[5]: ActualMBSize = 248
+evaluate: reading 358 frames of An4/693/693/cen8-mmkw-b.mfc
+Minibatch[6]: ActualMBSize = 358
+evaluate: reading 308 frames of An4/918/918/cen4-mtos-b.mfc
+Minibatch[7]: ActualMBSize = 308
+evaluate: reading 608 frames of An4/477/477/an257-mewl-b.mfc
+Minibatch[8]: ActualMBSize = 608
+evaluate: reading 78 frames of An4/454/454/an70-meht-b.mfc
+Minibatch[9]: ActualMBSize = 78
+evaluate: reading 228 frames of An4/254/254/cen6-ftmj-b.mfc
+Minibatch[10]: ActualMBSize = 228
+Written to /tmp/cntk-test-20160318001122.438616/Speech/DNN_WriteCommand@debug_gpu/Output*
+Total Samples Evaluated = 3250
+
+Action "write" complete.
+
+COMPLETED
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/baseline.windows.cpu.txt
@ -0,0 +1,666 @@
+=== Running /cygdrive/e/NetScale/CNTK/git_repos/git_master/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand/cntk.cntk currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data RunDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand OutputDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu DeviceId=-1 shareNodeValueMatrices=true
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Mar 17 2016 16:58:29
+		Last modified date: Thu Mar 17 16:54:52 2016
+		Build type: Debug
+		Build target: GPU
+		With 1bit-SGD: yes
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		CUB_PATH: C:\cub-1.4.1
+		CUDNN_PATH: C:\cudnn-4.0
+		Built by amitaga on Amitaga-Win-DT3
+		Build Path: E:\NetScale\CNTK\git_repos\git_master\Source\CNTK\
+-------------------------------------------------------------------
+Changed current directory to 'E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data'
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Mar 17 2016 16:58:29
+		Last modified date: Thu Mar 17 16:54:52 2016
+		Build type: Debug
+		Build target: GPU
+		With 1bit-SGD: yes
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		CUB_PATH: C:\cub-1.4.1
+		CUDNN_PATH: C:\cudnn-4.0
+		Built by amitaga on Amitaga-Win-DT3
+		Build Path: E:\NetScale\CNTK\git_repos\git_master\Source\CNTK\
+-------------------------------------------------------------------
+running on Amitaga-Win-DT3 at 2016/03/18 06:42:08
+command line: 
+E:\NetScale\CNTK\git_repos\git_master\x64\debug\cntk.exe  configFile=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand/cntk.cntk  currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data  RunDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu  DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data  ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand  OutputDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu  DeviceId=-1  shareNodeValueMatrices=true
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+precision = "float"
+command = speechTrain:write
+deviceId = $DeviceId$
+parallelTrain = false
+makeMode = false
+speechTrain = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    deviceId = $DeviceId$
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+write = [
+    action = write
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = $DeviceId$
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "$RunDir$/Output"
+]
+currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+RunDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu
+DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand
+OutputDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu
+DeviceId=-1
+shareNodeValueMatrices=true
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+precision = "float"
+command = speechTrain:write
+deviceId = -1
+parallelTrain = false
+makeMode = false
+speechTrain = [
+    action = "train"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn"
+    deviceId = -1
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/glob_0000.mlf"
+            labelMappingFile = "E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+write = [
+    action = write
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = -1
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/Output"
+]
+currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+RunDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu
+DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand
+OutputDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu
+DeviceId=-1
+shareNodeValueMatrices=true
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: cntk.cntk:command=speechTrain:write
+configparameters: cntk.cntk:ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand
+configparameters: cntk.cntk:currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+configparameters: cntk.cntk:DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+configparameters: cntk.cntk:deviceId=-1
+configparameters: cntk.cntk:makeMode=false
+configparameters: cntk.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu
+configparameters: cntk.cntk:parallelTrain=false
+configparameters: cntk.cntk:precision=float
+configparameters: cntk.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu
+configparameters: cntk.cntk:shareNodeValueMatrices=true
+configparameters: cntk.cntk:speechTrain=[
+    action = "train"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn"
+    deviceId = -1
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/glob_0000.mlf"
+            labelMappingFile = "E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+
+configparameters: cntk.cntk:write=[
+    action = write
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = -1
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/Output"
+]
+
+<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+Commands: speechTrain write 
+Precision = "float"
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 3
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+
+##############################################################################
+#                                                                            #
+# Action "train"                                                             #
+#                                                                            #
+##############################################################################
+
+CNTKCommandTrainBegin: speechTrain
+SimpleNetworkBuilder Using CPU
+reading script file glob_0000.scp ... 948 entries
+total 132 state names in state list E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/state.list
+htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+	EvalErrorPrediction = ErrorPrediction
+	InvStdOfFeatures = InvStdDev
+	MeanOfFeatures = Mean
+	PosteriorProb = Softmax
+	Prior = Mean
+	ScaledLogLikelihood = Minus
+FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation
+FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation
+FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation
+FormNestedNetwork: WARNING: Was called twice for Prior Mean operation
+FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation
+
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+Validating network, final pass.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+SGD using CPU.
+
+Training criterion node(s):
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	EvalErrorPrediction = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Precomputing --> 3 PreCompute nodes found.
+
+	NodeName: MeanOfFeatures
+	NodeName: InvStdOfFeatures
+	NodeName: Prior
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+Precomputing --> Completed.
+
+
+Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 1 of 3]-Minibatch[   1-  10, 3.13%]: SamplesSeen = 640; TrainLossPerSample =  4.46944885; EvalErr[0]PerSample = 0.90781250; TotalTime = 8.0863s; SamplesPerSecond = 79.1
+ Epoch[ 1 of 3]-Minibatch[  11-  20, 6.25%]: SamplesSeen = 640; TrainLossPerSample =  4.22300034; EvalErr[0]PerSample = 0.90156250; TotalTime = 8.3533s; SamplesPerSecond = 76.6
+ Epoch[ 1 of 3]-Minibatch[  21-  30, 9.38%]: SamplesSeen = 640; TrainLossPerSample =  3.93971329; EvalErr[0]PerSample = 0.84687500; TotalTime = 7.7989s; SamplesPerSecond = 82.1
+ Epoch[ 1 of 3]-Minibatch[  31-  40, 12.50%]: SamplesSeen = 640; TrainLossPerSample =  3.92341614; EvalErr[0]PerSample = 0.90468750; TotalTime = 7.4403s; SamplesPerSecond = 86.0
+ Epoch[ 1 of 3]-Minibatch[  41-  50, 15.63%]: SamplesSeen = 640; TrainLossPerSample =  3.84074249; EvalErr[0]PerSample = 0.91093750; TotalTime = 8.4981s; SamplesPerSecond = 75.3
+ Epoch[ 1 of 3]-Minibatch[  51-  60, 18.75%]: SamplesSeen = 640; TrainLossPerSample =  3.71251984; EvalErr[0]PerSample = 0.88437500; TotalTime = 8.1020s; SamplesPerSecond = 79.0
+ Epoch[ 1 of 3]-Minibatch[  61-  70, 21.88%]: SamplesSeen = 640; TrainLossPerSample =  3.51563110; EvalErr[0]PerSample = 0.82500000; TotalTime = 7.5585s; SamplesPerSecond = 84.7
+ Epoch[ 1 of 3]-Minibatch[  71-  80, 25.00%]: SamplesSeen = 640; TrainLossPerSample =  3.49348755; EvalErr[0]PerSample = 0.81093750; TotalTime = 7.4555s; SamplesPerSecond = 85.8
+ Epoch[ 1 of 3]-Minibatch[  81-  90, 28.13%]: SamplesSeen = 640; TrainLossPerSample =  3.34739685; EvalErr[0]PerSample = 0.76562500; TotalTime = 7.8433s; SamplesPerSecond = 81.6
+ Epoch[ 1 of 3]-Minibatch[  91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample =  3.51961060; EvalErr[0]PerSample = 0.79843750; TotalTime = 8.2672s; SamplesPerSecond = 77.4
+ Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample =  3.24656067; EvalErr[0]PerSample = 0.80312500; TotalTime = 7.4795s; SamplesPerSecond = 85.6
+ Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample =  3.33397217; EvalErr[0]PerSample = 0.80000000; TotalTime = 7.7651s; SamplesPerSecond = 82.4
+ Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.63%]: SamplesSeen = 640; TrainLossPerSample =  3.17780457; EvalErr[0]PerSample = 0.77031250; TotalTime = 7.8670s; SamplesPerSecond = 81.4
+ Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample =  3.09845886; EvalErr[0]PerSample = 0.76875000; TotalTime = 7.8823s; SamplesPerSecond = 81.2
+ Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample =  3.06457214; EvalErr[0]PerSample = 0.72968750; TotalTime = 8.7182s; SamplesPerSecond = 73.4
+ Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample =  2.91632080; EvalErr[0]PerSample = 0.69531250; TotalTime = 7.7545s; SamplesPerSecond = 82.5
+ Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.13%]: SamplesSeen = 640; TrainLossPerSample =  2.90608521; EvalErr[0]PerSample = 0.73281250; TotalTime = 7.9192s; SamplesPerSecond = 80.8
+ Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample =  2.74095459; EvalErr[0]PerSample = 0.65937500; TotalTime = 7.6856s; SamplesPerSecond = 83.3
+ Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample =  2.67088013; EvalErr[0]PerSample = 0.67343750; TotalTime = 7.9971s; SamplesPerSecond = 80.0
+ Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample =  2.67608643; EvalErr[0]PerSample = 0.66406250; TotalTime = 8.2668s; SamplesPerSecond = 77.4
+ Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.63%]: SamplesSeen = 640; TrainLossPerSample =  2.54733276; EvalErr[0]PerSample = 0.62968750; TotalTime = 7.6795s; SamplesPerSecond = 83.3
+ Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample =  2.61925659; EvalErr[0]PerSample = 0.67343750; TotalTime = 8.1318s; SamplesPerSecond = 78.7
+ Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample =  2.52387695; EvalErr[0]PerSample = 0.65781250; TotalTime = 8.0287s; SamplesPerSecond = 79.7
+ Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample =  2.47543945; EvalErr[0]PerSample = 0.63437500; TotalTime = 8.1587s; SamplesPerSecond = 78.4
+ Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.13%]: SamplesSeen = 640; TrainLossPerSample =  2.43265381; EvalErr[0]PerSample = 0.61406250; TotalTime = 7.5755s; SamplesPerSecond = 84.5
+ Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample =  2.41727905; EvalErr[0]PerSample = 0.63125000; TotalTime = 7.7612s; SamplesPerSecond = 82.5
+ Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample =  2.17673950; EvalErr[0]PerSample = 0.57812500; TotalTime = 7.9344s; SamplesPerSecond = 80.7
+ Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample =  2.31020508; EvalErr[0]PerSample = 0.64062500; TotalTime = 7.7126s; SamplesPerSecond = 83.0
+ Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.63%]: SamplesSeen = 640; TrainLossPerSample =  2.26400757; EvalErr[0]PerSample = 0.61093750; TotalTime = 7.5957s; SamplesPerSecond = 84.3
+ Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample =  2.15885010; EvalErr[0]PerSample = 0.58281250; TotalTime = 7.7268s; SamplesPerSecond = 82.8
+ Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample =  2.22711792; EvalErr[0]PerSample = 0.59218750; TotalTime = 8.4360s; SamplesPerSecond = 75.9
+ Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample =  2.25604858; EvalErr[0]PerSample = 0.60625000; TotalTime = 8.2430s; SamplesPerSecond = 77.6
+Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.007046; TotalSamplesSeen = 20480; EvalErrPerSample = 0.72827148; AvgLearningRatePerSample = 0.015625; EpochTime=253.747
+SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn.1'
+
+Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119  momentum as time constant = 607.5 samples
+minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 2 of 3]-Minibatch[   1-  10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample =  2.10257511; EvalErr[0]PerSample = 0.56484375; TotalTime = 14.7861s; SamplesPerSecond = 173.1
+ Epoch[ 2 of 3]-Minibatch[  11-  20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample =  2.00548573; EvalErr[0]PerSample = 0.54843750; TotalTime = 15.4207s; SamplesPerSecond = 166.0
+ Epoch[ 2 of 3]-Minibatch[  21-  30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample =  2.00766983; EvalErr[0]PerSample = 0.54960937; TotalTime = 14.6152s; SamplesPerSecond = 175.2
+ Epoch[ 2 of 3]-Minibatch[  31-  40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.92049370; EvalErr[0]PerSample = 0.53281250; TotalTime = 16.0739s; SamplesPerSecond = 159.3
+ Epoch[ 2 of 3]-Minibatch[  41-  50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.90178452; EvalErr[0]PerSample = 0.52265625; TotalTime = 15.4519s; SamplesPerSecond = 165.7
+ Epoch[ 2 of 3]-Minibatch[  51-  60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.91359482; EvalErr[0]PerSample = 0.53984375; TotalTime = 15.0020s; SamplesPerSecond = 170.6
+ Epoch[ 2 of 3]-Minibatch[  61-  70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.91765289; EvalErr[0]PerSample = 0.53125000; TotalTime = 14.9498s; SamplesPerSecond = 171.2
+ Epoch[ 2 of 3]-Minibatch[  71-  80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.87682800; EvalErr[0]PerSample = 0.52890625; TotalTime = 14.7959s; SamplesPerSecond = 173.0
+Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9557606; TotalSamplesSeen = 40960; EvalErrPerSample = 0.53979492; AvgLearningRatePerSample = 0.001953125; EpochTime=121.124
+SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn.2'
+
+Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119  momentum as time constant = 2429.9 samples
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 3 of 3]-Minibatch[   1-  10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample =  1.88593941; EvalErr[0]PerSample = 0.52529297; TotalTime = 42.7797s; SamplesPerSecond = 239.4
+ Epoch[ 3 of 3]-Minibatch[  11-  20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample =  1.89384575; EvalErr[0]PerSample = 0.51816406; TotalTime = 45.3675s; SamplesPerSecond = 225.7
+Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8898926; TotalSamplesSeen = 61440; EvalErrPerSample = 0.52172852; AvgLearningRatePerSample = 9.7656251e-005; EpochTime=88.2535
+SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/models/cntkSpeech.dnn'
+CNTKCommandTrainEnd: speechTrain
+
+Action "train" complete.
+
+
+##############################################################################
+#                                                                            #
+# Action "write"                                                             #
+#                                                                            #
+##############################################################################
+
+reading script file glob_0000.write.scp ... 10 entries
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+	EvalErrorPrediction = ErrorPrediction
+	InvStdOfFeatures = InvStdDev
+	MeanOfFeatures = Mean
+	PosteriorProb = Softmax
+	Prior = Mean
+	ScaledLogLikelihood = Minus
+FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation
+FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation
+FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation
+FormNestedNetwork: WARNING: Was called twice for Prior Mean operation
+FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation
+
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+Validating network, final pass.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+
+Allocating matrices for forward and/or backward propagation.
+evaluate: reading 368 frames of An4/71/71/cen5-fjam-b.mfc
+Minibatch[1]: ActualMBSize = 368
+evaluate: reading 438 frames of An4/213/213/cen4-fsaf2-b.mfc
+Minibatch[2]: ActualMBSize = 438
+evaluate: reading 368 frames of An4/513/513/cen7-mgah-b.mfc
+Minibatch[3]: ActualMBSize = 368
+evaluate: reading 248 frames of An4/614/614/cen7-mkdb-b.mfc
+Minibatch[4]: ActualMBSize = 248
+evaluate: reading 248 frames of An4/507/507/cen1-mgah-b.mfc
+Minibatch[5]: ActualMBSize = 248
+evaluate: reading 358 frames of An4/693/693/cen8-mmkw-b.mfc
+Minibatch[6]: ActualMBSize = 358
+evaluate: reading 308 frames of An4/918/918/cen4-mtos-b.mfc
+Minibatch[7]: ActualMBSize = 308
+evaluate: reading 608 frames of An4/477/477/an257-mewl-b.mfc
+Minibatch[8]: ActualMBSize = 608
+evaluate: reading 78 frames of An4/454/454/an70-meht-b.mfc
+Minibatch[9]: ActualMBSize = 78
+evaluate: reading 228 frames of An4/254/254/cen6-ftmj-b.mfc
+Minibatch[10]: ActualMBSize = 228
+Written to C:\cygwin64\tmp\cntk-test-20160317224207.69144\Speech\DNN_WriteCommand@debug_cpu/Output*
+Total Samples Evaluated = 3250
+
+Action "write" complete.
+
+COMPLETED
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/baseline.windows.gpu.txt
@ -0,0 +1,668 @@
+=== Running /cygdrive/e/NetScale/CNTK/git_repos/git_master/x64/debug/cntk.exe configFile=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand/cntk.cntk currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data RunDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand OutputDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu DeviceId=0 shareNodeValueMatrices=true
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Mar 17 2016 16:58:29
+		Last modified date: Thu Mar 17 16:54:52 2016
+		Build type: Debug
+		Build target: GPU
+		With 1bit-SGD: yes
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		CUB_PATH: C:\cub-1.4.1
+		CUDNN_PATH: C:\cudnn-4.0
+		Built by amitaga on Amitaga-Win-DT3
+		Build Path: E:\NetScale\CNTK\git_repos\git_master\Source\CNTK\
+-------------------------------------------------------------------
+Changed current directory to 'E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data'
+-------------------------------------------------------------------
+Build info: 
+
+		Built time: Mar 17 2016 16:58:29
+		Last modified date: Thu Mar 17 16:54:52 2016
+		Build type: Debug
+		Build target: GPU
+		With 1bit-SGD: yes
+		CUDA_PATH: C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v7.0
+		CUB_PATH: C:\cub-1.4.1
+		CUDNN_PATH: C:\cudnn-4.0
+		Built by amitaga on Amitaga-Win-DT3
+		Build Path: E:\NetScale\CNTK\git_repos\git_master\Source\CNTK\
+-------------------------------------------------------------------
+running on Amitaga-Win-DT3 at 2016/03/18 05:34:30
+command line: 
+E:\NetScale\CNTK\git_repos\git_master\x64\debug\cntk.exe  configFile=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand/cntk.cntk  currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data  RunDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu  DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data  ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand  OutputDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu  DeviceId=0  shareNodeValueMatrices=true
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>
+precision = "float"
+command = speechTrain:write
+deviceId = $DeviceId$
+parallelTrain = false
+makeMode = false
+speechTrain = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    deviceId = $DeviceId$
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+write = [
+    action = write
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = $DeviceId$
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "$RunDir$/Output"
+]
+currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+RunDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand
+OutputDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu
+DeviceId=0
+shareNodeValueMatrices=true
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+precision = "float"
+command = speechTrain:write
+deviceId = 0
+parallelTrain = false
+makeMode = false
+speechTrain = [
+    action = "train"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn"
+    deviceId = 0
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/glob_0000.mlf"
+            labelMappingFile = "E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+write = [
+    action = write
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = 0
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/Output"
+]
+currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+RunDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu
+DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand
+OutputDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu
+DeviceId=0
+shareNodeValueMatrices=true
+
+<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+
+>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>
+configparameters: cntk.cntk:command=speechTrain:write
+configparameters: cntk.cntk:ConfigDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\DNN\WriteCommand
+configparameters: cntk.cntk:currentDirectory=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+configparameters: cntk.cntk:DataDir=E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data
+configparameters: cntk.cntk:deviceId=0
+configparameters: cntk.cntk:makeMode=false
+configparameters: cntk.cntk:OutputDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu
+configparameters: cntk.cntk:parallelTrain=false
+configparameters: cntk.cntk:precision=float
+configparameters: cntk.cntk:RunDir=C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu
+configparameters: cntk.cntk:shareNodeValueMatrices=true
+configparameters: cntk.cntk:speechTrain=[
+    action = "train"
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn"
+    deviceId = 0
+    traceLevel = 1
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+        labels = [
+            mlfFile = "E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/glob_0000.mlf"
+            labelMappingFile = "E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/state.list"
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+
+configparameters: cntk.cntk:write=[
+    action = write
+    modelPath = "C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+    deviceId = 0
+    traceLevel = 1
+    useValidation=true
+    printValues=true
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+    outputPath = "C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/Output"
+]
+
+<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<
+Commands: speechTrain write 
+Precision = "float"
+CNTKModelPath: C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn
+CNTKCommandTrainInfo: speechTrain : 3
+CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : 3
+
+##############################################################################
+#                                                                            #
+# Action "train"                                                             #
+#                                                                            #
+##############################################################################
+
+CNTKCommandTrainBegin: speechTrain
+SimpleNetworkBuilder Using GPU 0
+reading script file glob_0000.scp ... 948 entries
+total 132 state names in state list E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/state.list
+htkmlfreader: reading MLF file E:\NetScale\CNTK\git_repos\git_master\Tests\EndToEndTests\Speech\Data/glob_0000.mlf ... total 948 entries
+...............................................................................................feature set 0: 252734 frames in 948 out of 948 utterances
+label set 0: 129 classes
+minibatchutterancesource: 948 utterances grouped into 3 chunks, av. chunk size: 316.0 utterances, 84244.7 frames
+Microsoft::MSR::CNTK::GPUMatrix<ElemType>::SetUniformRandomValue (GPU): creating curand object with seed 1, sizeof(ElemType)==4
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+	EvalErrorPrediction = ErrorPrediction
+	InvStdOfFeatures = InvStdDev
+	MeanOfFeatures = Mean
+	PosteriorProb = Softmax
+	Prior = Mean
+	ScaledLogLikelihood = Minus
+FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation
+FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation
+FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation
+FormNestedNetwork: WARNING: Was called twice for Prior Mean operation
+FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation
+
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+Validating network, final pass.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+SGD using GPU 0.
+
+Training criterion node(s):
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+
+Evaluation criterion node(s):
+	EvalErrorPrediction = ErrorPrediction
+
+
+Allocating matrices for forward and/or backward propagation.
+
+Precomputing --> 3 PreCompute nodes found.
+
+	NodeName: MeanOfFeatures
+	NodeName: InvStdOfFeatures
+	NodeName: Prior
+minibatchiterator: epoch 0: frames [0..252734] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+requiredata: determined feature kind as 33-dimensional 'USER' with frame shift 10.0 ms
+
+Precomputing --> Completed.
+
+
+Starting Epoch 1: learning rate per sample = 0.015625  effective momentum = 0.900000  momentum as time constant = 607.4 samples
+minibatchiterator: epoch 0: frames [0..20480] (first utterance at frame 0), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 1 of 3]-Minibatch[   1-  10, 3.13%]: SamplesSeen = 640; TrainLossPerSample =  4.45645981; EvalErr[0]PerSample = 0.92500000; TotalTime = 0.2163s; SamplesPerSecond = 2959.5
+ Epoch[ 1 of 3]-Minibatch[  11-  20, 6.25%]: SamplesSeen = 640; TrainLossPerSample =  4.22315750; EvalErr[0]PerSample = 0.90156250; TotalTime = 0.1989s; SamplesPerSecond = 3217.6
+ Epoch[ 1 of 3]-Minibatch[  21-  30, 9.38%]: SamplesSeen = 640; TrainLossPerSample =  3.95180664; EvalErr[0]PerSample = 0.84687500; TotalTime = 0.1898s; SamplesPerSecond = 3372.6
+ Epoch[ 1 of 3]-Minibatch[  31-  40, 12.50%]: SamplesSeen = 640; TrainLossPerSample =  3.94158020; EvalErr[0]PerSample = 0.89843750; TotalTime = 0.1855s; SamplesPerSecond = 3449.9
+ Epoch[ 1 of 3]-Minibatch[  41-  50, 15.63%]: SamplesSeen = 640; TrainLossPerSample =  3.85668945; EvalErr[0]PerSample = 0.91093750; TotalTime = 0.1794s; SamplesPerSecond = 3566.9
+ Epoch[ 1 of 3]-Minibatch[  51-  60, 18.75%]: SamplesSeen = 640; TrainLossPerSample =  3.72866364; EvalErr[0]PerSample = 0.89531250; TotalTime = 0.1738s; SamplesPerSecond = 3681.4
+ Epoch[ 1 of 3]-Minibatch[  61-  70, 21.88%]: SamplesSeen = 640; TrainLossPerSample =  3.51809235; EvalErr[0]PerSample = 0.82968750; TotalTime = 0.1703s; SamplesPerSecond = 3759.0
+ Epoch[ 1 of 3]-Minibatch[  71-  80, 25.00%]: SamplesSeen = 640; TrainLossPerSample =  3.48455200; EvalErr[0]PerSample = 0.80781250; TotalTime = 0.1645s; SamplesPerSecond = 3890.6
+ Epoch[ 1 of 3]-Minibatch[  81-  90, 28.13%]: SamplesSeen = 640; TrainLossPerSample =  3.33829346; EvalErr[0]PerSample = 0.76875000; TotalTime = 0.1630s; SamplesPerSecond = 3927.2
+ Epoch[ 1 of 3]-Minibatch[  91- 100, 31.25%]: SamplesSeen = 640; TrainLossPerSample =  3.50167236; EvalErr[0]PerSample = 0.79843750; TotalTime = 0.1567s; SamplesPerSecond = 4083.6
+WARNING: The same matrix with dim [1, 1] has been transferred between different devices for 20 times.
+ Epoch[ 1 of 3]-Minibatch[ 101- 110, 34.38%]: SamplesSeen = 640; TrainLossPerSample =  3.22861633; EvalErr[0]PerSample = 0.80000000; TotalTime = 0.1568s; SamplesPerSecond = 4081.0
+ Epoch[ 1 of 3]-Minibatch[ 111- 120, 37.50%]: SamplesSeen = 640; TrainLossPerSample =  3.32616882; EvalErr[0]PerSample = 0.79062500; TotalTime = 0.1498s; SamplesPerSecond = 4271.7
+ Epoch[ 1 of 3]-Minibatch[ 121- 130, 40.63%]: SamplesSeen = 640; TrainLossPerSample =  3.16897583; EvalErr[0]PerSample = 0.77968750; TotalTime = 0.1498s; SamplesPerSecond = 4271.6
+ Epoch[ 1 of 3]-Minibatch[ 131- 140, 43.75%]: SamplesSeen = 640; TrainLossPerSample =  3.08891907; EvalErr[0]PerSample = 0.77656250; TotalTime = 0.1451s; SamplesPerSecond = 4409.3
+ Epoch[ 1 of 3]-Minibatch[ 141- 150, 46.88%]: SamplesSeen = 640; TrainLossPerSample =  3.06005249; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.1446s; SamplesPerSecond = 4425.9
+ Epoch[ 1 of 3]-Minibatch[ 151- 160, 50.00%]: SamplesSeen = 640; TrainLossPerSample =  2.91128540; EvalErr[0]PerSample = 0.69531250; TotalTime = 0.1453s; SamplesPerSecond = 4406.1
+ Epoch[ 1 of 3]-Minibatch[ 161- 170, 53.13%]: SamplesSeen = 640; TrainLossPerSample =  2.90172119; EvalErr[0]PerSample = 0.72968750; TotalTime = 0.1452s; SamplesPerSecond = 4407.3
+ Epoch[ 1 of 3]-Minibatch[ 171- 180, 56.25%]: SamplesSeen = 640; TrainLossPerSample =  2.73261719; EvalErr[0]PerSample = 0.65312500; TotalTime = 0.1473s; SamplesPerSecond = 4346.1
+ Epoch[ 1 of 3]-Minibatch[ 181- 190, 59.38%]: SamplesSeen = 640; TrainLossPerSample =  2.66515503; EvalErr[0]PerSample = 0.68437500; TotalTime = 0.1443s; SamplesPerSecond = 4434.3
+ Epoch[ 1 of 3]-Minibatch[ 191- 200, 62.50%]: SamplesSeen = 640; TrainLossPerSample =  2.67383423; EvalErr[0]PerSample = 0.66406250; TotalTime = 0.1446s; SamplesPerSecond = 4425.1
+ Epoch[ 1 of 3]-Minibatch[ 201- 210, 65.63%]: SamplesSeen = 640; TrainLossPerSample =  2.52869263; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.1467s; SamplesPerSecond = 4362.7
+ Epoch[ 1 of 3]-Minibatch[ 211- 220, 68.75%]: SamplesSeen = 640; TrainLossPerSample =  2.60032349; EvalErr[0]PerSample = 0.66718750; TotalTime = 0.1449s; SamplesPerSecond = 4416.1
+ Epoch[ 1 of 3]-Minibatch[ 221- 230, 71.88%]: SamplesSeen = 640; TrainLossPerSample =  2.51134033; EvalErr[0]PerSample = 0.64843750; TotalTime = 0.1447s; SamplesPerSecond = 4422.7
+ Epoch[ 1 of 3]-Minibatch[ 231- 240, 75.00%]: SamplesSeen = 640; TrainLossPerSample =  2.45362549; EvalErr[0]PerSample = 0.63750000; TotalTime = 0.1449s; SamplesPerSecond = 4417.7
+ Epoch[ 1 of 3]-Minibatch[ 241- 250, 78.13%]: SamplesSeen = 640; TrainLossPerSample =  2.41640015; EvalErr[0]PerSample = 0.61562500; TotalTime = 0.1471s; SamplesPerSecond = 4349.7
+ Epoch[ 1 of 3]-Minibatch[ 251- 260, 81.25%]: SamplesSeen = 640; TrainLossPerSample =  2.39745483; EvalErr[0]PerSample = 0.62812500; TotalTime = 0.1454s; SamplesPerSecond = 4400.9
+ Epoch[ 1 of 3]-Minibatch[ 261- 270, 84.38%]: SamplesSeen = 640; TrainLossPerSample =  2.16415405; EvalErr[0]PerSample = 0.56718750; TotalTime = 0.1470s; SamplesPerSecond = 4354.8
+ Epoch[ 1 of 3]-Minibatch[ 271- 280, 87.50%]: SamplesSeen = 640; TrainLossPerSample =  2.30347290; EvalErr[0]PerSample = 0.63593750; TotalTime = 0.1450s; SamplesPerSecond = 4412.5
+ Epoch[ 1 of 3]-Minibatch[ 281- 290, 90.63%]: SamplesSeen = 640; TrainLossPerSample =  2.24398804; EvalErr[0]PerSample = 0.60937500; TotalTime = 0.1680s; SamplesPerSecond = 3808.9
+ Epoch[ 1 of 3]-Minibatch[ 291- 300, 93.75%]: SamplesSeen = 640; TrainLossPerSample =  2.15322266; EvalErr[0]PerSample = 0.57968750; TotalTime = 0.1760s; SamplesPerSecond = 3637.3
+ Epoch[ 1 of 3]-Minibatch[ 301- 310, 96.88%]: SamplesSeen = 640; TrainLossPerSample =  2.21664429; EvalErr[0]PerSample = 0.59531250; TotalTime = 0.1661s; SamplesPerSecond = 3853.0
+ Epoch[ 1 of 3]-Minibatch[ 311- 320, 100.00%]: SamplesSeen = 640; TrainLossPerSample =  2.25246582; EvalErr[0]PerSample = 0.60156250; TotalTime = 0.1574s; SamplesPerSecond = 4065.5
+Finished Epoch[ 1 of 3]: [Training Set] TrainLossPerSample = 3.0000031; TotalSamplesSeen = 20480; EvalErrPerSample = 0.72836918; AvgLearningRatePerSample = 0.015625; EpochTime=9.69797
+SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn.1'
+
+Starting Epoch 2: learning rate per sample = 0.001953  effective momentum = 0.656119  momentum as time constant = 607.5 samples
+minibatchiterator: epoch 1: frames [20480..40960] (first utterance at frame 20480), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 2 of 3]-Minibatch[   1-  10, 12.50%]: SamplesSeen = 2560; TrainLossPerSample =  2.08151951; EvalErr[0]PerSample = 0.55859375; TotalTime = 0.4082s; SamplesPerSecond = 6271.8
+ Epoch[ 2 of 3]-Minibatch[  11-  20, 25.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.98395710; EvalErr[0]PerSample = 0.54257813; TotalTime = 0.3661s; SamplesPerSecond = 6992.5
+ Epoch[ 2 of 3]-Minibatch[  21-  30, 37.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.98575516; EvalErr[0]PerSample = 0.54492188; TotalTime = 0.3298s; SamplesPerSecond = 7761.3
+ Epoch[ 2 of 3]-Minibatch[  31-  40, 50.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.90485115; EvalErr[0]PerSample = 0.53164062; TotalTime = 0.3054s; SamplesPerSecond = 8383.4
+ Epoch[ 2 of 3]-Minibatch[  41-  50, 62.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.88324280; EvalErr[0]PerSample = 0.52539063; TotalTime = 0.2806s; SamplesPerSecond = 9121.7
+ Epoch[ 2 of 3]-Minibatch[  51-  60, 75.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.89109268; EvalErr[0]PerSample = 0.53359375; TotalTime = 0.2659s; SamplesPerSecond = 9627.8
+ Epoch[ 2 of 3]-Minibatch[  61-  70, 87.50%]: SamplesSeen = 2560; TrainLossPerSample =  1.89496002; EvalErr[0]PerSample = 0.52890625; TotalTime = 0.2682s; SamplesPerSecond = 9546.9
+ Epoch[ 2 of 3]-Minibatch[  71-  80, 100.00%]: SamplesSeen = 2560; TrainLossPerSample =  1.85944366; EvalErr[0]PerSample = 0.52265625; TotalTime = 0.2865s; SamplesPerSecond = 8936.7
+Finished Epoch[ 2 of 3]: [Training Set] TrainLossPerSample = 1.9356028; TotalSamplesSeen = 40960; EvalErrPerSample = 0.53603518; AvgLearningRatePerSample = 0.001953125; EpochTime=2.56305
+SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn.2'
+
+Starting Epoch 3: learning rate per sample = 0.000098  effective momentum = 0.656119  momentum as time constant = 2429.9 samples
+minibatchiterator: epoch 2: frames [40960..61440] (first utterance at frame 40960), data subset 0 of 1, with 1 datapasses
+
+Starting minibatch loop.
+ Epoch[ 3 of 3]-Minibatch[   1-  10, 50.00%]: SamplesSeen = 10240; TrainLossPerSample =  1.86752853; EvalErr[0]PerSample = 0.52177734; TotalTime = 1.0785s; SamplesPerSecond = 9494.9
+ Epoch[ 3 of 3]-Minibatch[  11-  20, 100.00%]: SamplesSeen = 10240; TrainLossPerSample =  1.87358780; EvalErr[0]PerSample = 0.51542969; TotalTime = 0.9319s; SamplesPerSecond = 10988.0
+Finished Epoch[ 3 of 3]: [Training Set] TrainLossPerSample = 1.8705581; TotalSamplesSeen = 61440; EvalErrPerSample = 0.5186035; AvgLearningRatePerSample = 9.7656251e-005; EpochTime=2.19741
+SGD: Saving checkpoint model 'C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/models/cntkSpeech.dnn'
+CNTKCommandTrainEnd: speechTrain
+
+Action "train" complete.
+
+
+##############################################################################
+#                                                                            #
+# Action "write"                                                             #
+#                                                                            #
+##############################################################################
+
+reading script file glob_0000.write.scp ... 10 entries
+
+Post-processing network...
+
+7 roots:
+	CrossEntropyWithSoftmax = CrossEntropyWithSoftmax
+	EvalErrorPrediction = ErrorPrediction
+	InvStdOfFeatures = InvStdDev
+	MeanOfFeatures = Mean
+	PosteriorProb = Softmax
+	Prior = Mean
+	ScaledLogLikelihood = Minus
+FormNestedNetwork: WARNING: Was called twice for CrossEntropyWithSoftmax CrossEntropyWithSoftmax operation
+FormNestedNetwork: WARNING: Was called twice for EvalErrorPrediction ErrorPrediction operation
+FormNestedNetwork: WARNING: Was called twice for InvStdOfFeatures InvStdDev operation
+FormNestedNetwork: WARNING: Was called twice for MeanOfFeatures Mean operation
+FormNestedNetwork: WARNING: Was called twice for PosteriorProb Softmax operation
+FormNestedNetwork: WARNING: Was called twice for Prior Mean operation
+FormNestedNetwork: WARNING: Was called twice for ScaledLogLikelihood Minus operation
+
+
+Validating network. 25 nodes to process in pass 1.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+Validating network. 17 nodes to process in pass 2.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+Validating network, final pass.
+
+Validating --> labels = InputValue -> [132 {1} x *]
+Validating --> W2 = LearnableParameter -> [132 x 512 {1,132}]
+Validating --> W1 = LearnableParameter -> [512 x 512 {1,512}]
+Validating --> W0 = LearnableParameter -> [512 x 363 {1,512}]
+Validating --> features = InputValue -> [363 {1} x *]
+Validating --> MeanOfFeatures = Mean(features[363 {1} x *]) -> [363 {1}]
+Validating --> InvStdOfFeatures = InvStdDev(features[363 {1} x *]) -> [363 {1}]
+Validating --> MVNormalizedFeatures = PerDimMeanVarNormalization(features[363 {1} x *], MeanOfFeatures[363 {1}], InvStdOfFeatures[363 {1}]) -> [363 {1} x *]
+Validating --> W0*features = Times(W0[512 x 363 {1,512}], MVNormalizedFeatures[363 {1} x *]) -> [512 {1} x *]
+Validating --> B0 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W0*features+B0 = Plus(W0*features[512 {1} x *], B0[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H1 = Sigmoid(W0*features+B0[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W1*H1 = Times(W1[512 x 512 {1,512}], H1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> B1 = LearnableParameter -> [512 x 1 {1,512}]
+Validating --> W1*H1+B1 = Plus(W1*H1[512 x 1 {1,512} x *], B1[512 x 1 {1,512}]) -> [512 x 1 {1,512} x *]
+Validating --> H2 = Sigmoid(W1*H1+B1[512 x 1 {1,512} x *]) -> [512 x 1 {1,512} x *]
+Validating --> W2*H1 = Times(W2[132 x 512 {1,132}], H2[512 x 1 {1,512} x *]) -> [132 x 1 {1,132} x *]
+Validating --> B2 = LearnableParameter -> [132 x 1 {1,132}]
+Validating --> HLast = Plus(W2*H1[132 x 1 {1,132} x *], B2[132 x 1 {1,132}]) -> [132 x 1 {1,132} x *]
+Validating --> CrossEntropyWithSoftmax = CrossEntropyWithSoftmax(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> EvalErrorPrediction = ErrorPrediction(labels[132 {1} x *], HLast[132 x 1 {1,132} x *]) -> [1 {1}]
+Validating --> PosteriorProb = Softmax(HLast[132 x 1 {1,132} x *]) -> [132 x 1 {1,132} x *]
+Validating --> Prior = Mean(labels[132 {1} x *]) -> [132 {1}]
+Validating --> LogOfPrior = Log(Prior[132 {1}]) -> [132 {1}]
+Validating --> ScaledLogLikelihood = Minus(HLast[132 x 1 {1,132} x *], LogOfPrior[132 {1}]) -> [132 x 1 {1,132} x *]
+
+12 out of 25 nodes do not share the minibatch layout with the input data.
+
+Post-processing network complete.
+
+
+Allocating matrices for forward and/or backward propagation.
+evaluate: reading 368 frames of An4/71/71/cen5-fjam-b.mfc
+Minibatch[1]: ActualMBSize = 368
+evaluate: reading 438 frames of An4/213/213/cen4-fsaf2-b.mfc
+Minibatch[2]: ActualMBSize = 438
+evaluate: reading 368 frames of An4/513/513/cen7-mgah-b.mfc
+Minibatch[3]: ActualMBSize = 368
+evaluate: reading 248 frames of An4/614/614/cen7-mkdb-b.mfc
+Minibatch[4]: ActualMBSize = 248
+evaluate: reading 248 frames of An4/507/507/cen1-mgah-b.mfc
+Minibatch[5]: ActualMBSize = 248
+evaluate: reading 358 frames of An4/693/693/cen8-mmkw-b.mfc
+Minibatch[6]: ActualMBSize = 358
+evaluate: reading 308 frames of An4/918/918/cen4-mtos-b.mfc
+Minibatch[7]: ActualMBSize = 308
+evaluate: reading 608 frames of An4/477/477/an257-mewl-b.mfc
+Minibatch[8]: ActualMBSize = 608
+evaluate: reading 78 frames of An4/454/454/an70-meht-b.mfc
+Minibatch[9]: ActualMBSize = 78
+evaluate: reading 228 frames of An4/254/254/cen6-ftmj-b.mfc
+Minibatch[10]: ActualMBSize = 228
+Written to C:\cygwin64\tmp\cntk-test-20160317213428.977155\Speech\DNN_WriteCommand@debug_gpu/Output*
+Total Samples Evaluated = 3250
+
+Action "write" complete.
+
+COMPLETED
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/cntk.cntk
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/cntk.cntk
@ -0,0 +1,92 @@
+precision = "float"
+command = speechTrain:write
+deviceId = $DeviceId$
+
+parallelTrain = false
+makeMode = false
+
+speechTrain = [
+    action = "train"
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    deviceId = $DeviceId$
+    traceLevel = 1
+
+    SimpleNetworkBuilder = [
+        layerSizes = 363:512:512:132
+        trainingCriterion = "CrossEntropyWithSoftmax"
+        evalCriterion = "ErrorPrediction"
+        layerTypes = "Sigmoid"
+        applyMeanVarNorm = true
+        initValueScale = 1.0
+        uniformInit = true
+        needPrior = true
+    ]
+
+    SGD = [
+        epochSize = 20480
+        minibatchSize = 64:256:1024
+        learningRatesPerMB = 1.0:0.5:0.1
+        numMBsToShowResult = 10
+        momentumPerMB = 0.9:0.656119
+        dropoutRate = 0.0
+        maxEpochs = 3
+        keepCheckPointFiles = true
+
+        AutoAdjust = [
+            reduceLearnRateIfImproveLessThan = 0
+            loadBestModel = true
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            autoAdjustLR = "adjustAfterEpoch"
+        ]
+        clippingThresholdPerSample = 1#INF
+    ]
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.scp"
+        ]
+    
+        labels = [
+            mlfFile = "$DataDir$/glob_0000.mlf"
+            labelMappingFile = "$DataDir$/state.list"
+          
+            labelDim = 132
+            labelType = "category"
+        ]
+    ]
+]
+
+write = [
+    action = write
+    modelPath = "$RunDir$/models/cntkSpeech.dnn"
+    outputNodeNames=ScaledLogLikelihood
+
+    deviceId = $DeviceId$
+    traceLevel = 1
+    useValidation=true
+
+    printValues=true
+      
+    reader = [
+        readerType = "HTKMLFReader"
+        readMethod = "blockRandomize"
+        miniBatchMode = "partial"
+        randomize = "auto"
+        verbosity = 0
+        features = [
+            dim = 363
+            type = "real"
+            scpFile = "glob_0000.write.scp"
+        ]
+    ]
+
+    outputPath = "$RunDir$/Output"
+]
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/run-test
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/run-test
@ -0,0 +1,42 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+# cntkrun <CNTK config file name> <additional CNTK args>
+cntkrun cntk.cntk 'shareNodeValueMatrices=true' || exit $?
+
+OUTPUT_BASELINE=$TEST_DIR/Output.ScaledLogLikelihood
+if [ "$OS" == "Windows_NT" ]; then
+  OUTPUT_BASELINE=$OUTPUT_BASELINE.windows
+fi
+
+if [ "$TEST_DEVICE" == "cpu" ]; then
+  OUTPUT_BASELINE=$OUTPUT_BASELINE.cpu
+elif [ "$TEST_DEVICE" == "gpu" ]; then
+  OUTPUT_BASELINE=$OUTPUT_BASELINE.gpu
+else
+  echo "Error: Unknown TEST_DEVICE specified!"
+  exit 3
+fi
+
+OUTPUT_CURRENT=$TEST_RUN_DIR/Output.ScaledLogLikelihood
+
+if [ ! -e $OUTPUT_BASELINE ]; then
+  echo "Error: Cannot find write command's output baseline file $OUTPUT_BASELINE!"
+  exit 3
+fi
+
+if [ ! -e $OUTPUT_CURRENT ]; then
+  echo "Error: Cannot find write command's output file $OUTPUT_CURRENT!"
+  exit 3
+fi
+
+OUTPUT_DIFF=$TEST_RUN_DIR/Output.ScaledLogLikelihood.diff
+awk '{FS=" "} function abs(x) {return ((x < 0.0) ? -x : x)} NR==FNR {for (i=1; i<=NF; i++) a[FNR][i]=$i;} NR!=FNR {for (i=1; i<=NF; i++) {if ((abs($i - a[FNR][i])/abs($i)) > 0.006) printf("Line %d, Field %d: Baseline = %f, Current = %f\n", NR, i, a[FNR][i], $i);}}' $OUTPUT_BASELINE $OUTPUT_CURRENT > $OUTPUT_DIFF
+
+if [ -s $OUTPUT_DIFF ]; then
+  echo "Error: Output of write command does not match baseline output within specified tolerance. See $OUTPUT_DIFF"
+  exit 1
+fi
+
+exit 0
--- a/Tests/EndToEndTests/Speech/DNN/WriteCommand/testcases.yml
+++ b/Tests/EndToEndTests/Speech/DNN/WriteCommand/testcases.yml
@ -0,0 +1,31 @@
+dataDir: ../../Data
+tags:
+     # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
+     - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
+     # running unconditionally on every Nightly job in 'S' leg
+     - nightly-s (build_sku == 'gpu')
+
+testCases:
+  CNTK Run must be completed:
+    patterns:
+      - ^COMPLETED
+
+  Must train epochs in exactly same order and parameters:
+    patterns:
+      - ^Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+      - momentum = {{float}}
+
+  Epochs must be finished with expected results:
+    patterns:
+      - ^Finished Epoch[{{integer}} of {{integer}}]
+      - TrainLossPerSample = {{float,tolerance=.1%}}
+      - EvalErrPerSample = {{float,tolerance=.1%}}
+      - AvgLearningRatePerSample = {{float,tolerance=0.001%}}
+
+  Per-minibatch training results must match:
+    patterns:
+      - ^ Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
+      - SamplesSeen = {{integer}}
+      - TrainLossPerSample = {{float,tolerance=.1%}}
+      - EvalErr[0]PerSample = {{float,tolerance=.1%}}
--- a/Tests/EndToEndTests/Speech/Data/glob_0000.write.scp
+++ b/Tests/EndToEndTests/Speech/Data/glob_0000.write.scp
@ -0,0 +1,10 @@
+An4/71/71/cen5-fjam-b.mfc=Features/000000000.chunk[0,367]
+An4/213/213/cen4-fsaf2-b.mfc=Features/000000000.chunk[368,805]
+An4/513/513/cen7-mgah-b.mfc=Features/000000000.chunk[806,1173]
+An4/614/614/cen7-mkdb-b.mfc=Features/000000000.chunk[1174,1421]
+An4/507/507/cen1-mgah-b.mfc=Features/000000000.chunk[1422,1669]
+An4/693/693/cen8-mmkw-b.mfc=Features/000000000.chunk[1670,2027]
+An4/918/918/cen4-mtos-b.mfc=Features/000000000.chunk[2028,2335]
+An4/477/477/an257-mewl-b.mfc=Features/000000000.chunk[2336,2943]
+An4/454/454/an70-meht-b.mfc=Features/000000000.chunk[2944,3021]
+An4/254/254/cen6-ftmj-b.mfc=Features/000000000.chunk[3022,3249]
--- a/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/DiscriminativePreTraining/testcases.yml
+++ b/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/DiscriminativePreTraining/testcases.yml
@ -12,7 +12,7 @@ tags:
     # - nightly-s (build_sku == 'gpu')

     # running on every BVT job in 'S' (Speech) leg in Debug-GPU configuration:
-     - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') and (device=='gpu'))
+     # - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') and (device=='gpu'))
     # running unconditionally on every Nightly job in 'S' leg
     - nightly-s (build_sku == 'gpu') and (device=='gpu')

--- a/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/Parallel1BitQuantization/testcases.yml
+++ b/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/Parallel1BitQuantization/testcases.yml
@ -1,7 +1,7 @@
 dataDir: ../../../Data
 tags:
     # running for 1bitsgd build SKU on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
-     - bvt-s  (build_sku == '1bitsgd') and ((flavor=='debug') ^ (device=='cpu'))
+     # - bvt-s  (build_sku == '1bitsgd') and ((flavor=='debug') ^ (device=='cpu'))
     # running for 1bitsgd build SKU  on every Nightly job in 'S' leg
     - nightly-s (build_sku == '1bitsgd')

--- a/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/ParallelBufferedAsyncGradientAggregation/testcases.yml
+++ b/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/ParallelBufferedAsyncGradientAggregation/testcases.yml
@ -1,7 +1,7 @@
 dataDir: ../../../Data
 tags:
     # running for 1bitsgd build SKU on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
-     - bvt-s  (build_sku == '1bitsgd') and ((flavor=='debug') ^ (device=='cpu'))
+     # - bvt-s  (build_sku == '1bitsgd') and ((flavor=='debug') ^ (device=='cpu'))
     # running for 1bitsgd build SKU on every Nightly job in 'S' leg
     - nightly-s (build_sku == '1bitsgd')

--- a/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/ParallelNoQuantization/testcases.yml
+++ b/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/ParallelNoQuantization/testcases.yml
@ -1,7 +1,7 @@
 dataDir: ../../../Data
 tags:
     # running on every BVT job in 'P' (Speech) leg in Debug-GPU and Release-CPU configurations:
-     - bvt-p  ((build_sku == 'gpu') or (build_sku == '1bitsgd')) and ((flavor=='debug') ^ (device=='cpu'))
+     # - bvt-p  ((build_sku == 'gpu') or (build_sku == '1bitsgd')) and ((flavor=='debug') ^ (device=='cpu'))
     # running unconditionally on every Nightly job in 'P' leg
     - nightly-p ((build_sku == 'gpu') or (build_sku == '1bitsgd'))

--- a/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/ParallelNoQuantizationBufferedAsyncGradientAggregation/testcases.yml
+++ b/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/DNN/ParallelNoQuantizationBufferedAsyncGradientAggregation/testcases.yml
@ -1,7 +1,7 @@
 dataDir: ../../../Data
 tags:
     # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
-     - bvt-s  ((build_sku == 'gpu') or (build_sku == '1bitsgd')) and ((flavor=='debug') ^ (device=='cpu'))
+     # - bvt-s  ((build_sku == 'gpu') or (build_sku == '1bitsgd')) and ((flavor=='debug') ^ (device=='cpu'))
     # running unconditionally on every Nightly job in 'S' leg
     - nightly-s ((build_sku == 'gpu') or (build_sku == '1bitsgd'))

--- a/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/QuickE2E/testcases.yml
+++ b/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/QuickE2E/testcases.yml
@ -1,7 +1,7 @@
 dataDir: ../../Data
 tags:
     # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
-     - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
+     # - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
     # running unconditionally on every Nightly job in 'S' leg
     - nightly-s (build_sku == 'gpu')

--- a/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/SVD/testcases.yml
+++ b/Tests/EndToEndTests/Speech/ExperimentalHtkmlfReader/SVD/testcases.yml
@ -1,7 +1,7 @@
 dataDir: ../../Data
 tags:
     # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
-     - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
+     # - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
     # running unconditionally on every Nightly job in 'S' leg
     - nightly-s (build_sku == 'gpu')

--- a/Tests/EndToEndTests/TestDriver.py
+++ b/Tests/EndToEndTests/TestDriver.py
@ -95,7 +95,7 @@
 # matching against all test-cases/pattern simulteneously
 #

-import sys, os, argparse, traceback, yaml, subprocess, random, re, time
+import sys, os, argparse, traceback, yaml, subprocess, random, re, time, stat

 try:
  import six
@ -265,6 +265,10 @@ class Test:
    if args.verbose:
      six.print_(self.fullName + ":>" + logFile)
    with open(logFile, "w") as output:
+      if not windows:
+        testScript = self.testDir + "/run-test"
+        st = os.stat(testScript)
+        os.chmod(testScript, st.st_mode | stat.S_IEXEC | stat.S_IXOTH)
      cmdLine = ["bash", "-c", self.testDir + "/run-test 2>&1"]
      process = subprocess.Popen(cmdLine, stdout=subprocess.PIPE)

--- a/Tests/UnitTests/ReaderTests/ReaderLibTests.cpp
+++ b/Tests/UnitTests/ReaderTests/ReaderLibTests.cpp
@ -5,9 +5,9 @@

 #include "stdafx.h"

-#include "BlockRandomizer.h"
 #include "NoRandomizer.h"
 #include "DataDeserializer.h"
+#include "BlockRandomizer.h"

 using namespace Microsoft::MSR::CNTK;

@ -34,7 +34,7 @@ public:
        assert(chunkEnd <= data.size());
    }

-    std::vector<SequenceDataPtr> GetSequence(size_t sequenceId) override
+    void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override
    {
        assert(m_chunkBegin <= sequenceId);
        assert(sequenceId < m_chunkEnd);
@ -43,8 +43,7 @@ public:
        data->m_data = &m_data[sequenceId];
        data->m_numberOfSamples = 1;
        data->m_sampleLayout = m_sampleLayout;
-
-        return std::vector<SequenceDataPtr>{data};
+        result.push_back(data);
    }

    ~MockChunk() override {};
@ -58,9 +57,9 @@ private:
    size_t m_numSequencesPerChunk;
    std::vector<SequenceDescription> m_descriptions;
    std::vector<float>& m_data;
-    SequenceDescriptions m_sequenceDescriptions;
    std::vector<StreamDescriptionPtr> m_streams;
    TensorShapePtr m_sampleLayout;
+    std::vector<ChunkDescriptionPtr> m_chunkDescriptions;

 public:
    MockDeserializer(size_t numChunks, size_t numSequencesPerChunks, std::vector<float>& data)
@ -71,7 +70,6 @@ public:
    {
        size_t numSequences = numChunks * numSequencesPerChunks;
        m_descriptions.reserve(numSequences);
-        m_sequenceDescriptions.reserve(numSequences);
        assert(data.size() == numSequences);

        for (size_t i = 0; i < numSequences; i++)
@ -81,12 +79,18 @@ public:
                1,
                i / numSequencesPerChunks,
                true,
-                { std::wstring(L""), i }
+                { 0, i }
            });
-            m_sequenceDescriptions.push_back(&m_descriptions[i]);
        }

-        std::vector<StreamDescriptionPtr> result;
+        for (size_t i = 0; i < numChunks; i++)
+        {
+            m_chunkDescriptions.push_back(std::make_shared<ChunkDescription>(ChunkDescription {
+                i,
+                numSequencesPerChunks,
+                numSequencesPerChunks
+            }));
+        }

        m_streams.push_back(std::make_shared<StreamDescription>(StreamDescription{
            L"input",
@ -95,6 +99,8 @@ public:
            ElementType::tfloat,
            m_sampleLayout
        }));
+
+
    };

    std::vector<StreamDescriptionPtr> GetStreamDescriptions() const override
@ -102,11 +108,6 @@ public:
        return m_streams;
    }

-    const SequenceDescriptions& GetSequenceDescriptions() const override
-    {
-        return m_sequenceDescriptions;
-    }
-
    virtual ChunkPtr GetChunk(size_t chunkId) override
    {
        assert(chunkId < m_numChunks);
@ -114,17 +115,30 @@ public:
        size_t chunkEnd = chunkBegin + m_numSequencesPerChunk;
        std::shared_ptr<Chunk> chunk = std::make_shared<MockChunk>(chunkBegin, chunkEnd, m_data);
        return chunk;
-
    }

-    virtual const SequenceDescription* GetSequenceDescriptionByKey(const KeyType&) override
+    virtual void GetSequenceDescriptionByKey(const KeyType&, SequenceDescription&) override
    {
        throw std::logic_error("Not implemented");
    }

-    virtual size_t GetTotalNumberOfChunks() override
+    virtual ChunkDescriptions GetChunkDescriptions() override
    {
-        throw std::logic_error("Not implemented");
+        return m_chunkDescriptions;
+    }
+
+    virtual void GetSequencesForChunk(size_t chunkId, std::vector<SequenceDescription>& descriptions) override
+    {
+        for (size_t i = chunkId * m_numSequencesPerChunk; i < (chunkId + 1) * m_numSequencesPerChunk; i++)
+        {
+            descriptions.push_back(SequenceDescription{
+                i,
+                1,
+                chunkId,
+                true,
+                { 0, i }
+            });
+        }
    }

    MockDeserializer(const MockDeserializer&) = delete;
@ -136,7 +150,7 @@ BOOST_AUTO_TEST_CASE(BlockRandomizerInstantiate)
    std::vector<float> data;
    auto mockDeserializer = std::make_shared<MockDeserializer>(0, 0, data);

-    auto randomizer = std::make_shared<BlockRandomizer>(0, SIZE_MAX, mockDeserializer);
+    auto randomizer = std::make_shared<BlockRandomizer>(0, SIZE_MAX, mockDeserializer, BlockRandomizer::DecimationMode::chunk, false);
 }

 BOOST_AUTO_TEST_CASE(BlockRandomizerOneEpoch)
@ -144,7 +158,7 @@ BOOST_AUTO_TEST_CASE(BlockRandomizerOneEpoch)
    std::vector<float> data { 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
    auto mockDeserializer = std::make_shared<MockDeserializer>(5, 2, data);

-    auto randomizer = std::make_shared<BlockRandomizer>(0, SIZE_MAX, mockDeserializer);
+    auto randomizer = std::make_shared<BlockRandomizer>(0, SIZE_MAX, mockDeserializer, BlockRandomizer::DecimationMode::chunk, false);

    EpochConfiguration epochConfiguration;
    epochConfiguration.m_numberOfWorkers = 1;
@ -165,42 +179,8 @@ BOOST_AUTO_TEST_CASE(BlockRandomizerOneEpoch)
            auto data = reinterpret_cast<DenseSequenceData&>(*sequences.m_data[0][0]);
            BOOST_CHECK_EQUAL(data.m_numberOfSamples, 1);
            actual.push_back(*((float*)data.m_data));
-
        }
-        BOOST_CHECK_EQUAL(sequences.m_endOfEpoch, (9 <= i));
-    }
-    BOOST_CHECK_EQUAL_COLLECTIONS(expected.begin(), expected.end(),
-                                  actual.begin(), actual.end());
-}
-
-BOOST_AUTO_TEST_CASE(BlockRandomizerOneEpochSmallWindow)
-{
-    std::vector<float> data { 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0 };
-    auto mockDeserializer = std::make_shared<MockDeserializer>(5, 2, data);
-
-    auto randomizer = std::make_shared<BlockRandomizer>(0, 10, mockDeserializer);
-
-    EpochConfiguration epochConfiguration;
-    epochConfiguration.m_numberOfWorkers = 1;
-    epochConfiguration.m_workerRank = 0;
-    epochConfiguration.m_minibatchSizeInSamples = 0;
-    epochConfiguration.m_totalEpochSizeInSamples = 10;
-    epochConfiguration.m_epochIndex = 0;
-    randomizer->StartEpoch(epochConfiguration);
-
-    std::vector<float> expected { 9.0, 8.0, 3.0, 6.0, 2.0, 1.0, 4.0, 7.0, 5.0, 0.0 };
-    std::vector<float> actual;
-    for (int i = 0; i < 11; i++)
-    {
-        Sequences sequences = randomizer->GetNextSequences(1);
-        BOOST_CHECK_EQUAL(sequences.m_data.size(), 1 - (i / 10));
-        if (i < 10)
-        {
-            auto data = reinterpret_cast<DenseSequenceData&>(*sequences.m_data[0][0]);
-            BOOST_CHECK_EQUAL(data.m_numberOfSamples, 1);
-            actual.push_back(*((float*)data.m_data));
-        }
-        BOOST_CHECK_EQUAL(sequences.m_endOfEpoch, (9 <= i));
+        BOOST_CHECK_EQUAL(sequences.m_endOfEpoch, (10 <= i));
    }
    BOOST_CHECK_EQUAL_COLLECTIONS(expected.begin(), expected.end(),
                                  actual.begin(), actual.end());
@ -212,10 +192,10 @@ BOOST_AUTO_TEST_CASE(BlockRandomizerOneEpochLegacyRandomization)
    auto mockDeserializer = std::make_shared<MockDeserializer>(5, 2, data);

    auto randomizer = std::make_shared<BlockRandomizer>(0,
-                                                        SIZE_MAX,
-                                                        mockDeserializer,
-                                                        BlockRandomizer::DistributionMode::sequences_strides,
-                                                        true);
+        SIZE_MAX,
+        mockDeserializer,
+        BlockRandomizer::DecimationMode::sequence,
+        true);

    EpochConfiguration epochConfiguration;
    epochConfiguration.m_numberOfWorkers = 1;
@ -238,7 +218,7 @@ BOOST_AUTO_TEST_CASE(BlockRandomizerOneEpochLegacyRandomization)
            actual.push_back(*((float*)data.m_data));

        }
-        BOOST_CHECK_EQUAL(sequences.m_endOfEpoch, (9 <= i));
+        BOOST_CHECK_EQUAL(sequences.m_endOfEpoch, (10 <= i));
    }
    BOOST_CHECK_EQUAL_COLLECTIONS(expected.begin(), expected.end(),
                                  actual.begin(), actual.end());
@ -271,10 +251,11 @@ BOOST_AUTO_TEST_CASE(NoRandomizerOneEpoch)
            auto data = reinterpret_cast<DenseSequenceData&>(*sequences.m_data[0][0]);
            BOOST_CHECK_EQUAL(data.m_numberOfSamples, 1);
            actual.push_back(*((float*)data.m_data));
-
        }
+
        BOOST_CHECK_EQUAL(sequences.m_endOfEpoch, (10 <= i));
    }
+
    BOOST_CHECK_EQUAL_COLLECTIONS(data.begin(), data.end(),
                                  actual.begin(), actual.end());
 }
--- a/Tools/.make_binary_drop_linux.swp
+++ b/Tools/.make_binary_drop_linux.swp
--- a/Tools/make_binary_drop_linux
+++ b/Tools/make_binary_drop_linux