Merge branch 'master' of https://git01.codeplex.com/cntk into amitaga/separate1bitDataParallelSGD

2016-01-13 22:43:27 -08:00 · 2016-01-13 22:43:27 -08:00 · 18528f15b4
--- a/2
+++ b/2
@ -162,7 +162,7 @@ ifeq ("$(BUILDTYPE)","debug")
  CXXFLAGS += -g
  LDFLAGS += -rdynamic
  CPPFLAGS += -D_DEBUG
-  CUFLAGS += -O0 -use_fast_math -lineinfo  $(GENCODE_FLAGS)
+  CUFLAGS += -O0 -g -use_fast_math -lineinfo  $(GENCODE_FLAGS)
 endif

 ifeq ("$(BUILDTYPE)","release")
--- a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
+++ b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
@ -47,7 +47,7 @@ using namespace std;
        L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
        L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
        // TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
-        L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, numSteps=1, insertedDim=0, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n"
+        L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n"
        L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
        L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
        L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -345,6 +345,9 @@ void PrintBuiltInfo()
 #ifdef _CUB_PATH_
    fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
 #endif 
+#ifdef _CUDNN_PATH_
+    fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
+#endif
 #ifdef _GIT_EXIST
    fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
    fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
@ -568,7 +571,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])   // called from wmain which i
        RedirectStdErr(logpath);
    }

-    PrintBuiltInfo();
+    PrintBuiltInfo(); // this one goes to log file 
    std::string timestamp = TimeDateStamp();

    //dump config info
@ -643,10 +646,11 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])   // called from wmain which i
 // main wrapper that catches C++ exceptions and prints them
 // ---------------------------------------------------------------------------

-int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper that catches & repots Win32 exceptions
+int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper that catches & reports Win32 exceptions
 {
    try
    {
+        PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
        if (argc <= 1)
            InvalidArgument("No command-line argument given.");
        // detect legacy CNTK configuration
@ -684,6 +688,8 @@ void terminate_this() { fprintf(stderr, "terminate_this: aborting\n"), fflush(st
 int wmain(int argc, wchar_t* argv[])    // wmain wrapper that reports Win32 exceptions
 {
    set_terminate (terminate_this); // insert a termination handler to ensure stderr gets flushed before actually terminating
+    _set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing
+
    // Note: this does not seem to work--processes with this seem to just hang instead of terminating
    __try
    {
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@ -100,7 +100,7 @@ template <typename ElemType>
 void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigParamList& params)
 {
    std::string name = p_name;
-    if (EqualInsensitive(name, "CreateModel"))  //create a blank model
+    if (EqualInsensitive(name, "CreateModel"))  // create a blank model
    {
        size_t numFixedParams = 0, numOptionalParams = 0;
        if (params.size() > numFixedParams + numOptionalParams || params.size() < numFixedParams)
@ -109,7 +109,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
        auto cn = make_shared<ComputationNetwork>(CPUDEVICE);
        OverrideModelNameAndSetDefaultModel(cn);
    }
-    if (EqualInsensitive(name, "CreateModelWithName"))  //create a blank model
+    if (EqualInsensitive(name, "CreateModelWithName"))  // create a blank model
    {
        size_t numFixedParams = 1, numOptionalParams = 0;
        if (params.size() > numFixedParams + numOptionalParams || params.size() < numFixedParams)
@ -139,6 +139,16 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
        std::wstring modelFormat = GetOptionalModelFormat(params, numFixedParams);

        auto cn = make_shared<ComputationNetwork>(CPUDEVICE);
+#if 1   // support for a specific kind of legacy format, for the sole purpose of allowing users to convert (=load & save) them
+        if (modelFormat == L"cntk_legacy_no_tensorlib")
+        {
+            cn->Read<ElemType>(params[1]);
+            for (auto node : cn->FeatureNodes())
+                node->SetDims(TensorShape(node->GetNumRows()), 0);  // pre-tensorlib InputValues had incorrect tensor dimensions
+            cn->CompileNetwork();
+        }
+        else
+#endif
        cn->Load<ElemType>(params[1]);
        OverrideModelNameAndSetDefaultModel(cn, params[0]);
    }
@ -189,8 +199,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa

        // validate the network before we save it out
        ProcessNDLScript(m_netNdlDefault, ndlPassAll, true);
-
-        cn->Save(fileName);
+        cn->SaveEdited(fileName);
    }
    else if (EqualInsensitive(name, "SaveModel"))
    {
@ -209,7 +218,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa

        // validate and finish the second pass through NDL if any in-line NDL was defined
        ProcessNDLScript(netNdl, ndlPassAll, true);
-        netNdl->cn->Save(fileName);
+        netNdl->cn->SaveEdited(fileName);
    }
    else if (EqualInsensitive(name, "SetDefaultModel"))
    {
--- a/Source/CNTK/ModelEditLanguage.h
+++ b/Source/CNTK/ModelEditLanguage.h
@ -443,6 +443,10 @@ public:
                    {
                        modelFormat = L"cntk";
                    }
+                    else if (EqualInsensitive(value, "cntk_legacy_no_tensorlib"))    // model of late 2015 which had a bug in setting InputValue's tensor dimensions
+                    {
+                        modelFormat = L"cntk_legacy_no_tensorlib";
+                    }
                    else
                    {
                        RuntimeError("Invalid optional parameter value %s, valid values are: format=(cntk)", value.c_str());
--- a/Source/CNTK/SimpleNetworkBuilder.cpp
+++ b/Source/CNTK/SimpleNetworkBuilder.cpp
@ -2423,9 +2423,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Matrix<ElemType> priorVals = ReadMatrixFromDbnFile(fstream, std::string("Pu"));
            assert(priorVals.GetNumCols() == 1 && priorVals.GetNumRows() == m_outputLayerSize);

-            w = builder.Mean(label, L"Prior");
-            static_pointer_cast<PreComputedNode<ElemType>>(w)->SideLoadFromMatrix(priorVals);
-            w->SetParameterUpdateRequired(false);
+            prior = builder.Mean(label, L"Prior");
+            static_pointer_cast<PreComputedNode<ElemType>>(prior)->SideLoadFromMatrix(priorVals);
+            prior->SetParameterUpdateRequired(false);
        }
        else // pretrained network - need to add output layer, initalize
        {
@ -2465,7 +2465,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        if (layerType == "perceptron" || m_needPrior)
        {
-            input = builder.Log(pcNodePtr, L"LogOfPrior");
+            input = builder.Log(prior, L"LogOfPrior");

            //following two lines is needed only if true probability is needed
            //output = builder.Softmax(output);
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@ -33,6 +33,16 @@ if "%cuda_path%" == "" (
        echo #define _CUDA_PATH_    "%cuda_path:\=\\%" >> buildinfo.h$$
    )

+if not "%cudnn_path%" == "" (
+    echo #define _CUDNN_PATH_  "%cudnn_path:\=\\%" >> buildinfo.h$$
+    ) 
+
+if not "%cub_path%" == "" (
+    echo #define _CUB_PATH_  "%cub_path:\=\\%" >> buildinfo.h$$
+    ) 
+
+
+
 echo #endif >> buildinfo.h$$

 ::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time)
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -84,6 +84,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ptrdiff_t tBegin;       // first time index in this minibatch. Note that this may be negative of the sequence started before this MB.
            size_t    tEnd;         // end = first frame index after final frame. May be beyond the minibatch if reql sequence is longer than the MB.
            bool operator==(const SequenceInfo & other) const { return seqId == other.seqId && s == other.s && tBegin == other.tBegin && tEnd == other.tEnd; }
+            size_t GetNumTimeSteps() const { return (size_t)(tEnd - tBegin); }
        };

        // -------------------------------------------------------------------
@ -270,6 +271,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them.
        void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); }

+        // find a sequence by its id
+        const SequenceInfo & FindSequence(UniqueSequenceId seqId) const
+        {
+            for (const auto & seqInfo : m_sequences)
+                if (seqInfo.seqId == seqId)
+                    return seqInfo;
+            LogicError("FindSequence: Requested sequence (id %u) not found.", (unsigned int) seqId);
+        }
+
        // -------------------------------------------------------------------
        // inquire about gaps or boundaries
        // -------------------------------------------------------------------
@ -427,6 +437,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    public: // TODO: make private (currently used from masking and DataFor) ; TODO: rename all members with m_ prefix
        size_t timeIdxInSeq;                // start frame; SIZE_MAX = all frames in MB
        ptrdiff_t m_timeOffset;             // this is added to timeIdxInSeq wherever it is used
+        size_t m_timeRange;                 // use this to describe a custom range > 1 frame
        size_t seqIndex;                    // parallel-sequence index; SIZE_MAX = all sequences in MB (most common case)  --TODO: Bad name, 'sequence' and 'parallel sequence' are two different things
        MBLayoutPtr m_pMBLayout;            // layout associated with this
        bool m_broadcastAllowed;            // frame range may be broadcast from outer layout (e.g. a matrix with NULL layout and 1 column is acceptable to this frame range)
@ -434,7 +445,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    public:
        // can construct from a single size_t -> a single-frame range
-        FrameRange(MBLayoutPtr pMBLayout, size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), m_timeOffset(0), seqIndex(SIZE_MAX), m_pMBLayout(pMBLayout), m_broadcastAllowed(false), parent(nullptr) {}
+        FrameRange(MBLayoutPtr pMBLayout, size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), m_timeOffset(0), m_timeRange(1), seqIndex(SIZE_MAX), m_pMBLayout(pMBLayout), m_broadcastAllowed(false), parent(nullptr) {}

        // or without arguments -> entire minibatch / no frame-range
        FrameRange(MBLayoutPtr pMBLayout) : FrameRange(pMBLayout, SIZE_MAX) {}
@ -471,7 +482,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        // create a FrameRange with a time offset
-        // Note: This currently does not work in conjunction with IsAllFrames(). This would be a nice-to have, but tricky w.r.t. out-of-bounds accesses.
+        // If IsAllFrames() then this will cause out-of-bounds slices.
        FrameRange WithTimeOffset(ptrdiff_t offset) const
        {
            FrameRange ret = *this;
@ -479,6 +490,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return ret;
        }

+        // create a FrameRange with a time range > 1
+        FrameRange WithTimeRange(size_t range) const
+        {
+            FrameRange ret = *this;
+            if (!IsAllFrames())
+                ret.m_timeRange = range;
+            return ret;
+        }
+
+        // dimension we are iterating over; -1 means time dimension; 0 means no layout
+        int GetIterationDimension() const
+        {
+            if (!m_pMBLayout)
+                return 0;
+            else
+                return -1;  // TODO: allow user to specify other dimensions
+        }
+
        class IndexIteration    // range for range-based for over sequences
        {
            size_t m_beginIndex, m_endIndex;
@ -753,7 +782,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (startColumn >= numCols)
                LogicError("DataFor: FrameRange specifies a time index that is out of range.");
            if (fr.seqIndex == SIZE_MAX)
-                return std::pair<size_t, size_t>(startColumn, numParallelSequences);
+                return std::pair<size_t, size_t>(startColumn, numParallelSequences * fr.m_timeRange);
+            else if (fr.m_timeRange != 1)
+                LogicError("DataFor: FrameRange only support per-sequence time ranges with tensor slices, not matrix slices.");
            else
                return std::pair<size_t, size_t>(startColumn + fr.seqIndex, 1);
        }
@ -778,7 +809,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // TensorSliceWithMBLayoutFor() -- Return tensor slice for a FrameRange with specified number of columns with a given MBLayout
    // This implements the logic of interpreting the FrameRange object.
    // Unlike the matrix version above, this supports iteration indices other than time.
-    // TODO: This ^^. Still missing is a field to identify the index.
+    // TODO: This ^^. FrameRange still missing is a field to identify the index.
+    // This function happily returns tensor bounds that are out of bounds, assuming caller will do the right thing.
    // -----------------------------------------------------------------------

    template<class DimensionVector> // e.g. std::vector<size_t> or SmallVector<size_t>
@ -787,6 +819,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                                                                         const MBLayoutPtr & pMBLayout/*the MB layout of 'data'*/)
    {
        std::pair<DimensionVector, DimensionVector> result;
+        typedef decltype(result.first[0]) ElemType;

        // this creates a slice for the entire matrix, which we will then narrow down
        result.first.resize(shape.size(), 0);
@ -795,8 +828,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // get position of time and sequence index
        // These are only valid if we have a layout.
        // In the future, the 'timeDim' will be identified by the FrameRange.
+        int iterDimParam = fr.GetIterationDimension();
+        size_t iterDim = iterDimParam > 0 ? iterDimParam - 1/*regular dimensions are specified as 1-based*/ : shape.size() + iterDimParam/*-1 for time dimension*/;
        size_t sequenceDim = shape.size() - 2;  // TODO: In case of multiple time dims, this must be adjusted.
-        size_t timeDim = sequenceDim + 1;       // TODO: Get this from the FrameRange object.

        // MBLayout of data and of FrameRange must be identical pointers,
        // or in case of broadcasting, respective parent pointers.
@ -819,28 +853,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // but as a reference (e.g. it cannot be resized)
        else if (!pMBLayout || fr.IsAllFrames())
        {
-            if (fr.m_timeOffset != 0)   // entire minibatch with non-zero offset exceeds bounds on at least one side
-                LogicError("DataFor: Iteration offset must not be specified for FrameRanges that reference the entire minibatch.");
-            // TODO: Can we allow this? Semantics would be different, it would crop frames outside.
+            if (fr.m_timeOffset)
+            {
+                if (iterDim >= result.first.size())
+                    LogicError("DataFor: Time offset cannot be applied to tensors that have no time dimension.");
+                result.first[iterDim]  += (ElemType)fr.m_timeOffset;  // Note: If we have an offset, this is guaranteed to yield a slice that is out of bounds.
+                result.second[iterDim] += (ElemType)fr.m_timeOffset;
+                if (result.first[iterDim] > result.second[iterDim])
+                    LogicError("DataFor: Numeric wraparound. You used a size_t vector where an int vector would be needed.");
+            }
        }
        // FrameRange refers to a time slice -> return that
-        else  if (result.second[timeDim] > 1)    // (if time dim is broadcasting then always return that one independent of requested index)
+        else  if (result.second[iterDim] > 1)    // (if time dim is broadcasting then always return that one independent of requested index)
        {
-            size_t t = fr.timeIdxInSeq + fr.m_timeOffset;
-            if (t >= result.second[timeDim])
-                LogicError("DataFor: FrameRange specifies an iteration index that is out of range.");
-            result.first[timeDim]  = t;
-            result.second[timeDim] = t + 1;
+            size_t ts = fr.timeIdxInSeq + fr.m_timeOffset;
+            size_t te = ts + fr.m_timeRange;
+            result.first[iterDim]  = (ElemType)ts;
+            result.second[iterDim] = (ElemType)te;
        }
-        
+
        // sequence index
        if (fr.seqIndex != SIZE_MAX/*sequence requested*/ && pMBLayout/*have sequences*/ && result.second[sequenceDim] > 1/*>1 sequence (not broadcasting)*/)
        {
            size_t s = fr.seqIndex;
            if (s >= result.second[sequenceDim])
                LogicError("DataFor: FrameRange specifies a paralllel-sequence index that is out of range.");
-            result.first[sequenceDim]  = s;
-            result.second[sequenceDim] = s + 1;
+            result.first[sequenceDim]  = (ElemType)s;
+            result.second[sequenceDim] = (ElemType)s + 1;
        }

        return result;
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@ -104,7 +104,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void resize(size_t sz, const T & val) { if (sz < m_size) m_size = sz; else while (m_size < sz) push_back(val); }
        void assign(size_t sz, const T & val) { clear(); resize(sz, val); }
        template<class ITER>
-        void append(ITER beg, const ITER & end) { while (beg != end) push_back(*beg++); }
+        void append(ITER beg, const ITER & end) { while (beg != end) push_back((T)*beg++); }    // typecast allows signed/unsigned conversions
        template<class ITER>
        void assign(ITER beg, const ITER & end) { clear(); append(beg,end); }
        void operator=(const SmallVector & other) { m_size = other.m_size; memcpy(m_data, other.m_data, other.m_size * sizeof(T)); }
@ -180,8 +180,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // boilerplate
        bool operator==(const TensorShape & other) const { return m_dims == other.m_dims; }

-        void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable.
-
        // verify that this refers to a dense matrix (no strides)
        void VerifyIsDense() const
        {
@ -374,7 +372,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (size() != bounds.first.size() || size() != bounds.second.size())
                LogicError("NarrowedTo: Bounds parameter must have same rank as tensor.");
            for (size_t k = 0; k < size(); k++)
-                if (bounds.second[k] <= bounds.first[k] || bounds.second[k] > m_dims[k])
+                if (bounds.second[k] <= bounds.first[k] || (size_t)bounds.second[k] > m_dims[k])
                    LogicError("NarrowedTo: Invalid bounds parameter, dimensions must be at least one.");
            for (size_t k = 0; k < size(); k++)
            {
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@ -51,6 +51,7 @@ enum mbrclassdefinition     // used to identify definition of class in minimum b
 // ===========================================================================
 class lattice
 {
+    mutable int verbosity; 
    struct header_v1_v2
    {
        size_t numnodes : 32;
@ -567,11 +568,13 @@ private:
        std::vector<size_t> backptroffsets;         // TODO: we could change this to 'unsigned int' to save some transfer time
        std::vector<unsigned short> backptrstorage; // CPU-side versions use this as the traceback buffer; CUDA code has its CUDA-side buffer
        size_t numofstates;                         // per sil hmm
+        int verbosity;  
    public:
-        backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset) : numofstates(0)
+        backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset, int verbosity=0) : numofstates(0)
        {
            size_t edgeswithsilence = 0;    // (diagnostics only: number of edges with at least one /sil/)
            size_t backptrbufsize = 0;      // number of entries in buffer for silence backpointer array, used as cursor as we build it
+            
            backptroffsets.resize (L.edges.size() + 1);  // +1, so that the final entry determines the overall size of the allocated buffer
            const size_t silUnitId = hset.gethmmid ("sil");
            numofstates = hset.gethmm (silUnitId).getnumstates();
@ -595,15 +598,18 @@ private:
 #if 1           // multiple /sil/ -> log this (as we are not sure whether this is actually proper--probably it is)
                if (numsilunits > 1)
                {
-                    fprintf (stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits);
-                    fprintf (stderr, "alignments: :");
-                    foreach_index (a, aligntokens)
+                    if (verbosity)
                    {
-                        const auto & unit = aligntokens[a];
-                        const auto & hmm = hset.gethmm (unit.unit);
-                        fprintf (stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
+                        fprintf(stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits);
+                        fprintf(stderr, "alignments: :");
+                        foreach_index(a, aligntokens)
+                        {
+                            const auto & unit = aligntokens[a];
+                            const auto & hmm = hset.gethmm(unit.unit);
+                            fprintf(stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
+                        }
+                        fprintf(stderr, "\n");
                    }
-                    fprintf (stderr, "\n");
                }
 #endif
                if (numsilunits > 0)
@ -611,7 +617,8 @@ private:
                backptrbufsize += maxsilframes * numofstates;
            }
            backptroffsets[L.edges.size()] = backptrbufsize;        // (TODO: remove if not actually needed)
-            fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
+            if (verbosity)
+                fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
        }
        // CUDA support
        const std::vector<size_t> & getbackptroffsets() const { return backptroffsets; }
@ -1002,6 +1009,10 @@ public:

    std::wstring key;        // (keep our own name (key) so we can identify ourselves for diagnostics messages)
    const wchar_t * getkey() const { return key.c_str(); }
+
+    void setverbosity(int veb) const{
+        verbosity = veb;
+    }
 };

 // ===========================================================================
@ -1016,6 +1027,8 @@ class archive
    // set of lattice archive files referenced
    // Note that .toc files can be concatenated, i.e. one .toc file can reference multiple archive files.
    std::vector<std::wstring> archivepaths;         // [archiveindex] -> archive path
+    std::wstring              prefixPathInToc;      // prefix path in a toc; using this to avoid pushd some path before start training 
+    mutable int               verbosity;            
    size_t getarchiveindex (const std::wstring & path)  // get index of a path in archivepaths[]; create new entry if needed
    {
        auto iter = std::find (archivepaths.begin(), archivepaths.end(), path);
@ -1042,7 +1055,8 @@ class archive
        {   // need to read the map and establish the mapping
            // get the symlist file
            const std::wstring symlistpath = archivepaths[archiveindex] + L".symlist";
-            fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
+            if (verbosity>0)
+                fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
            std::vector<char> textbuffer;
            auto lines = msra::files::fgetfilelines (symlistpath, textbuffer);
            // establish mapping of each entry to the corresponding id in 'symmap'; this should fail if the symbol is not found
@ -1092,19 +1106,25 @@ class archive
 public:
    // construct = open the archive
    //archive() : currentarchiveindex (SIZE_MAX) {}
-
+    void setverbosity(int veb) const 
+    {
+        verbosity = veb;
+    }
    // test if this object is loaded with anything (if not, an empty set of TOC paths was passed--meaning disable lattice mode)
    bool empty() const { return archivepaths.empty(); }

    // construct from a list of TOC files
-    archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap) : currentarchiveindex (SIZE_MAX), modelsymmap (modelsymmap)
+    archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap, const std::wstring prefixPath=L"") 
+        : currentarchiveindex(SIZE_MAX), modelsymmap(modelsymmap), prefixPathInToc(prefixPath), verbosity(0)
    {
        if (tocpaths.empty())   // nothing to read--keep silent
            return;
        fprintf (stderr, "archive: opening %d lattice-archive TOC files ('%S' etc.)..", (int)tocpaths.size(), tocpaths[0].c_str());
+        size_t onepercentage = tocpaths.size() / 100 ? tocpaths.size()/100 : 1; 
        foreach_index (i, tocpaths)
        {
-            fprintf (stderr, ".");
+            if ( (i % onepercentage) ==  0)
+                fprintf (stderr, ".");
            open (tocpaths[i]);
        }
        fprintf (stderr, " %d total lattices referenced in %d archive files\n", (int)toc.size(), (int)archivepaths.size());
@ -1135,7 +1155,11 @@ public:
                RuntimeError("open: invalid TOC line (no [): %s", line);
            if (q != p)
            {
-                const std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
+                std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
+                if (!prefixPathInToc.empty())
+                {
+                    archivepath = prefixPathInToc + L"/" + archivepath;
+                }
                // TODO: should we allow paths relative to TOC file?
                archiveindex = getarchiveindex (archivepath);
            }
@ -1207,6 +1231,7 @@ public:
            fsetpos (f, offset);
            // get it
            L.fread (f, idmap, spunit);
+            L.setverbosity(verbosity);
 #ifdef HACK_IN_SILENCE       // hack to simulate DEL in the lattice
            const size_t silunit = getid (modelsymmap, "sil");
            const bool addsp = true;
--- a/Source/Common/Include/latticesource.h
+++ b/Source/Common/Include/latticesource.h
@ -23,10 +23,11 @@ public:
 class latticesource
 {
    const msra::lattices::archive numlattices, denlattices;
+    int verbosity; 
 public:
    typedef msra::dbn::latticepair latticepair;
-    latticesource (std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap)
-        : numlattices (latticetocs.first, modelsymmap), denlattices (latticetocs.second, modelsymmap) {}
+    latticesource (std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap, std::wstring RootPathInToc)
+        : numlattices (latticetocs.first, modelsymmap, RootPathInToc), denlattices (latticetocs.second, modelsymmap, RootPathInToc), verbosity(0) {}

    bool empty() const
    {
@ -52,6 +53,12 @@ public:
        denlattices.getlattice (key, LP->second, expectedframes);     // this loads the lattice from disk, using the existing L.second object
        L = LP;
    }
+
+    void setverbosity(int veb)
+    {
+        verbosity = veb; 
+        numlattices.setverbosity(veb); denlattices.setverbosity(veb);
+    }
 };

 }}
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@ -296,6 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            CreateMatrixIfNull(m_value);
            m_value->SetValue(value);
            m_hasComputed = true; 
+            SetDims(TensorShape(value.GetNumRows()), value.GetNumCols());
        }
    public:
        bool m_hasComputed;
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -62,6 +62,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // break cycles
        // BUGBUG: This only works if nodes are not shared across networks.
        //         Once we allow that (BrainScript editing), we need proper cycle detectors. Luckily, we know our cycles, so it won't be too hard.
+        //         Or just use weak ptrs.
        for (auto & iter : m_nameToNodeMap)
            iter.second->DetachInputs();

@ -74,8 +75,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // serialization
    // -----------------------------------------------------------------------

+    // after after editing--network is possibly not validated/compiled
+    void ComputationNetwork::SaveEdited(const wstring& fileName, const FileOptions fileFormat)
+    {
+        if (!IsCompiled())
+            CompileNetwork();
+        Save(fileName, fileFormat);
+    }
+
    void ComputationNetwork::Save(const wstring& fileName, const FileOptions fileFormat) const
    {
+        VerifyIsCompiled("Save");
        // In case of parallel training only the main node should we saving the model to prevent
        // the parallel training nodes from colliding to write the same file
        // TODO: This does not belong here.
@ -182,7 +192,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // load the section of nodes that contain persistable parameters
    // This is used for reloading a model without recreating it, e.g. during training.
    // TODO: Why not just reload it? Because SGD::Train() holds pointers to the parameters directly? That should be fixed.
-    template<class ElemType> void ComputationNetwork::LoadPersistableParameters(File & fstream, bool create)
+    template<class ElemType> void ComputationNetwork::ReadPersistableParameters(File & fstream, bool create)
    {
        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCN");

@ -221,47 +231,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
    }

-    template<class ElemType> void ComputationNetwork::Load(const wstring& fileName, const FileOptions fileFormat, const bool /*bAllowNoCriterionNode --unused*/, ComputationNetwork* anotherNetwork)
+    // deserialize the model
+    // This does not post-process the model (CompileNetwork()). Use Load() instead.
+    template<class ElemType> void ComputationNetwork::Read(const wstring& fileName, const FileOptions fileFormat, const bool /*bAllowNoCriterionNode --unused*/, ComputationNetwork* anotherNetwork)
    {
        ClearNetwork();

        File fstream(fileName, fileFormat | FileOptions::fileOptionsRead);

-#if 1
-        LoadPersistableParameters<ElemType>(fstream, true);
-#else
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCN");
-
-        // model version
-        size_t modelVersion = CNTK_MODEL_VERSION_1; //if version info is not there it is version 1
-        if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BVersion"))
-        {
-            fstream >> modelVersion;
-            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion");
-        }
-
-        size_t numNodes;
-        fstream >> numNodes;
-
-        // get all node info first
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BNodeList");
-        for (size_t i = 0; i < numNodes; i++)
-        {
-            wstring opName, nodeName;
-            fstream >> opName >> nodeName;
-
-            auto newNode = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
-
-            if (!newNode)
-            {
-                fprintf(stderr, "Unknown ComputationNode type %ls (node name %ls)\n", opName.c_str(), nodeName.c_str());
-                InvalidArgument("Invalid node type.");
-            }
-            newNode->Load(fstream, modelVersion);
-            AddNodeToNet(newNode);
-        }
-        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
-#endif
+        ReadPersistableParameters<ElemType>(fstream, true);

        size_t numNodes = m_nameToNodeMap.size();

@ -277,9 +255,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                vector<wstring> childrenNames;
                childrenNames.resize(numChildren);
                for (size_t j = 0; j < numChildren; j++)
-                {
                    fstream >> childrenNames[j];
-                }

                // TODO: how does the file distinguish float from double?
                ComputationNodeBasePtr nodePtr = GetNodeFromName(nodeName);
@ -288,42 +264,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                for (int j = 0; j < numChildren; j++)
                    childrenNodes[j] = GetNodeFromName(childrenNames[j], anotherNetwork);

-                //if (nodePtr->OperationName() == OperationNameOf(RowStackNode))
-                //{
-                    // allow for variable input nodes
-                    nodePtr->AttachInputs(childrenNodes);
-                //}
-                //else
-                //{
-                //    // fixed input nodes
-                //    // TODO: Use the variable-length AttachInputs() as well. This is a refactoring left-over.
-                //    switch (numChildren)
-                //    {
-                //        case 1:
-                //            nodePtr->AttachInputs(childrenNodes[0]);
-                //            break;
-                //        case 2:
-                //            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1]);
-                //            break;
-                //        case 3:
-                //            nodePtr->AttachInputs(childrenNodes[0],childrenNodes[1], childrenNodes[2]);
-                //            break;
-                //        case 4:
-                //            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3]);
-                //            break;
-                //        case 5:
-                //            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4]);
-                //            break;
-                //        case 6:
-                //            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4], childrenNodes[5]);
-                //            break;
-                //        default:
-                //            LogicError("Invalid number of children.");
-                //    }
-                //}
+                nodePtr->AttachInputs(childrenNodes);
            }
        }
-
        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ERelation");

        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BRootNodes");
@ -340,7 +283,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    fstream >> nodeName;
                    m_features.push_back(GetNodeFromName(nodeName));
                }
-
                fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EFeatureNodes");
            }

@ -353,7 +295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    m_labels.push_back(GetNodeFromName(nodeName));
                }
            }
-
+            // BUGBUG: Should this be inside the block?
            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELabelNodes");

            if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BCriterionNodes") ||
@ -372,13 +314,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
            }

-            // TODO: this section is defunct
+            // TODO: this section is defunct, skip over
            if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BNodesReqMultiSeqHandling"))
            {
                fprintf(stderr, "WARNING: Ignoring defunct 'BNodesReqMultiSeqHandling' section in input file.\n");
                fstream >> num;
                for (size_t i = 0; i < num; i++)
-                    fstream >> nodeName;
+                    fstream >> nodeName;    // dummy
                fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodesReqMultiSeqHandling");
            }

@ -415,13 +357,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EPairNodes");
            }
        }
-
        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ERootNodes");

        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECN");
-
-        // perform all further post-processing, caching, etc.
-        CompileNetwork();
    }

    // -----------------------------------------------------------------------
@ -622,9 +560,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    //set sequence training parameters, e.g. smoothing weight, frame drop threshhold
    template<class ElemType>
-    void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign)
+    void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, 
+                                            const ComputationNodeBasePtr criterionNode, 
+                                            const double& hsmoothingWeight, 
+                                            const double& frameDropThresh, 
+                                            const bool& doreferencealign, 
+                                            const double& amf /*= 14.0f*/,
+                                            const double& lmf /*= 14.0f*/,
+                                            const double& wp  /*= 0.0f*/,
+                                            const double& bMMIfactor /*= 0.0f*/,
+                                            const bool&  sMBR /*= false*/
+                                            )
    {
        fprintf(stderr, "Setting Hsmoothing weight to %.8g and frame-dropping threshhold to %.8g\n", hsmoothingWeight, frameDropThresh);
+        fprintf(stderr, "Setting SeqGammar-related parameters: amf=%.2f, lmf=%.2f, wp=%.2f, bMMIFactor=%.2f, usesMBR=%s\n",
+            amf, lmf, wp, bMMIfactor, sMBR ? "true" : "false");
        list<ComputationNodeBasePtr> seqNodes = net->GetNodesWithType(OperationNameOf(SequenceWithSoftmaxNode), criterionNode);
        if (seqNodes.size() == 0)
        {
@ -638,6 +588,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                node->SetSmoothWeight(hsmoothingWeight);
                node->SetFrameDropThresh(frameDropThresh);
                node->SetReferenceAlign(doreferencealign);
+                node->SetGammarCalculationParam(amf, lmf, wp, bMMIfactor, sMBR);
            }
        }
    }
@ -1114,18 +1065,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    template void ComputationNetwork::InitLearnableParameters<float>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const float initValueScale, bool initOnCPUOnly);
-    template void ComputationNetwork::Load<float>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
-    template void ComputationNetwork::LoadPersistableParameters<float>(File & fstream, bool create);
+    template void ComputationNetwork::Read<float>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
+    template void ComputationNetwork::ReadPersistableParameters<float>(File & fstream, bool create);
    template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
    template /*static*/void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
-    template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, 
+                                const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);

    template void ComputationNetwork::InitLearnableParameters<double>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly);
-    template void ComputationNetwork::Load<double>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
-    template void ComputationNetwork::LoadPersistableParameters<double>(File & fstream, bool create);
+    template void ComputationNetwork::Read<double>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
+    template void ComputationNetwork::ReadPersistableParameters<double>(File & fstream, bool create);
    template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
    template /*static*/void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
-    template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, 
+                            const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);

    // register ComputationNetwork with the ScriptableObject system
    ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetwork> registerComputationNetwork(L"ComputationNetwork");
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -78,24 +78,33 @@ public:
    // -----------------------------------------------------------------------

    void Save(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary) const;
+    void SaveEdited(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary);
 private:
    void SaveToFileImpl(const std::wstring& fileName, const FileOptions fileFormat) const;
 public:

    template<class ElemType>
-    void LoadPersistableParameters(File & fstream, bool create);
+    void ReadPersistableParameters(File & fstream, bool create);
    // reload node content only, e.g. used by SGD::Train() when going back to an older model that had better training objective
    template<class ElemType>
-    void ReloadPersistableParameters(const std::wstring& fileName)
+    void RereadPersistableParameters(const std::wstring& fileName)
    {
        File fstream(fileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
-        LoadPersistableParameters<ElemType>(fstream, false);
+        ReadPersistableParameters<ElemType>(fstream, false);
    }
    // design BUGBUG: binary files do not know whether they are float or double.
    // TODO: modify file format to know this; then eliminate the <ElemType> dependency (and in some future, allow nodes to be different)
    template<class ElemType>
+    void Read(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary,
+              const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr);
+    template<class ElemType>
    void Load(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary,
-                      const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr);
+              const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr)
+    {
+        Read<ElemType>(fileName, fileFormat, bAllowNoCriterionNode, anotherNetwork);
+        // perform all further post-processing, caching, etc.
+        CompileNetwork();
+    }

    // static helper to instantiate a network from a file
    template<class ElemType>
@ -159,9 +168,11 @@ public:
 private:
    void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t & todo);
    void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode);
+    void MarkValueNonSharableNodes();
 private:
    void DetermineSetOfAllRoots();
    void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
+    bool IsCompiled() const { return m_isCompiled; }
    void VerifyIsCompiled(const char * where) const;
    //bool BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode);
 public:
@ -411,8 +422,20 @@ public:

    template<class ElemType>
    static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
+
+
+
    template<class ElemType>
-    static void SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    static void SetSeqParam(ComputationNetworkPtr net, 
+                            const ComputationNodeBasePtr criterionNode, 
+                            const double&  hsmoothingWeight, 
+                            const double& frameDropThresh, 
+                            const bool&   doreferencealign, 
+                            const double& amf=14.0f, 
+                            const double& lmf=14.0f, 
+                            const double& wp=0.0f, 
+                            const double& bMMIfactor=0.0f, 
+                            const bool&  sMBR=false);
    static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples);

    // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@ -30,6 +30,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                                        std::wstring toName,
                                                        const CopyNodeFlags flags)
    {
+        InvalidateCompiledNetwork();
+
        if (toName == L"")
            toName = fromName;

@ -50,11 +52,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
        else
        {
-            //node already exists
-
+            // node already exists
            pToNode = GetNodeFromName(toName);

-            //same node. no copy needed
+            // same node. no copy needed
            if (pFromNode == pToNode)
                LogicError("CopyNode: You are copying the node to the same network with same node name.");
            else
@ -69,6 +70,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                         const std::wstring fromName, std::wstring toNamePrefix,
                                         const CopyNodeFlags flags)
    {
+        InvalidateCompiledNetwork();
+
        if (!(flags & CopyNodeFlags::copyNodeValue))
            LogicError("CopySubTree: you cannot copy a tree without copying the node values.");

@ -103,7 +106,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // nodeNameNew - new node name
    void ComputationNetwork::RenameNode(const std::wstring& nodeNameOrig, const std::wstring& nodeNameNew)
    {
-        // so that renamed node will not be referenced
        InvalidateCompiledNetwork();

        ComputationNodeBasePtr nodeToRename = GetNodeFromName(nodeNameOrig);
@ -128,7 +130,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    void ComputationNetwork::DeleteNode(const std::wstring & nodeName)
    {
-        // so that deleted node will not be referenced
        InvalidateCompiledNetwork();

        ComputationNodeBasePtr nodeToDelete = GetNodeFromName(nodeName);
@ -172,6 +173,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // need to update all the mappings as well childrens
    void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr newNode)
    {
+        InvalidateCompiledNetwork();
+
        ComputationNodeBasePtr oldNode = GetNodeFromName(nodeName);
        if (oldNode->OperationName() != newNode->OperationName())
            InvalidArgument("newNode must have the same type as the old node.");
@ -204,6 +207,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // need to update those nodes who use oldNode as their child
    void ComputationNetwork::ReplaceLeafNode(wstring oldNodeName, ComputationNodeBasePtr newNode)
    {
+        InvalidateCompiledNetwork();
+
        ComputationNodeBasePtr oldNode = GetNodeFromName(oldNodeName);

        // change the input of those nodes whose child is oldNode
@ -223,6 +228,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    void ComputationNetwork::ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode)
    {
+        InvalidateCompiledNetwork();
+
        // Checks if the node is a criterion node.
        int index = -1;
        for (int i = 0; i < m_finalCriteria.size(); ++i)
@ -251,6 +258,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    void ComputationNetwork::AddFeatureNode(ComputationNodeBasePtr featureNode)
    {
+        InvalidateCompiledNetwork();
+
        wstring nodeName = featureNode->NodeName();
        if (NodeNameExists(nodeName))
            RuntimeError("AddFeatureNode: feature node already exists.");
@ -261,12 +270,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // We only remove the node, not delete it.
    void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode)
    {
+        InvalidateCompiledNetwork();
+
        wstring nodeName = featureNode->NodeName();
        if (!NodeNameExists(nodeName))
            RuntimeError("RemoveFeatureNode: feature node does not exist.");

-        InvalidateCompiledNetwork();
-
        // Removes links.
        for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); ++nodeIter)
        {
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -10,11 +10,13 @@
 #include "ComputationNode.h"
 #include "ComputationNetwork.h"
 #include "RecurrentNodes.h"
+#include "InputAndParamNodes.h"
 #include <string>
 #include <vector>
 #include <list>
 #include <set>
 #include <algorithm>
+#include <map>

 using namespace std;

@ -365,7 +367,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // verify that network has undergone CompileNetwork()
    void ComputationNetwork::VerifyIsCompiled(const char * where) const
    {
-        if (!m_isCompiled)
+        if (!IsCompiled())
            LogicError("%s: A compiled network was expected.", where);
    }

@ -712,6 +714,63 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------
    // memory allocation
    // -----------------------------------------------------------------------
+    // mark nodes that are purely induced by parameters as non-sharable and create space for value if null 
+    void ComputationNetwork::MarkValueNonSharableNodes()
+    {
+        const auto & nodes = GetEvalOrder(nullptr);
+        std::map<wstring, bool>    allLeafDescendentsAreParameters; 
+        std::list<ComputationNodeBasePtr>    allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter)); 
+        // note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not 
+
+        for (auto& node : nodes)
+        {
+            auto children = node->GetInputs(); 
+            wstring myname = node->NodeName();
+            bool allParameters = true; 
+                        
+            if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already 
+            {
+                for (auto child : children)
+                {
+                    wstring ChildName = child->NodeName();
+                    if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end())
+                    {
+                        // not found, means it is a leaf node (we are at eval order )
+                        assert(child->IsLeaf() || child->IsPartOfLoop());
+                        if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end())
+                        {
+                            allLeafDescendentsAreParameters[ChildName] = true; 
+                        }
+                        else
+                        {
+                            allParameters = false; 
+                            allLeafDescendentsAreParameters[ChildName] = false;
+                            break;
+                        }                      
+                    }
+                    else
+                    {
+                        if (allLeafDescendentsAreParameters[ChildName] == false)
+                        {
+                            allParameters = false;
+                            break;
+                        }
+                    }
+                }
+                allLeafDescendentsAreParameters[myname] = allParameters;
+                if (allParameters)
+                {
+                    node->MarkValueNonSharable();
+                }
+                else
+                {
+                    node->MarkValueSharable();
+                }
+            }
+        }
+        
+    }
+

    // this function will need to be called before actual validation and execution to 
    // predetermine how to share matrices to reduce memory usage.
@ -726,9 +785,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        VerifyIsCompiled("AllocateAllMatrices");

+        // Due to special topology, if a node is solely induced by parameters, its function value should not be shared  
+        MarkValueNonSharableNodes();
+
        bool performingBackPropagation = (trainRootNode != nullptr);

-        // Create a composite Eval order with the specfied nodes as roots
+        // Create a composite Eval order with the specified nodes as roots
        std::vector<ComputationNodeBasePtr> forwardPropRoots;
        forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end());
        forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end());
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -136,7 +136,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    {
        typedef std::shared_ptr<INodeState> NodeStatePtr;
        virtual NodeStatePtr ExportState() = 0;
-        virtual void ImportState(NodeStatePtr && state) = 0;
+        virtual void ImportState(const NodeStatePtr & state) = 0;
    };
    typedef IStatefulNode::NodeStatePtr NodeStatePtr;

@ -151,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        friend class ComputationNetwork;

        ComputationNetworkOwnedNodeState() :
-            m_needsGradient(false)
+            m_needsGradient(false), m_valueSharable(true)
        {
            PurgeStateForFormingRecurrentLoops();
            m_isPartOfLoop = false;
@ -166,10 +166,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        bool IsPartOfLoop() const { return m_isPartOfLoop; }

+        virtual void MarkValueNonSharable(){ m_valueSharable = false; }
+        virtual void MarkValueSharable() { m_valueSharable = true;    }
+        bool isValueSharable() const { return m_valueSharable;  }
+        
    protected:  // TODO: should be fully encapsulated here

        bool m_needsGradient;   // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)

+        bool m_valueSharable;   // a flag is needed for memory share. 
+                                // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters), 
+                                // it will never be released to memory pool 
    private:

        bool m_isPartOfLoop;        // true if this loop is part of a recurrent loop
@ -250,7 +257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_deviceId(deviceId), m_outputNeededDuringBackprop(true),
            m_parameterUpdateRequired(false), m_gradientInitialized(false),
            m_nodeName(name == L"" ? CreateUniqNodeName() : name),
-            m_numRows(0), m_numCols(0)
+            m_numRows(0), m_numCols(0) 
        { }
        virtual ~ComputationNodeBase(){}

@ -348,9 +355,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        const TensorShape & GetSampleLayout() const { return m_sampleLayout; }
        bool HasSampleLayout() const { return m_sampleLayout.GetRank() != 1; }      // meaning does it have a layout that is not just a vector
+        TensorShape GetTensorShape(size_t rank) const;                              // form the actual tensor that describes the full object
    protected:
        size_t DetermineElementwiseTensorRank() const;                              // determine tensor rank when considering all inputs with padding
-        TensorShape GetTensorShape(size_t rank) const;                              // form the actual tensor that describes the full object
        TensorShape GetTensorSliceFor(size_t rank, const FrameRange & fr) const;    // form tensor shape of the slice referenced by FrameRange
    public:
        // access to element(0,0) without having to type-cast
@ -455,6 +462,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
        }

+
    protected:
    public:     // ...the following should be protected, but nodes inquire about their children, requiring public access

@ -537,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
        bool IsOutputNeededDuringBackprop() const 
        {
-            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
+            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop ;
        }

        const size_t GetNumInputs() const { return m_inputs.size(); }
@ -769,6 +777,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        bool m_parameterUpdateRequired;     // update parameters? Only used for LearnableParameters.    --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves.
        bool m_gradientInitialized;         // indicates whether the gradient matrix has been resized and initialized to 0
        bool m_outputNeededDuringBackprop;  // indicates whether the output value of the node is needed during backprop
+
    };
    typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;

@ -902,7 +911,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        //don't release matrices that need to be used in the gradient computation
        virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
        {
-            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE))
+            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable())
                ReleaseMatrixToPool(m_value, matrixPool);
        }

@ -931,7 +940,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                // Release the Value matrix only if the output value is needed during backprop
                // since in the case it isn't used, we release it during forward prop itself
-                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE)
+                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable())
                    ReleaseMatrixToPool(m_value, matrixPool);
            }
        }
@ -1317,6 +1326,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            CreateMatrixIfNull(m_gradient);
        }

+        void MarkValueNonSharable() override
+        {
+            m_valueSharable = false; 
+            CreateMatrixIfNull(m_value);
+        }
+
+
    protected:

        // this function is used to create matrices for those needed before matrix pool is available
@ -1532,7 +1548,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */    \
 protected: \
    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr; \
-    using Base::m_deviceId; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
+    using Base::m_deviceId; using Base::shared_from_this; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; \
+    using Base::GetNumRows; using Base::GetNumCols; using Base::GetTensorShape; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
    using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \
    using Base::MaskMissingColumnsToZero; using Base::MaskMissingValueColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValueColumns; using Base::InvalidateMissingGradientColumns; \
    using Base::DataFor; using Base::ValueFor; using Base::Gradient; using Base::GradientFor; \
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -813,9 +813,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        void SetEvalMode(bool bnEvalMode)
        {
-            m_eval = bnEvalMode;
+            m_eval = bnEvalMode; 
        }
-
    private:
        struct VersionInfo
        {
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -41,6 +41,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            Base(deviceId, name)
        {
            m_parameterUpdateRequired = true;
+            this->m_valueSharable = false; 
            SetDims(TensorShape(), 0);
        }
        LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & shape) :
@ -48,6 +49,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            m_parameterUpdateRequired = true;
            CreateMatrixIfNull(m_value);
+            this->m_valueSharable = false; 
            // for now we split off the trailing dimension into the matrix column dimension
            // TODO: This is for compat, but is is inconsistent. Decide what a sample layout means for a node without MBLayout w.r.t. non-tensor ops.
            auto dims = shape.GetDims();
@ -197,6 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            PrintNodeValuesToFile(printValues, fstream);
        }
+
    };

 #if 0
@ -261,6 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            SetDims(sampleLayout, 0);
            UpdateFunctionValuesSize();     // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
            m_parameterUpdateRequired = false;
+            this->m_valueSharable = false; 
        }
    protected:
        InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) :
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -44,7 +44,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
        {
-            static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }
+            //static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }

            size_t rank = DetermineElementwiseTensorRank();
            auto result =           ValueTensorFor(rank, fr);
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -9,6 +9,7 @@
 #include "Matrix.h"
 #include "TensorShape.h"
 #include "ComputationNode.h"
+#include "Sequences.h"

 #include <unordered_set>
 #include <map>
@ -26,7 +27,7 @@
 namespace Microsoft { namespace MSR { namespace CNTK {

    // -----------------------------------------------------------------------
-    // ShiftNode (input, fromOffset, boundaryValue, dim=-1, numSteps=1, insertDim=0) -- delay and rolling window
+    // ShiftNode (input, fromOffset, boundaryValue, dim=-1) -- delay and rolling window
    //
    // This shifts the input by (-fromOffset) steps. In other words, output(t) will be input(t+fromOffset).
    // E.g. for fromOffset=-1, this gives the past value.
@ -34,36 +35,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    //
    // This node can be used in a recurrent loop. This requires special handling by the ComputationNetwork,
    // for both execution (sequential execution) and creation (avoiding circular references).
-    // TODO: When outside a recurrent loop and used with frame randomization, this will communicate to the reader
-    // that additional frames are needed, which will then return a frame range. TODO: This will not match
-    // the labels, which are still 1 frame. Think through which dimension this should go in.
    //
    // Values shifted in from beyond sequence boundaries will be copied from boundaryValue.
    // Normally, this is a scalar Constant(). However, it can be any node, which will be indexed from the end
-    // (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement
-    // sequence-to-sequence models. Broadcasting is supported, so it can be e.g. a single output-dimension vector
+    // (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement the basic
+    // sequence-to-sequence model. Broadcasting is supported, so it can be e.g. a single output-dimension vector
    // applied to all sequences.
    //
    // To delay (past value), use negative fromOffset. To access future value, use positive fromOffset.
    //
-    // To pull in multiple offsets, use offsetRange>1. This will pull in offsetRange consecutive offsets starting
-    // with fromOffset. This implements a rolling window. A new dimension will be inserted at multiOffsetDim
-    // (default 0 means after the last sample dimension). Special considerations:
-    //  - If the boundaryValue is not wide enough, the sequence will be dropped (e.g. if you pull in 5 history frames,
-    //    but the sequence in boundaryValue only has 4 samples).
-    //  - If you feed back such an expanded output into this node in a loop, you get an inconsistency
-    //    and will eventually fail. You must pull the dimensions apart.
-    //  - If the current time step (offset 0) is included in the range (e.g. fromOffset=-1, offsetRange=3) then
-    //    this node cannot participate in a recurrence.
-    //
    // By default, this shifts over the time dimension, but you can choose to shift over any
    // sample tensor dimension instead using 'dim' (-1 stands for time). This will only work, however,
    // when all involved nodes are implemented using the tensor library. Nodes implemented using
    // Matrix slices can only support iterating over time.
-    //
-    // If the boundaryValue has 0 elements, the sequence will be trimmed (frames reaching beyond the boundary
-    // are dropped). This will initially not be implemented for the time dimension (as it would require
-    // change of MBLayout).
    // -----------------------------------------------------------------------

    template<class ElemType>
@ -74,24 +58,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    public:
        enum BoundaryMode : int     // how to fill frames at boundaries
        {
-            reachAcross = -1,       // go across the boundary: use boundaryValue. This is for recurrence.
-            duplicate = 0,          // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only.
-            trim = 1                // drop frames. Non-recurrent mode only.
+            reachAcross = -1,       // go across the boundary: use boundaryValue
+            duplicate = 0           // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only.
        };
-        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimension, size_t numSteps, int insertedDimParam) :
-            Base(deviceId, name), m_fromOffset(fromOffset), m_numSteps(numSteps),
+        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimParam) :
+            Base(deviceId, name), m_fromOffset(fromOffset),
            m_boundaryMode(boundaryMode),
-            m_shiftDimension(shiftDimension), m_insertedDimParam(insertedDimParam),
-            m_insertExpandShapeAt(SIZE_MAX/*uninitialized at this point*/)
+            m_shiftDimParam(shiftDimParam),
+            m_shiftDim(SIZE_MAX),
+            m_state(deviceId)
        {
            CreateMatrixIfNull(m_value);
            SetDims(TensorShape(), 0);  // empty for now
        }
        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1, 1, 0)
+            ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1)
        { }
        ShiftNode(const ScriptableObjects::IConfigRecordPtr configp) :
-            ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim"), configp->Get(L"numSteps"), configp->Get(L"insertedDim"))
+            ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim"))
        {
            // We do NOT attach the inputs, as we cannot resolve the main input without causing a circular reference.
            // Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below.
@ -111,19 +95,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void Save(File& fstream) const
        {
            Base::Save(fstream);
-            fstream << m_fromOffset << m_numSteps << m_boundaryMode << m_shiftDimension << m_insertedDimParam;
+            fstream << m_fromOffset << m_boundaryMode << m_shiftDimParam;
        }

        virtual void Load(File& fstream, size_t modelVersion) override
        {
            Base::Load(fstream, modelVersion);
-            fstream >> m_fromOffset >> m_numSteps >> m_boundaryMode >> m_shiftDimension >> m_insertedDimParam;
-        }
-
-        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
-        {
-            assert(inputIndex == 0); inputIndex;
-            fr;
+            fstream >> m_fromOffset >> m_boundaryMode >> m_shiftDimParam;
        }

        virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
@ -133,6 +111,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            Base::BeginForwardProp();

+            // TODO: If we have a truncated-BPTT state then verify that the sequence indices match with m_state->m_sequences, and the tensor dimensions.
+
            // in case of trimming, narrow the layout
            // We actually do not drop content, only reduce the range of sequences.
            // This is meant to optimize for the case where we have multiple sequences concatenated while trimming a small amount only.
@ -142,34 +122,216 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            Base::EndForwardProp();

-            // In BPTT, we carry over left-to-right state across minibatches.
+            // In truncated BPTT, we carry over left-to-right state across minibatches.
            // The necessary frames are stored in m_state->m_delayedValue.

-            // Only if layout has anything exceeding the MB.
+            if (GetMBLayout()->HasSequenceBeyondEnd())    // only if layout has any sequence that has ends beyond this minibatch
+            {
+            }
+            else
+                m_state.clear();
+        }
+    private:
+        typedef std::pair<SmallVector<int>, SmallVector<int>> SliceBounds;  // slice bounds for dimension k are [first[k], second[k]) (think STL begin/end)
+
+        TensorView<ElemType> DataTensorFor(Matrix<ElemType> & data, TensorShape shape/*original shape of 'data'*/, SliceBounds slice)
+        {
+            shape.NarrowTo(slice);
+            return TensorView<ElemType>(data, shape);
        }

-        // This function assumes BeginForwardProp/EndForwardProp() to be called before/after the iteration loop.
+        // helper to shift dimension 'm_shiftDim' of SliceBounds by an offset (a common operation below)
+        SliceBounds ShiftDim(const SliceBounds & in, int shiftBy)
+        {
+            SliceBounds result = in;
+            result.first [m_shiftDim] += shiftBy;
+            result.second[m_shiftDim] += shiftBy;
+            return result;
+        }
+
+        static SmallVector<int> ToIntDims(const TensorShape & shape)
+        {
+            SmallVector<int> dimsSigned;
+            dimsSigned.append(shape.GetDims().begin(), shape.GetDims().end());  // we need the bounds as signed integers as they may shift into negative ranges
+            return dimsSigned;
+        }
+
+        // determine shapes and slices to move
+        // This is used for both forward and backprop.
+        // 'In' below refers to Input(0) where 'Out' refers to the output of *this.
+        void DetermineSlices(size_t rank, const FrameRange & fr,
+                             TensorShape & inShape, TensorShape & outShape,                 // our MB's shape
+                             SliceBounds & inSliceLogical, SliceBounds & outSliceLogical)   // the logical ranges to shift
+        {
+            // get the slice bounds for the given FrameRange
+            outShape =           GetTensorShape(rank);     // describes the full tensor including sequence and time dimensions
+            inShape  = Input(0)->GetTensorShape(rank);
+
+            // determine the logical in and out slices
+            // This may now have bounds that fall outside, which we need to split off next.
+            outSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(outShape), fr, GetMBLayout());
+            inSliceLogical  = TensorSliceWithMBLayoutFor(ToIntDims(inShape),  fr.WithTimeOffset(m_fromOffset), GetMBLayout());    // apply the offset
+        }
+
+        // determine stripes to move w.r.t. main storage and from/to state
+        // For efficiency:
+        //  - this function assumes that the return values have been freshly constructed (it won't reset them)
+        //  - it may return a slice with end < begin which indicates an empty slice
+        void PartitionSlices(const SliceBounds & inSliceLogical, const SliceBounds & outSliceLogical,  // the move we want to make
+                             int T,                                                                    // our actual size
+                             SliceBounds & inSliceMain,  SliceBounds & outSliceMain,                   // the part that goes main-to-main
+                             SliceBounds & inSliceState, SliceBounds & outSliceState)                  // the part that goes from/to state
+        {
+            inSliceMain  =  inSliceLogical;
+            outSliceMain = outSliceLogical;
+            if (inSliceMain.first[m_shiftDim] < 0)
+            {
+                assert(inSliceMain.second[m_shiftDim] < T);
+                if (!m_state.empty())           // truncated BPTT case
+                {
+                    // determine range that lives in state
+                    SliceBounds inSliceOutside = inSliceMain;       // beginning falls to the left of the MB
+                    if (inSliceOutside.second[m_shiftDim] > 0)
+                        inSliceOutside.second[m_shiftDim] = 0;   // trim end; e.g. [-2,97) -> [-2,0), but [-2,-1) remains
+                    // now inSliceOutside represents only the region that falls outside
+
+                    // map to dimensions of our saved state
+                    SliceBounds inSliceState = ShiftDim(inSliceOutside, m_state.m_shape[m_shiftDim]);
+                    // E.g. for offset = -4, m_state will be 4 elements, so [-2,0) -> [2,4), and [-2,-1) -> [2,3)
+
+                    // map to target dimensions
+                    SliceBounds outSliceState = ShiftDim(inSliceOutside, -m_fromOffset);
+                    assert(inSliceState == outSliceState);     // (when we fall out on the left, both must be the same)
+                }
+                // else: no truncated BPTT means we must have a proper boundary. So don't write those values here, they will be initialized with boundary values below.
+
+                // and trim main (if 'from' is entirely outside, such as in the common single-frame case, we get begin >= end)
+                outSliceMain.first[m_shiftDim] += -inSliceMain.first[m_shiftDim];
+                inSliceMain.first[m_shiftDim]  += -inSliceMain.first[m_shiftDim];
+                assert(inSliceMain.first[m_shiftDim] == 0);
+            }
+            else if (inSliceMain.second[m_shiftDim] > T)
+            {
+                if (!m_state.empty())
+                {
+                    // determine range to get from state
+                    SliceBounds inSliceOutside = inSliceMain;
+                    if (inSliceOutside.first[m_shiftDim] < T)
+                        inSliceOutside.first[m_shiftDim] = T;     // trim end; e.g. [2,102) -> [100,102), but [101,102) remains
+                    // now inSliceOutside is where we should copy from, with indices completely out of bounds
+
+                    // map to dimensions of our saved state
+                    SliceBounds inSliceState = ShiftDim(inSliceOutside, -T);
+                    // E.g. for offset = 4, m_state will be 4 elements, so [100,102) -> [0,2), and [101,102) -> [1,2)
+
+                    // map to target dimensions
+                    SliceBounds outSliceState = ShiftDim(inSliceOutside, T - m_fromOffset);
+                    // E.g. [0,2) -> [96,98), and [1,2) -> [97,98)
+                }
+                // and trim main (if 'from' is entirely outside, such as in the common single-frame case, we get begin >= end)
+                outSliceMain.first[m_shiftDim] -= (inSliceMain.second[m_shiftDim] - T);
+                inSliceMain.second[m_shiftDim] -= (inSliceMain.second[m_shiftDim] - T);
+                assert(inSliceMain.second[m_shiftDim] == T);
+            }
+        }
+    public:
        virtual void ForwardProp(const FrameRange & fr) override
        {
+            if (fr.GetIterationDimension() != m_shiftDimParam)
+                LogicError("ShiftNode::ForwardProp(): FrameRange not iterating over user-specified dimension.");
+
+            // for debugging, invalidate the output region, so we will catch if we missed to update something
+#ifdef _DEBUG
+            ValueFor(fr).Invalidate();
+#endif
+
            // STEP 1: whole-sale copy a shifted version of the input to the output
            //  - consider the saved parts from the last minibatch as part of the input at dimensions beyond the bounds
-            //  - ignore boundary conditions for now
+            //  - ignore boundary conditions at this point (will be fixed subsequently)
+            // This will copy a little too much in case of multiple concatenated sequences within a single parallel sequence.

-            // get the tensors without shift
+            // get the logical ranges we want to shift
+            TensorShape inShape, outShape;                  // expanded tensor shapes of input and output
+            SliceBounds inSliceLogical, outSliceLogical;    // the logical ranges to shift
            size_t rank = DetermineElementwiseTensorRank();
-            auto result = ValueTensorFor(rank, fr);
-            auto input = Input(0)->ValueTensorFor(rank, fr);
+            DetermineSlices(rank, fr, inShape, outShape, inSliceLogical, outSliceLogical);

-            // shift the dimension in the input
+            // now copy the two stripes--one that is main-to-main, and one that pulls in data from previous state (truncated BPTT only)
+            // This correctly handles if input is a tensor with strides. This is currently not the case, but may be if we support in-place.
+
+            SliceBounds inSliceMain, outSliceMain;          // main-to-main
+            SliceBounds inSliceState, outSliceState;        // from state
+            PartitionSlices(inSliceLogical, outSliceLogical, outShape[m_shiftDim], inSliceMain, outSliceMain, inSliceState, outSliceState);
+
+            if (!inSliceState.first.empty() && inSliceState.second[m_shiftDim] > inSliceState.first[m_shiftDim])
+            {
+                // Note: If all sequences begin at the start of the range, this would copy invalid values which would be overwrittten below.
+                // This is prevented in that m_state will be set to empty in the previous MB if all sequences ended, which will in turn return an empty slice.
+                auto from = DataTensorFor(m_state.m_delayedValue, m_state.m_shape,  inSliceState);
+                auto to   = DataTensorFor(Value(),                       outShape, outSliceState);
+                to.AssignCopyOf(from);
+            }
+            if (inSliceMain.second[m_shiftDim] > inSliceMain.first[m_shiftDim])
+            {
+                auto from = DataTensorFor(Input(0)->Value(),  inShape,  inSliceMain);
+                auto to   = DataTensorFor(          Value(), outShape, outSliceMain);
+                to.AssignCopyOf(from);
+            }
+            // We have now pulled anything from within the logical bounds.
+            // Any frame that pulls from outside contains invalid values (either not initialized or copied from incorrect source), which must be fixed next.

            // STEP 2: fix up the boundary conditions
-            //  - fill in xxx
+            //  - fill in all frames that are too close to boundary and must be filled from context (recurrent) or by replication (non-recurrent only)

-            // turn selected frame and shifted frame into a tensor
+            if (fr.IsAllFrames() || GetMBLayout()->IsBeyondStartOrEnd(fr.WithTimeOffset(m_fromOffset)))     // short-cut test whether there is anything to do
+            {
+                auto ts = outSliceLogical.first[m_shiftDim];
+                auto te = outSliceLogical.second[m_shiftDim];
+                //size_t sequenceDim = outShape.size() - 2;  // TODO: In case of multiple time dims, this must be adjusted. Code dup from TensorSliceWithMBLayoutFor(). Encapsulate this.
+                // iterate over all sequences in this batch and handle all that overlap with the target region
+                for (const auto & seq : GetMBLayout()->GetAllSequences())
+                {
+                    if (seq.tEnd <= ts || seq.tBegin >= te)     // no overlap--skip
+                        continue;

-            // copy all that's in range
+                    // get tensor to fill in. This may be out of bounds, and may only partially overlap with [ts,te)
+                    auto seqLen = abs(m_fromOffset);
+                    auto seqBegin = m_fromOffset < 0 ? seq.tBegin : seq.tBegin + seq.GetNumTimeSteps() - seqLen;    // e.g. m_fromOffset = -4 -> [0,4) , +4 -> [Len-4,Len)
+                    auto outSliceFill = TensorSliceWithMBLayoutFor(ToIntDims(outShape), fr.WithTimeOffset(seqBegin).WithTimeRange(seqLen).Sequence(seq.s), GetMBLayout());

-            // fix up all that is not
+                    // get tensor to fill from
+                    // We fill either from the provided boundary node or from ourselves (BoundaryMode::duplicate = clamp).
+                    bool clamp = m_boundaryMode == BoundaryMode::duplicate;
+                    ComputationNodeBasePtr boundaryNode = clamp ? shared_from_this() : Input(0);
+                    auto boundaryShape = boundaryNode->GetTensorShape(rank);
+                    auto fromSeq = clamp ?
+                                       seq.s :
+                                       boundaryNode->HasMBLayout() ?
+                                           boundaryNode->GetMBLayout()->FindSequence(seq.seqId).seqId :
+                                           SIZE_MAX;
+                    auto fromBegin = 0;
+                    auto boundarySliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(boundaryShape), fr.WithTimeOffset(fromBegin).WithTimeRange(seqLen).Sequence(fromSeq), GetMBLayout());
+
+                    boundarySliceLogical;
+
+                    //inSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(inShape), fr.WithTimeOffset(m_fromOffset), GetMBLayout());    // apply the offset
+
+
+
+                    // clip against [ts,te)
+                    // copy
+                    sin(1);
+                }
+            }
+        }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+            // To allow for bulk gradient computation, we will clear out any gradient that should not be propagated.
+            // We do that directly to our incoming output gradient. This is OK because we own this, and it is no longer used after this operation
+            // (it is invalid to call BackpropTo() multiple times since it adds to the outgoing Input() gradient).
+            assert(inputIndex == 0); inputIndex;
+            fr;
        }

        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
@ -177,46 +339,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            assert(m_inputs.size() == 2);
            ComputationNodeBase::Validate(isFinalValidationPass);

-            if (isFinalValidationPass)
-                sin(1.0f);
-
            // MBLayout is just inherited
            m_pMBLayout = Input(0)->GetMBLayout();
            if (isFinalValidationPass && !m_pMBLayout)
                InvalidArgument("%ls %ls operation must operate on data (must have an MB Layout).", NodeName().c_str(), OperationName().c_str());

-            // determine final sample layout
-            auto inputSampleLayout = Input(0)->GetSampleLayout();
-            auto inputDims = inputSampleLayout.GetDims();
-            if (m_insertedDimParam < 0)
-                InvalidArgument("%ls %ls operation: Specified insertion location %d refers to a time dimension, but this is not allowed.", 
-                                NodeName().c_str(), OperationName().c_str(), m_insertedDimParam);
-            m_insertExpandShapeAt = m_numSteps > 1 ? 0 : (m_insertedDimParam > 0 ? m_insertedDimParam - 1 : inputDims.size());
-            if (m_insertExpandShapeAt > inputDims.size())
-                if (isFinalValidationPass)
-                    InvalidArgument("%ls %ls operation: Specified insertion location %d beyond end of input sample layout [%s].",
-                                    NodeName().c_str(), OperationName().c_str(), m_insertedDimParam, string(inputSampleLayout).c_str());
-                else
-                    m_insertExpandShapeAt = inputDims.size();   // this may be an error, but we want to catch that only in the final pass
-            SmallVector<size_t> dims;
-            if (m_numSteps > 1 && inputDims.size() + 1 > dims.capacity())
-                InvalidArgument("%ls %ls operation: Too many dimensions. Did you feed back output of this node without stripping the extra dimensions?",
-                                NodeName().c_str(), OperationName().c_str());
-            dims.append(inputDims.begin(), inputDims.begin() + m_insertExpandShapeAt);
-            if (m_numSteps > 1)             // insert the new dimension if we expand into more than one step
-                dims.push_back(m_numSteps);
-            dims.append(inputDims.begin() + m_insertExpandShapeAt, inputDims.end());
-            auto sampleLayout = TensorShape(dims);
+            // as is the sample layout
+            SetDims(Input(0));

-            SetDims(sampleLayout, 0);
+            // determine the dimension that is to be shifted (convert user-specified as a zero-based index)
+            if (isFinalValidationPass)
+            {
+                size_t rank = DetermineElementwiseTensorRank();
+                auto valueShape = GetTensorShape(rank);                         // bounds of the Value()
+                m_shiftDim = m_shiftDimParam > 0 ? m_shiftDimParam - 1/*regular dimensions are specified as 1-based*/ : valueShape.size() + m_shiftDimParam/*-1 for time dimension*/;
+            }
        }

        // special interface for use by loop detection
        virtual int /*IRecurrentNode::*/GetRecurrenceSteppingDirection() const override
        {
-            if (m_boundaryMode != BoundaryMode::reachAcross)
+            if (m_boundaryMode != BoundaryMode::reachAcross)    // duplicating boundary frames cannot be done with recurrence
                return 0;
-            else if (m_fromOffset + (int)m_numSteps <= 0)
+            else if (m_fromOffset < 0)
                return +1;
            else if (m_fromOffset > 0)
                return -1;
@ -231,48 +376,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                auto node = dynamic_pointer_cast<ShiftNode<ElemType>>(nodeP);
                node->m_fromOffset          = m_fromOffset;
-                node->m_numSteps            = m_numSteps;
                node->m_boundaryMode        = m_boundaryMode;
-                node->m_shiftDimension      = m_shiftDimension;
-                node->m_insertedDimParam    = m_insertedDimParam;
-                node->m_insertExpandShapeAt = m_insertExpandShapeAt;
+                node->m_shiftDimParam       = m_shiftDimParam;
+                node->m_shiftDim            = m_shiftDim;
                node->m_state               = m_state;
            }
        }

        class ShiftNodeState : public INodeState
        {
-            Matrix<ElemType> m_delayedValue;            // saves the activation of the previous step that this node points to
-            vector<MBLayout::SequenceInfo> m_delayedSequences;    // and associated sequence info. This is only used for consistency checking (it must match).
+        public:
+            Matrix<ElemType> m_delayedValue;                    // saves the activation of the previous step that this node points to
+            TensorShape m_shape;                                // tensor shape that describes m_delayedValue
+            vector<MBLayout::SequenceInfo> m_delayedSequences;  // and associated sequence info. This is only used for consistency checking (it must match).
            ShiftNodeState(DEVICEID_TYPE deviceId) : m_delayedValue(deviceId) { }
+            bool empty() const { return m_delayedSequences.empty(); }
+            void clear() { m_delayedValue.Resize(0, 0); m_shape = TensorShape(); m_delayedSequences.clear(); }
        };
        typedef std::shared_ptr<ShiftNodeState> ShiftNodeStatePtr;

        // state export/import
-        // This is done with a shared_ptr. The moment state is exported, the internal state is cleared; ownership is transferred to the exporting entity.
-        // This way, the next invocation does not overwrite the exported state, but is required to create a new one if needed.
-        // On the other hand, once imported, the state object is owned by the node and will be overwritten with the next state.
-        virtual NodeStatePtr ExportState() { return std::move(m_state); }
-        virtual void ImportState(NodeStatePtr && state) override
+        // This is done with a shared_ptr. The current state is exported, the internal state is cleared.
+        // Ownership of members is logically transferred to the exporting entity.
+        // Physically, however, since we often transfer between CPU and GPU, activation data is merely copied,
+        // and the GPU or CPU object resized to (0,0) without giving up the memory.
+        virtual NodeStatePtr ExportState()  // TODO: can we instead pass the shared_ptr object in? So we don't need to create a new one all the time? Or should we still take ownership of the ptr?
        {
-            m_state = dynamic_pointer_cast<ShiftNodeState>(state);
-            if (state && !m_state)
+            auto state = make_shared<ShiftNodeState>(CPUDEVICE);
+            state->m_delayedValue.SetValue(m_state.m_delayedValue);     // note: this will transfer from GPU to CPU
+            m_state.m_delayedValue.Resize(0, 0);
+            state->m_shape = std::move(m_state.m_shape);
+            state->m_delayedSequences = std::move(m_state.m_delayedSequences);
+            return state;
+        }
+        virtual void ImportState(const NodeStatePtr & statep) override
+        {
+            ShiftNodeStatePtr state = dynamic_pointer_cast<ShiftNodeState>(statep);
+            if (!state)
                LogicError("ImportState: Wrong state object passed (wrong type).");
+            m_state.m_delayedValue.SetValue(state->m_delayedValue);     // note: this will transfer from CPU to GPU
+            state->m_delayedValue.Resize(0, 0);
+            m_state.m_shape = std::move(state->m_shape);
+            m_state.m_delayedSequences = std::move(state->m_delayedSequences);
        }
    protected:
        // parameters remembered from construction
-        int m_fromOffset;                      // offset to pull from
-        int m_numSteps;                             // offset range
-        BoundaryMode m_boundaryMode;                // how to fill at the boundary (reach across, duplicate, or trim)
-        int m_shiftDimension;                       // dimension to shift (default: time)
-        int m_insertedDimParam;                     // in case of multiple steps, this is where a new dimension will be inserted
+        int m_fromOffset;                       // offset to pull from
+        BoundaryMode m_boundaryMode;            // how to fill at the boundary (reach across or duplicate)
+        int m_shiftDimParam;                    // dimension to shift (default: time)

-        // derived params set up in Validate()
-        size_t m_insertExpandShapeAt;               // at which dimension to insert (internal 0-based index)
+        size_t m_shiftDim;                      // m_shiftDimParam matched to the real tensor index

-        ShiftNodeStatePtr m_state;                  // saves the activation of the previous step that this node points to
+        ShiftNodeState m_state;                 // state that is carried over across evaluations
+        // Note: The version held by this node lives in the GPU, whereas the versions being exported carry CPU-side copies

-        function<void()> m_attachInputsFn;          // for late expansion of inputs (scripting)
+        function<void()> m_attachInputsFn;      // for late expansion of inputs (scripting)
    };

    // -----------------------------------------------------------------------
@ -333,7 +491,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    //  - ranges of neighbor frames as a secondary tensor dimension (i.e. can be used to implement a rolling window)
    //  - full support/efficiency of non-recurrent use (in which case the range can be from negative to positive, e.g. a symmetric rolling window)
    //  - denoting which tensor dimension to loop over (this may not be completed, but I will plant a seed)
-    //  - support for Yongqiang’s sub-minibatching with BPTT (export/import state)
+    //  - support for Yongqiang’s sub-minibatching with truncated BPTT (export/import state)
    //  - more efficient storage of carried-over state (only store the needed frames, not a full copy of the previous MB as currently; which will on the other hand also allow windows that reach back beyond a minibatch)
    // -----------------------------------------------------------------------

@ -486,7 +644,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        virtual void EndForwardProp() override        // called after last iteration step of ForwardProp()
        {
-            // In BPTT, we carry over left-to-right state across minibatches.
+            // In truncated BPTT, we carry over left-to-right state across minibatches.
            // It is kept in m_delayedValue, m_delayedActivationMBLayout.
            // This could be optimized as follows:
            //  - only keep the required number of frames (m_timeStep)
@ -620,27 +778,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            if (dir == -1) // we look into past 
            {
-#if 0
-                bool   allAtBoundary = true;
-                // if the current last frames are all sentence end or no feature , there is no need to carry on state info
-                if (m_pMBLayout->Is(FrameRange(nT-1), MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature))
-                {
-                    for (size_t u = 0; u < nU; u++)
-                    {
-                        if (!m_pMBLayout->Is(FrameRange(nT - 1).Sequence(u), MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature))
-                        {
-                            allAtBoundary = false;
-                            break;
-                        }
-                    }
-                }
-                else
-                {
-                    allAtBoundary = false; 
-                }
-
-                if (allAtBoundary)
-#endif
                if (!m_pMBLayout->HasSequenceBeyondEnd())       // only need to export state if anything crosses the MB boundary
                {
                    auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId); 
@ -655,26 +792,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    pExportedState = pState; 
                }
            }
-            if (dir == 1) // we look into future 
+            else if (dir == 1) // we look into future 
            {
-#if 0
-                // TODO: check whether all at boundary and don't carry state if it is the case 
-                size_t nT = m_pMBLayout->GetNumTimeSteps(); 
-                size_t nU = m_pMBLayout->GetNumParallelSequences(); 
-                bool allAtBoundary = true; 
-                if (m_pMBLayout->Is(FrameRange(nullptr, 0), MinibatchPackingFlags::NoFeature | MinibatchPackingFlags::SequenceStart))
-                {
-                    for (size_t u = 0; u < nU; u++)
-                    {
-                        if (!m_pMBLayout->Is(FrameRange(nullptr, 0).Sequence(u), MinibatchPackingFlags::SequenceStart | MinibatchPackingFlags::NoFeature))
-                        {
-                            allAtBoundary = false; 
-                            break;
-                        }
-                    }
-                }
-                if (allAtBoundary)
-#endif
                if (!m_pMBLayout->HasSequenceBeyondBegin())       // only need to export state if anything crosses the MB boundary
                {
                    auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId); 
@ -689,19 +808,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    pExportedState = pState;
                }
            }
-            if (dir != -1 && dir != 1)
+            else
            {
-                RuntimeError("Unrecognized direction in DelayedValueNodeBase");
+                LogicError("Unrecognized direction in DelayedValueNodeBase");
            }
            return pExportedState;
        }

-        virtual void /*IStatefulNode::*/ImportState(NodeStatePtr && pImportedState) override
+        virtual void /*IStatefulNode::*/ImportState(const NodeStatePtr & pImportedState) override
        {
            DelayedNodeStatePtr pState = dynamic_pointer_cast<DelayedValueNodeState<ElemType>> (pImportedState); 

            if (!pState)
-                RuntimeError("Expecting DelayValueNodeState after down casting"); 
+                LogicError("Expecting DelayValueNodeState after downcasting"); 

            pState->ExportDelayedMBLayout(m_delayedActivationMBLayout);  // pstate copy to m_delayedActivationMBLayout
            if (pState->IsEmpty())
@ -715,18 +834,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            int dir = direction;
            if (dir == -1) // looking backward 
-            {
                m_delayedValue.SetColumnSlice(delayedActivation, (nT - 1)*nU, nU);
-            }
-            if (dir == 1)
-            {
-                //m_delayedValue.CopyColumnsStrided(delayedActivation, nU, 1, nT);
+            else if (dir == 1)
                m_delayedValue.SetColumnSlice(delayedActivation, 0, nU);
-            }
-            if (dir != -1 && dir == 1)
-            {// it is really a compile error ? 
-                RuntimeError("Unrecognized direction in DelayedValueNodeBase");
-            }
+            else
+                LogicError("Unrecognized direction in DelayedValueNodeBase");
        }
    protected:

--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@ -1234,8 +1234,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            else if (inputIndex == 1)
            {
-                BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(),
-                                         Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold);
+				FrameRange fr(Input(0)->GetMBLayout());
+				BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(),
+					Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold);
+				MaskMissingColumnsToZero(Input(inputIndex)->Gradient(), Input(0)->GetMBLayout(), fr);
+                
 #ifdef _DEBUG
                Input(inputIndex)->InvalidateMissingGradientColumns(FrameRange(Input(inputIndex)->GetMBLayout()));
 #endif
@ -1368,14 +1371,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
        }

-        // Release gradient and temp matrices that are no longer needed after all the children's gradients are computed.
-        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
-        {
-            Base::ReleaseMatricesAfterBackprop(matrixPool);
-            ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
-            ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
-            ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
-        }
+		//request matrices needed to do node function value evaluation
+		virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+		{
+			Base::ReleaseMatricesAfterBackprop(matrixPool);
+			ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
+			ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
+			ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
+		}

        // TODO: method names should be CamelCase
        std::vector<shared_ptr<const msra::dbn::latticepair>> * getLatticePtr()
@ -1415,6 +1418,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_doReferenceAlignment = doreferencealign;
        }

+        void SetGammarCalculationParam(const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR)
+        {
+            msra::lattices::SeqGammarCalParam param; 
+            param.amf = amf; 
+            param.lmf = lmf; 
+            param.wp = wp; 
+            param.bMMIfactor = bMMIfactor; 
+            param.sMBRmode = sMBR;
+            m_gammaCalculator.SetGammarCalculationParams(param);
+        }
+
        void gettime(unsigned long long &gammatime, unsigned long long &partialtime)
        {
            gammatime = m_gammatime;
@ -1427,6 +1441,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        shared_ptr<Matrix<ElemType>> m_gammaFromLattice;
        double m_frameDropThreshold;
        double m_fsSmoothingWeight;         // frame-sequence criterion interpolation weight    --TODO: can this be done outside?
+        double m_seqGammarAMF; 
+        double m_seqGammarLMF; 
+        double m_seqGammarWP; 
+        double m_seqGammarbMMIFactor;
+        double m_seqGammarUsesMBR; 
        bool m_doReferenceAlignment;
        std::vector<shared_ptr<const msra::dbn::latticepair>> m_lattices;
        msra::asr::simplesenonehmm m_hmm;
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@ -74,7 +74,7 @@
      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
@ -102,7 +102,7 @@
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -137,7 +137,7 @@ struct GridDim
        std::vector<cudaDeviceProp> props(numDevices);
        for (int i = 0; i < numDevices; i++)
            CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
-#if 1   // on Linux, maxGridSize[0] gets reported as 0
+#if 0   // on Linux, maxGridSize[0] gets reported as 0
        for (int i = 0; i < numDevices; i++)
            fprintf(stderr, "%d procs  %d warps  %d %d %d max grid  on  %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
 #endif
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -2246,7 +2246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        slice.m_computeDevice = m_computeDevice;
        slice.m_numRows = m_numRows;
        slice.m_numCols = numCols;
-        slice.m_nz = SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
+        slice.m_nz = ( numCols == m_numCols ) ? m_nz : SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
        slice.m_elemSizeAllocated = m_elemSizeAllocated;
        slice.m_totalBufferSizeAllocated = m_totalBufferSizeAllocated;
        slice.m_pArray = m_pArray;
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@ -87,9 +87,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0));
        }

+		// TODO: Comment these methods more thoroughly, e.g., why it uses numNZ instead of m_elemSizeAllocated.
        size_t MajorIndexCount() const
        {
-            return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
+            return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
        }
        size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
        { 
@ -113,6 +114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                return MajorIndexLocation() + m_numRows;
            else
                return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset;
+                //return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset;
        } 
        size_t SecondaryIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const
        {
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -79,7 +79,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
@ -127,7 +127,7 @@
      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@ -91,7 +91,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>cudart.lib;cublas.lib;cusparse.lib;curand.lib;libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -1383,17 +1383,62 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

    template<class ElemType>
-    void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum)
+    void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients, 
+                                      Matrix<ElemType>& functionValues, 
+                                      const ElemType learnRatePerSample, 
+                                      const ElemType momentum, 
+                                      const bool useNesterovMomentum
+                                      )
    {
        DecideAndMoveToRightDevice(*this, gradients, functionValues);
-
-        DISPATCH_MATRIX_ON_FLAG(&gradients,
+    
+        if (!useNesterovMomentum)
+        {
+            DISPATCH_MATRIX_ON_FLAG(&gradients,
            nullptr,
            ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, 
            ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, 
            if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues),
            if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues)
            );
+        }
+        else
+        {
+            DISPATCH_MATRIX_ON_FLAG(&gradients,
+            nullptr,
+            {/* CPU dense */
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
+                ScaleAndAdd(-momentum, *this, functionValues);
+                ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues);
+                // w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient, 
+            }, 
+            {/* GPU dense */
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this); 
+                ScaleAndAdd(-momentum, *this, functionValues); 
+                ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues);                 
+            }, 
+            { /* CPU sparse */
+                if (momentum != 0)
+                {
+                    Matrix<ElemType> gradientCache(gradients.GetDeviceId()); 
+                    gradientCache.SetValue(gradients); 
+                    gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); 
+                    ScaleAndAdd(-momentum, *this, functionValues); 
+                    ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues); 
+                }
+            }, 
+            { /* GPU sparse */
+                if (momentum != 0)
+                {
+                    Matrix<ElemType> gradientCache(gradients.GetDeviceId());
+                    gradientCache.SetValue(gradients);
+                    gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
+                    ScaleAndAdd(-momentum, *this, functionValues);
+                    ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues);
+                }
+            }
+            );
+        }       
    }

    //both this and gradients will be changed
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -164,7 +164,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void ShiftBy(int numShift);

        // TODO: all these scalars should be passed as doubles and cast down inside
-        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
+        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum, const bool useNAG);
        ElemType Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier);
        void FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
        ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@ -237,8 +237,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
    {
-        static int cc = 0; if (cc++ == 0)
-            fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
+        //static int cc = 0; if (cc++ == 0)
+        //    fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());

        // prepare all tensor descriptor information as needed for execution
        array<size_t, 2> offsets;
@ -257,8 +257,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
    {
-        static int cc = 0; if (cc++ == 0)
-            fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());
+        //static int cc = 0; if (cc++ == 0)
+        //    fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());

        array<size_t, 3> offsets;
        array<SmallVector<ptrdiff_t>, 3> regularStrides, reducingStrides;
@ -275,8 +275,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template<class ElemType>
    void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op)
    {
-        static int cc = 0; if (cc++ == 0)
-            fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());
+        //static int cc = 0; if (cc++ == 0)
+        //    fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());

        array<size_t, 4> offsets;
        array<SmallVector<ptrdiff_t>, 4> regularStrides, reducingStrides;
--- a/Source/Math/latticefunctionskernels.h
+++ b/Source/Math/latticefunctionskernels.h
@ -356,26 +356,39 @@ struct latticefunctionskernels
            const size_t te = ts + numframes;               // end time of current unit

            size_t state1step0to1 = te;                     // inflection point from state 0 to 1, record in state 1
+			//size_t state1stepm1to1 = te;
            size_t state2step0to1 = te;                     // inflection point from state 0 to 1, record in state 2
+            //size_t state2stepm1to1 = te;                    // inflection point from state 0 to 1, record in state 2
            size_t state2step1to2 = te;                     // inflection point from state 1 to 2, record in state 2
+			size_t state2step0to2 = te;

            //now we only support transition from -1 to 0 or 2 for sil
-            float pathscore0 = fwscore ;                     // log pp in state 0
-            float pathscore1 = LOGZERO;                     // log pp in state 1
-            float pathscore2 = LOGZERO;                     // log pp in state 2
-            if(isSil)
-                pathscore2 = fwscore;                    
+			float pathscore0 = fwscore;                     // log pp in state 0
+			float pathscore1 = fwscore;                     // log pp in state 1
+			float pathscore2 = fwscore;                     // log pp in state 2
+            
                
+
            // first frame
            if (ts != te)                                                              // for t = ts, initialization
            {                           
-                if (isSil)                                                              //for sil, -1 to 2 and -1 to 0 is permitted
+            /*    if (isSil)                                                              //for sil, -1 to 2 and -1 to 0 is permitted
                {
                    pathscore0 += getlogtransp(transP,-1,0) + logLLs(senoneid0,ts); 
                    pathscore2 += getlogtransp(transP,-1,2) + logLLs(senoneid2,ts);      
                }
-                else                                                                    //for others, only -1 to 0 is permitted
-                    pathscore0 +=  logLLs(senoneid0,ts);                                // Note: no need to incorporate LLs for state [1] and [2] because the path log LLs are LOGZERO anyway
+				else                                                                    //for others, only -1 to 0 is permitted
+				{
+					pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);                                
+					pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);                                
+
+				}*/
+				pathscore2 += getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts);
+				pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);
+				//state1stepm1to1 = ts;
+				pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);
+				
+
            }
            
            
@ -399,17 +412,22 @@ struct latticefunctionskernels
                    {
                        pathscore2 = pathscore12;
                        state2step0to1 = state1step0to1;                                        // record the inflection point
+						//state2stepm1to1 = state1stepm1to1;
                        state2step1to2 = t;                                                     // record the inflection point
+						state2step0to2 = te;
                        if (isSil)
                            backptrmatrix (2, t-ts-1) = 1;
                    }
-                    if (isSil)                                                                  // only silence have path from 0 to 2
+                    //if (isSil)                                                                  // only silence have path from 0 to 2
                    {
                        const float pathscore02 = pathscore0 + getlogtransp(transP,0,2);          // log pp from state 0 to 2
                        if (pathscore02 >= pathscore2)                                          // if state 0->2
                        {
                            pathscore2 = pathscore02;
-                            backptrmatrix (2, t-ts-1) = 0;
+                            if (isSil)
+                            	backptrmatrix (2, t-ts-1) = 0;
+							state2step0to2 = t;		
+							state2step1to2 = te;
                        }
                    }

@ -422,9 +440,11 @@ struct latticefunctionskernels
                    {
                        pathscore1 = pathscore01;
                        state1step0to1 = t;                                                     // record the inflection point
+						//state1stepm1to1 = te;
                        if (isSil)
                            backptrmatrix (1, t-ts-1) = 0;
                    }
+					
                    if (isSil)                                                                  // only silence have path from 2 to 1
                    {
                        const float pathscore21 = pathscore2last + getlogtransp(transP,2,1); 
@ -495,19 +515,35 @@ struct latticefunctionskernels

            if (!isSil)
            {
-                state2step0to1 += alignindex - ts;                              // convert to align measure
-                state2step1to2 += alignindex - ts;
-                for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
-                {
-                    size_t senoneid;
-                    if (t < state2step0to1)                                     // in state 0
-                        senoneid = senoneid0;
-                    else if(t < state2step1to2)                                 // in state 1
-                        senoneid = senoneid1;
-                    else                                                        // in state 2
-                        senoneid = senoneid2;
-                    alignresult[t] = (unsigned short) senoneid;
-                }
+				if (state2step0to2 < te)     //from 0 to 2
+				{
+					state2step0to2 += alignindex - ts;
+					for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
+					{
+						size_t senoneid;
+						if (t < state2step0to2)                                     // in state 0
+							senoneid = senoneid0;						
+						else                                                        // in state 2
+							senoneid = senoneid2;
+						alignresult[t] = (unsigned short)senoneid;
+					}
+				}
+				else          //from 1 to 2
+				{
+					state2step0to1 += alignindex - ts;                              // convert to align measure
+					state2step1to2 += alignindex - ts;
+					for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
+					{
+						size_t senoneid;
+                        if (state2step0to1 <alignindex - ts + te && t < state2step0to1)
+							senoneid = senoneid0;
+						else if(t < state2step1to2)                                 // in state 1
+							senoneid = senoneid1;
+						else                                                        // in state 2
+							senoneid = senoneid2;
+						alignresult[t] = (unsigned short) senoneid;
+					}
+				}
            }
            else                                                                        // for silence
            {
--- a/Source/Readers/BinaryReader/BinaryReader.vcxproj
+++ b/Source/Readers/BinaryReader/BinaryReader.vcxproj
@ -70,7 +70,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -91,7 +91,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/DSSMReader/DSSMReader.vcxproj
+++ b/Source/Readers/DSSMReader/DSSMReader.vcxproj
@ -72,7 +72,7 @@
      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -93,7 +93,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj
+++ b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj
@ -100,7 +100,7 @@
      <UseFullPaths>true</UseFullPaths>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
    </Link>
@ -115,7 +115,7 @@
      <UseFullPaths>true</UseFullPaths>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
      <AdditionalDependencies>ucireader.lib;Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
@ -133,7 +133,7 @@
      <UseFullPaths>true</UseFullPaths>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
@ -152,7 +152,7 @@
      <UseFullPaths>true</UseFullPaths>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@ -100,6 +100,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            vector<wstring> scriptpaths;
            vector<wstring> RootPathInScripts; 
+            wstring         RootPathInLatticeTocs;
            vector<wstring> mlfpaths;
            vector<vector<wstring>>mlfpathsmulti;
            size_t firstfilesonly = SIZE_MAX;   // set to a lower value for testing
@ -263,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    expand_wildcards(thisLattice(L"numLatTocFile"), paths);
                    latticetocs.first.insert(latticetocs.first.end(), paths.begin(), paths.end());
                }
-
+                RootPathInLatticeTocs =(wstring) thisLattice(L"prefixPathInToc",L"");
            }

            //get HMM related file names
@ -448,7 +449,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (!_wcsicmp(readMethod.c_str(), L"blockRandomize"))
            {
                // construct all the parameters we don't need, but need to be passed to the constructor...
-                m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap()));
+                
+                m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap(), RootPathInLatticeTocs));
+                m_lattices->setverbosity(m_verbosity);

                // now get the frame source. This has better randomization and doesn't create temp files
                m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_frameMode));
@ -941,6 +944,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        if (!skip)
                        {
+                            // a stopgap 
+                            if (m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i])
+                            {
+                                // BUGBUG: we just found that (due to some bugs yet to be tracked down), 
+                                // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
+                                // This is just a stopgap, to be removed after the bugs are found and fixed
+                                bool needRenew = true; 
+                                while (needRenew)
+                                {
+                                    size_t framenum = m_numFramesToProcess[i];
+                                    fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
+                                        (int)framenum, (int)m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str());
+                                    ReNewBufferForMultiIO(i);
+                                    needRenew = m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]; 
+                                }
+
+                            }
                            m_numValidFrames[i] = m_numFramesToProcess[i];
                            if (m_numValidFrames[i] > 0)
                            {
@ -972,49 +992,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        m_extraNumSeqs = 0;
                        if (!m_frameMode)
                        {
-                            // insert extra utterances to parallel sequences that have enough space left
-                            // As long as there is a gap at the end of any parallel sequence that is large enough for another utterance, fill it in.
-                            size_t nextMinibatchUttnum = 0;
-                            bool inserted;
-                            // The next utterances have already been prepared under parallel-sequence indices [i], in prep for the next MB.
-                            // For each, we will go through all parallel sequences [j] to see whether the entry currently held for the next [i] fits into [j].
-                            for (size_t i = 0; i < m_numSeqsPerMB; i++)
+                            for (size_t src = 0; src < m_numSeqsPerMB; )
                            {
-                                while (nextMinibatchUttnum <= i)
+                                size_t framenum = m_numFramesToProcess[src];
+                                if (framenum == 0)
                                {
-                                    size_t framenum = m_numFramesToProcess[i];
-                                    inserted = false;
-                                    if (framenum > 0)       // non-empty entry: see were it fits
-                                    {
-                                        // greedily search for a parallel sequence with enough space at the end to insert this utterance
-                                        for (size_t j = 0; j < m_numSeqsPerMB; j++)
-                                        {
-                                            if (framenum + m_numValidFrames[j] < m_mbNumTimeSteps)
-                                            {
-                                                // enough space: insert it as parallel sequence [j] (instead of [i] in the next MB)
-                                                m_extraSeqsPerMB.push_back(j);
-                                                if (m_latticeBufferMultiUtt[i] != nullptr)
-                                                {
-                                                    m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[i]);
-                                                    m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[i]);
-                                                    m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[i]);
-                                                }
-                                                fillOneUttDataforParallelmode(matrices, m_numValidFrames[j], framenum, j, i);
-                                                m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, j, m_numValidFrames[j], m_numValidFrames[j] + framenum);
+                                    src++;
+                                    continue;
+                                }
+                                if (m_latticeBufferMultiUtt[src]!=nullptr && m_latticeBufferMultiUtt[src]->getnumframes()!=framenum)
+                                {
+                                    // BUGBUG: we just found that (due to some bugs yet to be tracked down), 
+                                    // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
+                                    // This is just a stopgap, to be removed after the bugs are found and fixed
+                                    fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
+                                        (int)framenum, (int)m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str());
+                                    src++;
+                                    continue;
+                                }

-                                                // consume it
-                                                ReNewBufferForMultiIO(i);       // replace current [i] with a new one; then try again with this new one at [i]
-                                                m_numValidFrames[j] += framenum;
-                                                m_extraNumSeqs++;
-                                                inserted = true;
-                                                break;
-                                            }
+                                bool slotFound = false;
+                                for (size_t des = 0; des < m_numSeqsPerMB; des++) // try to found a slot
+                                {
+                                    if (framenum + m_numValidFrames[des] < m_mbNumTimeSteps)
+                                    { // found !
+                                        m_extraSeqsPerMB.push_back(des);
+                                        if (m_latticeBufferMultiUtt[src] != nullptr)
+                                        {
+                                            m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[src]);
+                                            m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[src]);
+                                            m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[src]);
                                        }
+                                        fillOneUttDataforParallelmode(matrices, m_numValidFrames[des], framenum, des, src);
+                                        m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, des, m_numValidFrames[des], m_numValidFrames[des] + framenum);
+
+                                        ReNewBufferForMultiIO(src);
+                                        m_numValidFrames[des] += framenum;
+                                        m_extraNumSeqs++;
+                                        slotFound = true;
+                                        break;
                                    }
-                                    if (!inserted)
-                                    {
-                                        nextMinibatchUttnum++;  // didn't fit anywhere: done with entry [i]
-                                    }
+                                }
+                                if (!slotFound)
+                                {
+                                    src++; // done with this source;  try next source;
                                }
                            }

--- a/Source/Readers/HTKMLFReader/HTKMLFReader.h
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.h
@ -32,6 +32,9 @@ private:
    intargvector m_numSeqsPerMBForAllEpochs;
    size_t m_numSeqsPerMB;                  // requested number of parallel sequences
    size_t m_mbNumTimeSteps;                // number of time steps  to fill/filled (note: for frame randomization, this the #frames, and not 1 as later reported)
+    size_t m_mbMaxNumTimeSteps;             // max time steps we take in a MB layout; any setence longer than this max will be discarded (and a warning will be issued )
+                                            // this is used to prevent CUDA out-of memory errors
+
    vector<size_t> m_numFramesToProcess;    // [seq index] number of frames available (left to return) in each parallel sequence
    vector<size_t> m_switchFrame;           /// TODO: something like the position where a new sequence starts; still supported?
    vector<size_t> m_numValidFrames;        // [seq index] valid #frames in each parallel sequence. Frames (s, t) with t >= m_numValidFrames[s] are NoInput.
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
@ -69,7 +69,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
@ -87,7 +87,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/ImageReader/ImageReader.vcxproj
+++ b/Source/Readers/ImageReader/ImageReader.vcxproj
@ -75,7 +75,7 @@
      <OpenMPSupport>true</OpenMPSupport>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;$(OpenCVLib);%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
--- a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
+++ b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
@ -71,7 +71,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -92,7 +92,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
+++ b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
@ -71,7 +71,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -92,7 +92,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
@ -72,7 +72,7 @@
      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -93,7 +93,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
+++ b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
@ -72,7 +72,7 @@
      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -93,7 +93,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
+++ b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
@ -70,7 +70,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -91,7 +91,7 @@
      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/Readers/UCIReader/UCIReader.vcxproj
+++ b/Source/Readers/UCIReader/UCIReader.vcxproj
@ -91,7 +91,7 @@
      <SDLCheck>true</SDLCheck>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -107,7 +107,7 @@
      <AdditionalIncludeDirectories>..\..\common\include;..\..\Source\Math</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@ -124,7 +124,7 @@
      <SDLCheck>true</SDLCheck>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
@ -144,7 +144,7 @@
      <AdditionalIncludeDirectories>..\..\common\include;..\..\Source\Math</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
--- a/Source/SGDLib/MultiNetworksSGD.h
+++ b/Source/SGDLib/MultiNetworksSGD.h
@ -63,6 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        using SGDBase::m_L2RegWeight;
        using SGDBase::m_L1RegWeight;
        using SGDBase::m_needAveMultiplier;
+        using SGDBase::m_useNesterovMomentum;
        using SGDBase::m_traceLevel;
        using SGDBase::m_numMBsToShowResult;
        using SGDBase::m_gradientCheckSigDigit;
@ -392,8 +393,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        if (m_loadBestModel)
                        {
-                            encoderNet->ReloadPersistableParameters<ElemType>(GetEncoderModelNameForEpoch(i - 1));
-                            decoderNet->ReloadPersistableParameters<ElemType>(GetDecoderModelNameForEpoch(i - 1));
+                            encoderNet->RereadPersistableParameters<ElemType>(GetEncoderModelNameForEpoch(i - 1));
+                            decoderNet->RereadPersistableParameters<ElemType>(GetDecoderModelNameForEpoch(i - 1));

                            size_t dummyMinibatchSize = 0;
                            this->LoadCheckPointInfo(i - 1,
@ -721,7 +722,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                            //persist model and check-point info
                            for (size_t k = 0; k < iNumNetworks; k++)
                            {
-                                nets[k]->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k)));
+                                nets[k]->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k)));
                                nets[k]->ResetEvalTimeStamps();
                            }

@ -930,7 +931,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        {
                            Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);

-                            UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
+                            UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier, m_useNesterovMomentum);
                        }
                    }
                }
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -310,7 +310,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // likewise for sequence training parameters
        if (isSequenceTrainingCriterion)
        {
-            ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign);
+            ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign, 
+                m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR );
        }

        // --- MAIN EPOCH LOOP
@ -519,6 +520,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
            {
                g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank());
+                g_mpi->Bcast(&lrControlCriterion, 1, g_mpi->MainNodeRank());
            }

            bool loadedPrevModel = false;
@ -543,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        auto bestModelPath = GetModelNameForEpoch(i - m_learnRateAdjustInterval);
                        fprintf(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
-                        net->ReloadPersistableParameters<ElemType>(bestModelPath);
+                        net->RereadPersistableParameters<ElemType>(bestModelPath);
                        LoadCheckPointInfo(i - m_learnRateAdjustInterval,
                                           /*out*/ totalSamplesSeen,
                                           /*out*/ learnRatePerSample,
@ -771,13 +773,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
        DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
        size_t numSubminibatchesNeeded = 0; 
-        if (m_maxSamplesInRAM < SIZE_MAX)   // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
+        if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1)   // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
        {
-            // into how many pieces would we need to break the minibatch?
-            // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
-            size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
-            size_t estimatedMBSize = tunedMBSize * numParallelSequences; 
-            numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);             
+            if (m_maxSamplesInRAM < SIZE_MAX)
+            {
+                // into how many pieces would we need to break the minibatch?
+                // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
+                size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
+                size_t estimatedMBSize = tunedMBSize * numParallelSequences;
+                numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);
+            }
+            if (m_numSubminiBatches > 1)
+            {
+                numSubminibatchesNeeded = m_numSubminiBatches;
+            }
        }
        // this is non-trivial, we need a manager object to handle this
        if (numSubminibatchesNeeded > 1)
@ -807,7 +816,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
        if (numSubminibatchesNeeded > 1)
        {
-            fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
+            if (m_maxSamplesInRAM < SIZE_MAX)
+                fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
+            else
+                fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded);
        }
        fprintf(stderr, ".\n");

@ -998,7 +1010,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        UpdateWeights(node, smoothedGradient, learnRatePerSample,
                                      GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, net->GetMBLayoutPtr()->GetNumParallelSequences()), aggregateNumSamples,
                                      m_L2RegWeight, m_L1RegWeight,
-                                      m_needAveMultiplier);
+                                      m_needAveMultiplier, m_useNesterovMomentum);
 #ifdef _DEBUG
                        if (dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): "))
                            LogicError("%ls %ls operation has NaNs in functionValues after parameter update.", node->NodeName().c_str(), node->OperationName().c_str());
@ -1438,7 +1450,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        int baseModelEpoch = epochNumber - 1;
-        net->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
+        net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));

        double learnRate = learnRatePerSample;
        size_t dummyMinibatchSize = 0;
@ -1598,7 +1610,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        int baseModelEpoch = epochNumber - 1;
-        net->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
+        net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));

        double dummyLearnRate;
        double dummtPrevCriterion;
@ -2029,7 +2041,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                               size_t actualMBSize,
                               const double L2RegWeight,
                               const double L1RegWeight,
-                               const bool needAveMultiplier)
+                               const bool needAveMultiplier, 
+                               const bool useNesterovMomentum
+                               )
    {
        // we use simple linear (instead of log linear) scaling here
        const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
@ -2070,7 +2084,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        if (adpType == GradientsUpdateType::None)
        {
            smoothedGradient.NormalGrad(gradientValues, functionValues,
-                                        (ElemType)learnRatePerSample, (ElemType)momentum);
+                                        (ElemType)learnRatePerSample, (ElemType)momentum, useNesterovMomentum);
        }
        else if (adpType == GradientsUpdateType::AdaGrad ||
                (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
@ -2120,7 +2134,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                       const double momentumPerSample,
                       const size_t actualMBSize,
                       const double L2RegWeight, const double L1RegWeight,
-                       const bool needAveMultiplier) const
+                       const bool needAveMultiplier, 
+                       const bool useNesterovMomentum
+                       ) const
    {
 #if DUMPOUTPUT
        fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
@ -2131,7 +2147,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        UpdateWeightsS(this, dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(), dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Gradient(),
                       smoothedGradient, learnRatePerSample, momentumPerSample,
                       actualMBSize, L2RegWeight, L1RegWeight,
-                       needAveMultiplier);
+                       needAveMultiplier, m_useNesterovMomentum);
        node->BumpEvalTimeStamp();
    }

@ -2501,6 +2517,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector<int>{ 256 })));
        m_truncated = configSGD(L"truncated", false);
        m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t)SIZE_MAX);
+        m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t)1);

        // the number of samples in each epoch (0 means, use all the samples in each epoch).
        m_epochSize = configSGD(L"epochSize", (size_t)0);
@ -2520,6 +2537,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        floatargvector momentumPerMB          = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector()));
        floatargvector momentumPerSample      = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector()));
        floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));
+        bool           useNesterovMomentum = configSGD(L"useNAG", false); 
+

        m_maxTempMemSizeInSamplesForCNN = configSGD(L"maxTempMemSizeInSamplesForCNN", (size_t)0);

@ -2534,6 +2553,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_hSmoothingWeight = configSGD(L"hSmoothingWeight", 0.95);
        m_frameDropThresh =  configSGD(L"frameDropThresh",  1e-10);
        m_doReferenceAlign = configSGD(L"doReferenceAlign", false);
+        m_seqGammarCalcUsesMBR = configSGD(L"seqGammarUsesMBR", false); 
+        m_seqGammarCalcAMF = configSGD(L"seqGammarAMF", 14.0);
+        m_seqGammarCalcLMF = configSGD(L"seqGammarLMF", 14.0);
+        m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0); 
+        m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0);

        m_dropoutRates = configSGD(L"dropoutRate", ConfigRecordType::Array(floatargvector(vector<float>{ 0.0f })));

@ -2639,6 +2663,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_momentumParam = floatargvector(L"0.9");
            m_momentumSpecifiedForMBSize = m_mbSize;
        }
+        m_useNesterovMomentum = useNesterovMomentum; 
+
        for (int i = 0; i < m_momentumParam.size(); i++)
        {
            if (m_momentumParam[i] >= 1.0 || m_momentumParam[i] < 0.0)
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -111,6 +111,7 @@ protected:
    intargvector m_learningRatesSpecifiedForMBSize;       // 1 for per sample, m_mbSize[] for per MB
    floatargvector m_momentumParam;
    intargvector m_momentumSpecifiedForMBSize;
+    bool         m_useNesterovMomentum; 

    // Determine the MB size used for mapping a given learning-rate or momentum parameter to a per-sample value.
    // MB size is the number of samples across all time steps and parallel sequences.
@ -157,7 +158,11 @@ protected:
    // To mitigate this issue, we adopt the sub-minibatch implementation, where 
    // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM
    // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch 
-
+    size_t m_numSubminiBatches; 
+    // alternative method to specify how to split minibatches into subminibatches 
+    // default is 1, which means no subminibatch is used 
+    // if m_maxTempMemSizeInSamples = SIZE_MAX (which means users do not specify the option) and m_numSubminiBatches > 1 
+    // we divide one minibatch to m_numSubminiBatches subMinibatches 

    // the number of samples in each epoch (0 means, use all the samples in each epoch).
    size_t m_epochSize;
@ -245,6 +250,11 @@ protected:
    double m_hSmoothingWeight;
    double m_frameDropThresh;
    bool m_doReferenceAlign;
+    double m_seqGammarCalcAMF;
+    double m_seqGammarCalcLMF; 
+    double m_seqGammarCalcWP;
+    double m_seqGammarCalcbMMIFactor; 
+    bool m_seqGammarCalcUsesMBR;
 };

 template<class ElemType> class IDistGradAggregator;
@ -436,7 +446,9 @@ public:
                               size_t actualMBSize,
                               const double L2RegWeight,
                               const double L1RegWeight,
-                               const bool needAveMultiplier);
+                               const bool needAveMultiplier, 
+                               const bool useNesterovMomentum
+                               );

 protected:
    // UpdateWeights - update the weights in
@ -446,7 +458,8 @@ protected:
                       const double momentumPerSample,
                       const size_t actualMBSize,
                       const double L2RegWeight, const double L1RegWeight,
-                       const bool needAveMultiplier) const;
+                       const bool needAveMultiplier, 
+                       const bool useNesterovMomentum) const;

    void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;

--- a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj
+++ b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj
@ -44,7 +44,7 @@
      <SDLCheck>true</SDLCheck>
    </ClCompile>
    <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
    </Link>
  </ItemDefinitionGroup>
--- a/Source/SequenceTrainingLib/gammacalculation.h
+++ b/Source/SequenceTrainingLib/gammacalculation.h
@ -11,6 +11,23 @@
 #pragma warning (disable: 4127) // conditional expression is constant

 namespace msra { namespace lattices {
+
+    struct  SeqGammarCalParam{
+        double amf; 
+        double lmf; 
+        double wp; 
+        double bMMIfactor; 
+        bool  sMBRmode; 
+        SeqGammarCalParam()
+        {
+            amf = 14.0; 
+            lmf = 14.0; 
+            wp = 0.0; 
+            bMMIfactor = 0.0;
+            sMBRmode = false;
+        }
+    };
+
    template<class ElemType>
    class GammaCalculation
    {
@ -19,9 +36,9 @@ namespace msra { namespace lattices {
        GammaCalculation() : cpumode(false)
        {
            initialmark = false;
-            lmf = 14.0f; // Note that 9 was best for Fisher  --these should best be configurable
+            lmf = 7.0f; // Note that 9 was best for Fisher  --these should best be configurable
            wp = 0.0f;
-            amf = 14.0f;
+            amf = 7.0f;
            boostmmifactor = 0.0f;
            seqsMBRmode = false;
        }
@ -30,6 +47,9 @@ namespace msra { namespace lattices {

        }

+        //========================================
+        // Sec. 1 init functions
+        //========================================
        void init(msra::asr::simplesenonehmm hset, int DeviceId)
        {
            m_deviceid = DeviceId;
@ -47,7 +67,21 @@ namespace msra { namespace lattices {
            }
        }
            
-            
+        //========================================
+        // Sec. 2 set functions 
+        //========================================
+        void SetGammarCalculationParams(const SeqGammarCalParam& gammarParam)
+        {
+            lmf = (float)gammarParam.lmf;
+            amf = (float)gammarParam.amf;
+            wp =  (float)gammarParam.wp; 
+            seqsMBRmode = gammarParam.sMBRmode; 
+            boostmmifactor = (float)gammarParam.bMMIfactor;
+        }
+
+        //========================================
+        // Sec. 3 calculation functions 
+        //========================================
        void calgammaformb( Microsoft::MSR::CNTK::Matrix<ElemType>& functionValues, 
                            std::vector<shared_ptr<const msra::dbn::latticepair>> &lattices, 
                            const Microsoft::MSR::CNTK::Matrix<ElemType>& loglikelihood,
--- a/Source/SequenceTrainingLib/latticeforwardbackward.cpp
+++ b/Source/SequenceTrainingLib/latticeforwardbackward.cpp
@ -442,6 +442,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
                LogicError("invalid backpointer resulting in state index out of range");

            int bp = (int) backpointers(j,t);   // save the backpointer before overwriting it (gammas and backpointers are aliases of each other)
+			//thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js);
            if (!returnsenoneids)               // return binary gammas (for MMI; this mode is compatible with softalignmode)
                for (size_t i = js; i < je; i++)
                    loggammas(i,t) = ((int) i == j) ? 0.0f : LOGZERO;
--- a/Source/SequenceTrainingLib/parallelforwardbackward.cpp
+++ b/Source/SequenceTrainingLib/parallelforwardbackward.cpp
@ -743,8 +743,8 @@ namespace msra { namespace lattices {
        double totalfwscore = 0.0f;
        if (!parallelstate->emulation)
        {
-
-            fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
+            if (verbosity>=2)
+                fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());

            const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f);
            const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f);
--- a/Tests/EndToEndTests/Speech/LSTM/cntk.config
+++ b/Tests/EndToEndTests/Speech/LSTM/cntk.config
@ -67,7 +67,7 @@ speechTrain = [

            // LSTM cell
            # TODO: This is temporary test code for the new ShiftNode (until we switch PastValue() itself over)
-            PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1, numSteps=1, insertedDim=2)
+            PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1)
            PastValue1 = PastValue
            #PastValue1 = PastValueShift
            dh = PastValue1(outputDim, output);                   // hidden state(t-1)
--- a/Tools/generate_build_info
+++ b/Tools/generate_build_info
@ -56,6 +56,9 @@ makebuildinfo()
 	if [ ! -z "$CUB_PATH" ]; then 
 		printf "#define _CUB_PATH_ \"%s\"\n"  $CUB_PATH  >> $target
 	fi
+    if [ ! -z "$CUDNN_PATH" ]; then 
+        printf "#define _CUDNN_PATH_ \"%s\"\n"  $CUDNN_PATH  >> $target
+    fi
 	printf "#define _BUILDTYPE_ \"%s\"\n" $BUILDTYPE    	>> 	$target
 	printf "#endif\n" 					>>	$target
 }