diff --git a/Makefile b/Makefile
index f1f95c9b3..a730f41dc 100644
--- a/Makefile
+++ b/Makefile
@@ -162,7 +162,7 @@ ifeq ("$(BUILDTYPE)","debug")
   CXXFLAGS += -g
   LDFLAGS += -rdynamic
   CPPFLAGS += -D_DEBUG
-  CUFLAGS += -O0 -use_fast_math -lineinfo  $(GENCODE_FLAGS)
+  CUFLAGS += -O0 -g -use_fast_math -lineinfo  $(GENCODE_FLAGS)
 endif
 
 ifeq ("$(BUILDTYPE)","release")
diff --git a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
index a0a3bf769..2a707b831 100644
--- a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
+++ b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
@@ -47,7 +47,7 @@ using namespace std;
         L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
         L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
         // TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
-        L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, numSteps=1, insertedDim=0, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n"
+        L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n"
         L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
         L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
         L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
diff --git a/Source/CNTK/CNTK.cpp b/Source/CNTK/CNTK.cpp
index e7753e2c7..600f0ae06 100644
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@@ -345,6 +345,9 @@ void PrintBuiltInfo()
 #ifdef _CUB_PATH_
     fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
 #endif 
+#ifdef _CUDNN_PATH_
+    fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
+#endif
 #ifdef _GIT_EXIST
     fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
     fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
@@ -568,7 +571,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])   // called from wmain which i
         RedirectStdErr(logpath);
     }
 
-    PrintBuiltInfo();
+    PrintBuiltInfo(); // this one goes to log file 
     std::string timestamp = TimeDateStamp();
 
     //dump config info
@@ -643,10 +646,11 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])   // called from wmain which i
 // main wrapper that catches C++ exceptions and prints them
 // ---------------------------------------------------------------------------
 
-int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper that catches & repots Win32 exceptions
+int wmain1(int argc, wchar_t* argv[])   // called from wmain which is a wrapper that catches & reports Win32 exceptions
 {
     try
     {
+        PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
         if (argc <= 1)
             InvalidArgument("No command-line argument given.");
         // detect legacy CNTK configuration
@@ -684,6 +688,8 @@ void terminate_this() { fprintf(stderr, "terminate_this: aborting\n"), fflush(st
 int wmain(int argc, wchar_t* argv[])    // wmain wrapper that reports Win32 exceptions
 {
     set_terminate (terminate_this); // insert a termination handler to ensure stderr gets flushed before actually terminating
+    _set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing
+
     // Note: this does not seem to work--processes with this seem to just hang instead of terminating
     __try
     {
diff --git a/Source/CNTK/ModelEditLanguage.cpp b/Source/CNTK/ModelEditLanguage.cpp
index 9b1ec6fa1..612b96e63 100644
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@@ -100,7 +100,7 @@ template <typename ElemType>
 void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigParamList& params)
 {
     std::string name = p_name;
-    if (EqualInsensitive(name, "CreateModel"))  //create a blank model
+    if (EqualInsensitive(name, "CreateModel"))  // create a blank model
     {
         size_t numFixedParams = 0, numOptionalParams = 0;
         if (params.size() > numFixedParams + numOptionalParams || params.size() < numFixedParams)
@@ -109,7 +109,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         auto cn = make_shared<ComputationNetwork>(CPUDEVICE);
         OverrideModelNameAndSetDefaultModel(cn);
     }
-    if (EqualInsensitive(name, "CreateModelWithName"))  //create a blank model
+    if (EqualInsensitive(name, "CreateModelWithName"))  // create a blank model
     {
         size_t numFixedParams = 1, numOptionalParams = 0;
         if (params.size() > numFixedParams + numOptionalParams || params.size() < numFixedParams)
@@ -139,6 +139,16 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
         std::wstring modelFormat = GetOptionalModelFormat(params, numFixedParams);
 
         auto cn = make_shared<ComputationNetwork>(CPUDEVICE);
+#if 1   // support for a specific kind of legacy format, for the sole purpose of allowing users to convert (=load & save) them
+        if (modelFormat == L"cntk_legacy_no_tensorlib")
+        {
+            cn->Read<ElemType>(params[1]);
+            for (auto node : cn->FeatureNodes())
+                node->SetDims(TensorShape(node->GetNumRows()), 0);  // pre-tensorlib InputValues had incorrect tensor dimensions
+            cn->CompileNetwork();
+        }
+        else
+#endif
         cn->Load<ElemType>(params[1]);
         OverrideModelNameAndSetDefaultModel(cn, params[0]);
     }
@@ -189,8 +199,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
 
         // validate the network before we save it out
         ProcessNDLScript(m_netNdlDefault, ndlPassAll, true);
-
-        cn->Save(fileName);
+        cn->SaveEdited(fileName);
     }
     else if (EqualInsensitive(name, "SaveModel"))
     {
@@ -209,7 +218,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
 
         // validate and finish the second pass through NDL if any in-line NDL was defined
         ProcessNDLScript(netNdl, ndlPassAll, true);
-        netNdl->cn->Save(fileName);
+        netNdl->cn->SaveEdited(fileName);
     }
     else if (EqualInsensitive(name, "SetDefaultModel"))
     {
diff --git a/Source/CNTK/ModelEditLanguage.h b/Source/CNTK/ModelEditLanguage.h
index 117470b80..538922692 100644
--- a/Source/CNTK/ModelEditLanguage.h
+++ b/Source/CNTK/ModelEditLanguage.h
@@ -443,6 +443,10 @@ public:
                     {
                         modelFormat = L"cntk";
                     }
+                    else if (EqualInsensitive(value, "cntk_legacy_no_tensorlib"))    // model of late 2015 which had a bug in setting InputValue's tensor dimensions
+                    {
+                        modelFormat = L"cntk_legacy_no_tensorlib";
+                    }
                     else
                     {
                         RuntimeError("Invalid optional parameter value %s, valid values are: format=(cntk)", value.c_str());
diff --git a/Source/CNTK/SimpleNetworkBuilder.cpp b/Source/CNTK/SimpleNetworkBuilder.cpp
index 6ab9a5204..bf76efded 100644
--- a/Source/CNTK/SimpleNetworkBuilder.cpp
+++ b/Source/CNTK/SimpleNetworkBuilder.cpp
@@ -2423,9 +2423,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Matrix<ElemType> priorVals = ReadMatrixFromDbnFile(fstream, std::string("Pu"));
             assert(priorVals.GetNumCols() == 1 && priorVals.GetNumRows() == m_outputLayerSize);
 
-            w = builder.Mean(label, L"Prior");
-            static_pointer_cast<PreComputedNode<ElemType>>(w)->SideLoadFromMatrix(priorVals);
-            w->SetParameterUpdateRequired(false);
+            prior = builder.Mean(label, L"Prior");
+            static_pointer_cast<PreComputedNode<ElemType>>(prior)->SideLoadFromMatrix(priorVals);
+            prior->SetParameterUpdateRequired(false);
         }
         else // pretrained network - need to add output layer, initalize
         {
@@ -2465,7 +2465,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         if (layerType == "perceptron" || m_needPrior)
         {
-            input = builder.Log(pcNodePtr, L"LogOfPrior");
+            input = builder.Log(prior, L"LogOfPrior");
 
             //following two lines is needed only if true probability is needed
             //output = builder.Softmax(output);
diff --git a/Source/CNTK/prebuild.bat b/Source/CNTK/prebuild.bat
index 9f841d104..12631cf52 100644
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@@ -33,6 +33,16 @@ if "%cuda_path%" == "" (
         echo #define _CUDA_PATH_    "%cuda_path:\=\\%" >> buildinfo.h$$
     )
 
+if not "%cudnn_path%" == "" (
+    echo #define _CUDNN_PATH_  "%cudnn_path:\=\\%" >> buildinfo.h$$
+    ) 
+
+if not "%cub_path%" == "" (
+    echo #define _CUB_PATH_  "%cub_path:\=\\%" >> buildinfo.h$$
+    ) 
+
+
+
 echo #endif >> buildinfo.h$$
 
 ::: update file only if it changed (otherwise CNTK.cpp will get rebuilt each time)
diff --git a/Source/Common/Include/Sequences.h b/Source/Common/Include/Sequences.h
index c70862369..f3af694fb 100644
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@@ -84,6 +84,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             ptrdiff_t tBegin;       // first time index in this minibatch. Note that this may be negative of the sequence started before this MB.
             size_t    tEnd;         // end = first frame index after final frame. May be beyond the minibatch if reql sequence is longer than the MB.
             bool operator==(const SequenceInfo & other) const { return seqId == other.seqId && s == other.s && tBegin == other.tBegin && tEnd == other.tEnd; }
+            size_t GetNumTimeSteps() const { return (size_t)(tEnd - tBegin); }
         };
 
         // -------------------------------------------------------------------
@@ -270,6 +271,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // I'd love to start with all-gaps, but that would require to set flags upfront, and then clearing them.
         void AddGap(size_t s, ptrdiff_t beginTime, size_t endTime) { if ((ptrdiff_t)endTime > beginTime) AddSequence(GAP_SEQUENCE_ID, s, beginTime, endTime); }
 
+        // find a sequence by its id
+        const SequenceInfo & FindSequence(UniqueSequenceId seqId) const
+        {
+            for (const auto & seqInfo : m_sequences)
+                if (seqInfo.seqId == seqId)
+                    return seqInfo;
+            LogicError("FindSequence: Requested sequence (id %u) not found.", (unsigned int) seqId);
+        }
+
         // -------------------------------------------------------------------
         // inquire about gaps or boundaries
         // -------------------------------------------------------------------
@@ -427,6 +437,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     public: // TODO: make private (currently used from masking and DataFor) ; TODO: rename all members with m_ prefix
         size_t timeIdxInSeq;                // start frame; SIZE_MAX = all frames in MB
         ptrdiff_t m_timeOffset;             // this is added to timeIdxInSeq wherever it is used
+        size_t m_timeRange;                 // use this to describe a custom range > 1 frame
         size_t seqIndex;                    // parallel-sequence index; SIZE_MAX = all sequences in MB (most common case)  --TODO: Bad name, 'sequence' and 'parallel sequence' are two different things
         MBLayoutPtr m_pMBLayout;            // layout associated with this
         bool m_broadcastAllowed;            // frame range may be broadcast from outer layout (e.g. a matrix with NULL layout and 1 column is acceptable to this frame range)
@@ -434,7 +445,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     public:
         // can construct from a single size_t -> a single-frame range
-        FrameRange(MBLayoutPtr pMBLayout, size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), m_timeOffset(0), seqIndex(SIZE_MAX), m_pMBLayout(pMBLayout), m_broadcastAllowed(false), parent(nullptr) {}
+        FrameRange(MBLayoutPtr pMBLayout, size_t timeIdxInSeq) : timeIdxInSeq(timeIdxInSeq), m_timeOffset(0), m_timeRange(1), seqIndex(SIZE_MAX), m_pMBLayout(pMBLayout), m_broadcastAllowed(false), parent(nullptr) {}
 
         // or without arguments -> entire minibatch / no frame-range
         FrameRange(MBLayoutPtr pMBLayout) : FrameRange(pMBLayout, SIZE_MAX) {}
@@ -471,7 +482,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         // create a FrameRange with a time offset
-        // Note: This currently does not work in conjunction with IsAllFrames(). This would be a nice-to have, but tricky w.r.t. out-of-bounds accesses.
+        // If IsAllFrames() then this will cause out-of-bounds slices.
         FrameRange WithTimeOffset(ptrdiff_t offset) const
         {
             FrameRange ret = *this;
@@ -479,6 +490,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             return ret;
         }
 
+        // create a FrameRange with a time range > 1
+        FrameRange WithTimeRange(size_t range) const
+        {
+            FrameRange ret = *this;
+            if (!IsAllFrames())
+                ret.m_timeRange = range;
+            return ret;
+        }
+
+        // dimension we are iterating over; -1 means time dimension; 0 means no layout
+        int GetIterationDimension() const
+        {
+            if (!m_pMBLayout)
+                return 0;
+            else
+                return -1;  // TODO: allow user to specify other dimensions
+        }
+
         class IndexIteration    // range for range-based for over sequences
         {
             size_t m_beginIndex, m_endIndex;
@@ -753,7 +782,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (startColumn >= numCols)
                 LogicError("DataFor: FrameRange specifies a time index that is out of range.");
             if (fr.seqIndex == SIZE_MAX)
-                return std::pair<size_t, size_t>(startColumn, numParallelSequences);
+                return std::pair<size_t, size_t>(startColumn, numParallelSequences * fr.m_timeRange);
+            else if (fr.m_timeRange != 1)
+                LogicError("DataFor: FrameRange only support per-sequence time ranges with tensor slices, not matrix slices.");
             else
                 return std::pair<size_t, size_t>(startColumn + fr.seqIndex, 1);
         }
@@ -778,7 +809,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // TensorSliceWithMBLayoutFor() -- Return tensor slice for a FrameRange with specified number of columns with a given MBLayout
     // This implements the logic of interpreting the FrameRange object.
     // Unlike the matrix version above, this supports iteration indices other than time.
-    // TODO: This ^^. Still missing is a field to identify the index.
+    // TODO: This ^^. FrameRange still missing is a field to identify the index.
+    // This function happily returns tensor bounds that are out of bounds, assuming caller will do the right thing.
     // -----------------------------------------------------------------------
 
     template<class DimensionVector> // e.g. std::vector<size_t> or SmallVector<size_t>
@@ -787,6 +819,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                                                                          const MBLayoutPtr & pMBLayout/*the MB layout of 'data'*/)
     {
         std::pair<DimensionVector, DimensionVector> result;
+        typedef decltype(result.first[0]) ElemType;
 
         // this creates a slice for the entire matrix, which we will then narrow down
         result.first.resize(shape.size(), 0);
@@ -795,8 +828,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // get position of time and sequence index
         // These are only valid if we have a layout.
         // In the future, the 'timeDim' will be identified by the FrameRange.
+        int iterDimParam = fr.GetIterationDimension();
+        size_t iterDim = iterDimParam > 0 ? iterDimParam - 1/*regular dimensions are specified as 1-based*/ : shape.size() + iterDimParam/*-1 for time dimension*/;
         size_t sequenceDim = shape.size() - 2;  // TODO: In case of multiple time dims, this must be adjusted.
-        size_t timeDim = sequenceDim + 1;       // TODO: Get this from the FrameRange object.
 
         // MBLayout of data and of FrameRange must be identical pointers,
         // or in case of broadcasting, respective parent pointers.
@@ -819,28 +853,33 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // but as a reference (e.g. it cannot be resized)
         else if (!pMBLayout || fr.IsAllFrames())
         {
-            if (fr.m_timeOffset != 0)   // entire minibatch with non-zero offset exceeds bounds on at least one side
-                LogicError("DataFor: Iteration offset must not be specified for FrameRanges that reference the entire minibatch.");
-            // TODO: Can we allow this? Semantics would be different, it would crop frames outside.
+            if (fr.m_timeOffset)
+            {
+                if (iterDim >= result.first.size())
+                    LogicError("DataFor: Time offset cannot be applied to tensors that have no time dimension.");
+                result.first[iterDim]  += (ElemType)fr.m_timeOffset;  // Note: If we have an offset, this is guaranteed to yield a slice that is out of bounds.
+                result.second[iterDim] += (ElemType)fr.m_timeOffset;
+                if (result.first[iterDim] > result.second[iterDim])
+                    LogicError("DataFor: Numeric wraparound. You used a size_t vector where an int vector would be needed.");
+            }
         }
         // FrameRange refers to a time slice -> return that
-        else  if (result.second[timeDim] > 1)    // (if time dim is broadcasting then always return that one independent of requested index)
+        else  if (result.second[iterDim] > 1)    // (if time dim is broadcasting then always return that one independent of requested index)
         {
-            size_t t = fr.timeIdxInSeq + fr.m_timeOffset;
-            if (t >= result.second[timeDim])
-                LogicError("DataFor: FrameRange specifies an iteration index that is out of range.");
-            result.first[timeDim]  = t;
-            result.second[timeDim] = t + 1;
+            size_t ts = fr.timeIdxInSeq + fr.m_timeOffset;
+            size_t te = ts + fr.m_timeRange;
+            result.first[iterDim]  = (ElemType)ts;
+            result.second[iterDim] = (ElemType)te;
         }
-        
+
         // sequence index
         if (fr.seqIndex != SIZE_MAX/*sequence requested*/ && pMBLayout/*have sequences*/ && result.second[sequenceDim] > 1/*>1 sequence (not broadcasting)*/)
         {
             size_t s = fr.seqIndex;
             if (s >= result.second[sequenceDim])
                 LogicError("DataFor: FrameRange specifies a paralllel-sequence index that is out of range.");
-            result.first[sequenceDim]  = s;
-            result.second[sequenceDim] = s + 1;
+            result.first[sequenceDim]  = (ElemType)s;
+            result.second[sequenceDim] = (ElemType)s + 1;
         }
 
         return result;
diff --git a/Source/Common/Include/TensorShape.h b/Source/Common/Include/TensorShape.h
index 49e563c82..bedecbce3 100644
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@@ -104,7 +104,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void resize(size_t sz, const T & val) { if (sz < m_size) m_size = sz; else while (m_size < sz) push_back(val); }
         void assign(size_t sz, const T & val) { clear(); resize(sz, val); }
         template<class ITER>
-        void append(ITER beg, const ITER & end) { while (beg != end) push_back(*beg++); }
+        void append(ITER beg, const ITER & end) { while (beg != end) push_back((T)*beg++); }    // typecast allows signed/unsigned conversions
         template<class ITER>
         void assign(ITER beg, const ITER & end) { clear(); append(beg,end); }
         void operator=(const SmallVector & other) { m_size = other.m_size; memcpy(m_data, other.m_data, other.m_size * sizeof(T)); }
@@ -180,8 +180,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // boilerplate
         bool operator==(const TensorShape & other) const { return m_dims == other.m_dims; }
 
-        void Invalidate() { m_dims.assign(3, SIZE_MAX); } // TODO: clean up the valid/invalid situation (this is currently done inconsistently). Also this object is immutable.
-
         // verify that this refers to a dense matrix (no strides)
         void VerifyIsDense() const
         {
@@ -374,7 +372,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (size() != bounds.first.size() || size() != bounds.second.size())
                 LogicError("NarrowedTo: Bounds parameter must have same rank as tensor.");
             for (size_t k = 0; k < size(); k++)
-                if (bounds.second[k] <= bounds.first[k] || bounds.second[k] > m_dims[k])
+                if (bounds.second[k] <= bounds.first[k] || (size_t)bounds.second[k] > m_dims[k])
                     LogicError("NarrowedTo: Invalid bounds parameter, dimensions must be at least one.");
             for (size_t k = 0; k < size(); k++)
             {
diff --git a/Source/Common/Include/latticearchive.h b/Source/Common/Include/latticearchive.h
index d1411396d..ca489ad5c 100644
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@@ -51,6 +51,7 @@ enum mbrclassdefinition     // used to identify definition of class in minimum b
 // ===========================================================================
 class lattice
 {
+    mutable int verbosity; 
     struct header_v1_v2
     {
         size_t numnodes : 32;
@@ -567,11 +568,13 @@ private:
         std::vector<size_t> backptroffsets;         // TODO: we could change this to 'unsigned int' to save some transfer time
         std::vector<unsigned short> backptrstorage; // CPU-side versions use this as the traceback buffer; CUDA code has its CUDA-side buffer
         size_t numofstates;                         // per sil hmm
+        int verbosity;  
     public:
-        backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset) : numofstates(0)
+        backpointers (const lattice & L, const msra::asr::simplesenonehmm & hset, int verbosity=0) : numofstates(0)
         {
             size_t edgeswithsilence = 0;    // (diagnostics only: number of edges with at least one /sil/)
             size_t backptrbufsize = 0;      // number of entries in buffer for silence backpointer array, used as cursor as we build it
+            
             backptroffsets.resize (L.edges.size() + 1);  // +1, so that the final entry determines the overall size of the allocated buffer
             const size_t silUnitId = hset.gethmmid ("sil");
             numofstates = hset.gethmm (silUnitId).getnumstates();
@@ -595,15 +598,18 @@ private:
 #if 1           // multiple /sil/ -> log this (as we are not sure whether this is actually proper--probably it is)
                 if (numsilunits > 1)
                 {
-                    fprintf (stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits);
-                    fprintf (stderr, "alignments: :");
-                    foreach_index (a, aligntokens)
+                    if (verbosity)
                     {
-                        const auto & unit = aligntokens[a];
-                        const auto & hmm = hset.gethmm (unit.unit);
-                        fprintf (stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
+                        fprintf(stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, (int)numsilunits);
+                        fprintf(stderr, "alignments: :");
+                        foreach_index(a, aligntokens)
+                        {
+                            const auto & unit = aligntokens[a];
+                            const auto & hmm = hset.gethmm(unit.unit);
+                            fprintf(stderr, "%s,%.2f:", hmm.getname(), unit.frames / 100.0f);
+                        }
+                        fprintf(stderr, "\n");
                     }
-                    fprintf (stderr, "\n");
                 }
 #endif
                 if (numsilunits > 0)
@@ -611,7 +617,8 @@ private:
                 backptrbufsize += maxsilframes * numofstates;
             }
             backptroffsets[L.edges.size()] = backptrbufsize;        // (TODO: remove if not actually needed)
-            fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
+            if (verbosity)
+                fprintf (stderr, "backpointers: %.1f%% edges have at least one /sil/ unit inside\n", 100.0f * ((float) edgeswithsilence / L.edges.size()));
         }
         // CUDA support
         const std::vector<size_t> & getbackptroffsets() const { return backptroffsets; }
@@ -1002,6 +1009,10 @@ public:
 
     std::wstring key;        // (keep our own name (key) so we can identify ourselves for diagnostics messages)
     const wchar_t * getkey() const { return key.c_str(); }
+
+    void setverbosity(int veb) const{
+        verbosity = veb;
+    }
 };
 
 // ===========================================================================
@@ -1016,6 +1027,8 @@ class archive
     // set of lattice archive files referenced
     // Note that .toc files can be concatenated, i.e. one .toc file can reference multiple archive files.
     std::vector<std::wstring> archivepaths;         // [archiveindex] -> archive path
+    std::wstring              prefixPathInToc;      // prefix path in a toc; using this to avoid pushd some path before start training 
+    mutable int               verbosity;            
     size_t getarchiveindex (const std::wstring & path)  // get index of a path in archivepaths[]; create new entry if needed
     {
         auto iter = std::find (archivepaths.begin(), archivepaths.end(), path);
@@ -1042,7 +1055,8 @@ class archive
         {   // need to read the map and establish the mapping
             // get the symlist file
             const std::wstring symlistpath = archivepaths[archiveindex] + L".symlist";
-            fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
+            if (verbosity>0)
+                fprintf (stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
             std::vector<char> textbuffer;
             auto lines = msra::files::fgetfilelines (symlistpath, textbuffer);
             // establish mapping of each entry to the corresponding id in 'symmap'; this should fail if the symbol is not found
@@ -1092,19 +1106,25 @@ class archive
 public:
     // construct = open the archive
     //archive() : currentarchiveindex (SIZE_MAX) {}
-
+    void setverbosity(int veb) const 
+    {
+        verbosity = veb;
+    }
     // test if this object is loaded with anything (if not, an empty set of TOC paths was passed--meaning disable lattice mode)
     bool empty() const { return archivepaths.empty(); }
 
     // construct from a list of TOC files
-    archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap) : currentarchiveindex (SIZE_MAX), modelsymmap (modelsymmap)
+    archive (const std::vector<std::wstring> & tocpaths, const std::unordered_map<std::string,size_t> & modelsymmap, const std::wstring prefixPath=L"") 
+        : currentarchiveindex(SIZE_MAX), modelsymmap(modelsymmap), prefixPathInToc(prefixPath), verbosity(0)
     {
         if (tocpaths.empty())   // nothing to read--keep silent
             return;
         fprintf (stderr, "archive: opening %d lattice-archive TOC files ('%S' etc.)..", (int)tocpaths.size(), tocpaths[0].c_str());
+        size_t onepercentage = tocpaths.size() / 100 ? tocpaths.size()/100 : 1; 
         foreach_index (i, tocpaths)
         {
-            fprintf (stderr, ".");
+            if ( (i % onepercentage) ==  0)
+                fprintf (stderr, ".");
             open (tocpaths[i]);
         }
         fprintf (stderr, " %d total lattices referenced in %d archive files\n", (int)toc.size(), (int)archivepaths.size());
@@ -1135,7 +1155,11 @@ public:
                 RuntimeError("open: invalid TOC line (no [): %s", line);
             if (q != p)
             {
-                const std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
+                std::wstring archivepath = msra::strfun::utf16 (std::string (p, q - p));
+                if (!prefixPathInToc.empty())
+                {
+                    archivepath = prefixPathInToc + L"/" + archivepath;
+                }
                 // TODO: should we allow paths relative to TOC file?
                 archiveindex = getarchiveindex (archivepath);
             }
@@ -1207,6 +1231,7 @@ public:
             fsetpos (f, offset);
             // get it
             L.fread (f, idmap, spunit);
+            L.setverbosity(verbosity);
 #ifdef HACK_IN_SILENCE       // hack to simulate DEL in the lattice
             const size_t silunit = getid (modelsymmap, "sil");
             const bool addsp = true;
diff --git a/Source/Common/Include/latticesource.h b/Source/Common/Include/latticesource.h
index fcf046b68..0ec12508e 100644
--- a/Source/Common/Include/latticesource.h
+++ b/Source/Common/Include/latticesource.h
@@ -23,10 +23,11 @@ public:
 class latticesource
 {
     const msra::lattices::archive numlattices, denlattices;
+    int verbosity; 
 public:
     typedef msra::dbn::latticepair latticepair;
-    latticesource (std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap)
-        : numlattices (latticetocs.first, modelsymmap), denlattices (latticetocs.second, modelsymmap) {}
+    latticesource (std::pair<std::vector<std::wstring>,std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap, std::wstring RootPathInToc)
+        : numlattices (latticetocs.first, modelsymmap, RootPathInToc), denlattices (latticetocs.second, modelsymmap, RootPathInToc), verbosity(0) {}
 
     bool empty() const
     {
@@ -52,6 +53,12 @@ public:
         denlattices.getlattice (key, LP->second, expectedframes);     // this loads the lattice from disk, using the existing L.second object
         L = LP;
     }
+
+    void setverbosity(int veb)
+    {
+        verbosity = veb; 
+        numlattices.setverbosity(veb); denlattices.setverbosity(veb);
+    }
 };
 
 }}
\ No newline at end of file
diff --git a/Source/ComputationNetworkLib/CompositeComputationNodes.h b/Source/ComputationNetworkLib/CompositeComputationNodes.h
index f8f79dc21..8400407b7 100644
--- a/Source/ComputationNetworkLib/CompositeComputationNodes.h
+++ b/Source/ComputationNetworkLib/CompositeComputationNodes.h
@@ -296,6 +296,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CreateMatrixIfNull(m_value);
             m_value->SetValue(value);
             m_hasComputed = true; 
+            SetDims(TensorShape(value.GetNumRows()), value.GetNumCols());
         }
     public:
         bool m_hasComputed;
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.cpp b/Source/ComputationNetworkLib/ComputationNetwork.cpp
index 8da7ba6c7..75c989bc5 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@@ -62,6 +62,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // break cycles
         // BUGBUG: This only works if nodes are not shared across networks.
         //         Once we allow that (BrainScript editing), we need proper cycle detectors. Luckily, we know our cycles, so it won't be too hard.
+        //         Or just use weak ptrs.
         for (auto & iter : m_nameToNodeMap)
             iter.second->DetachInputs();
 
@@ -74,8 +75,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // serialization
     // -----------------------------------------------------------------------
 
+    // after after editing--network is possibly not validated/compiled
+    void ComputationNetwork::SaveEdited(const wstring& fileName, const FileOptions fileFormat)
+    {
+        if (!IsCompiled())
+            CompileNetwork();
+        Save(fileName, fileFormat);
+    }
+
     void ComputationNetwork::Save(const wstring& fileName, const FileOptions fileFormat) const
     {
+        VerifyIsCompiled("Save");
         // In case of parallel training only the main node should we saving the model to prevent
         // the parallel training nodes from colliding to write the same file
         // TODO: This does not belong here.
@@ -182,7 +192,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // load the section of nodes that contain persistable parameters
     // This is used for reloading a model without recreating it, e.g. during training.
     // TODO: Why not just reload it? Because SGD::Train() holds pointers to the parameters directly? That should be fixed.
-    template<class ElemType> void ComputationNetwork::LoadPersistableParameters(File & fstream, bool create)
+    template<class ElemType> void ComputationNetwork::ReadPersistableParameters(File & fstream, bool create)
     {
         fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCN");
 
@@ -221,47 +231,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
     }
 
-    template<class ElemType> void ComputationNetwork::Load(const wstring& fileName, const FileOptions fileFormat, const bool /*bAllowNoCriterionNode --unused*/, ComputationNetwork* anotherNetwork)
+    // deserialize the model
+    // This does not post-process the model (CompileNetwork()). Use Load() instead.
+    template<class ElemType> void ComputationNetwork::Read(const wstring& fileName, const FileOptions fileFormat, const bool /*bAllowNoCriterionNode --unused*/, ComputationNetwork* anotherNetwork)
     {
         ClearNetwork();
 
         File fstream(fileName, fileFormat | FileOptions::fileOptionsRead);
 
-#if 1
-        LoadPersistableParameters<ElemType>(fstream, true);
-#else
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BCN");
-
-        // model version
-        size_t modelVersion = CNTK_MODEL_VERSION_1; //if version info is not there it is version 1
-        if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BVersion"))
-        {
-            fstream >> modelVersion;
-            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EVersion");
-        }
-
-        size_t numNodes;
-        fstream >> numNodes;
-
-        // get all node info first
-        fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BNodeList");
-        for (size_t i = 0; i < numNodes; i++)
-        {
-            wstring opName, nodeName;
-            fstream >> opName >> nodeName;
-
-            auto newNode = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
-
-            if (!newNode)
-            {
-                fprintf(stderr, "Unknown ComputationNode type %ls (node name %ls)\n", opName.c_str(), nodeName.c_str());
-                InvalidArgument("Invalid node type.");
-            }
-            newNode->Load(fstream, modelVersion);
-            AddNodeToNet(newNode);
-        }
-        fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodeList");
-#endif
+        ReadPersistableParameters<ElemType>(fstream, true);
 
         size_t numNodes = m_nameToNodeMap.size();
 
@@ -277,9 +255,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 vector<wstring> childrenNames;
                 childrenNames.resize(numChildren);
                 for (size_t j = 0; j < numChildren; j++)
-                {
                     fstream >> childrenNames[j];
-                }
 
                 // TODO: how does the file distinguish float from double?
                 ComputationNodeBasePtr nodePtr = GetNodeFromName(nodeName);
@@ -288,42 +264,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 for (int j = 0; j < numChildren; j++)
                     childrenNodes[j] = GetNodeFromName(childrenNames[j], anotherNetwork);
 
-                //if (nodePtr->OperationName() == OperationNameOf(RowStackNode))
-                //{
-                    // allow for variable input nodes
-                    nodePtr->AttachInputs(childrenNodes);
-                //}
-                //else
-                //{
-                //    // fixed input nodes
-                //    // TODO: Use the variable-length AttachInputs() as well. This is a refactoring left-over.
-                //    switch (numChildren)
-                //    {
-                //        case 1:
-                //            nodePtr->AttachInputs(childrenNodes[0]);
-                //            break;
-                //        case 2:
-                //            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1]);
-                //            break;
-                //        case 3:
-                //            nodePtr->AttachInputs(childrenNodes[0],childrenNodes[1], childrenNodes[2]);
-                //            break;
-                //        case 4:
-                //            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3]);
-                //            break;
-                //        case 5:
-                //            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4]);
-                //            break;
-                //        case 6:
-                //            nodePtr->AttachInputs(childrenNodes[0], childrenNodes[1], childrenNodes[2], childrenNodes[3], childrenNodes[4], childrenNodes[5]);
-                //            break;
-                //        default:
-                //            LogicError("Invalid number of children.");
-                //    }
-                //}
+                nodePtr->AttachInputs(childrenNodes);
             }
         }
-
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ERelation");
 
         fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BRootNodes");
@@ -340,7 +283,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     fstream >> nodeName;
                     m_features.push_back(GetNodeFromName(nodeName));
                 }
-
                 fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EFeatureNodes");
             }
 
@@ -353,7 +295,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     m_labels.push_back(GetNodeFromName(nodeName));
                 }
             }
-
+            // BUGBUG: Should this be inside the block?
             fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ELabelNodes");
 
             if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BCriterionNodes") ||
@@ -372,13 +314,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 }
             }
 
-            // TODO: this section is defunct
+            // TODO: this section is defunct, skip over
             if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BNodesReqMultiSeqHandling"))
             {
                 fprintf(stderr, "WARNING: Ignoring defunct 'BNodesReqMultiSeqHandling' section in input file.\n");
                 fstream >> num;
                 for (size_t i = 0; i < num; i++)
-                    fstream >> nodeName;
+                    fstream >> nodeName;    // dummy
                 fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ENodesReqMultiSeqHandling");
             }
 
@@ -415,13 +357,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EPairNodes");
             }
         }
-
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ERootNodes");
 
         fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ECN");
-
-        // perform all further post-processing, caching, etc.
-        CompileNetwork();
     }
 
     // -----------------------------------------------------------------------
@@ -622,9 +560,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     //set sequence training parameters, e.g. smoothing weight, frame drop threshhold
     template<class ElemType>
-    void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign)
+    void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net, 
+                                            const ComputationNodeBasePtr criterionNode, 
+                                            const double& hsmoothingWeight, 
+                                            const double& frameDropThresh, 
+                                            const bool& doreferencealign, 
+                                            const double& amf /*= 14.0f*/,
+                                            const double& lmf /*= 14.0f*/,
+                                            const double& wp  /*= 0.0f*/,
+                                            const double& bMMIfactor /*= 0.0f*/,
+                                            const bool&  sMBR /*= false*/
+                                            )
     {
         fprintf(stderr, "Setting Hsmoothing weight to %.8g and frame-dropping threshhold to %.8g\n", hsmoothingWeight, frameDropThresh);
+        fprintf(stderr, "Setting SeqGammar-related parameters: amf=%.2f, lmf=%.2f, wp=%.2f, bMMIFactor=%.2f, usesMBR=%s\n",
+            amf, lmf, wp, bMMIfactor, sMBR ? "true" : "false");
         list<ComputationNodeBasePtr> seqNodes = net->GetNodesWithType(OperationNameOf(SequenceWithSoftmaxNode), criterionNode);
         if (seqNodes.size() == 0)
         {
@@ -638,6 +588,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 node->SetSmoothWeight(hsmoothingWeight);
                 node->SetFrameDropThresh(frameDropThresh);
                 node->SetReferenceAlign(doreferencealign);
+                node->SetGammarCalculationParam(amf, lmf, wp, bMMIfactor, sMBR);
             }
         }
     }
@@ -1114,18 +1065,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     }
 
     template void ComputationNetwork::InitLearnableParameters<float>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const float initValueScale, bool initOnCPUOnly);
-    template void ComputationNetwork::Load<float>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
-    template void ComputationNetwork::LoadPersistableParameters<float>(File & fstream, bool create);
+    template void ComputationNetwork::Read<float>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
+    template void ComputationNetwork::ReadPersistableParameters<float>(File & fstream, bool create);
     template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
     template /*static*/void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
-    template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, 
+                                const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 
     template void ComputationNetwork::InitLearnableParameters<double>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly);
-    template void ComputationNetwork::Load<double>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
-    template void ComputationNetwork::LoadPersistableParameters<double>(File & fstream, bool create);
+    template void ComputationNetwork::Read<double>(const wstring& fileName, const FileOptions fileFormat, const bool bAllowNoCriterionNode, ComputationNetwork* anotherNetwork);
+    template void ComputationNetwork::ReadPersistableParameters<double>(File & fstream, bool create);
     template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
     template /*static*/void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
-    template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign, 
+                            const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 
     // register ComputationNetwork with the ScriptableObject system
     ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetwork> registerComputationNetwork(L"ComputationNetwork");
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index 0a9b3bf8a..426dea48f 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -78,24 +78,33 @@ public:
     // -----------------------------------------------------------------------
 
     void Save(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary) const;
+    void SaveEdited(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary);
 private:
     void SaveToFileImpl(const std::wstring& fileName, const FileOptions fileFormat) const;
 public:
 
     template<class ElemType>
-    void LoadPersistableParameters(File & fstream, bool create);
+    void ReadPersistableParameters(File & fstream, bool create);
     // reload node content only, e.g. used by SGD::Train() when going back to an older model that had better training objective
     template<class ElemType>
-    void ReloadPersistableParameters(const std::wstring& fileName)
+    void RereadPersistableParameters(const std::wstring& fileName)
     {
         File fstream(fileName, FileOptions::fileOptionsBinary | FileOptions::fileOptionsRead);
-        LoadPersistableParameters<ElemType>(fstream, false);
+        ReadPersistableParameters<ElemType>(fstream, false);
     }
     // design BUGBUG: binary files do not know whether they are float or double.
     // TODO: modify file format to know this; then eliminate the <ElemType> dependency (and in some future, allow nodes to be different)
     template<class ElemType>
+    void Read(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary,
+              const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr);
+    template<class ElemType>
     void Load(const std::wstring& fileName, const FileOptions fileFormat = FileOptions::fileOptionsBinary,
-                      const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr);
+              const bool bAllowNoCriterionNode = false, ComputationNetwork* anotherNetwork = nullptr)
+    {
+        Read<ElemType>(fileName, fileFormat, bAllowNoCriterionNode, anotherNetwork);
+        // perform all further post-processing, caching, etc.
+        CompileNetwork();
+    }
 
     // static helper to instantiate a network from a file
     template<class ElemType>
@@ -159,9 +168,11 @@ public:
 private:
     void ValidateNodes(list<ComputationNodeBasePtr> nodes, bool isFinalValidationPass, size_t & todo);
     void ValidateSubNetwork(const ComputationNodeBasePtr& rootNode);
+    void MarkValueNonSharableNodes();
 private:
     void DetermineSetOfAllRoots();
     void CollectInputAndLearnableParameters(const ComputationNodeBasePtr& rootNode);
+    bool IsCompiled() const { return m_isCompiled; }
     void VerifyIsCompiled(const char * where) const;
     //bool BuiltAndValidatedSubNetwork(const ComputationNodeBasePtr & rootNode);
 public:
@@ -411,8 +422,20 @@ public:
 
     template<class ElemType>
     static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double & prevDropoutRate, unsigned long & dropOutSeed);
+
+
+
     template<class ElemType>
-    static void SetSeqParam(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, double hsmoothingWeight, double frameDropThresh, const bool doreferencealign);
+    static void SetSeqParam(ComputationNetworkPtr net, 
+                            const ComputationNodeBasePtr criterionNode, 
+                            const double&  hsmoothingWeight, 
+                            const double& frameDropThresh, 
+                            const bool&   doreferencealign, 
+                            const double& amf=14.0f, 
+                            const double& lmf=14.0f, 
+                            const double& wp=0.0f, 
+                            const double& bMMIfactor=0.0f, 
+                            const bool&  sMBR=false);
     static void SetMaxTempMemSizeForCNN(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const size_t maxTempMemSizeInSamples);
 
     // -----------------------------------------------------------------------
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
index 6dddc73a3..2f408b797 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@@ -30,6 +30,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                                         std::wstring toName,
                                                         const CopyNodeFlags flags)
     {
+        InvalidateCompiledNetwork();
+
         if (toName == L"")
             toName = fromName;
 
@@ -50,11 +52,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         else
         {
-            //node already exists
-
+            // node already exists
             pToNode = GetNodeFromName(toName);
 
-            //same node. no copy needed
+            // same node. no copy needed
             if (pFromNode == pToNode)
                 LogicError("CopyNode: You are copying the node to the same network with same node name.");
             else
@@ -69,6 +70,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                          const std::wstring fromName, std::wstring toNamePrefix,
                                          const CopyNodeFlags flags)
     {
+        InvalidateCompiledNetwork();
+
         if (!(flags & CopyNodeFlags::copyNodeValue))
             LogicError("CopySubTree: you cannot copy a tree without copying the node values.");
 
@@ -103,7 +106,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // nodeNameNew - new node name
     void ComputationNetwork::RenameNode(const std::wstring& nodeNameOrig, const std::wstring& nodeNameNew)
     {
-        // so that renamed node will not be referenced
         InvalidateCompiledNetwork();
 
         ComputationNodeBasePtr nodeToRename = GetNodeFromName(nodeNameOrig);
@@ -128,7 +130,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     void ComputationNetwork::DeleteNode(const std::wstring & nodeName)
     {
-        // so that deleted node will not be referenced
         InvalidateCompiledNetwork();
 
         ComputationNodeBasePtr nodeToDelete = GetNodeFromName(nodeName);
@@ -172,6 +173,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // need to update all the mappings as well childrens
     void ComputationNetwork::ChangeNode(wstring nodeName, ComputationNodeBasePtr newNode)
     {
+        InvalidateCompiledNetwork();
+
         ComputationNodeBasePtr oldNode = GetNodeFromName(nodeName);
         if (oldNode->OperationName() != newNode->OperationName())
             InvalidArgument("newNode must have the same type as the old node.");
@@ -204,6 +207,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // need to update those nodes who use oldNode as their child
     void ComputationNetwork::ReplaceLeafNode(wstring oldNodeName, ComputationNodeBasePtr newNode)
     {
+        InvalidateCompiledNetwork();
+
         ComputationNodeBasePtr oldNode = GetNodeFromName(oldNodeName);
 
         // change the input of those nodes whose child is oldNode
@@ -223,6 +228,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     void ComputationNetwork::ReplaceFinalCriterionNode(wstring oldNodeName, ComputationNodeBasePtr newNode)
     {
+        InvalidateCompiledNetwork();
+
         // Checks if the node is a criterion node.
         int index = -1;
         for (int i = 0; i < m_finalCriteria.size(); ++i)
@@ -251,6 +258,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
     void ComputationNetwork::AddFeatureNode(ComputationNodeBasePtr featureNode)
     {
+        InvalidateCompiledNetwork();
+
         wstring nodeName = featureNode->NodeName();
         if (NodeNameExists(nodeName))
             RuntimeError("AddFeatureNode: feature node already exists.");
@@ -261,12 +270,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // We only remove the node, not delete it.
     void ComputationNetwork::RemoveFeatureNode(ComputationNodeBasePtr featureNode)
     {
+        InvalidateCompiledNetwork();
+
         wstring nodeName = featureNode->NodeName();
         if (!NodeNameExists(nodeName))
             RuntimeError("RemoveFeatureNode: feature node does not exist.");
 
-        InvalidateCompiledNetwork();
-
         // Removes links.
         for (auto nodeIter = m_nameToNodeMap.begin(); nodeIter != m_nameToNodeMap.end(); ++nodeIter)
         {
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
index 084fe9ce9..9a297d5c7 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@@ -10,11 +10,13 @@
 #include "ComputationNode.h"
 #include "ComputationNetwork.h"
 #include "RecurrentNodes.h"
+#include "InputAndParamNodes.h"
 #include <string>
 #include <vector>
 #include <list>
 #include <set>
 #include <algorithm>
+#include <map>
 
 using namespace std;
 
@@ -365,7 +367,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // verify that network has undergone CompileNetwork()
     void ComputationNetwork::VerifyIsCompiled(const char * where) const
     {
-        if (!m_isCompiled)
+        if (!IsCompiled())
             LogicError("%s: A compiled network was expected.", where);
     }
 
@@ -712,6 +714,63 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     // -----------------------------------------------------------------------
     // memory allocation
     // -----------------------------------------------------------------------
+    // mark nodes that are purely induced by parameters as non-sharable and create space for value if null 
+    void ComputationNetwork::MarkValueNonSharableNodes()
+    {
+        const auto & nodes = GetEvalOrder(nullptr);
+        std::map<wstring, bool>    allLeafDescendentsAreParameters; 
+        std::list<ComputationNodeBasePtr>    allLearnableParameters = GetNodesWithType(OperationNameOf(LearnableParameter)); 
+        // note that: we cannot use m_learnableParameters because we need all parameters node, regardless whether it requires update or not 
+
+        for (auto& node : nodes)
+        {
+            auto children = node->GetInputs(); 
+            wstring myname = node->NodeName();
+            bool allParameters = true; 
+                        
+            if (children.size()) // we don't do the check for leaf node, cause all the possible leaf nodes (input/parameters/precompute node) are marked as non-sharable already 
+            {
+                for (auto child : children)
+                {
+                    wstring ChildName = child->NodeName();
+                    if (allLeafDescendentsAreParameters.find(ChildName) == allLeafDescendentsAreParameters.end())
+                    {
+                        // not found, means it is a leaf node (we are at eval order )
+                        assert(child->IsLeaf() || child->IsPartOfLoop());
+                        if (std::find(allLearnableParameters.begin(), allLearnableParameters.end(), child)!= allLearnableParameters.end())
+                        {
+                            allLeafDescendentsAreParameters[ChildName] = true; 
+                        }
+                        else
+                        {
+                            allParameters = false; 
+                            allLeafDescendentsAreParameters[ChildName] = false;
+                            break;
+                        }                      
+                    }
+                    else
+                    {
+                        if (allLeafDescendentsAreParameters[ChildName] == false)
+                        {
+                            allParameters = false;
+                            break;
+                        }
+                    }
+                }
+                allLeafDescendentsAreParameters[myname] = allParameters;
+                if (allParameters)
+                {
+                    node->MarkValueNonSharable();
+                }
+                else
+                {
+                    node->MarkValueSharable();
+                }
+            }
+        }
+        
+    }
+
 
     // this function will need to be called before actual validation and execution to 
     // predetermine how to share matrices to reduce memory usage.
@@ -726,9 +785,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         VerifyIsCompiled("AllocateAllMatrices");
 
+        // Due to special topology, if a node is solely induced by parameters, its function value should not be shared  
+        MarkValueNonSharableNodes();
+
         bool performingBackPropagation = (trainRootNode != nullptr);
 
-        // Create a composite Eval order with the specfied nodes as roots
+        // Create a composite Eval order with the specified nodes as roots
         std::vector<ComputationNodeBasePtr> forwardPropRoots;
         forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end());
         forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end());
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index b4e0725bd..af9851bd0 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -136,7 +136,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     {
         typedef std::shared_ptr<INodeState> NodeStatePtr;
         virtual NodeStatePtr ExportState() = 0;
-        virtual void ImportState(NodeStatePtr && state) = 0;
+        virtual void ImportState(const NodeStatePtr & state) = 0;
     };
     typedef IStatefulNode::NodeStatePtr NodeStatePtr;
 
@@ -151,7 +151,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         friend class ComputationNetwork;
 
         ComputationNetworkOwnedNodeState() :
-            m_needsGradient(false)
+            m_needsGradient(false), m_valueSharable(true)
         {
             PurgeStateForFormingRecurrentLoops();
             m_isPartOfLoop = false;
@@ -166,10 +166,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         bool IsPartOfLoop() const { return m_isPartOfLoop; }
 
+        virtual void MarkValueNonSharable(){ m_valueSharable = false; }
+        virtual void MarkValueSharable() { m_valueSharable = true;    }
+        bool isValueSharable() const { return m_valueSharable;  }
+        
     protected:  // TODO: should be fully encapsulated here
 
         bool m_needsGradient;   // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)
 
+        bool m_valueSharable;   // a flag is needed for memory share. 
+                                // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters), 
+                                // it will never be released to memory pool 
     private:
 
         bool m_isPartOfLoop;        // true if this loop is part of a recurrent loop
@@ -250,7 +257,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_deviceId(deviceId), m_outputNeededDuringBackprop(true),
             m_parameterUpdateRequired(false), m_gradientInitialized(false),
             m_nodeName(name == L"" ? CreateUniqNodeName() : name),
-            m_numRows(0), m_numCols(0)
+            m_numRows(0), m_numCols(0) 
         { }
         virtual ~ComputationNodeBase(){}
 
@@ -348,9 +355,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         const TensorShape & GetSampleLayout() const { return m_sampleLayout; }
         bool HasSampleLayout() const { return m_sampleLayout.GetRank() != 1; }      // meaning does it have a layout that is not just a vector
+        TensorShape GetTensorShape(size_t rank) const;                              // form the actual tensor that describes the full object
     protected:
         size_t DetermineElementwiseTensorRank() const;                              // determine tensor rank when considering all inputs with padding
-        TensorShape GetTensorShape(size_t rank) const;                              // form the actual tensor that describes the full object
         TensorShape GetTensorSliceFor(size_t rank, const FrameRange & fr) const;    // form tensor shape of the slice referenced by FrameRange
     public:
         // access to element(0,0) without having to type-cast
@@ -455,6 +462,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 LogicError("VerifyNumParallelSequences: value inconsistent with MB layout");
         }
 
+
     protected:
     public:     // ...the following should be protected, but nodes inquire about their children, requiring public access
 
@@ -537,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
         bool IsOutputNeededDuringBackprop() const 
         {
-            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop;
+            return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop ;
         }
 
         const size_t GetNumInputs() const { return m_inputs.size(); }
@@ -769,6 +777,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         bool m_parameterUpdateRequired;     // update parameters? Only used for LearnableParameters.    --TODO: Should we make this a member of LearnableParameters actually? And require a type cast? Currently it is read out for all leaves.
         bool m_gradientInitialized;         // indicates whether the gradient matrix has been resized and initialized to 0
         bool m_outputNeededDuringBackprop;  // indicates whether the output value of the node is needed during backprop
+
     };
     typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;
 
@@ -902,7 +911,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         //don't release matrices that need to be used in the gradient computation
         virtual void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool)
         {
-            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE))
+            if (!IsOutputNeededDuringBackprop() && (m_value->GetMatrixType() != SPARSE) && isValueSharable())
                 ReleaseMatrixToPool(m_value, matrixPool);
         }
 
@@ -931,7 +940,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
                 // Release the Value matrix only if the output value is needed during backprop
                 // since in the case it isn't used, we release it during forward prop itself
-                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE)
+                if (IsOutputNeededDuringBackprop() && m_value->GetMatrixType() != SPARSE && isValueSharable())
                     ReleaseMatrixToPool(m_value, matrixPool);
             }
         }
@@ -1317,6 +1326,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             CreateMatrixIfNull(m_gradient);
         }
 
+        void MarkValueNonSharable() override
+        {
+            m_valueSharable = false; 
+            CreateMatrixIfNull(m_value);
+        }
+
+
     protected:
 
         // this function is used to create matrices for those needed before matrix pool is available
@@ -1532,7 +1548,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #define UsingComputationNodeMembers /*without OperationName; needed to support inconsistent pattern of InputValue--TODO: This comment it out of date. */    \
 protected: \
     typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr; \
-    using Base::m_deviceId; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; using Base::GetNumRows; using Base::GetNumCols; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
+    using Base::m_deviceId; using Base::shared_from_this; using Base::GetDeviceId; using Base::SetDims; using Base::SetDims1; using Base::SetNumCols; \
+    using Base::GetNumRows; using Base::GetNumCols; using Base::GetTensorShape; using Base::UpdateFunctionValuesSize; using Base::LoadValue; \
     using Base::m_pMBLayout; using Base::GetNumTimeSteps; using Base::GetNumParallelSequences; \
     using Base::MaskMissingColumnsToZero; using Base::MaskMissingValueColumnsToZero; using Base::MaskMissingGradientColumnsToZero; using Base::InvalidateMissingValueColumns; using Base::InvalidateMissingGradientColumns; \
     using Base::DataFor; using Base::ValueFor; using Base::Gradient; using Base::GradientFor; \
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index 3d2a7a343..6d36a33a9 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -813,9 +813,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         void SetEvalMode(bool bnEvalMode)
         {
-            m_eval = bnEvalMode;
+            m_eval = bnEvalMode; 
         }
-
     private:
         struct VersionInfo
         {
diff --git a/Source/ComputationNetworkLib/InputAndParamNodes.h b/Source/ComputationNetworkLib/InputAndParamNodes.h
index 898d35f9f..f53ef7a38 100644
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@@ -41,6 +41,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             Base(deviceId, name)
         {
             m_parameterUpdateRequired = true;
+            this->m_valueSharable = false; 
             SetDims(TensorShape(), 0);
         }
         LearnableParameter(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & shape) :
@@ -48,6 +49,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             m_parameterUpdateRequired = true;
             CreateMatrixIfNull(m_value);
+            this->m_valueSharable = false; 
             // for now we split off the trailing dimension into the matrix column dimension
             // TODO: This is for compat, but is is inconsistent. Decide what a sample layout means for a node without MBLayout w.r.t. non-tensor ops.
             auto dims = shape.GetDims();
@@ -197,6 +199,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             PrintNodeValuesToFile(printValues, fstream);
         }
+
     };
 
 #if 0
@@ -261,6 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             SetDims(sampleLayout, 0);
             UpdateFunctionValuesSize();     // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
             m_parameterUpdateRequired = false;
+            this->m_valueSharable = false; 
         }
     protected:
         InputValueBase(DEVICEID_TYPE deviceId, const wstring & name, const TensorShape & sampleLayout, bool isSparse) :
diff --git a/Source/ComputationNetworkLib/NonlinearityNodes.h b/Source/ComputationNetworkLib/NonlinearityNodes.h
index 52dd27935..07ceba027 100644
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@@ -44,7 +44,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void /*ComputationNode::*/ForwardProp(const FrameRange & fr) override
         {
-            static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }
+            //static int c = 0; if (c++ == 0) { fprintf(stderr, "#NLop%d#\n", (int)opForward); }
 
             size_t rank = DetermineElementwiseTensorRank();
             auto result =           ValueTensorFor(rank, fr);
diff --git a/Source/ComputationNetworkLib/RecurrentNodes.h b/Source/ComputationNetworkLib/RecurrentNodes.h
index 080f43d44..997f7b209 100644
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@@ -9,6 +9,7 @@
 #include "Matrix.h"
 #include "TensorShape.h"
 #include "ComputationNode.h"
+#include "Sequences.h"
 
 #include <unordered_set>
 #include <map>
@@ -26,7 +27,7 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
 
     // -----------------------------------------------------------------------
-    // ShiftNode (input, fromOffset, boundaryValue, dim=-1, numSteps=1, insertDim=0) -- delay and rolling window
+    // ShiftNode (input, fromOffset, boundaryValue, dim=-1) -- delay and rolling window
     //
     // This shifts the input by (-fromOffset) steps. In other words, output(t) will be input(t+fromOffset).
     // E.g. for fromOffset=-1, this gives the past value.
@@ -34,36 +35,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     //
     // This node can be used in a recurrent loop. This requires special handling by the ComputationNetwork,
     // for both execution (sequential execution) and creation (avoiding circular references).
-    // TODO: When outside a recurrent loop and used with frame randomization, this will communicate to the reader
-    // that additional frames are needed, which will then return a frame range. TODO: This will not match
-    // the labels, which are still 1 frame. Think through which dimension this should go in.
     //
     // Values shifted in from beyond sequence boundaries will be copied from boundaryValue.
     // Normally, this is a scalar Constant(). However, it can be any node, which will be indexed from the end
-    // (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement
-    // sequence-to-sequence models. Broadcasting is supported, so it can be e.g. a single output-dimension vector
+    // (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement the basic
+    // sequence-to-sequence model. Broadcasting is supported, so it can be e.g. a single output-dimension vector
     // applied to all sequences.
     //
     // To delay (past value), use negative fromOffset. To access future value, use positive fromOffset.
     //
-    // To pull in multiple offsets, use offsetRange>1. This will pull in offsetRange consecutive offsets starting
-    // with fromOffset. This implements a rolling window. A new dimension will be inserted at multiOffsetDim
-    // (default 0 means after the last sample dimension). Special considerations:
-    //  - If the boundaryValue is not wide enough, the sequence will be dropped (e.g. if you pull in 5 history frames,
-    //    but the sequence in boundaryValue only has 4 samples).
-    //  - If you feed back such an expanded output into this node in a loop, you get an inconsistency
-    //    and will eventually fail. You must pull the dimensions apart.
-    //  - If the current time step (offset 0) is included in the range (e.g. fromOffset=-1, offsetRange=3) then
-    //    this node cannot participate in a recurrence.
-    //
     // By default, this shifts over the time dimension, but you can choose to shift over any
     // sample tensor dimension instead using 'dim' (-1 stands for time). This will only work, however,
     // when all involved nodes are implemented using the tensor library. Nodes implemented using
     // Matrix slices can only support iterating over time.
-    //
-    // If the boundaryValue has 0 elements, the sequence will be trimmed (frames reaching beyond the boundary
-    // are dropped). This will initially not be implemented for the time dimension (as it would require
-    // change of MBLayout).
     // -----------------------------------------------------------------------
 
     template<class ElemType>
@@ -74,24 +58,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     public:
         enum BoundaryMode : int     // how to fill frames at boundaries
         {
-            reachAcross = -1,       // go across the boundary: use boundaryValue. This is for recurrence.
-            duplicate = 0,          // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only.
-            trim = 1                // drop frames. Non-recurrent mode only.
+            reachAcross = -1,       // go across the boundary: use boundaryValue
+            duplicate = 0           // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only.
         };
-        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimension, size_t numSteps, int insertedDimParam) :
-            Base(deviceId, name), m_fromOffset(fromOffset), m_numSteps(numSteps),
+        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimParam) :
+            Base(deviceId, name), m_fromOffset(fromOffset),
             m_boundaryMode(boundaryMode),
-            m_shiftDimension(shiftDimension), m_insertedDimParam(insertedDimParam),
-            m_insertExpandShapeAt(SIZE_MAX/*uninitialized at this point*/)
+            m_shiftDimParam(shiftDimParam),
+            m_shiftDim(SIZE_MAX),
+            m_state(deviceId)
         {
             CreateMatrixIfNull(m_value);
             SetDims(TensorShape(), 0);  // empty for now
         }
         ShiftNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1, 1, 0)
+            ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1)
         { }
         ShiftNode(const ScriptableObjects::IConfigRecordPtr configp) :
-            ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim"), configp->Get(L"numSteps"), configp->Get(L"insertedDim"))
+            ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim"))
         {
             // We do NOT attach the inputs, as we cannot resolve the main input without causing a circular reference.
             // Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below.
@@ -111,19 +95,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void Save(File& fstream) const
         {
             Base::Save(fstream);
-            fstream << m_fromOffset << m_numSteps << m_boundaryMode << m_shiftDimension << m_insertedDimParam;
+            fstream << m_fromOffset << m_boundaryMode << m_shiftDimParam;
         }
 
         virtual void Load(File& fstream, size_t modelVersion) override
         {
             Base::Load(fstream, modelVersion);
-            fstream >> m_fromOffset >> m_numSteps >> m_boundaryMode >> m_shiftDimension >> m_insertedDimParam;
-        }
-
-        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
-        {
-            assert(inputIndex == 0); inputIndex;
-            fr;
+            fstream >> m_fromOffset >> m_boundaryMode >> m_shiftDimParam;
         }
 
         virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
@@ -133,6 +111,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             Base::BeginForwardProp();
 
+            // TODO: If we have a truncated-BPTT state then verify that the sequence indices match with m_state->m_sequences, and the tensor dimensions.
+
             // in case of trimming, narrow the layout
             // We actually do not drop content, only reduce the range of sequences.
             // This is meant to optimize for the case where we have multiple sequences concatenated while trimming a small amount only.
@@ -142,34 +122,216 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             Base::EndForwardProp();
 
-            // In BPTT, we carry over left-to-right state across minibatches.
+            // In truncated BPTT, we carry over left-to-right state across minibatches.
             // The necessary frames are stored in m_state->m_delayedValue.
 
-            // Only if layout has anything exceeding the MB.
+            if (GetMBLayout()->HasSequenceBeyondEnd())    // only if layout has any sequence that has ends beyond this minibatch
+            {
+            }
+            else
+                m_state.clear();
+        }
+    private:
+        typedef std::pair<SmallVector<int>, SmallVector<int>> SliceBounds;  // slice bounds for dimension k are [first[k], second[k]) (think STL begin/end)
+
+        TensorView<ElemType> DataTensorFor(Matrix<ElemType> & data, TensorShape shape/*original shape of 'data'*/, SliceBounds slice)
+        {
+            shape.NarrowTo(slice);
+            return TensorView<ElemType>(data, shape);
         }
 
-        // This function assumes BeginForwardProp/EndForwardProp() to be called before/after the iteration loop.
+        // helper to shift dimension 'm_shiftDim' of SliceBounds by an offset (a common operation below)
+        SliceBounds ShiftDim(const SliceBounds & in, int shiftBy)
+        {
+            SliceBounds result = in;
+            result.first [m_shiftDim] += shiftBy;
+            result.second[m_shiftDim] += shiftBy;
+            return result;
+        }
+
+        static SmallVector<int> ToIntDims(const TensorShape & shape)
+        {
+            SmallVector<int> dimsSigned;
+            dimsSigned.append(shape.GetDims().begin(), shape.GetDims().end());  // we need the bounds as signed integers as they may shift into negative ranges
+            return dimsSigned;
+        }
+
+        // determine shapes and slices to move
+        // This is used for both forward and backprop.
+        // 'In' below refers to Input(0) where 'Out' refers to the output of *this.
+        void DetermineSlices(size_t rank, const FrameRange & fr,
+                             TensorShape & inShape, TensorShape & outShape,                 // our MB's shape
+                             SliceBounds & inSliceLogical, SliceBounds & outSliceLogical)   // the logical ranges to shift
+        {
+            // get the slice bounds for the given FrameRange
+            outShape =           GetTensorShape(rank);     // describes the full tensor including sequence and time dimensions
+            inShape  = Input(0)->GetTensorShape(rank);
+
+            // determine the logical in and out slices
+            // This may now have bounds that fall outside, which we need to split off next.
+            outSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(outShape), fr, GetMBLayout());
+            inSliceLogical  = TensorSliceWithMBLayoutFor(ToIntDims(inShape),  fr.WithTimeOffset(m_fromOffset), GetMBLayout());    // apply the offset
+        }
+
+        // determine stripes to move w.r.t. main storage and from/to state
+        // For efficiency:
+        //  - this function assumes that the return values have been freshly constructed (it won't reset them)
+        //  - it may return a slice with end < begin which indicates an empty slice
+        void PartitionSlices(const SliceBounds & inSliceLogical, const SliceBounds & outSliceLogical,  // the move we want to make
+                             int T,                                                                    // our actual size
+                             SliceBounds & inSliceMain,  SliceBounds & outSliceMain,                   // the part that goes main-to-main
+                             SliceBounds & inSliceState, SliceBounds & outSliceState)                  // the part that goes from/to state
+        {
+            inSliceMain  =  inSliceLogical;
+            outSliceMain = outSliceLogical;
+            if (inSliceMain.first[m_shiftDim] < 0)
+            {
+                assert(inSliceMain.second[m_shiftDim] < T);
+                if (!m_state.empty())           // truncated BPTT case
+                {
+                    // determine range that lives in state
+                    SliceBounds inSliceOutside = inSliceMain;       // beginning falls to the left of the MB
+                    if (inSliceOutside.second[m_shiftDim] > 0)
+                        inSliceOutside.second[m_shiftDim] = 0;   // trim end; e.g. [-2,97) -> [-2,0), but [-2,-1) remains
+                    // now inSliceOutside represents only the region that falls outside
+
+                    // map to dimensions of our saved state
+                    SliceBounds inSliceState = ShiftDim(inSliceOutside, m_state.m_shape[m_shiftDim]);
+                    // E.g. for offset = -4, m_state will be 4 elements, so [-2,0) -> [2,4), and [-2,-1) -> [2,3)
+
+                    // map to target dimensions
+                    SliceBounds outSliceState = ShiftDim(inSliceOutside, -m_fromOffset);
+                    assert(inSliceState == outSliceState);     // (when we fall out on the left, both must be the same)
+                }
+                // else: no truncated BPTT means we must have a proper boundary. So don't write those values here, they will be initialized with boundary values below.
+
+                // and trim main (if 'from' is entirely outside, such as in the common single-frame case, we get begin >= end)
+                outSliceMain.first[m_shiftDim] += -inSliceMain.first[m_shiftDim];
+                inSliceMain.first[m_shiftDim]  += -inSliceMain.first[m_shiftDim];
+                assert(inSliceMain.first[m_shiftDim] == 0);
+            }
+            else if (inSliceMain.second[m_shiftDim] > T)
+            {
+                if (!m_state.empty())
+                {
+                    // determine range to get from state
+                    SliceBounds inSliceOutside = inSliceMain;
+                    if (inSliceOutside.first[m_shiftDim] < T)
+                        inSliceOutside.first[m_shiftDim] = T;     // trim end; e.g. [2,102) -> [100,102), but [101,102) remains
+                    // now inSliceOutside is where we should copy from, with indices completely out of bounds
+
+                    // map to dimensions of our saved state
+                    SliceBounds inSliceState = ShiftDim(inSliceOutside, -T);
+                    // E.g. for offset = 4, m_state will be 4 elements, so [100,102) -> [0,2), and [101,102) -> [1,2)
+
+                    // map to target dimensions
+                    SliceBounds outSliceState = ShiftDim(inSliceOutside, T - m_fromOffset);
+                    // E.g. [0,2) -> [96,98), and [1,2) -> [97,98)
+                }
+                // and trim main (if 'from' is entirely outside, such as in the common single-frame case, we get begin >= end)
+                outSliceMain.first[m_shiftDim] -= (inSliceMain.second[m_shiftDim] - T);
+                inSliceMain.second[m_shiftDim] -= (inSliceMain.second[m_shiftDim] - T);
+                assert(inSliceMain.second[m_shiftDim] == T);
+            }
+        }
+    public:
         virtual void ForwardProp(const FrameRange & fr) override
         {
+            if (fr.GetIterationDimension() != m_shiftDimParam)
+                LogicError("ShiftNode::ForwardProp(): FrameRange not iterating over user-specified dimension.");
+
+            // for debugging, invalidate the output region, so we will catch if we missed to update something
+#ifdef _DEBUG
+            ValueFor(fr).Invalidate();
+#endif
+
             // STEP 1: whole-sale copy a shifted version of the input to the output
             //  - consider the saved parts from the last minibatch as part of the input at dimensions beyond the bounds
-            //  - ignore boundary conditions for now
+            //  - ignore boundary conditions at this point (will be fixed subsequently)
+            // This will copy a little too much in case of multiple concatenated sequences within a single parallel sequence.
 
-            // get the tensors without shift
+            // get the logical ranges we want to shift
+            TensorShape inShape, outShape;                  // expanded tensor shapes of input and output
+            SliceBounds inSliceLogical, outSliceLogical;    // the logical ranges to shift
             size_t rank = DetermineElementwiseTensorRank();
-            auto result = ValueTensorFor(rank, fr);
-            auto input = Input(0)->ValueTensorFor(rank, fr);
+            DetermineSlices(rank, fr, inShape, outShape, inSliceLogical, outSliceLogical);
 
-            // shift the dimension in the input
+            // now copy the two stripes--one that is main-to-main, and one that pulls in data from previous state (truncated BPTT only)
+            // This correctly handles if input is a tensor with strides. This is currently not the case, but may be if we support in-place.
+
+            SliceBounds inSliceMain, outSliceMain;          // main-to-main
+            SliceBounds inSliceState, outSliceState;        // from state
+            PartitionSlices(inSliceLogical, outSliceLogical, outShape[m_shiftDim], inSliceMain, outSliceMain, inSliceState, outSliceState);
+
+            if (!inSliceState.first.empty() && inSliceState.second[m_shiftDim] > inSliceState.first[m_shiftDim])
+            {
+                // Note: If all sequences begin at the start of the range, this would copy invalid values which would be overwrittten below.
+                // This is prevented in that m_state will be set to empty in the previous MB if all sequences ended, which will in turn return an empty slice.
+                auto from = DataTensorFor(m_state.m_delayedValue, m_state.m_shape,  inSliceState);
+                auto to   = DataTensorFor(Value(),                       outShape, outSliceState);
+                to.AssignCopyOf(from);
+            }
+            if (inSliceMain.second[m_shiftDim] > inSliceMain.first[m_shiftDim])
+            {
+                auto from = DataTensorFor(Input(0)->Value(),  inShape,  inSliceMain);
+                auto to   = DataTensorFor(          Value(), outShape, outSliceMain);
+                to.AssignCopyOf(from);
+            }
+            // We have now pulled anything from within the logical bounds.
+            // Any frame that pulls from outside contains invalid values (either not initialized or copied from incorrect source), which must be fixed next.
 
             // STEP 2: fix up the boundary conditions
-            //  - fill in xxx
+            //  - fill in all frames that are too close to boundary and must be filled from context (recurrent) or by replication (non-recurrent only)
 
-            // turn selected frame and shifted frame into a tensor
+            if (fr.IsAllFrames() || GetMBLayout()->IsBeyondStartOrEnd(fr.WithTimeOffset(m_fromOffset)))     // short-cut test whether there is anything to do
+            {
+                auto ts = outSliceLogical.first[m_shiftDim];
+                auto te = outSliceLogical.second[m_shiftDim];
+                //size_t sequenceDim = outShape.size() - 2;  // TODO: In case of multiple time dims, this must be adjusted. Code dup from TensorSliceWithMBLayoutFor(). Encapsulate this.
+                // iterate over all sequences in this batch and handle all that overlap with the target region
+                for (const auto & seq : GetMBLayout()->GetAllSequences())
+                {
+                    if (seq.tEnd <= ts || seq.tBegin >= te)     // no overlap--skip
+                        continue;
 
-            // copy all that's in range
+                    // get tensor to fill in. This may be out of bounds, and may only partially overlap with [ts,te)
+                    auto seqLen = abs(m_fromOffset);
+                    auto seqBegin = m_fromOffset < 0 ? seq.tBegin : seq.tBegin + seq.GetNumTimeSteps() - seqLen;    // e.g. m_fromOffset = -4 -> [0,4) , +4 -> [Len-4,Len)
+                    auto outSliceFill = TensorSliceWithMBLayoutFor(ToIntDims(outShape), fr.WithTimeOffset(seqBegin).WithTimeRange(seqLen).Sequence(seq.s), GetMBLayout());
 
-            // fix up all that is not
+                    // get tensor to fill from
+                    // We fill either from the provided boundary node or from ourselves (BoundaryMode::duplicate = clamp).
+                    bool clamp = m_boundaryMode == BoundaryMode::duplicate;
+                    ComputationNodeBasePtr boundaryNode = clamp ? shared_from_this() : Input(0);
+                    auto boundaryShape = boundaryNode->GetTensorShape(rank);
+                    auto fromSeq = clamp ?
+                                       seq.s :
+                                       boundaryNode->HasMBLayout() ?
+                                           boundaryNode->GetMBLayout()->FindSequence(seq.seqId).seqId :
+                                           SIZE_MAX;
+                    auto fromBegin = 0;
+                    auto boundarySliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(boundaryShape), fr.WithTimeOffset(fromBegin).WithTimeRange(seqLen).Sequence(fromSeq), GetMBLayout());
+
+                    boundarySliceLogical;
+
+                    //inSliceLogical = TensorSliceWithMBLayoutFor(ToIntDims(inShape), fr.WithTimeOffset(m_fromOffset), GetMBLayout());    // apply the offset
+
+
+
+                    // clip against [ts,te)
+                    // copy
+                    sin(1);
+                }
+            }
+        }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+            // To allow for bulk gradient computation, we will clear out any gradient that should not be propagated.
+            // We do that directly to our incoming output gradient. This is OK because we own this, and it is no longer used after this operation
+            // (it is invalid to call BackpropTo() multiple times since it adds to the outgoing Input() gradient).
+            assert(inputIndex == 0); inputIndex;
+            fr;
         }
 
         virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
@@ -177,46 +339,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             assert(m_inputs.size() == 2);
             ComputationNodeBase::Validate(isFinalValidationPass);
 
-            if (isFinalValidationPass)
-                sin(1.0f);
-
             // MBLayout is just inherited
             m_pMBLayout = Input(0)->GetMBLayout();
             if (isFinalValidationPass && !m_pMBLayout)
                 InvalidArgument("%ls %ls operation must operate on data (must have an MB Layout).", NodeName().c_str(), OperationName().c_str());
 
-            // determine final sample layout
-            auto inputSampleLayout = Input(0)->GetSampleLayout();
-            auto inputDims = inputSampleLayout.GetDims();
-            if (m_insertedDimParam < 0)
-                InvalidArgument("%ls %ls operation: Specified insertion location %d refers to a time dimension, but this is not allowed.", 
-                                NodeName().c_str(), OperationName().c_str(), m_insertedDimParam);
-            m_insertExpandShapeAt = m_numSteps > 1 ? 0 : (m_insertedDimParam > 0 ? m_insertedDimParam - 1 : inputDims.size());
-            if (m_insertExpandShapeAt > inputDims.size())
-                if (isFinalValidationPass)
-                    InvalidArgument("%ls %ls operation: Specified insertion location %d beyond end of input sample layout [%s].",
-                                    NodeName().c_str(), OperationName().c_str(), m_insertedDimParam, string(inputSampleLayout).c_str());
-                else
-                    m_insertExpandShapeAt = inputDims.size();   // this may be an error, but we want to catch that only in the final pass
-            SmallVector<size_t> dims;
-            if (m_numSteps > 1 && inputDims.size() + 1 > dims.capacity())
-                InvalidArgument("%ls %ls operation: Too many dimensions. Did you feed back output of this node without stripping the extra dimensions?",
-                                NodeName().c_str(), OperationName().c_str());
-            dims.append(inputDims.begin(), inputDims.begin() + m_insertExpandShapeAt);
-            if (m_numSteps > 1)             // insert the new dimension if we expand into more than one step
-                dims.push_back(m_numSteps);
-            dims.append(inputDims.begin() + m_insertExpandShapeAt, inputDims.end());
-            auto sampleLayout = TensorShape(dims);
+            // as is the sample layout
+            SetDims(Input(0));
 
-            SetDims(sampleLayout, 0);
+            // determine the dimension that is to be shifted (convert user-specified as a zero-based index)
+            if (isFinalValidationPass)
+            {
+                size_t rank = DetermineElementwiseTensorRank();
+                auto valueShape = GetTensorShape(rank);                         // bounds of the Value()
+                m_shiftDim = m_shiftDimParam > 0 ? m_shiftDimParam - 1/*regular dimensions are specified as 1-based*/ : valueShape.size() + m_shiftDimParam/*-1 for time dimension*/;
+            }
         }
 
         // special interface for use by loop detection
         virtual int /*IRecurrentNode::*/GetRecurrenceSteppingDirection() const override
         {
-            if (m_boundaryMode != BoundaryMode::reachAcross)
+            if (m_boundaryMode != BoundaryMode::reachAcross)    // duplicating boundary frames cannot be done with recurrence
                 return 0;
-            else if (m_fromOffset + (int)m_numSteps <= 0)
+            else if (m_fromOffset < 0)
                 return +1;
             else if (m_fromOffset > 0)
                 return -1;
@@ -231,48 +376,61 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             {
                 auto node = dynamic_pointer_cast<ShiftNode<ElemType>>(nodeP);
                 node->m_fromOffset          = m_fromOffset;
-                node->m_numSteps            = m_numSteps;
                 node->m_boundaryMode        = m_boundaryMode;
-                node->m_shiftDimension      = m_shiftDimension;
-                node->m_insertedDimParam    = m_insertedDimParam;
-                node->m_insertExpandShapeAt = m_insertExpandShapeAt;
+                node->m_shiftDimParam       = m_shiftDimParam;
+                node->m_shiftDim            = m_shiftDim;
                 node->m_state               = m_state;
             }
         }
 
         class ShiftNodeState : public INodeState
         {
-            Matrix<ElemType> m_delayedValue;            // saves the activation of the previous step that this node points to
-            vector<MBLayout::SequenceInfo> m_delayedSequences;    // and associated sequence info. This is only used for consistency checking (it must match).
+        public:
+            Matrix<ElemType> m_delayedValue;                    // saves the activation of the previous step that this node points to
+            TensorShape m_shape;                                // tensor shape that describes m_delayedValue
+            vector<MBLayout::SequenceInfo> m_delayedSequences;  // and associated sequence info. This is only used for consistency checking (it must match).
             ShiftNodeState(DEVICEID_TYPE deviceId) : m_delayedValue(deviceId) { }
+            bool empty() const { return m_delayedSequences.empty(); }
+            void clear() { m_delayedValue.Resize(0, 0); m_shape = TensorShape(); m_delayedSequences.clear(); }
         };
         typedef std::shared_ptr<ShiftNodeState> ShiftNodeStatePtr;
 
         // state export/import
-        // This is done with a shared_ptr. The moment state is exported, the internal state is cleared; ownership is transferred to the exporting entity.
-        // This way, the next invocation does not overwrite the exported state, but is required to create a new one if needed.
-        // On the other hand, once imported, the state object is owned by the node and will be overwritten with the next state.
-        virtual NodeStatePtr ExportState() { return std::move(m_state); }
-        virtual void ImportState(NodeStatePtr && state) override
+        // This is done with a shared_ptr. The current state is exported, the internal state is cleared.
+        // Ownership of members is logically transferred to the exporting entity.
+        // Physically, however, since we often transfer between CPU and GPU, activation data is merely copied,
+        // and the GPU or CPU object resized to (0,0) without giving up the memory.
+        virtual NodeStatePtr ExportState()  // TODO: can we instead pass the shared_ptr object in? So we don't need to create a new one all the time? Or should we still take ownership of the ptr?
         {
-            m_state = dynamic_pointer_cast<ShiftNodeState>(state);
-            if (state && !m_state)
+            auto state = make_shared<ShiftNodeState>(CPUDEVICE);
+            state->m_delayedValue.SetValue(m_state.m_delayedValue);     // note: this will transfer from GPU to CPU
+            m_state.m_delayedValue.Resize(0, 0);
+            state->m_shape = std::move(m_state.m_shape);
+            state->m_delayedSequences = std::move(m_state.m_delayedSequences);
+            return state;
+        }
+        virtual void ImportState(const NodeStatePtr & statep) override
+        {
+            ShiftNodeStatePtr state = dynamic_pointer_cast<ShiftNodeState>(statep);
+            if (!state)
                 LogicError("ImportState: Wrong state object passed (wrong type).");
+            m_state.m_delayedValue.SetValue(state->m_delayedValue);     // note: this will transfer from CPU to GPU
+            state->m_delayedValue.Resize(0, 0);
+            m_state.m_shape = std::move(state->m_shape);
+            m_state.m_delayedSequences = std::move(state->m_delayedSequences);
         }
     protected:
         // parameters remembered from construction
-        int m_fromOffset;                      // offset to pull from
-        int m_numSteps;                             // offset range
-        BoundaryMode m_boundaryMode;                // how to fill at the boundary (reach across, duplicate, or trim)
-        int m_shiftDimension;                       // dimension to shift (default: time)
-        int m_insertedDimParam;                     // in case of multiple steps, this is where a new dimension will be inserted
+        int m_fromOffset;                       // offset to pull from
+        BoundaryMode m_boundaryMode;            // how to fill at the boundary (reach across or duplicate)
+        int m_shiftDimParam;                    // dimension to shift (default: time)
 
-        // derived params set up in Validate()
-        size_t m_insertExpandShapeAt;               // at which dimension to insert (internal 0-based index)
+        size_t m_shiftDim;                      // m_shiftDimParam matched to the real tensor index
 
-        ShiftNodeStatePtr m_state;                  // saves the activation of the previous step that this node points to
+        ShiftNodeState m_state;                 // state that is carried over across evaluations
+        // Note: The version held by this node lives in the GPU, whereas the versions being exported carry CPU-side copies
 
-        function<void()> m_attachInputsFn;          // for late expansion of inputs (scripting)
+        function<void()> m_attachInputsFn;      // for late expansion of inputs (scripting)
     };
 
     // -----------------------------------------------------------------------
@@ -333,7 +491,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     //  - ranges of neighbor frames as a secondary tensor dimension (i.e. can be used to implement a rolling window)
     //  - full support/efficiency of non-recurrent use (in which case the range can be from negative to positive, e.g. a symmetric rolling window)
     //  - denoting which tensor dimension to loop over (this may not be completed, but I will plant a seed)
-    //  - support for Yongqiang�s sub-minibatching with BPTT (export/import state)
+    //  - support for Yongqiang�s sub-minibatching with truncated BPTT (export/import state)
     //  - more efficient storage of carried-over state (only store the needed frames, not a full copy of the previous MB as currently; which will on the other hand also allow windows that reach back beyond a minibatch)
     // -----------------------------------------------------------------------
 
@@ -486,7 +644,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
         virtual void EndForwardProp() override        // called after last iteration step of ForwardProp()
         {
-            // In BPTT, we carry over left-to-right state across minibatches.
+            // In truncated BPTT, we carry over left-to-right state across minibatches.
             // It is kept in m_delayedValue, m_delayedActivationMBLayout.
             // This could be optimized as follows:
             //  - only keep the required number of frames (m_timeStep)
@@ -620,27 +778,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
             if (dir == -1) // we look into past 
             {
-#if 0
-                bool   allAtBoundary = true;
-                // if the current last frames are all sentence end or no feature , there is no need to carry on state info
-                if (m_pMBLayout->Is(FrameRange(nT-1), MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature))
-                {
-                    for (size_t u = 0; u < nU; u++)
-                    {
-                        if (!m_pMBLayout->Is(FrameRange(nT - 1).Sequence(u), MinibatchPackingFlags::SequenceEnd | MinibatchPackingFlags::NoFeature))
-                        {
-                            allAtBoundary = false;
-                            break;
-                        }
-                    }
-                }
-                else
-                {
-                    allAtBoundary = false; 
-                }
-
-                if (allAtBoundary)
-#endif
                 if (!m_pMBLayout->HasSequenceBeyondEnd())       // only need to export state if anything crosses the MB boundary
                 {
                     auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId); 
@@ -655,26 +792,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     pExportedState = pState; 
                 }
             }
-            if (dir == 1) // we look into future 
+            else if (dir == 1) // we look into future 
             {
-#if 0
-                // TODO: check whether all at boundary and don't carry state if it is the case 
-                size_t nT = m_pMBLayout->GetNumTimeSteps(); 
-                size_t nU = m_pMBLayout->GetNumParallelSequences(); 
-                bool allAtBoundary = true; 
-                if (m_pMBLayout->Is(FrameRange(nullptr, 0), MinibatchPackingFlags::NoFeature | MinibatchPackingFlags::SequenceStart))
-                {
-                    for (size_t u = 0; u < nU; u++)
-                    {
-                        if (!m_pMBLayout->Is(FrameRange(nullptr, 0).Sequence(u), MinibatchPackingFlags::SequenceStart | MinibatchPackingFlags::NoFeature))
-                        {
-                            allAtBoundary = false; 
-                            break;
-                        }
-                    }
-                }
-                if (allAtBoundary)
-#endif
                 if (!m_pMBLayout->HasSequenceBeyondBegin())       // only need to export state if anything crosses the MB boundary
                 {
                     auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId); 
@@ -689,19 +808,19 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     pExportedState = pState;
                 }
             }
-            if (dir != -1 && dir != 1)
+            else
             {
-                RuntimeError("Unrecognized direction in DelayedValueNodeBase");
+                LogicError("Unrecognized direction in DelayedValueNodeBase");
             }
             return pExportedState;
         }
 
-        virtual void /*IStatefulNode::*/ImportState(NodeStatePtr && pImportedState) override
+        virtual void /*IStatefulNode::*/ImportState(const NodeStatePtr & pImportedState) override
         {
             DelayedNodeStatePtr pState = dynamic_pointer_cast<DelayedValueNodeState<ElemType>> (pImportedState); 
 
             if (!pState)
-                RuntimeError("Expecting DelayValueNodeState after down casting"); 
+                LogicError("Expecting DelayValueNodeState after downcasting"); 
 
             pState->ExportDelayedMBLayout(m_delayedActivationMBLayout);  // pstate copy to m_delayedActivationMBLayout
             if (pState->IsEmpty())
@@ -715,18 +834,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
             int dir = direction;
             if (dir == -1) // looking backward 
-            {
                 m_delayedValue.SetColumnSlice(delayedActivation, (nT - 1)*nU, nU);
-            }
-            if (dir == 1)
-            {
-                //m_delayedValue.CopyColumnsStrided(delayedActivation, nU, 1, nT);
+            else if (dir == 1)
                 m_delayedValue.SetColumnSlice(delayedActivation, 0, nU);
-            }
-            if (dir != -1 && dir == 1)
-            {// it is really a compile error ? 
-                RuntimeError("Unrecognized direction in DelayedValueNodeBase");
-            }
+            else
+                LogicError("Unrecognized direction in DelayedValueNodeBase");
         }
     protected:
 
diff --git a/Source/ComputationNetworkLib/TrainingCriterionNodes.h b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
index 0b73d69b7..a0f00586c 100644
--- a/Source/ComputationNetworkLib/TrainingCriterionNodes.h
+++ b/Source/ComputationNetworkLib/TrainingCriterionNodes.h
@@ -1234,8 +1234,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             }
             else if (inputIndex == 1)
             {
-                BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(),
-                                         Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold);
+				FrameRange fr(Input(0)->GetMBLayout());
+				BackpropToRight(*m_softmaxOfRight, Input(0)->Value(), Input(inputIndex)->Gradient(),
+					Gradient(), *m_gammaFromLattice, m_fsSmoothingWeight, m_frameDropThreshold);
+				MaskMissingColumnsToZero(Input(inputIndex)->Gradient(), Input(0)->GetMBLayout(), fr);
+                
 #ifdef _DEBUG
                 Input(inputIndex)->InvalidateMissingGradientColumns(FrameRange(Input(inputIndex)->GetMBLayout()));
 #endif
@@ -1368,14 +1371,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
         }
 
-        // Release gradient and temp matrices that are no longer needed after all the children's gradients are computed.
-        virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
-        {
-            Base::ReleaseMatricesAfterBackprop(matrixPool);
-            ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
-            ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
-            ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
-        }
+		//request matrices needed to do node function value evaluation
+		virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+		{
+			Base::ReleaseMatricesAfterBackprop(matrixPool);
+			ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
+			ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
+			ReleaseMatrixToPool(m_gammaFromLattice, matrixPool);
+		}
 
         // TODO: method names should be CamelCase
         std::vector<shared_ptr<const msra::dbn::latticepair>> * getLatticePtr()
@@ -1415,6 +1418,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_doReferenceAlignment = doreferencealign;
         }
 
+        void SetGammarCalculationParam(const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR)
+        {
+            msra::lattices::SeqGammarCalParam param; 
+            param.amf = amf; 
+            param.lmf = lmf; 
+            param.wp = wp; 
+            param.bMMIfactor = bMMIfactor; 
+            param.sMBRmode = sMBR;
+            m_gammaCalculator.SetGammarCalculationParams(param);
+        }
+
         void gettime(unsigned long long &gammatime, unsigned long long &partialtime)
         {
             gammatime = m_gammatime;
@@ -1427,6 +1441,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         shared_ptr<Matrix<ElemType>> m_gammaFromLattice;
         double m_frameDropThreshold;
         double m_fsSmoothingWeight;         // frame-sequence criterion interpolation weight    --TODO: can this be done outside?
+        double m_seqGammarAMF; 
+        double m_seqGammarLMF; 
+        double m_seqGammarWP; 
+        double m_seqGammarbMMIFactor;
+        double m_seqGammarUsesMBR; 
         bool m_doReferenceAlignment;
         std::vector<shared_ptr<const msra::dbn::latticepair>> m_lattices;
         msra::asr::simplesenonehmm m_hmm;
diff --git a/Source/EvalDll/EvalDll.vcxproj b/Source/EvalDll/EvalDll.vcxproj
index 71e515bc8..a535ca3ff 100644
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@@ -74,7 +74,7 @@
       <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\; "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</AdditionalLibraryDirectories>
@@ -102,7 +102,7 @@
       <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
index 65decd1b4..153dff585 100644
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -137,7 +137,7 @@ struct GridDim
         std::vector<cudaDeviceProp> props(numDevices);
         for (int i = 0; i < numDevices; i++)
             CUDA_CALL(cudaGetDeviceProperties(&props[i], i));
-#if 1   // on Linux, maxGridSize[0] gets reported as 0
+#if 0   // on Linux, maxGridSize[0] gets reported as 0
         for (int i = 0; i < numDevices; i++)
             fprintf(stderr, "%d procs  %d warps  %d %d %d max grid  on  %s\n", (int)props[i].multiProcessorCount, (int)props[i].warpSize, (int)props[i].maxGridSize[0], (int)props[i].maxGridSize[1], (int)props[i].maxGridSize[2], props[i].name);
 #endif
diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu
index 7e4f7a1c6..3d4635020 100644
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@@ -2246,7 +2246,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         slice.m_computeDevice = m_computeDevice;
         slice.m_numRows = m_numRows;
         slice.m_numCols = numCols;
-        slice.m_nz = SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
+        slice.m_nz = ( numCols == m_numCols ) ? m_nz : SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
         slice.m_elemSizeAllocated = m_elemSizeAllocated;
         slice.m_totalBufferSizeAllocated = m_totalBufferSizeAllocated;
         slice.m_pArray = m_pArray;
diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h
index 63234dabe..3f125330a 100644
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@@ -87,9 +87,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             return (MajorIndexLocation() + (m_format == matrixFormatSparseCSC ? SecondaryIndexValueAt(0) : 0));
         }
 
+		// TODO: Comment these methods more thoroughly, e.g., why it uses numNZ instead of m_elemSizeAllocated.
         size_t MajorIndexCount() const
         {
-            return MajorIndexCount(m_numRows, m_numCols, m_elemSizeAllocated, m_format);
+            return MajorIndexCount(m_numRows, m_numCols, m_nz, m_format);
         }
         size_t MajorIndexCount(const size_t numRows, const size_t numCols, const size_t numNZ, const MatrixFormat format) const
         { 
@@ -113,6 +114,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                 return MajorIndexLocation() + m_numRows;
             else
                 return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset;
+                //return MajorIndexLocation() + m_elemSizeAllocated + m_sliceViewOffset;
         } 
         size_t SecondaryIndexCount(const size_t numRows, const size_t numCols, const size_t numNZReserved, const MatrixFormat format) const
         {
diff --git a/Source/Math/Math.vcxproj b/Source/Math/Math.vcxproj
index f33a6328b..d79abfb28 100644
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@@ -79,7 +79,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
@@ -127,7 +127,7 @@
       <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Math/MathCUDA.vcxproj b/Source/Math/MathCUDA.vcxproj
index 7fcb5807a..ad29f39a4 100644
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@@ -91,7 +91,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>cudart.lib;cublas.lib;cusparse.lib;curand.lib;libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <Profile>true</Profile>
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 3650db859..52a937579 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -1383,17 +1383,62 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
     template<class ElemType>
-    void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum)
+    void Matrix<ElemType>::NormalGrad(Matrix<ElemType>& gradients, 
+                                      Matrix<ElemType>& functionValues, 
+                                      const ElemType learnRatePerSample, 
+                                      const ElemType momentum, 
+                                      const bool useNesterovMomentum
+                                      )
     {
         DecideAndMoveToRightDevice(*this, gradients, functionValues);
-
-        DISPATCH_MATRIX_ON_FLAG(&gradients,
+    
+        if (!useNesterovMomentum)
+        {
+            DISPATCH_MATRIX_ON_FLAG(&gradients,
             nullptr,
             ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, 
             ScaleAndAdd((1-momentum) * learnRatePerSample, gradients, momentum, *this); functionValues -= *this, 
             if (momentum != 0) gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues),
             if (momentum != 0) gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum); ScaleAndAdd(-learnRatePerSample, gradients, functionValues)
             );
+        }
+        else
+        {
+            DISPATCH_MATRIX_ON_FLAG(&gradients,
+            nullptr,
+            {/* CPU dense */
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this);
+                ScaleAndAdd(-momentum, *this, functionValues);
+                ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues);
+                // w_t = w_{t-1} - momentum * v_ {t-1} - (1-momentum)*learnRatePerSampele*gardient, 
+            }, 
+            {/* GPU dense */
+                ScaleAndAdd((1 - momentum) * learnRatePerSample, gradients, momentum, *this); 
+                ScaleAndAdd(-momentum, *this, functionValues); 
+                ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradients, functionValues);                 
+            }, 
+            { /* CPU sparse */
+                if (momentum != 0)
+                {
+                    Matrix<ElemType> gradientCache(gradients.GetDeviceId()); 
+                    gradientCache.SetValue(gradients); 
+                    gradients.m_CPUSparseMatrix->NormalGrad(*m_CPUMatrix, momentum); 
+                    ScaleAndAdd(-momentum, *this, functionValues); 
+                    ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues); 
+                }
+            }, 
+            { /* GPU sparse */
+                if (momentum != 0)
+                {
+                    Matrix<ElemType> gradientCache(gradients.GetDeviceId());
+                    gradientCache.SetValue(gradients);
+                    gradients.m_GPUSparseMatrix->NormalGrad(*m_GPUMatrix, momentum);
+                    ScaleAndAdd(-momentum, *this, functionValues);
+                    ScaleAndAdd(-(1 - momentum)*learnRatePerSample, gradientCache, functionValues);
+                }
+            }
+            );
+        }       
     }
 
     //both this and gradients will be changed
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 379169529..94eb0dd53 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -164,7 +164,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         void ShiftBy(int numShift);
 
         // TODO: all these scalars should be passed as doubles and cast down inside
-        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
+        void NormalGrad(Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum, const bool useNAG);
         ElemType Adagrad(Matrix<ElemType>& gradients, const bool needAveMultiplier);
         void FSAdagrad(size_t mbSize, Matrix<ElemType>& gradients, Matrix<ElemType>& functionValues, const ElemType learnRatePerSample, const ElemType momentum);
         ElemType RmsProp(Matrix<ElemType>& gradients, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index e032f2299..9d343eeea 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -237,8 +237,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView & a, ElemType alpha, ElementWiseOperator op)
     {
-        static int cc = 0; if (cc++ == 0)
-            fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
+        //static int cc = 0; if (cc++ == 0)
+        //    fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
 
         // prepare all tensor descriptor information as needed for execution
         array<size_t, 2> offsets;
@@ -257,8 +257,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, ElemType alpha, ElementWiseOperator op)
     {
-        static int cc = 0; if (cc++ == 0)
-            fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());
+        //static int cc = 0; if (cc++ == 0)
+        //    fprintf(stderr, "Tensor Op: Op %d: %s op %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(GetShape()).c_str());
 
         array<size_t, 3> offsets;
         array<SmallVector<ptrdiff_t>, 3> regularStrides, reducingStrides;
@@ -275,8 +275,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     void TensorView<ElemType>::DoTernaryOpOf(ElemType beta, const TensorView & a, const TensorView & b, const TensorView & c, ElemType alpha, ElementWiseOperator op)
     {
-        static int cc = 0; if (cc++ == 0)
-            fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());
+        //static int cc = 0; if (cc++ == 0)
+        //    fprintf(stderr, "Tensor Op: Op %d: %s, %s, %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(b.GetShape()).c_str(), string(c.GetShape()).c_str(), string(GetShape()).c_str());
 
         array<size_t, 4> offsets;
         array<SmallVector<ptrdiff_t>, 4> regularStrides, reducingStrides;
diff --git a/Source/Math/latticefunctionskernels.h b/Source/Math/latticefunctionskernels.h
index 876e3c6a8..b2b7d4b08 100644
--- a/Source/Math/latticefunctionskernels.h
+++ b/Source/Math/latticefunctionskernels.h
@@ -356,26 +356,39 @@ struct latticefunctionskernels
             const size_t te = ts + numframes;               // end time of current unit
 
             size_t state1step0to1 = te;                     // inflection point from state 0 to 1, record in state 1
+			//size_t state1stepm1to1 = te;
             size_t state2step0to1 = te;                     // inflection point from state 0 to 1, record in state 2
+            //size_t state2stepm1to1 = te;                    // inflection point from state 0 to 1, record in state 2
             size_t state2step1to2 = te;                     // inflection point from state 1 to 2, record in state 2
+			size_t state2step0to2 = te;
 
             //now we only support transition from -1 to 0 or 2 for sil
-            float pathscore0 = fwscore ;                     // log pp in state 0
-            float pathscore1 = LOGZERO;                     // log pp in state 1
-            float pathscore2 = LOGZERO;                     // log pp in state 2
-            if(isSil)
-                pathscore2 = fwscore;                    
+			float pathscore0 = fwscore;                     // log pp in state 0
+			float pathscore1 = fwscore;                     // log pp in state 1
+			float pathscore2 = fwscore;                     // log pp in state 2
+            
                 
+
             // first frame
             if (ts != te)                                                              // for t = ts, initialization
             {                           
-                if (isSil)                                                              //for sil, -1 to 2 and -1 to 0 is permitted
+            /*    if (isSil)                                                              //for sil, -1 to 2 and -1 to 0 is permitted
                 {
                     pathscore0 += getlogtransp(transP,-1,0) + logLLs(senoneid0,ts); 
                     pathscore2 += getlogtransp(transP,-1,2) + logLLs(senoneid2,ts);      
                 }
-                else                                                                    //for others, only -1 to 0 is permitted
-                    pathscore0 +=  logLLs(senoneid0,ts);                                // Note: no need to incorporate LLs for state [1] and [2] because the path log LLs are LOGZERO anyway
+				else                                                                    //for others, only -1 to 0 is permitted
+				{
+					pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);                                
+					pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);                                
+
+				}*/
+				pathscore2 += getlogtransp(transP, -1, 2) + logLLs(senoneid2, ts);
+				pathscore1 += getlogtransp(transP, -1, 1) + logLLs(senoneid1, ts);
+				//state1stepm1to1 = ts;
+				pathscore0 += getlogtransp(transP, -1, 0) + logLLs(senoneid0, ts);
+				
+
             }
             
             
@@ -399,17 +412,22 @@ struct latticefunctionskernels
                     {
                         pathscore2 = pathscore12;
                         state2step0to1 = state1step0to1;                                        // record the inflection point
+						//state2stepm1to1 = state1stepm1to1;
                         state2step1to2 = t;                                                     // record the inflection point
+						state2step0to2 = te;
                         if (isSil)
                             backptrmatrix (2, t-ts-1) = 1;
                     }
-                    if (isSil)                                                                  // only silence have path from 0 to 2
+                    //if (isSil)                                                                  // only silence have path from 0 to 2
                     {
                         const float pathscore02 = pathscore0 + getlogtransp(transP,0,2);          // log pp from state 0 to 2
                         if (pathscore02 >= pathscore2)                                          // if state 0->2
                         {
                             pathscore2 = pathscore02;
-                            backptrmatrix (2, t-ts-1) = 0;
+                            if (isSil)
+                            	backptrmatrix (2, t-ts-1) = 0;
+							state2step0to2 = t;		
+							state2step1to2 = te;
                         }
                     }
 
@@ -422,9 +440,11 @@ struct latticefunctionskernels
                     {
                         pathscore1 = pathscore01;
                         state1step0to1 = t;                                                     // record the inflection point
+						//state1stepm1to1 = te;
                         if (isSil)
                             backptrmatrix (1, t-ts-1) = 0;
                     }
+					
                     if (isSil)                                                                  // only silence have path from 2 to 1
                     {
                         const float pathscore21 = pathscore2last + getlogtransp(transP,2,1); 
@@ -495,19 +515,35 @@ struct latticefunctionskernels
 
             if (!isSil)
             {
-                state2step0to1 += alignindex - ts;                              // convert to align measure
-                state2step1to2 += alignindex - ts;
-                for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
-                {
-                    size_t senoneid;
-                    if (t < state2step0to1)                                     // in state 0
-                        senoneid = senoneid0;
-                    else if(t < state2step1to2)                                 // in state 1
-                        senoneid = senoneid1;
-                    else                                                        // in state 2
-                        senoneid = senoneid2;
-                    alignresult[t] = (unsigned short) senoneid;
-                }
+				if (state2step0to2 < te)     //from 0 to 2
+				{
+					state2step0to2 += alignindex - ts;
+					for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
+					{
+						size_t senoneid;
+						if (t < state2step0to2)                                     // in state 0
+							senoneid = senoneid0;						
+						else                                                        // in state 2
+							senoneid = senoneid2;
+						alignresult[t] = (unsigned short)senoneid;
+					}
+				}
+				else          //from 1 to 2
+				{
+					state2step0to1 += alignindex - ts;                              // convert to align measure
+					state2step1to2 += alignindex - ts;
+					for (size_t t = alignindex; t < alignindex + numframes; t++)    // set the final alignment
+					{
+						size_t senoneid;
+                        if (state2step0to1 <alignindex - ts + te && t < state2step0to1)
+							senoneid = senoneid0;
+						else if(t < state2step1to2)                                 // in state 1
+							senoneid = senoneid1;
+						else                                                        // in state 2
+							senoneid = senoneid2;
+						alignresult[t] = (unsigned short) senoneid;
+					}
+				}
             }
             else                                                                        // for silence
             {
diff --git a/Source/Readers/BinaryReader/BinaryReader.vcxproj b/Source/Readers/BinaryReader/BinaryReader.vcxproj
index 208fab6bc..ac0f40bac 100644
--- a/Source/Readers/BinaryReader/BinaryReader.vcxproj
+++ b/Source/Readers/BinaryReader/BinaryReader.vcxproj
@@ -70,7 +70,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -91,7 +91,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/DSSMReader/DSSMReader.vcxproj b/Source/Readers/DSSMReader/DSSMReader.vcxproj
index 1412fac38..d607a7c9f 100644
--- a/Source/Readers/DSSMReader/DSSMReader.vcxproj
+++ b/Source/Readers/DSSMReader/DSSMReader.vcxproj
@@ -72,7 +72,7 @@
       <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -93,7 +93,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj
index 8a422f187..438c7daed 100644
--- a/Source/Readers/DataReaderTest/DataReaderTest.vcxproj
+++ b/Source/Readers/DataReaderTest/DataReaderTest.vcxproj
@@ -100,7 +100,7 @@
       <UseFullPaths>true</UseFullPaths>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
     </Link>
@@ -115,7 +115,7 @@
       <UseFullPaths>true</UseFullPaths>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalLibraryDirectories>$(VCInstallDir)UnitTest\lib;%(AdditionalLibraryDirectories)</AdditionalLibraryDirectories>
       <AdditionalDependencies>ucireader.lib;Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
@@ -133,7 +133,7 @@
       <UseFullPaths>true</UseFullPaths>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
@@ -152,7 +152,7 @@
       <UseFullPaths>true</UseFullPaths>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
index 0db717a99..ecc6283f6 100644
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@@ -100,6 +100,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             vector<wstring> scriptpaths;
             vector<wstring> RootPathInScripts; 
+            wstring         RootPathInLatticeTocs;
             vector<wstring> mlfpaths;
             vector<vector<wstring>>mlfpathsmulti;
             size_t firstfilesonly = SIZE_MAX;   // set to a lower value for testing
@@ -263,7 +264,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     expand_wildcards(thisLattice(L"numLatTocFile"), paths);
                     latticetocs.first.insert(latticetocs.first.end(), paths.begin(), paths.end());
                 }
-
+                RootPathInLatticeTocs =(wstring) thisLattice(L"prefixPathInToc",L"");
             }
 
             //get HMM related file names
@@ -448,7 +449,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if (!_wcsicmp(readMethod.c_str(), L"blockRandomize"))
             {
                 // construct all the parameters we don't need, but need to be passed to the constructor...
-                m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap()));
+                
+                m_lattices.reset(new msra::dbn::latticesource(latticetocs, m_hset.getsymmap(), RootPathInLatticeTocs));
+                m_lattices->setverbosity(m_verbosity);
 
                 // now get the frame source. This has better randomization and doesn't create temp files
                 m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, numContextLeft, numContextRight, randomize, *m_lattices, m_latticeMap, m_frameMode));
@@ -941,6 +944,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         if (!skip)
                         {
+                            // a stopgap 
+                            if (m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i])
+                            {
+                                // BUGBUG: we just found that (due to some bugs yet to be tracked down), 
+                                // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
+                                // This is just a stopgap, to be removed after the bugs are found and fixed
+                                bool needRenew = true; 
+                                while (needRenew)
+                                {
+                                    size_t framenum = m_numFramesToProcess[i];
+                                    fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
+                                        (int)framenum, (int)m_latticeBufferMultiUtt[i]->getnumframes(), m_latticeBufferMultiUtt[i]->getkey().c_str());
+                                    ReNewBufferForMultiIO(i);
+                                    needRenew = m_numFramesToProcess[i] > 0 && m_latticeBufferMultiUtt[i] && m_latticeBufferMultiUtt[i]->getnumframes() != m_numFramesToProcess[i]; 
+                                }
+
+                            }
                             m_numValidFrames[i] = m_numFramesToProcess[i];
                             if (m_numValidFrames[i] > 0)
                             {
@@ -972,49 +992,50 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         m_extraNumSeqs = 0;
                         if (!m_frameMode)
                         {
-                            // insert extra utterances to parallel sequences that have enough space left
-                            // As long as there is a gap at the end of any parallel sequence that is large enough for another utterance, fill it in.
-                            size_t nextMinibatchUttnum = 0;
-                            bool inserted;
-                            // The next utterances have already been prepared under parallel-sequence indices [i], in prep for the next MB.
-                            // For each, we will go through all parallel sequences [j] to see whether the entry currently held for the next [i] fits into [j].
-                            for (size_t i = 0; i < m_numSeqsPerMB; i++)
+                            for (size_t src = 0; src < m_numSeqsPerMB; )
                             {
-                                while (nextMinibatchUttnum <= i)
+                                size_t framenum = m_numFramesToProcess[src];
+                                if (framenum == 0)
                                 {
-                                    size_t framenum = m_numFramesToProcess[i];
-                                    inserted = false;
-                                    if (framenum > 0)       // non-empty entry: see were it fits
-                                    {
-                                        // greedily search for a parallel sequence with enough space at the end to insert this utterance
-                                        for (size_t j = 0; j < m_numSeqsPerMB; j++)
-                                        {
-                                            if (framenum + m_numValidFrames[j] < m_mbNumTimeSteps)
-                                            {
-                                                // enough space: insert it as parallel sequence [j] (instead of [i] in the next MB)
-                                                m_extraSeqsPerMB.push_back(j);
-                                                if (m_latticeBufferMultiUtt[i] != nullptr)
-                                                {
-                                                    m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[i]);
-                                                    m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[i]);
-                                                    m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[i]);
-                                                }
-                                                fillOneUttDataforParallelmode(matrices, m_numValidFrames[j], framenum, j, i);
-                                                m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, j, m_numValidFrames[j], m_numValidFrames[j] + framenum);
+                                    src++;
+                                    continue;
+                                }
+                                if (m_latticeBufferMultiUtt[src]!=nullptr && m_latticeBufferMultiUtt[src]->getnumframes()!=framenum)
+                                {
+                                    // BUGBUG: we just found that (due to some bugs yet to be tracked down), 
+                                    // the filled number of frames is inconsistent with the number frames in lattices (though it rarely occurs)
+                                    // This is just a stopgap, to be removed after the bugs are found and fixed
+                                    fprintf(stderr, "WARNING: mismatched number of frames filled in the reader: %d in data vs %d in lattices. Ignoring this utterance %ls\n",
+                                        (int)framenum, (int)m_latticeBufferMultiUtt[src]->getnumframes(), m_latticeBufferMultiUtt[src]->getkey().c_str());
+                                    src++;
+                                    continue;
+                                }
 
-                                                // consume it
-                                                ReNewBufferForMultiIO(i);       // replace current [i] with a new one; then try again with this new one at [i]
-                                                m_numValidFrames[j] += framenum;
-                                                m_extraNumSeqs++;
-                                                inserted = true;
-                                                break;
-                                            }
+                                bool slotFound = false;
+                                for (size_t des = 0; des < m_numSeqsPerMB; des++) // try to found a slot
+                                {
+                                    if (framenum + m_numValidFrames[des] < m_mbNumTimeSteps)
+                                    { // found !
+                                        m_extraSeqsPerMB.push_back(des);
+                                        if (m_latticeBufferMultiUtt[src] != nullptr)
+                                        {
+                                            m_extraLatticeBufferMultiUtt.push_back(m_latticeBufferMultiUtt[src]);
+                                            m_extraLabelsIDBufferMultiUtt.push_back(m_labelsIDBufferMultiUtt[src]);
+                                            m_extraPhoneboundaryIDBufferMultiUtt.push_back(m_phoneboundaryIDBufferMultiUtt[src]);
                                         }
+                                        fillOneUttDataforParallelmode(matrices, m_numValidFrames[des], framenum, des, src);
+                                        m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, des, m_numValidFrames[des], m_numValidFrames[des] + framenum);
+
+                                        ReNewBufferForMultiIO(src);
+                                        m_numValidFrames[des] += framenum;
+                                        m_extraNumSeqs++;
+                                        slotFound = true;
+                                        break;
                                     }
-                                    if (!inserted)
-                                    {
-                                        nextMinibatchUttnum++;  // didn't fit anywhere: done with entry [i]
-                                    }
+                                }
+                                if (!slotFound)
+                                {
+                                    src++; // done with this source;  try next source;
                                 }
                             }
 
diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.h b/Source/Readers/HTKMLFReader/HTKMLFReader.h
index fd6015c28..7e64ee3e8 100644
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.h
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.h
@@ -32,6 +32,9 @@ private:
     intargvector m_numSeqsPerMBForAllEpochs;
     size_t m_numSeqsPerMB;                  // requested number of parallel sequences
     size_t m_mbNumTimeSteps;                // number of time steps  to fill/filled (note: for frame randomization, this the #frames, and not 1 as later reported)
+    size_t m_mbMaxNumTimeSteps;             // max time steps we take in a MB layout; any setence longer than this max will be discarded (and a warning will be issued )
+                                            // this is used to prevent CUDA out-of memory errors
+
     vector<size_t> m_numFramesToProcess;    // [seq index] number of frames available (left to return) in each parallel sequence
     vector<size_t> m_switchFrame;           /// TODO: something like the position where a new sequence starts; still supported?
     vector<size_t> m_numValidFrames;        // [seq index] valid #frames in each parallel sequence. Frames (s, t) with t >= m_numValidFrames[s] are NoInput.
diff --git a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
index de7772889..fd8f9c343 100644
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
@@ -69,7 +69,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
@@ -87,7 +87,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/ImageReader/ImageReader.vcxproj b/Source/Readers/ImageReader/ImageReader.vcxproj
index b5061adaf..7d3a3b01c 100644
--- a/Source/Readers/ImageReader/ImageReader.vcxproj
+++ b/Source/Readers/ImageReader/ImageReader.vcxproj
@@ -75,7 +75,7 @@
       <OpenMPSupport>true</OpenMPSupport>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;$(OpenCVLib);%(AdditionalDependencies)</AdditionalDependencies>
     </Link>
diff --git a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
index 24a8a1112..93b527173 100644
--- a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
+++ b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
@@ -71,7 +71,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -92,7 +92,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
index bb68dd89d..a73d0af74 100644
--- a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
+++ b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
@@ -71,7 +71,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -92,7 +92,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
index e3a10c534..e5d8ac1fb 100644
--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
@@ -72,7 +72,7 @@
       <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -93,7 +93,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
index 72d18defe..db66c6d31 100644
--- a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
+++ b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
@@ -72,7 +72,7 @@
       <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -93,7 +93,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
index fc0e03ffa..e30dc6b90 100644
--- a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
+++ b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
@@ -70,7 +70,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -91,7 +91,7 @@
       <TreatWarningAsError>true</TreatWarningAsError>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/Readers/UCIReader/UCIReader.vcxproj b/Source/Readers/UCIReader/UCIReader.vcxproj
index 2e25c2b57..08cce8205 100644
--- a/Source/Readers/UCIReader/UCIReader.vcxproj
+++ b/Source/Readers/UCIReader/UCIReader.vcxproj
@@ -91,7 +91,7 @@
       <SDLCheck>true</SDLCheck>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -107,7 +107,7 @@
       <AdditionalIncludeDirectories>..\..\common\include;..\..\Source\Math</AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
       <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\Source\Math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
@@ -124,7 +124,7 @@
       <SDLCheck>true</SDLCheck>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
@@ -144,7 +144,7 @@
       <AdditionalIncludeDirectories>..\..\common\include;..\..\Source\Math</AdditionalIncludeDirectories>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
       <EnableCOMDATFolding>true</EnableCOMDATFolding>
       <OptimizeReferences>true</OptimizeReferences>
diff --git a/Source/SGDLib/MultiNetworksSGD.h b/Source/SGDLib/MultiNetworksSGD.h
index 19f3f2025..a4851fdf8 100644
--- a/Source/SGDLib/MultiNetworksSGD.h
+++ b/Source/SGDLib/MultiNetworksSGD.h
@@ -63,6 +63,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         using SGDBase::m_L2RegWeight;
         using SGDBase::m_L1RegWeight;
         using SGDBase::m_needAveMultiplier;
+        using SGDBase::m_useNesterovMomentum;
         using SGDBase::m_traceLevel;
         using SGDBase::m_numMBsToShowResult;
         using SGDBase::m_gradientCheckSigDigit;
@@ -392,8 +393,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         if (m_loadBestModel)
                         {
-                            encoderNet->ReloadPersistableParameters<ElemType>(GetEncoderModelNameForEpoch(i - 1));
-                            decoderNet->ReloadPersistableParameters<ElemType>(GetDecoderModelNameForEpoch(i - 1));
+                            encoderNet->RereadPersistableParameters<ElemType>(GetEncoderModelNameForEpoch(i - 1));
+                            decoderNet->RereadPersistableParameters<ElemType>(GetDecoderModelNameForEpoch(i - 1));
 
                             size_t dummyMinibatchSize = 0;
                             this->LoadCheckPointInfo(i - 1,
@@ -721,7 +722,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                             //persist model and check-point info
                             for (size_t k = 0; k < iNumNetworks; k++)
                             {
-                                nets[k]->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k)));
+                                nets[k]->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(i, false, msra::strfun::wstrprintf(L".%d", k)));
                                 nets[k]->ResetEvalTimeStamps();
                             }
 
@@ -930,7 +931,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         {
                             Matrix<ElemType>& smoothedGradient = (*smoothedGradientIter);
 
-                            UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier);
+                            UpdateWeights(node, smoothedGradient, learnRatePerSample, GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, dataReader[0]->GetNumParallelSequences()), actualMBSize, m_L2RegWeight, m_L1RegWeight, m_needAveMultiplier, m_useNesterovMomentum);
                         }
                     }
                 }
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index 8fe60474f..6665815a7 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -310,7 +310,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // likewise for sequence training parameters
         if (isSequenceTrainingCriterion)
         {
-            ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign);
+            ComputationNetwork::SetSeqParam<ElemType>(net, criterionNodes[0], m_hSmoothingWeight, m_frameDropThresh, m_doReferenceAlign, 
+                m_seqGammarCalcAMF, m_seqGammarCalcLMF, m_seqGammarCalcWP, m_seqGammarCalcbMMIFactor, m_seqGammarCalcUsesMBR );
         }
 
         // --- MAIN EPOCH LOOP
@@ -519,6 +520,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             if ((m_parallelizationMethod == ParallelizationMethod::ModelAveragingSGD) && (g_mpi->NumNodesInUse() > 1))
             {
                 g_mpi->Bcast(&epochCriterion, 1, g_mpi->MainNodeRank());
+                g_mpi->Bcast(&lrControlCriterion, 1, g_mpi->MainNodeRank());
             }
 
             bool loadedPrevModel = false;
@@ -543,7 +545,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                     {
                         auto bestModelPath = GetModelNameForEpoch(i - m_learnRateAdjustInterval);
                         fprintf(stderr, "Loading previous model with best training-criterion value: %ls.\n", bestModelPath.c_str());
-                        net->ReloadPersistableParameters<ElemType>(bestModelPath);
+                        net->RereadPersistableParameters<ElemType>(bestModelPath);
                         LoadCheckPointInfo(i - m_learnRateAdjustInterval,
                                            /*out*/ totalSamplesSeen,
                                            /*out*/ learnRatePerSample,
@@ -771,13 +773,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         // Sub-minibatching is used if a single minibatch is too large to fit into GPU RAM.
         DataReaderHelpers::SubminibatchDispatcher<ElemType> smbDispatcher;
         size_t numSubminibatchesNeeded = 0; 
-        if (m_maxSamplesInRAM < SIZE_MAX)   // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
+        if (m_maxSamplesInRAM < SIZE_MAX || m_numSubminiBatches > 1)   // user-specified maximum number of samples that fit into GPU RAM; or 0 if not enabled
         {
-            // into how many pieces would we need to break the minibatch?
-            // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
-            size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
-            size_t estimatedMBSize = tunedMBSize * numParallelSequences; 
-            numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);             
+            if (m_maxSamplesInRAM < SIZE_MAX)
+            {
+                // into how many pieces would we need to break the minibatch?
+                // TODO: The following calculation relies on the ill-devised definition of "minibatch" of the current truncated BPTT implementation. Adapt this once fixed.
+                size_t numParallelSequences = trainSetDataReader->GetNumParallelSequences();
+                size_t estimatedMBSize = tunedMBSize * numParallelSequences;
+                numSubminibatchesNeeded = (size_t)std::ceil((float)estimatedMBSize / m_maxSamplesInRAM);
+            }
+            if (m_numSubminiBatches > 1)
+            {
+                numSubminibatchesNeeded = m_numSubminiBatches;
+            }
         }
         // this is non-trivial, we need a manager object to handle this
         if (numSubminibatchesNeeded > 1)
@@ -807,7 +816,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
         if (numSubminibatchesNeeded > 1)
         {
-            fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
+            if (m_maxSamplesInRAM < SIZE_MAX)
+                fprintf(stderr, ", with maximum %d samples in RAM", (int)m_maxSamplesInRAM);
+            else
+                fprintf(stderr, ", with %d subminibatch", (int)numSubminibatchesNeeded);
         }
         fprintf(stderr, ".\n");
 
@@ -998,7 +1010,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                         UpdateWeights(node, smoothedGradient, learnRatePerSample,
                                       GetMomentumPerSample(epochNumber/*BUGBUG workaround:*/, net->GetMBLayoutPtr()->GetNumParallelSequences()), aggregateNumSamples,
                                       m_L2RegWeight, m_L1RegWeight,
-                                      m_needAveMultiplier);
+                                      m_needAveMultiplier, m_useNesterovMomentum);
 #ifdef _DEBUG
                         if (dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().HasNan("TrainOneEpoch/UpdateWeights(): "))
                             LogicError("%ls %ls operation has NaNs in functionValues after parameter update.", node->NodeName().c_str(), node->OperationName().c_str());
@@ -1438,7 +1450,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         int baseModelEpoch = epochNumber - 1;
-        net->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
+        net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
 
         double learnRate = learnRatePerSample;
         size_t dummyMinibatchSize = 0;
@@ -1598,7 +1610,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         }
 
         int baseModelEpoch = epochNumber - 1;
-        net->ReloadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
+        net->RereadPersistableParameters<ElemType>(GetModelNameForEpoch(baseModelEpoch));
 
         double dummyLearnRate;
         double dummtPrevCriterion;
@@ -2029,7 +2041,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                size_t actualMBSize,
                                const double L2RegWeight,
                                const double L1RegWeight,
-                               const bool needAveMultiplier)
+                               const bool needAveMultiplier, 
+                               const bool useNesterovMomentum
+                               )
     {
         // we use simple linear (instead of log linear) scaling here
         const double momentum = MomentumPerMB(momentumPerSample, actualMBSize);
@@ -2070,7 +2084,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         if (adpType == GradientsUpdateType::None)
         {
             smoothedGradient.NormalGrad(gradientValues, functionValues,
-                                        (ElemType)learnRatePerSample, (ElemType)momentum);
+                                        (ElemType)learnRatePerSample, (ElemType)momentum, useNesterovMomentum);
         }
         else if (adpType == GradientsUpdateType::AdaGrad ||
                 (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
@@ -2120,7 +2134,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        const double momentumPerSample,
                        const size_t actualMBSize,
                        const double L2RegWeight, const double L1RegWeight,
-                       const bool needAveMultiplier) const
+                       const bool needAveMultiplier, 
+                       const bool useNesterovMomentum
+                       ) const
     {
 #if DUMPOUTPUT
         fprintf(stderr, "Update_%ls\n", node->NodeName().c_str());
@@ -2131,7 +2147,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         UpdateWeightsS(this, dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(), dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Gradient(),
                        smoothedGradient, learnRatePerSample, momentumPerSample,
                        actualMBSize, L2RegWeight, L1RegWeight,
-                       needAveMultiplier);
+                       needAveMultiplier, m_useNesterovMomentum);
         node->BumpEvalTimeStamp();
     }
 
@@ -2501,6 +2517,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_mbSize = configSGD(L"minibatchSize", ConfigRecordType::Array(intargvector(vector<int>{ 256 })));
         m_truncated = configSGD(L"truncated", false);
         m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t)SIZE_MAX);
+        m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t)1);
 
         // the number of samples in each epoch (0 means, use all the samples in each epoch).
         m_epochSize = configSGD(L"epochSize", (size_t)0);
@@ -2520,6 +2537,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         floatargvector momentumPerMB          = configSGD(L"momentumPerMB", ConfigRecordType::Array(floatargvector()));
         floatargvector momentumPerSample      = configSGD(L"momentumPerSample", ConfigRecordType::Array(floatargvector()));
         floatargvector momentumAsTimeConstant = configSGD(L"momentumAsTimeConstant", ConfigRecordType::Array(floatargvector()));
+        bool           useNesterovMomentum = configSGD(L"useNAG", false); 
+
 
         m_maxTempMemSizeInSamplesForCNN = configSGD(L"maxTempMemSizeInSamplesForCNN", (size_t)0);
 
@@ -2534,6 +2553,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         m_hSmoothingWeight = configSGD(L"hSmoothingWeight", 0.95);
         m_frameDropThresh =  configSGD(L"frameDropThresh",  1e-10);
         m_doReferenceAlign = configSGD(L"doReferenceAlign", false);
+        m_seqGammarCalcUsesMBR = configSGD(L"seqGammarUsesMBR", false); 
+        m_seqGammarCalcAMF = configSGD(L"seqGammarAMF", 14.0);
+        m_seqGammarCalcLMF = configSGD(L"seqGammarLMF", 14.0);
+        m_seqGammarCalcbMMIFactor = configSGD(L"seqGammarBMMIFactor", 0.0); 
+        m_seqGammarCalcWP = configSGD(L"seqGammarWordPen", 0.0);
 
         m_dropoutRates = configSGD(L"dropoutRate", ConfigRecordType::Array(floatargvector(vector<float>{ 0.0f })));
 
@@ -2639,6 +2663,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
             m_momentumParam = floatargvector(L"0.9");
             m_momentumSpecifiedForMBSize = m_mbSize;
         }
+        m_useNesterovMomentum = useNesterovMomentum; 
+
         for (int i = 0; i < m_momentumParam.size(); i++)
         {
             if (m_momentumParam[i] >= 1.0 || m_momentumParam[i] < 0.0)
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index 15143dfa0..a014ec1d2 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -111,6 +111,7 @@ protected:
     intargvector m_learningRatesSpecifiedForMBSize;       // 1 for per sample, m_mbSize[] for per MB
     floatargvector m_momentumParam;
     intargvector m_momentumSpecifiedForMBSize;
+    bool         m_useNesterovMomentum; 
 
     // Determine the MB size used for mapping a given learning-rate or momentum parameter to a per-sample value.
     // MB size is the number of samples across all time steps and parallel sequences.
@@ -157,7 +158,11 @@ protected:
     // To mitigate this issue, we adopt the sub-minibatch implementation, where 
     // each m_mbSize[epoch] is divided by a few sub-minibatch of which size will be no more than m_maxSamplesInRAM
     // a forward-backward is performed for each sub-minibathch; a model update is performed after each minibatch 
-
+    size_t m_numSubminiBatches; 
+    // alternative method to specify how to split minibatches into subminibatches 
+    // default is 1, which means no subminibatch is used 
+    // if m_maxTempMemSizeInSamples = SIZE_MAX (which means users do not specify the option) and m_numSubminiBatches > 1 
+    // we divide one minibatch to m_numSubminiBatches subMinibatches 
 
     // the number of samples in each epoch (0 means, use all the samples in each epoch).
     size_t m_epochSize;
@@ -245,6 +250,11 @@ protected:
     double m_hSmoothingWeight;
     double m_frameDropThresh;
     bool m_doReferenceAlign;
+    double m_seqGammarCalcAMF;
+    double m_seqGammarCalcLMF; 
+    double m_seqGammarCalcWP;
+    double m_seqGammarCalcbMMIFactor; 
+    bool m_seqGammarCalcUsesMBR;
 };
 
 template<class ElemType> class IDistGradAggregator;
@@ -436,7 +446,9 @@ public:
                                size_t actualMBSize,
                                const double L2RegWeight,
                                const double L1RegWeight,
-                               const bool needAveMultiplier);
+                               const bool needAveMultiplier, 
+                               const bool useNesterovMomentum
+                               );
 
 protected:
     // UpdateWeights - update the weights in
@@ -446,7 +458,8 @@ protected:
                        const double momentumPerSample,
                        const size_t actualMBSize,
                        const double L2RegWeight, const double L1RegWeight,
-                       const bool needAveMultiplier) const;
+                       const bool needAveMultiplier, 
+                       const bool useNesterovMomentum) const;
 
     void ClipGradient(Matrix<ElemType>& gradient, const size_t actualMBSize) const;
 
diff --git a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj
index c7c9d4073..b37973541 100644
--- a/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj
+++ b/Source/SequenceTrainingLib/SequenceTrainingLib.vcxproj
@@ -44,7 +44,7 @@
       <SDLCheck>true</SDLCheck>
     </ClCompile>
     <Link>
-      <SubSystem>Windows</SubSystem>
+      <SubSystem>Console</SubSystem>
       <GenerateDebugInformation>true</GenerateDebugInformation>
     </Link>
   </ItemDefinitionGroup>
diff --git a/Source/SequenceTrainingLib/gammacalculation.h b/Source/SequenceTrainingLib/gammacalculation.h
index 4ad7d8f46..f63c50487 100644
--- a/Source/SequenceTrainingLib/gammacalculation.h
+++ b/Source/SequenceTrainingLib/gammacalculation.h
@@ -11,6 +11,23 @@
 #pragma warning (disable: 4127) // conditional expression is constant
 
 namespace msra { namespace lattices {
+
+    struct  SeqGammarCalParam{
+        double amf; 
+        double lmf; 
+        double wp; 
+        double bMMIfactor; 
+        bool  sMBRmode; 
+        SeqGammarCalParam()
+        {
+            amf = 14.0; 
+            lmf = 14.0; 
+            wp = 0.0; 
+            bMMIfactor = 0.0;
+            sMBRmode = false;
+        }
+    };
+
     template<class ElemType>
     class GammaCalculation
     {
@@ -19,9 +36,9 @@ namespace msra { namespace lattices {
         GammaCalculation() : cpumode(false)
         {
             initialmark = false;
-            lmf = 14.0f; // Note that 9 was best for Fisher  --these should best be configurable
+            lmf = 7.0f; // Note that 9 was best for Fisher  --these should best be configurable
             wp = 0.0f;
-            amf = 14.0f;
+            amf = 7.0f;
             boostmmifactor = 0.0f;
             seqsMBRmode = false;
         }
@@ -30,6 +47,9 @@ namespace msra { namespace lattices {
 
         }
 
+        //========================================
+        // Sec. 1 init functions
+        //========================================
         void init(msra::asr::simplesenonehmm hset, int DeviceId)
         {
             m_deviceid = DeviceId;
@@ -47,7 +67,21 @@ namespace msra { namespace lattices {
             }
         }
             
-            
+        //========================================
+        // Sec. 2 set functions 
+        //========================================
+        void SetGammarCalculationParams(const SeqGammarCalParam& gammarParam)
+        {
+            lmf = (float)gammarParam.lmf;
+            amf = (float)gammarParam.amf;
+            wp =  (float)gammarParam.wp; 
+            seqsMBRmode = gammarParam.sMBRmode; 
+            boostmmifactor = (float)gammarParam.bMMIfactor;
+        }
+
+        //========================================
+        // Sec. 3 calculation functions 
+        //========================================
         void calgammaformb( Microsoft::MSR::CNTK::Matrix<ElemType>& functionValues, 
                             std::vector<shared_ptr<const msra::dbn::latticepair>> &lattices, 
                             const Microsoft::MSR::CNTK::Matrix<ElemType>& loglikelihood,
diff --git a/Source/SequenceTrainingLib/latticeforwardbackward.cpp b/Source/SequenceTrainingLib/latticeforwardbackward.cpp
index 4f43bc718..4abb50d3c 100644
--- a/Source/SequenceTrainingLib/latticeforwardbackward.cpp
+++ b/Source/SequenceTrainingLib/latticeforwardbackward.cpp
@@ -442,6 +442,7 @@ template<typename FLOAT> static bool islogzero (FLOAT v) { return v < LOGZERO/2;
                 LogicError("invalid backpointer resulting in state index out of range");
 
             int bp = (int) backpointers(j,t);   // save the backpointer before overwriting it (gammas and backpointers are aliases of each other)
+			//thisedgealignmentsj[t] = (unsigned short)hmm.getsenoneid(j - js);
             if (!returnsenoneids)               // return binary gammas (for MMI; this mode is compatible with softalignmode)
                 for (size_t i = js; i < je; i++)
                     loggammas(i,t) = ((int) i == j) ? 0.0f : LOGZERO;
diff --git a/Source/SequenceTrainingLib/parallelforwardbackward.cpp b/Source/SequenceTrainingLib/parallelforwardbackward.cpp
index 3fb27b59f..bc4baaad9 100644
--- a/Source/SequenceTrainingLib/parallelforwardbackward.cpp
+++ b/Source/SequenceTrainingLib/parallelforwardbackward.cpp
@@ -743,8 +743,8 @@ namespace msra { namespace lattices {
         double totalfwscore = 0.0f;
         if (!parallelstate->emulation)
         {
-
-            fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
+            if (verbosity>=2)
+                fprintf(stderr, "parallelforwardbackwardlattice: %d launches for forward, %d launches for backward\n", (int)batchsizeforward.size(), (int)batchsizebackward.size());
 
             const bool allocateframescorrect = (returnEframescorrect || boostingfactor != 0.0f);
             const bool copyuids = (returnEframescorrect || boostingfactor != 0.0f);
diff --git a/Tests/EndToEndTests/Speech/LSTM/cntk.config b/Tests/EndToEndTests/Speech/LSTM/cntk.config
index 292bbe8c1..de01d3d79 100644
--- a/Tests/EndToEndTests/Speech/LSTM/cntk.config
+++ b/Tests/EndToEndTests/Speech/LSTM/cntk.config
@@ -67,7 +67,7 @@ speechTrain = [
 
             // LSTM cell
             # TODO: This is temporary test code for the new ShiftNode (until we switch PastValue() itself over)
-            PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1, numSteps=1, insertedDim=2)
+            PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1)
             PastValue1 = PastValue
             #PastValue1 = PastValueShift
             dh = PastValue1(outputDim, output);                   // hidden state(t-1)
diff --git a/Tools/generate_build_info b/Tools/generate_build_info
index a155fc84e..62686222e 100755
--- a/Tools/generate_build_info
+++ b/Tools/generate_build_info
@@ -56,6 +56,9 @@ makebuildinfo()
 	if [ ! -z "$CUB_PATH" ]; then 
 		printf "#define _CUB_PATH_ \"%s\"\n"  $CUB_PATH  >> $target
 	fi
+    if [ ! -z "$CUDNN_PATH" ]; then 
+        printf "#define _CUDNN_PATH_ \"%s\"\n"  $CUDNN_PATH  >> $target
+    fi
 	printf "#define _BUILDTYPE_ \"%s\"\n" $BUILDTYPE    	>> 	$target
 	printf "#endif\n" 					>>	$target
 }