changed GetRecurrenceDirections() back to operate on a singly dimension only (multiple dimensions can be realized with BrainScript);

towards implementing DataTensorFor() using tensor slices, so that we can reuse that for ShiftNode
2016-01-08 11:19:50 -08:00 · 2016-01-08 11:19:50 -08:00 · ed5d40aae1
--- a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
+++ b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
@ -47,7 +47,7 @@ using namespace std;
        L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
        L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
        // TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
-        L"Shift(input, fromOffsets, boundaryValue, dim=-1, offsetRanges=1, multiOffsetDim=0, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) ; fromOffset = new IntVector [ items = fromOffsets ] ; offsetRange = new SizeVector [items=    new SizeVector [ items = offsetRanges ]   ]/*plus the function args*/ ]\n"
+        L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, numSteps=1, insertedDim=0, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n"
        L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
        L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
        L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -748,17 +748,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                if (!pMBLayout)
                    LogicError("DataFor: Attempting to retrieve a parallel sequence from data without layout.");
-#if 1
                else
-                    LogicError("DataFor: To retrieve a parallel sequence, implement Matrix::RowSlice() first!");
-#else
-                // get a reshaped view that stacks all sequences into T long vectors
-                auto mat = data.ColumnSlice(0, data.GetNumCols());
-                mat.Resize(data.GetNumRows() * pMBLayout->GetNumParallelSequences(), data.GetNumRows() / pMBLayout->GetNumParallelSequences());
-                return mat;   // .RowSlice(fr.seqIndex * data.GetNumRows());
-                // TODO: Why does RowSlice() not exist? Seems simple. Is there a hidden assumption of contiguous memory?#endif
-                // TODO: The tensor version of this will support it.
-#endif
+                    LogicError("DataFor: Individual parallel sequences cannot be retrieved in Matrix representation. Use TensorView instead.");
            }
        }
        // FrameRange refers to a time slice -> return that
@ -790,6 +781,82 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return data.ColumnSlice(columnRange.first, columnRange.second);
    }

+    // -----------------------------------------------------------------------
+    // TensorSliceWithMBLayoutFor() -- Return tensor slice for a FrameRange of a Matrix with specified number of columns with a given MBLayout
+    // -----------------------------------------------------------------------
+
+    template<class DimensionVector> // e.g. std::vector<size_t> or SmallVector<size_t>
+    static inline std::pair<DimensionVector, DimensionVector> TensorSliceWithMBLayoutFor(const DimensionVector & shape/*of data matrix to slice*/,
+                                                                                         const FrameRange & fr/*select frame or entire batch*/,
+                                                                                         const MBLayoutPtr & pMBLayout/*the MB layout of 'data'*/)
+    {
+        std::pair<DimensionVector, DimensionVector> result;
+
+        // this creates a slice for the entire matrix, which we will then narrow down
+        result.first.resize(shape.size(), 0);
+        result.second = shape;
+       
+        fr; pMBLayout;
+#if 0
+        // MBLayout of data and of FrameRange must be identical pointers,
+        // or in case of broadcasting, respective parent pointers.
+        // MBLayouts that are identical in content but not object identity (pointer) are not admissible.
+        // For those cases, use a ReconcileMBLayout node.
+        if (fr.m_pMBLayout != pMBLayout)
+        {
+            // if broadcast allowed then it is allowed to broadcast from an outer-loop value
+            // Currently, the only 'outer' loop we have is to have no layout.
+            if (fr.m_broadcastAllowed && !pMBLayout && numCols == 1)
+                return std::pair<size_t, size_t>(0, numCols);
+            if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout)
+                LogicError("DataFor: fr's MBLayout inconsistent with matrix. They are compatible though--are you missing a ReconcileMBLayout operation?");
+            else
+                LogicError("DataFor: fr's MBLayout inconsistent with matrix");
+        }
+        // if FrameRange refers to whole minibatch (map mode)
+        // or if we don't even have a layout
+        // then return the whole matrix
+        // but as a reference (e.g. it cannot be resized)
+        if (!pMBLayout || fr.IsAllFrames())
+        {
+            if (fr.m_timeOffset != 0)
+                LogicError("DataFor: Time offset must not be specified for FrameRanges that reference the entire minibatch.");
+            // TODO: Can we allow this? Semantics would be different, it would crop frames outside.
+            if (fr.seqIndex == SIZE_MAX)
+                return std::pair<size_t, size_t>(0, numCols);
+            else
+            {
+                if (!pMBLayout)
+                    LogicError("DataFor: Attempting to retrieve a parallel sequence from data without layout.");
+#if 1
+                else
+                    LogicError("DataFor: To retrieve a parallel sequence, implement Matrix::RowSlice() first!");
+#else
+                // get a reshaped view that stacks all sequences into T long vectors
+                auto mat = data.ColumnSlice(0, data.GetNumCols());
+                mat.Resize(data.GetNumRows() * pMBLayout->GetNumParallelSequences(), data.GetNumRows() / pMBLayout->GetNumParallelSequences());
+                return mat;   // .RowSlice(fr.seqIndex * data.GetNumRows());
+                // TODO: Why does RowSlice() not exist? Seems simple. Is there a hidden assumption of contiguous memory?#endif
+                // TODO: The tensor version of this will support it.
+#endif
+            }
+        }
+        // FrameRange refers to a time slice -> return that
+        else
+        {
+            size_t numParallelSequences = pMBLayout->GetNumParallelSequences();
+            size_t startColumn = (fr.timeIdxInSeq + fr.m_timeOffset) * numParallelSequences;
+            if (startColumn >= numCols)
+                LogicError("DataFor: FrameRange specifies a time index that is out of range.");
+            if (fr.seqIndex == SIZE_MAX)
+                return std::pair<size_t, size_t>(startColumn, numParallelSequences);
+            else
+                return std::pair<size_t, size_t>(startColumn + fr.seqIndex, 1);
+        }
+#endif
+        return result;
+    }
+
    // -----------------------------------------------------------------------
    // MaskMissingColumnsTo() -- function to set gaps to zero or NaN
    // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
@ -24,7 +24,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // The methods below determine evaluation order, which is tricky in presence of recurrent loops.
    // TODO: Can this be moved to a separate class?

-    static const vector<int> & GetRecurrenceDirections(const ComputationNodeBasePtr &);
+    static int GetRecurrenceSteppingDirection(const ComputationNodeBasePtr &);

    // FormRecurrentLoops() -- MAIN ENTRY POINT for network recurrent-loop analysis. All other functions in this CPP are called only from this one.
    // This function analysis the networks for recurrent loops present in the computation of 'rootNode.'
@ -92,7 +92,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                const auto & node = iter->m_nestedNodes[j];
                for (size_t i = 0; i < node->GetNumInputs(); i++)
                {
-                    if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceDirections(node).empty())
+                    if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0)
                    {
                        //assert(node->Input(i)->m_indexInLoop == 0);                    // No. It seems this variable really counts the number of parents.
                        node->Input(i)->m_indexInLoop++;               // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
@ -168,13 +168,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    }

    // checks whether a node is recurrent, and which direction
-    static vector<int> emptyVector;
-    static const vector<int> & GetRecurrenceDirections(const ComputationNodeBasePtr & node)
+    static int GetRecurrenceSteppingDirection(const ComputationNodeBasePtr & node)
    {
        if (node->Is<IRecurrentNode>())
-            return node->As<IRecurrentNode>()->GetRecurrenceDirections();
+            return node->As<IRecurrentNode>()->GetRecurrenceSteppingDirection();
        else
-            return emptyVector;
+            return 0;
    }

    static int DetermineLoopDirection(const std::vector<ComputationNodeBasePtr> & nestedNodes);
@ -308,7 +307,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            visited.insert(cur);
            recStack.insert(cur);

-            if (GetRecurrenceDirections(cur).empty())   // recurrence stops at delays
+            if (GetRecurrenceSteppingDirection(cur) == 0)   // recurrence stops at delay nodes
            {
                for (size_t i = 0; i < cur->GetNumInputs(); i++)
                    if (cur->Input(i)->m_loopId == cur->m_loopId)
@ -395,25 +394,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // BUGBUG: Need to extend to multi-dimensional loop directions. Use a vector<int>.
    static int DetermineLoopDirection(const std::vector<ComputationNodeBasePtr> & nestedNodes)
    {
-        vector<int> recurrenceDirections;
+        int steppingDirection = 0;

        for (auto & node : nestedNodes)
        {
-            const auto & dirs = GetRecurrenceDirections(node);
-            if (dirs.empty())   // not a recurrent node
+            int dir = GetRecurrenceSteppingDirection(node);
+            if (dir == 0)   // not a recurrent node
                continue;
-            if (recurrenceDirections.empty())
-                recurrenceDirections = dirs;
-            else if (recurrenceDirections != dirs)
-                InvalidArgument("It is not allowed to have multiple different recurrence directions in the same loop (loop connected to %ls %ls operation).",
+            if (steppingDirection == 0)
+                steppingDirection = dir;
+            else if (steppingDirection != dir)
+                InvalidArgument("It is not allowed to have multiple different stepping directions in the same loop (loop connected to %ls %ls operation).",
                                nestedNodes.front()->NodeName().c_str(), nestedNodes.front()->OperationName().c_str());
        }

-        if (recurrenceDirections.empty())
+        if (steppingDirection == 0)
            LogicError("There is no recurrent node in the loop connected to %ls %ls operation.",
                       nestedNodes.front()->NodeName().c_str(), nestedNodes.front()->OperationName().c_str());
        // BUGBUG: Multiple recurrence dimensions not yet supported beyond this point.
-        return -recurrenceDirections[0];
+        return steppingDirection;
    }

 }}}
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -213,30 +213,6 @@ namespace Microsoft {
        return maxRank;
    }

-    // determine the full tensor dimension including padding and multiple samples (MBLayout)
-    // but without trailing ones (assuming they will be auto-padded by the tensor op)
-    TensorShape ComputationNodeBase::GetTensorShape(size_t rank, const FrameRange & fr) const
-    {
-        //GetAndValidateSampleLayout();     // no need to validate because rank comes from DetermineElementwiseTensorRank() which validates all
-        if (!HasMBLayout())
-            return GetSampleLayout().Append(GetSampleLayout().GetRank(), GetNumCols());    //  last dim is column dimension
-            // TODO: This is not nice! Instead, of no MBLayout then have sample layout explain whole matrix.
-        else if (fr.IsAllFrames())
-        {
-            // we have an MBLayout, and for refers to the entire MB
-            return GetSampleLayout().Append(rank, GetMBLayout()->GetNumCols());
-        }
-        //else if (fr.Sequence != SIZE_MAX)     // needs a slice and a two-dim tensor
-        //{
-        //    return GetAndValidateSampleLayout();  // .Append(rank, 1);  // no need to append ones
-        //}
-        else
-        {
-            // we have an MBLayout, and fr refers to one frame (across all parallel sequences)
-            return GetSampleLayout().Append(rank, GetMBLayout()->GetNumParallelSequences());
-        }
-    }
-
    // -----------------------------------------------------------------------
    // others
    // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -353,8 +353,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        const TensorShape & GetAndValidateSampleLayout() const;             // TODO: Once numRows is consistent with m_sampleLayout, this will go away
        size_t DetermineElementwiseTensorRank() const;
    public:
-        TensorShape GetTensorShape(size_t dims, const FrameRange & fr) const;
-
        // access to element(0,0) without having to type-cast
        virtual double Get00Element() const = 0;

@ -1144,15 +1142,24 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // tensor variants
        TensorView<ElemType> DataTensorFor(Matrix<ElemType> & data, size_t rank, const FrameRange & fr)
        {
-            return TensorView<ElemType>(DataFor(data, fr), GetTensorShape(rank, fr));
+            TensorShape tensorShape = GetSampleLayout();
+            if (!HasMBLayout())
+                tensorShape.AppendInPlace(tensorShape.GetRank(), GetNumCols());    //  last dim is column dimension
+            // TODO: This is not nice! Instead, of no MBLayout then have sample layout explain whole matrix.
+            else if (fr.IsAllFrames()) // we have an MBLayout, and for refers to the entire MB
+                tensorShape.AppendInPlace(rank, GetMBLayout()->GetNumCols());
+            else  // we have an MBLayout, and fr refers to one frame (across all parallel sequences)
+                tensorShape.AppendInPlace(rank, GetMBLayout()->GetNumParallelSequences());
+            // TODO: determine SmallVector begin, end bounds first, get a narrow full shape, squeeze the dims, then return the tensor
+            return TensorView<ElemType>(DataFor(data, fr), tensorShape);
        }
        TensorView<ElemType> ValueTensorFor(size_t rank, const FrameRange & fr)
        {
-            return TensorView<ElemType>(ValueFor(fr), GetTensorShape(rank, fr));
+            return DataTensorFor(Value(), rank, fr);
        }
        TensorView<ElemType> GradientTensorFor(size_t rank, const FrameRange & fr)
        {
-            return TensorView<ElemType>(GradientFor(fr), GetTensorShape(rank, fr));
+            return DataTensorFor(Gradient(), rank, fr);
        }

        // update the actual matrix allocation for m_value based on the node dimension
@ -1509,7 +1516,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // IRecurrentNode -- helper wrapper class for ComputationNodes that can be recurrent
    // =======================================================================

-    struct IRecurrentNode { virtual const std::vector<int> & GetRecurrenceDirections() const = 0; };
+    struct IRecurrentNode { virtual int GetRecurrenceSteppingDirection() const = 0; };


    // =======================================================================
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -26,7 +26,7 @@
 namespace Microsoft { namespace MSR { namespace CNTK {

    // -----------------------------------------------------------------------
-    // ShiftNode (input, fromOffset, boundaryValue, dim=-1, offsetRange=1, multiOffsetDim=0) -- delay and rolling window
+    // ShiftNode (input, fromOffset, boundaryValue, dim=-1, numSteps=1, insertDim=0) -- delay and rolling window
    //
    // This shifts the input by (-fromOffset) steps. In other words, output(t) will be input(t+fromOffset).
    // E.g. for fromOffset=-1, this gives the past value.
@ -61,9 +61,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // when all involved nodes are implemented using the tensor library. Nodes implemented using
    // Matrix slices can only support iterating over time.
    //
-    // The fromOffset can also be a tensor, e.g. (1,1). In that case, iteration will be over multiple
-    // consecutive dimensions. offsetRange must have the same number of dimensions.
-    //
    // If the boundaryValue has 0 elements, the sequence will be trimmed (frames reaching beyond the boundary
    // are dropped). This will initially not be implemented for the time dimension (as it would require
    // change of MBLayout).
@ -75,34 +72,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
        static const std::wstring TypeName() { return L"Shift"; }
    public:
-        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, const std::vector<int> & fromOffset, int shiftDimension, const std::vector<size_t> & offsetRange, int expandDimension) :
-            Base(deviceId, name), m_fromOffsetBegin(fromOffset),
-            m_shiftDimension(shiftDimension), m_expandDimension(expandDimension),
+        enum BoundaryMode : int     // how to fill frames at boundaries
+        {
+            reachAcross = -1,       // go across the boundary: use boundaryValue. This is for recurrence.
+            duplicate = 0,          // duplicate frame at boundary, e.g. duplicate first frame. Non-recurrent mode only.
+            trim = 1                // drop frames. Non-recurrent mode only.
+        };
+        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, int fromOffset, BoundaryMode boundaryMode, int shiftDimension, size_t numSteps, int insertedDimParam) :
+            Base(deviceId, name), m_fromOffset(fromOffset), m_numSteps(numSteps),
+            m_boundaryMode(boundaryMode),
+            m_shiftDimension(shiftDimension), m_insertedDimParam(insertedDimParam),
            m_insertExpandShapeAt(SIZE_MAX/*uninitialized at this point*/)
        {
-            // determine m_fromOffsetEnd from fromOffset/offsetRange
-            bool anyNonRecurrent = false;
-            for (size_t k = 0; k < m_fromOffsetBegin.size(); k++)
-            {
-                m_fromOffsetEnd.push_back(m_fromOffsetBegin[k] + (k < offsetRange.size() ? (int)offsetRange[k] : 1));
-                if (m_fromOffsetEnd[k] <= 0)
-                    m_recurrenceDirections.push_back(-1);
-                else if (m_fromOffsetBegin[k] > 0)
-                    m_recurrenceDirections.push_back(+1);
-                else
-                    m_recurrenceDirections.push_back(0);
-                anyNonRecurrent |= m_recurrenceDirections[k] == 0;
-            }
-            if (anyNonRecurrent)
-                m_recurrenceDirections.clear();
            CreateMatrixIfNull(m_value);
            SetDims(TensorShape(), 0);  // empty for now
        }
        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name) :
-            ShiftNode(deviceId, name, std::vector<int> { 1 }, -1, std::vector<size_t> { 1 }, 0)
+            ShiftNode(deviceId, name, 1, BoundaryMode::reachAcross, -1, 1, 0)
        { }
        ShiftNode(const ScriptableObjects::IConfigRecordPtr configp) :
-            ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), configp->Get(L"dim"), configp->Get(L"offsetRange"), configp->Get(L"multiOffsetDim"))
+            ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), (BoundaryMode)(int)configp->Get(L"boundaryMode"), configp->Get(L"dim"), configp->Get(L"numSteps"), configp->Get(L"insertedDim"))
        {
            // We do NOT attach the inputs, as we cannot resolve the main input without causing a circular reference.
            // Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below.
@ -122,23 +111,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        void Save(File& fstream) const
        {
            Base::Save(fstream);
-
-            fstream << m_fromOffsetBegin;
-            fstream << m_fromOffsetEnd;
-            fstream << m_shiftDimension;
-            fstream << m_expandDimension;
-            fstream << m_recurrenceDirections;
+            fstream << m_fromOffset << m_numSteps << m_boundaryMode << m_shiftDimension << m_insertedDimParam;
        }

        virtual void Load(File& fstream, size_t modelVersion) override
        {
            Base::Load(fstream, modelVersion);
-
-            fstream >> m_fromOffsetBegin;
-            fstream >> m_fromOffsetEnd;
-            fstream >> m_shiftDimension;
-            fstream >> m_expandDimension;
-            fstream >> m_recurrenceDirections;
+            fstream >> m_fromOffset >> m_numSteps >> m_boundaryMode >> m_shiftDimension >> m_insertedDimParam;
        }

        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
@ -150,19 +129,47 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
        virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override {return false; }

+        virtual void BeginForwardProp() override        // called after last iteration step of ForwardProp()
+        {
+            Base::BeginForwardProp();
+
+            // in case of trimming, narrow the layout
+            // We actually do not drop content, only reduce the range of sequences.
+            // This is meant to optimize for the case where we have multiple sequences concatenated while trimming a small amount only.
+        }
+
        virtual void EndForwardProp() override        // called after last iteration step of ForwardProp()
        {
            Base::EndForwardProp();

            // In BPTT, we carry over left-to-right state across minibatches.
-            // TODO: package up the state using ExportState(). Then in BeginForwardProp() bring it back. In-between, the packages can be moved around.
+            // The necessary frames are stored in m_state->m_delayedValue.
+
+            // Only if layout has anything exceeding the MB.
        }

        // This function assumes BeginForwardProp/EndForwardProp() to be called before/after the iteration loop.
-        // TODO: In the future, there may be value for one more way of handling the boundary condition: Fill as 'NoInput'. Then we can use this to implement rolling windows (albeit inefficiently). Would require to unshare the layout.
        virtual void ForwardProp(const FrameRange & fr) override
        {
-            fr;
+            // STEP 1: whole-sale copy a shifted version of the input to the output
+            //  - consider the saved parts from the last minibatch as part of the input at dimensions beyond the bounds
+            //  - ignore boundary conditions for now
+
+            // get the tensors without shift
+            size_t rank = DetermineElementwiseTensorRank();
+            auto result = ValueTensorFor(rank, fr);
+            auto input = Input(0)->ValueTensorFor(rank, fr);
+
+            // shift the dimension in the input
+
+            // STEP 2: fix up the boundary conditions
+            //  - fill in xxx
+
+            // turn selected frame and shifted frame into a tensor
+
+            // copy all that's in range
+
+            // fix up all that is not
        }

        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
@ -178,40 +185,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (isFinalValidationPass && !m_pMBLayout)
                InvalidArgument("%ls %ls operation must operate on data (must have an MB Layout).", NodeName().c_str(), OperationName().c_str());

-            // determine expandShape--empty if no multiple offsets; otherwise the 1 or more dimensions that need to be added at m_expandDimension
-            m_expandShape.clear();
-            for (size_t k = 0; k < m_fromOffsetBegin.size(); k++)
-            {
-                size_t dim = m_fromOffsetEnd[k] - m_fromOffsetBegin[k];
-                if (dim > 1)
-                {
-                    m_expandShape.resize(k, 1);
-                    m_expandShape.push_back(dim);
-                }
-            }
-            if (!m_expandShape.empty())
-                m_expandShape.resize(m_fromOffsetBegin.size(), 1);  // pad ones to end
-            // now it either matches the dimensions to insert, or is empty if none to append
-
            // determine final sample layout
            auto inputSampleLayout = Input(0)->GetSampleLayout();
            auto inputDims = inputSampleLayout.GetDims();
-            if (m_expandDimension < 0)
+            if (m_insertedDimParam < 0)
                InvalidArgument("%ls %ls operation: Specified insertion location %d refers to a time dimension, but this is not allowed.", 
-                                NodeName().c_str(), OperationName().c_str(), m_expandDimension);
-            m_insertExpandShapeAt = m_expandShape.empty() ? 0 : (m_expandDimension > 0 ? m_expandDimension - 1 : inputDims.size());
+                                NodeName().c_str(), OperationName().c_str(), m_insertedDimParam);
+            m_insertExpandShapeAt = m_numSteps > 1 ? 0 : (m_insertedDimParam > 0 ? m_insertedDimParam - 1 : inputDims.size());
            if (m_insertExpandShapeAt > inputDims.size())
                if (isFinalValidationPass)
                    InvalidArgument("%ls %ls operation: Specified insertion location %d beyond end of input sample layout [%s].",
-                                    NodeName().c_str(), OperationName().c_str(), m_expandDimension, string(inputSampleLayout).c_str());
+                                    NodeName().c_str(), OperationName().c_str(), m_insertedDimParam, string(inputSampleLayout).c_str());
                else
                    m_insertExpandShapeAt = inputDims.size();   // this may be an error, but we want to catch that only in the final pass
            SmallVector<size_t> dims;
-            if (!m_expandShape.empty() && inputDims.size() + m_expandShape.size() > dims.capacity())
+            if (m_numSteps > 1 && inputDims.size() + 1 > dims.capacity())
                InvalidArgument("%ls %ls operation: Too many dimensions. Did you feed back output of this node without stripping the extra dimensions?",
                                NodeName().c_str(), OperationName().c_str());
            dims.append(inputDims.begin(), inputDims.begin() + m_insertExpandShapeAt);
-            dims.append(m_expandShape.begin(), m_expandShape.end());
+            if (m_numSteps > 1)             // insert the new dimension if we expand into more than one step
+                dims.push_back(m_numSteps);
            dims.append(inputDims.begin() + m_insertExpandShapeAt, inputDims.end());
            auto sampleLayout = TensorShape(dims);

@ -219,9 +212,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        // special interface for use by loop detection
-        virtual const std::vector<int> & /*IRecurrentNode::*/GetRecurrenceDirections() const override
+        virtual int /*IRecurrentNode::*/GetRecurrenceSteppingDirection() const override
        {
-            return m_recurrenceDirections;
+            if (m_boundaryMode != BoundaryMode::reachAcross)
+                return 0;
+            else if (m_fromOffset + (int)m_numSteps <= 0)
+                return +1;
+            else if (m_fromOffset > 0)
+                return -1;
+            else
+                return 0;
        }

        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -230,20 +230,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (flags & CopyNodeFlags::copyNodeValue)
            {
                auto node = dynamic_pointer_cast<ShiftNode<ElemType>>(nodeP);
-                node->m_fromOffsetBegin      = m_fromOffsetBegin;
-                node->m_fromOffsetEnd        = m_fromOffsetEnd;
-                node->m_recurrenceDirections = m_recurrenceDirections;
-                node->m_shiftDimension       = m_shiftDimension;
-                node->m_expandDimension      = m_expandDimension;
-                node->m_expandShape          = m_expandShape;
-                node->m_insertExpandShapeAt  = m_insertExpandShapeAt;
-                node->m_state                = m_state;
+                node->m_fromOffset          = m_fromOffset;
+                node->m_numSteps            = m_numSteps;
+                node->m_boundaryMode        = m_boundaryMode;
+                node->m_shiftDimension      = m_shiftDimension;
+                node->m_insertedDimParam    = m_insertedDimParam;
+                node->m_insertExpandShapeAt = m_insertExpandShapeAt;
+                node->m_state               = m_state;
            }
        }

        class ShiftNodeState : public INodeState
        {
-            Matrix<ElemType> m_delayedActivation;       // saves the activation of the previous step that this node points to
+            Matrix<ElemType> m_delayedValue;            // saves the activation of the previous step that this node points to
+            vector<MBLayout::SequenceInfo> m_delayedSequences;    // and associated sequence info. This is only used for consistency checking (it must match).
+            ShiftNodeState(DEVICEID_TYPE deviceId) : m_delayedValue(deviceId) { }
        };
        typedef std::shared_ptr<ShiftNodeState> ShiftNodeStatePtr;

@ -260,15 +261,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    protected:
        // parameters remembered from construction
-        std::vector<int> m_fromOffsetBegin;         // offset to pull from; first offset in case of offset range
-        std::vector<int> m_fromOffsetEnd;           // end of offset range
+        int m_fromOffset;                      // offset to pull from
+        int m_numSteps;                             // offset range
+        BoundaryMode m_boundaryMode;                // how to fill at the boundary (reach across, duplicate, or trim)
        int m_shiftDimension;                       // dimension to shift (default: time)
-        int m_expandDimension;                      // in case of offset range, this is where a new dimension will be inserted
+        int m_insertedDimParam;                     // in case of multiple steps, this is where a new dimension will be inserted

        // derived params set up in Validate()
-        SmallVector<size_t> m_expandShape;          // offsetEnd-offsetBegin if >1 offset in any dimension; empty otherwise
        size_t m_insertExpandShapeAt;               // at which dimension to insert (internal 0-based index)
-        std::vector<int> m_recurrenceDirections;    // for GetRecurrenceDirections()

        ShiftNodeStatePtr m_state;                  // saves the activation of the previous step that this node points to

@ -350,7 +350,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            m_initialActivationValue = initialActivationValue;
            m_timeStep = 1;
-            m_recurrenceDirections.push_back(direction);
            CreateMatrixIfNull(m_value);
            SetDims(sampleLayout, 0);
            m_value->SetValue(m_initialActivationValue);        // is this needed?
@ -358,13 +357,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    protected:
        DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name) :
            Base(deviceId, name),
-            m_delayedActivation(deviceId)
+            m_delayedValue(deviceId)
        {
            Init(TensorShape(), (ElemType)DEFAULT_HIDDEN_ACTIVATION);
        }
        DelayedValueNodeBase(DEVICEID_TYPE deviceId, const wstring & name, ElemType initialActivationValue, const TensorShape & sampleLayout, size_t timeStep) :
            Base(deviceId, name),
-            m_delayedActivation(deviceId)
+            m_delayedValue(deviceId)
        {
            Init(sampleLayout, initialActivationValue);
            m_timeStep = (int)timeStep; // TODO: pass this to Init() instead as well
@ -408,7 +407,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            // Note: Do we need load cols for delay node? I just set to zero to see if there is any problem.
            SetDims(TensorShape(rows), 0);          // tensor shape will be overwritten in Validate()  --TODO: We should serialize it here.
-            m_delayedActivation.Resize(rows, 0);    // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag
+            m_delayedValue.Resize(rows, 0);    // Note: If we try to access history in first minibatch, we shall crash. It would be a consequence of a missing sentence-begin flag

            if (modelVersion >= CNTK_MODEL_VERSION_2)
                fstream >> m_initialActivationValue;
@ -488,14 +487,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual void EndForwardProp() override        // called after last iteration step of ForwardProp()
        {
            // In BPTT, we carry over left-to-right state across minibatches.
-            // It is kept in m_delayedActivation, m_delayedActivationMBLayout.
+            // It is kept in m_delayedValue, m_delayedActivationMBLayout.
            // This could be optimized as follows:
            //  - only keep the required number of frames (m_timeStep)
            //  - we don't need to keep anything in full-sequence mode
            //  - we don't need to keep anything if all sequences are closed (sentence end)
            //    This condition includes full-sequence mode.
            // TODO: Can we optimize this and only copy if there is a sequence spanning across the end of the MB? And add a check to BeginForwardProp() to make sure we got one if there is a boundary at the start?
-            m_delayedActivation = Input(0)->Value();
+            m_delayedValue = Input(0)->Value();
            if (!m_delayedActivationMBLayout) m_delayedActivationMBLayout = make_shared<MBLayout>();
            m_delayedActivationMBLayout->CopyFrom(m_pMBLayout);

@ -555,9 +554,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    {
                        // inside the sequence: access delayed value
                        if (t_delayed < 0)
-                            inp = DataWithMBLayoutFor(m_delayedActivation, FrameRange(m_delayedActivationMBLayout, t_delayed + T_delayedActivation).Sequence(id), m_delayedActivationMBLayout); // delay reaches in previous minibatch
+                            inp = DataWithMBLayoutFor(m_delayedValue, FrameRange(m_delayedActivationMBLayout, t_delayed + T_delayedActivation).Sequence(id), m_delayedActivationMBLayout); // delay reaches in previous minibatch
                        else if (t_delayed >= T)
-                            inp = DataWithMBLayoutFor(m_delayedActivation, FrameRange(m_delayedActivationMBLayout, t_delayed - T).Sequence(id), m_delayedActivationMBLayout); // delay reaches in previous minibatch
+                            inp = DataWithMBLayoutFor(m_delayedValue, FrameRange(m_delayedActivationMBLayout, t_delayed - T).Sequence(id), m_delayedActivationMBLayout); // delay reaches in previous minibatch
                        else
                            inp = Input(0)->ValueFor(frDelayed.Sequence(id));
                            //inp = Input(0)->ValueFor(FrameRange(m_pMBLayout, t_delayed).Sequence(id));
@ -571,9 +570,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                Matrix<ElemType> out = ValueFor(fr);

                if (t_delayed < 0)
-                    inp = DataWithMBLayoutFor(m_delayedActivation, FrameRange(m_delayedActivationMBLayout, t_delayed + T_delayedActivation), m_delayedActivationMBLayout);
+                    inp = DataWithMBLayoutFor(m_delayedValue, FrameRange(m_delayedActivationMBLayout, t_delayed + T_delayedActivation), m_delayedActivationMBLayout);
                else if (t_delayed >= T)
-                    inp = DataWithMBLayoutFor(m_delayedActivation, FrameRange(m_delayedActivationMBLayout, t_delayed - T), m_delayedActivationMBLayout);
+                    inp = DataWithMBLayoutFor(m_delayedValue, FrameRange(m_delayedActivationMBLayout, t_delayed - T), m_delayedActivationMBLayout);
                else
                    inp = Input(0)->ValueFor(frDelayed);
                    //inp = Input(0)->ValueFor(FrameRange(m_pMBLayout, t_delayed));
@ -587,10 +586,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ValidateUnaryMap(isFinalValidationPass);
        }

-        // special interface for use by loop detection
-        virtual const std::vector<int> & /*IRecurrentNode::*/GetRecurrenceDirections() const override
+        virtual int /*IRecurrentNode::*/GetRecurrenceSteppingDirection() const override
        {
-            return m_recurrenceDirections;
+            return -direction;
        }

        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -601,7 +599,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                auto node = dynamic_pointer_cast<DelayedValueNodeBase<ElemType, direction/*, SequenceStart_or_End*/>>(nodeP);
                node->m_timeStep = m_timeStep;
                node->m_initialActivationValue = m_initialActivationValue;
-                node->m_delayedActivation = m_delayedActivation;
+                node->m_delayedValue = m_delayedValue;
                if (m_delayedActivationMBLayout)
                    (node->m_delayedActivationMBLayout = make_shared<MBLayout>())->CopyFrom(m_delayedActivationMBLayout);
                else
@ -652,7 +650,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                else
                {
                    auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId);
-                    pState->CacheState(m_delayedActivation.ColumnSlice((nT - 1)*nU, nU)); 
+                    pState->CacheState(m_delayedValue.ColumnSlice((nT - 1)*nU, nU)); 
                    pState->CacheDelayedMBLayout(m_delayedActivationMBLayout); 
                    pExportedState = pState; 
                }
@ -686,7 +684,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                else
                {
                    auto pState = make_shared<DelayedValueNodeState<ElemType>>(m_deviceId);
-                    pState->CacheState(m_delayedActivation.ColumnSlice((nT-1)*nU, nU));
+                    pState->CacheState(m_delayedValue.ColumnSlice((nT-1)*nU, nU));
                    pState->CacheDelayedMBLayout(m_delayedActivationMBLayout);
                    pExportedState = pState;
                }
@ -718,12 +716,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            int dir = direction;
            if (dir == -1) // looking backward 
            {
-                m_delayedActivation.SetColumnSlice(delayedActivation, (nT - 1)*nU, nU);
+                m_delayedValue.SetColumnSlice(delayedActivation, (nT - 1)*nU, nU);
            }
            if (dir == 1)
            {
-                //m_delayedActivation.CopyColumnsStrided(delayedActivation, nU, 1, nT);
-                m_delayedActivation.SetColumnSlice(delayedActivation, 0, nU);
+                //m_delayedValue.CopyColumnsStrided(delayedActivation, nU, 1, nT);
+                m_delayedValue.SetColumnSlice(delayedActivation, 0, nU);
            }
            if (dir != -1 && dir == 1)
            {// it is really a compile error ? 
@ -733,15 +731,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    protected:

        ElemType m_initialActivationValue;          // starting value for hidden activation vector at boundary
-        Matrix<ElemType> m_delayedActivation;       // saves the activation of the previous step that this node points to
-        MBLayoutPtr m_delayedActivationMBLayout;    // layout for m_delayedActivation
+        Matrix<ElemType> m_delayedValue;       // saves the activation of the previous step that this node points to
+        MBLayoutPtr m_delayedActivationMBLayout;    // layout for m_delayedValue
        int m_timeStep;                             // delay in frames (typ. 1)
        function<void()> m_attachInputsFn;          // for late expansion of inputs (scripting)
-        std::vector<int> m_recurrenceDirections;    // for GetRecurrenceDirections()
    };

 #define UsingDelayedValueNodeMembers UsingComputationNodeMembersBoilerplate; \
-    using Base::m_initialActivationValue; using Base::m_delayedActivation; using Base::m_timeStep;
+    using Base::m_initialActivationValue; using Base::m_delayedValue; using Base::m_timeStep;

    // -----------------------------------------------------------------------
    // PastValueNode (input) -- delay node
--- a/Tests/EndToEndTests/Speech/LSTM/cntk.config
+++ b/Tests/EndToEndTests/Speech/LSTM/cntk.config
@ -67,7 +67,7 @@ speechTrain = [

            // LSTM cell
            # TODO: This is temporary test code for the new ShiftNode (until we switch PastValue() itself over)
-            PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1, offsetRanges=1, multiOffsetDim=2)
+            PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1, numSteps=1, insertedDim=2)
            PastValue1 = PastValue
            #PastValue1 = PastValueShift
            dh = PastValue1(outputDim, output);                   // hidden state(t-1)