multithread GPU code checkin, no speedup though due to the stream in kernel function

2020-02-20 22:34:44 -08:00 · 2020-02-20 22:34:44 -08:00 · 1fc15ed105
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@ -75,6 +75,9 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
        LogicError("CopySubTree: you cannot copy a tree without copying the node values.");

    ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
+    std::set<wstring> set_cur_tonode_names;
+
+    set_cur_tonode_names.clear();

    if (!fromNet.EvalOrderExists(fromRoot))
        const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
@ -84,20 +87,66 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
        wstring fromNodeName = fromNode->NodeName();
        wstring toNodeName = toNamePrefix + fromNodeName;

+        // fprintf(stderr, "CopySubTree node = %ls\n", fromNodeName.c_str());
+
        ComputationNodeBasePtr toNode = CopyNode(fromNet, fromNodeName,
                                                 toNodeName,
                                                 CopyNodeFlags::copyNodeValue);
-
+        set_cur_tonode_names.insert(toNodeName);
+        ComputationNodeBasePtr fromNodeTmp;
+        fromNodeTmp = fromNet.GetNodeFromName(fromNodeName);
+        
+        /* fprintf(stderr, "CopySubTree node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(),
+         int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumRows()), 
+            int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumCols()) 
+            ); */
        if (flags & CopyNodeFlags::copyNodeInputLinks)
        {
            // copy the children structure but use the new nodes generated
            for (int i = 0; i < fromNode->GetNumInputs(); i++)
-                toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName()));
+            {
+                wstring inputNodeName;
+                inputNodeName = (toNamePrefix + fromNode->GetInputs()[i]->NodeName());
+                if (set_cur_tonode_names.find(inputNodeName) == set_cur_tonode_names.end() )
+                {
+                    //somehow the evalorder goes wrong, and the input is not copied to the toNetwork yet, do copy here
+                    // fprintf(stderr, "CopySubTree i = %d, missing node = %ls\n", i, inputNodeName.c_str());
+
+                    CopyNode(fromNet, fromNode->GetInputs()[i]->NodeName(),
+                             inputNodeName,
+                             CopyNodeFlags::copyNodeValue);
+                    set_cur_tonode_names.insert(inputNodeName);
+                }
+                //fprintf(stderr, "CopySubTree i = %d, node = %ls\n", i, toNodeName.c_str());
+
+                //toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName()));
+                toNode->SetInput(i, GetNodeFromName(inputNodeName));
+            }
        }
    }
 }

-// you can only copy inputs from nodes in the same network
+void ComputationNetwork::ShowNodeMemory(const ComputationNetwork& fromNet,
+                                     const std::wstring fromName)
+{
+   
+    ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
+  
+    if (!fromNet.EvalOrderExists(fromRoot))
+        const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
+
+    for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
+    {
+        wstring fromNodeName = fromNode->NodeName();
+        ComputationNodeBasePtr fromNodeTmp;
+        fromNodeTmp = fromNet.GetNodeFromName(fromNodeName);
+
+        fprintf(stderr, "ShowNodeMemory node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(),
+                int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumRows()),
+                int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumCols()));
+    }
+}
+    // you can only copy inputs from nodes in the same network
 void ComputationNetwork::CopyInputs(const std::wstring fromName, std::wstring toName)
 {
    CopyNode(*this, fromName, toName, CopyNodeFlags::copyNodeInputLinks);
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -13,7 +13,7 @@
 #include "MatrixPool.h"
 #include "ComputationEnvironment.h"
 #include "Globals.h"
-
+#include <cuda_runtime.h>
 #include <unordered_set>
 #include <map>
 #include <string>
@ -233,7 +233,6 @@ struct ComputationNetworkOwnedNodeState
        return m_parentGradientOptimization == ParentGradientOptimization::Reuse;
    }

-
    virtual void MarkValueNonSharable()
    {
        m_valueSharable = false;
@ -351,6 +350,8 @@ class ComputationNodeBase : public IComputationNode,
 public:
    typedef shared_ptr<ComputationNodeBase> ComputationNodeBasePtr;

+
+
    // -----------------------------------------------------------------------
    // constructors, copying, (de-)serialization
    // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -26,7 +26,12 @@
 #include <boost/algorithm/string/join.hpp>
 #include <boost/range/adaptor/transformed.hpp>

-namespace Microsoft { namespace MSR { namespace CNTK {
+namespace Microsoft
+{
+namespace MSR
+{
+namespace CNTK
+{

 // -----------------------------------------------------------------------
 // ReduceElements (op, axis=, input)
@ -39,10 +44,10 @@ template <class ElemType>
    if (flags & CopyNodeFlags::copyNodeValue)
    {
        auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
-        node->m_axes        = m_axes;
-        node->m_operation   = m_operation;
+        node->m_axes = m_axes;
+        node->m_operation = m_operation;
        node->m_reductionOp = m_reductionOp;
-        node->m_scale       = m_scale;
+        node->m_scale = m_scale;
        node->m_keepDimensions = m_keepDimensions;
    }
 }
@ -57,7 +62,7 @@ template <class ElemType>
        fstream >> num_axes;
    for (int i = 0; i < num_axes; ++i)
    {
-        int axis; 
+        int axis;
        fstream >> axis;
        m_axes.push_back(axis);
    }
@ -123,7 +128,7 @@ template <class ElemType>
    if (ReduceSequenceAxis())
    {
        ElemType gapPadValue = NeutralValue(m_reductionOp);
-        input = ComputationNode<ElemType>::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/ true, &gapPadValue);
+        input = ComputationNode<ElemType>::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/true, &gapPadValue);
    }
    else
        input = InputRef(0).ValueTensorFor(rank, frInput);
@ -142,7 +147,24 @@ template <class ElemType>
    default:
        // the actual operation is a Copy with reduction, where the magic is in the reduction op
        // For "Mean", m_scale is 1/#elements, and 1 otherwise.
-        result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
+        {
+            /*
+            if (num_frames == 94)
+            {
+                fprintf(stderr, "reshape 1, name  = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale));
+            }
+            */
+            if (multi_thread)
+                result.DoUnaryOpOfDebug(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
+            else
+                result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
+            /*
+            if (num_frames == 94)
+            {
+                fprintf(stderr, "reshape 2, name  = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale));
+            }
+            */
+        }
    }
 }

@ -156,15 +178,15 @@ template <class ElemType>
    {
        // Broadcast along the sequence
        auto result = ValueFor(fr);
-        ComputationNode<ElemType>::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/ accumulateGradient ? (ElemType)1 : (ElemType)0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices);
+        ComputationNode<ElemType>::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/accumulateGradient ? (ElemType) 1 : (ElemType) 0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices);
    }
    else
    {
        const auto frInput = (ReduceAllAxes() || ReduceBatchAxis()) ? FrameRange(InputRef(0).GetMBLayout()) : fr; // can't use 'fr' for ReduceAllAxes() as it refers to the result (same as for training criteria)
-                                                                                        // get the args
+                                                                                                                  // get the args
        size_t rank = DetermineElementwiseTensorRank();
        auto sliceOutputGrad = ReduceAllAxes() ? TensorView<ElemType>(GradientPtr(), GetSampleLayout()) : GradientTensorFor(rank, fr); // propagate from this one...
-        auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput); // ...to this one
+        auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput);                                                            // ...to this one

        // gradients are not as simple as passing an op-code, unfortunately
        switch (m_reductionOp)
@ -220,8 +242,8 @@ template <class ElemType>
        break;
        case ElementWiseOperator::opElementwiseProduct:
        {
-            auto input  = InputRef(inputIndex).ValueTensorFor(rank, frInput);
-            auto output =                      ValueTensorFor(rank, fr.AllowBroadcast());
+            auto input = InputRef(inputIndex).ValueTensorFor(rank, frInput);
+            auto output = ValueTensorFor(rank, fr.AllowBroadcast());
            if (accumulateGradient)
                sliceInputGrad.AddElementwiseProductWithQuotientOf(sliceOutputGrad, output, input);
            else
@ -242,13 +264,20 @@ template <class ElemType>
 {
    switch (m_reductionOp)
    {
-    case ElementWiseOperator::opSum:                   return false;
-    case ElementWiseOperator::opLogSum:                return true;
-    case ElementWiseOperator::opMin:                   return true;
-    case ElementWiseOperator::opMax:                   return true;
-    case ElementWiseOperator::opElementwiseProduct:    return true;
-    case ElementWiseOperator::opArgmin:                return false;
-    case ElementWiseOperator::opArgmax:                return false;
+    case ElementWiseOperator::opSum:
+        return false;
+    case ElementWiseOperator::opLogSum:
+        return true;
+    case ElementWiseOperator::opMin:
+        return true;
+    case ElementWiseOperator::opMax:
+        return true;
+    case ElementWiseOperator::opElementwiseProduct:
+        return true;
+    case ElementWiseOperator::opArgmin:
+        return false;
+    case ElementWiseOperator::opArgmax:
+        return false;
    }
    LogicError("Should not get here.");
 }
@ -258,13 +287,20 @@ template <class ElemType>
 {
    switch (m_reductionOp)
    {
-    case ElementWiseOperator::opSum:                   return false;
-    case ElementWiseOperator::opLogSum:                return true;
-    case ElementWiseOperator::opMin:                   return true;
-    case ElementWiseOperator::opMax:                   return true;
-    case ElementWiseOperator::opElementwiseProduct:    return true;
-    case ElementWiseOperator::opArgmin:                return false;
-    case ElementWiseOperator::opArgmax:                return false;
+    case ElementWiseOperator::opSum:
+        return false;
+    case ElementWiseOperator::opLogSum:
+        return true;
+    case ElementWiseOperator::opMin:
+        return true;
+    case ElementWiseOperator::opMax:
+        return true;
+    case ElementWiseOperator::opElementwiseProduct:
+        return true;
+    case ElementWiseOperator::opArgmin:
+        return false;
+    case ElementWiseOperator::opArgmax:
+        return false;
    }
    LogicError("Should not get here.");
 }
@ -282,7 +318,7 @@ template <class ElemType>
    // validate the opcode (in case we got instantiated empty and never updated)
    ValidateOp();

-    m_scale = (ElemType)1;
+    m_scale = (ElemType) 1;
    if (ReduceAllAxes())
        Base::ValidateUnaryReduce(isFinalValidationPass, m_keepDimensions);
    else if (ReduceSequenceAxis())
@ -319,29 +355,27 @@ template <class ElemType>

        let shape = Input(0)->GetSampleLayout();
        auto dims = shape.GetDims();
-        size_t reducedDimProd = 1; 
+        size_t reducedDimProd = 1;
        if (ReduceAllStaticAxes())
        {
            reducedDimProd = shape.GetNumElements();
-            dims = m_keepDimensions ? SmallVector<size_t>(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector<size_t>({}) : SmallVector<size_t>({ 1 })); // entire sample is reduced to a scalar
+            dims = m_keepDimensions ? SmallVector<size_t>(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector<size_t>({}) : SmallVector<size_t>({1})); // entire sample is reduced to a scalar
        }
-        else if (!m_axes.empty() 
-                && std::all_of(m_axes.begin(), 
-                                m_axes.end(), 
-                                [&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); }))
+        else if (!m_axes.empty() && std::all_of(m_axes.begin(),
+                                                m_axes.end(),
+                                                [&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); }))
        {
            //Accumulate the number of elements for reduce_mean
            reducedDimProd = std::accumulate(m_axes.begin(),
-                                                m_axes.end(), 
-                                                1, 
-                                                [&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; });
+                                             m_axes.end(),
+                                             1,
+                                             [&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; });

            // axes reduced to a scalar
            if (m_keepDimensions)
                std::for_each(m_axes.begin(),
-                    m_axes.end(),
-                    [&dims](int axis) {dims[axis - 1] = 1; }
-                 );
+                              m_axes.end(),
+                              [&dims](int axis) { dims[axis - 1] = 1; });
            else
            {
                SmallVector<size_t> reducedDims(dims.size() - m_axes.size());
@ -355,13 +389,12 @@ template <class ElemType>
                dims = reducedDims;
            }
        }
-        else if (isFinalValidationPass) 
+        else if (isFinalValidationPass)
        {
            InvalidArgument("The shape of %ls [%ls] can not be reduced along axes [%ls]",
-                NodeDescription().c_str(),
-                wstring(shape).c_str(),
-                boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str()
-            );
+                            NodeDescription().c_str(),
+                            wstring(shape).c_str(),
+                            boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str());
        }
        // for "Mean", we must divide by #elements
        if (isFinalValidationPass && IsMean())
@ -384,10 +417,14 @@ struct SequenceLengthVector
 {
    typedef vector<vector<size_t>> SequenceVector;
    typedef MBLayout::SequenceInfo SequenceInfo;
-    const SequenceVector& m_sequenceVector;        // vector of sequences (to get sequence length)
-    const vector<SequenceInfo>& m_sequenceInfo;    // original sequence info (for seqId)
-    SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector) : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) { }
-    size_t size() const { return m_sequenceInfo.size(); }
+    const SequenceVector& m_sequenceVector;     // vector of sequences (to get sequence length)
+    const vector<SequenceInfo>& m_sequenceInfo; // original sequence info (for seqId)
+    SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector)
+        : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) {}
+    size_t size() const
+    {
+        return m_sequenceInfo.size();
+    }
    MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence
    {
        SequenceInfo seq;
@ -427,12 +464,12 @@ template <class ElemType>
        for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
        {
            double delta = input(0, inMBLayout->GetColumnIndex(seq, t)); // how many frames the current time step should expand into
-            desiredCount += delta; // this is now how many frames we should have
+            desiredCount += delta;                                       // this is now how many frames we should have
            // use a margin against round-off errors, so that we get non-binary ratios like 1/3 and 1/5 right
            // This really means generate a frame if too few, unless we are within machine accuracy of the target.
            // The assumption is that the delta has this error, while accumulation (in double) has no error.
            ElemType relativeMargin = 1 - std::numeric_limits<ElemType>::epsilon();
-            while ((indexSequence.empty() && desiredCount > 0)  // no margin for the first frame (always include unless flag is 0)
+            while ((indexSequence.empty() && desiredCount > 0) // no margin for the first frame (always include unless flag is 0)
                   || indexSequence.size() < desiredCount * relativeMargin)
                indexSequence.push_back(t);
        }
@ -441,10 +478,10 @@ template <class ElemType>
    input.CollapseDataLocation(); // BUGBUG: Move back, since BOTH state is broken at present.
    // create a new MBLayout
    let& outMBLayout = GetMBLayout();
-    outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
+    outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/ m_placementBuffer, /*temp*/ m_rowAllocationsBuffer);
    // copy to output
    vector<ElemType> buf(outMBLayout->GetNumCols(), numeric_limits<ElemType>::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps
-    let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this
+    let size = min(sequences.size(), outMBLayout->GetAllSequences().size());                // no non-gap sequence has an index beyond this
    for (size_t i = 0; i < size; i++)
    {
        let& seq = outMBLayout->GetAllSequences()[i];
@ -452,7 +489,7 @@ template <class ElemType>
            continue;
        let& indexSequence = indexSequences[i];
        for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
-            buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
+            buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType) indexSequence[t];
    }
    // there may be dangling gaps at the end. Take the opportunity to verify this.
    for (size_t i = size; i < sequences.size(); i++)
@ -460,7 +497,7 @@ template <class ElemType>
    for (size_t i = size; i < outMBLayout->GetAllSequences().size(); i++)
        assert(outMBLayout->GetAllSequences()[i].seqId == GAP_SEQUENCE_ID);
    // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
-    Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true);
+    Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/true, /*emptyTransfer=*/true, /*updatePreferredDevice=*/true);
    Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
 }

@ -501,9 +538,9 @@ template <class ElemType>
 /*virtual*/ void PackedIndexNode<ElemType>::ForwardPropNonLooping() /*override*/
 {
    let& sourceMBLayout = InputRef(SOURCEDATA).GetMBLayout(); // only used for index conversion
-    let& indexMBLayout  = InputRef(INDEXDATA).GetMBLayout();
-    let&  index  = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped
-    auto& result =                     Value(); // packed index values as mapped to sourceData's layout
+    let& indexMBLayout = InputRef(INDEXDATA).GetMBLayout();
+    let& index = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped
+    auto& result = Value();                   // packed index values as mapped to sourceData's layout
    // loop over sourceSequences
    // Input matrix contains time indices for each sequence that refer to frames inside that sequence.
    // We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout.
@ -516,10 +553,10 @@ template <class ElemType>
        let& indexSeq = indexMBLayout->FindMatchingSequence(sourceSequences, i); // find corresponding entry in indexMBLayout
        for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++)   // map all index values in index sequence
        {
-            let jIndex  = indexMBLayout->GetColumnIndex(indexSeq, tIndex);    // map time index to actual location in the matrix storage object
-            let tSource = (size_t)index(0, jIndex);                           // the new time location (relative to source sequence)
+            let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex);     // map time index to actual location in the matrix storage object
+            let tSource = (size_t) index(0, jIndex);                          // the new time location (relative to source sequence)
            let jSource = sourceMBLayout->GetColumnIndex(sourceSeq, tSource); // map new time index as well. This performs a range check.
-            result(0, jIndex) = (ElemType)jSource;
+            result(0, jIndex) = (ElemType) jSource;
        }
    }
    // Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes.
@ -563,8 +600,8 @@ template <class ElemType>
 /*virtual*/ void GatherPackedNode<ElemType>::ForwardPropNonLooping() /*override*/
 {
    InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
-    let&  index  = InputRef(INDEXDATA) .Value(); // column indices to copy from
-    let&  source = InputRef(SOURCEDATA).Value(); // source data to copy
+    let& index = InputRef(INDEXDATA).Value();                                                         // column indices to copy from
+    let& source = InputRef(SOURCEDATA).Value();                                                       // source data to copy

 #ifdef _MSC_VER
    auto& outputValuePtrRef = ValuePtrRef();
@ -587,9 +624,9 @@ template <class ElemType>
 {
    if (inputIndex == SOURCEDATA)
    {
-        let&  index          = InputRef(INDEXDATA) .Value();    // column indices to copy from
+        let& index = InputRef(INDEXDATA).Value();               // column indices to copy from
        auto& sourceGradient = InputRef(SOURCEDATA).Gradient(); // source to propagate the gradient intpu
-        auto& outputGradient =                      Gradient(); // output gradient to propagate
+        auto& outputGradient = Gradient();                      // output gradient to propagate
        sourceGradient.DoScatterColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1, true);
    }
 }
@ -614,7 +651,7 @@ template <class ElemType>
        SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout());
    else
    {
-        SmallVector<size_t> layout = { 1 }; // Scalar
+        SmallVector<size_t> layout = {1}; // Scalar
        if (Input(SOURCEDATA)->GetSampleLayout().GetRank() > 1)
        {
            auto srcLayout = Input(SOURCEDATA)->GetSampleLayout().GetDims();
@ -638,8 +675,8 @@ template <class ElemType>
    if (*InputRef(INDEXDATA).GetMBLayout() != *InputRef(SOURCEDATA).GetMBLayout())
        InvalidArgument("%ls %ls operation requires the minibatch layout of index and source data to be the same.", NodeName().c_str(), OperationName().c_str());
    InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
-    let&  index  = InputRef(INDEXDATA) .Value(); // column indices to copy from
-    let&  source = InputRef(SOURCEDATA).Value(); // source data to copy
+    let& index = InputRef(INDEXDATA).Value();                                                         // column indices to copy from
+    let& source = InputRef(SOURCEDATA).Value();                                                       // source data to copy

 #ifdef _MSC_VER
    auto& outputValuePtrRef = ValuePtrRef();
@ -653,7 +690,7 @@ template <class ElemType>
                                                               source.GetMatrixType(),
                                                               source.GetFormat());

-    auto& output =                      Value(); // output goes here
+    auto& output = Value(); // output goes here
    output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1, true);
 }

@ -662,9 +699,9 @@ template <class ElemType>
 {
    if (inputIndex == SOURCEDATA)
    {
-        let&  index          = InputRef(INDEXDATA).Value();     // column indices to copy from
+        let& index = InputRef(INDEXDATA).Value();             // column indices to copy from
        auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient input
-        auto& outputGradient =                      Gradient(); // output gradient to propagate
+        auto& outputGradient = Gradient();                    // output gradient to propagate
        sourceGradient.DoGatherColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1);
    }
 }
@ -707,8 +744,8 @@ template <class ElemType>
 CropNode<ElemType>::CropNode(size_t offsetX, size_t offsetY, DEVICEID_TYPE deviceId, const wstring& name)
    : CropNode(deviceId, name)
 {
-    m_xOffset = (double)(offsetX);
-    m_yOffset = (double)(offsetY);
+    m_xOffset = (double) (offsetX);
+    m_yOffset = (double) (offsetY);
 }

 template <class ElemType>
@ -880,8 +917,7 @@ void CropNode<ElemType>::ComputeCropOffsets()
    // nodeToTransformMap contains coordinate maps for all nodes traversed so far, and is updated by this function.
    // Traversal stack contains all nodes traversed so far. Inputs of currNode are pushed to traversal stack so that their
    // inputs can be processed later on.
-    auto ProcessInputs = [](ComputationNodeBase* currNode, stack<ComputationNodeBase*>& traversalStack, unordered_map<ComputationNodeBase*, SpaceTransform>& nodeToTransformMap)
-    {
+    auto ProcessInputs = [](ComputationNodeBase* currNode, stack<ComputationNodeBase*>& traversalStack, unordered_map<ComputationNodeBase*, SpaceTransform>& nodeToTransformMap) {
        if (!currNode->Is<TransformerNode>())
            RuntimeError("Node does not support affine transform for cropping.");

@ -1033,4 +1069,6 @@ template class CropNode<float>;
 template class CropNode<double>;
 template class CropNode<half>;

-}}}
+} // namespace CNTK
+} // namespace MSR
+} // namespace Microsoft
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -311,6 +311,7 @@ public:
    CPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);

    CPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
+    CPUMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);

    ElemType SumOfAbsElements() const; // sum of all abs(elements)
    ElemType SumOfElements() const;    // sum of all elements
--- a/Source/Math/CPUMatrixImpl.h
+++ b/Source/Math/CPUMatrixImpl.h
@ -3215,6 +3215,24 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
    return *this;
 }

+template <class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    if (IsEmpty())
+        LogicError("SetToZeroIfLessThan: Matrix is empty.");
+
+    auto& us = *this;
+
+#pragma omp parallel for
+    foreach_coord (i, j, us)
+    {
+        if ((us(i, j)) < threshold)
+            us(i, j) = 0;
+    }
+
+    return *this;
+}
+
 //sum of all abs(elements)
 template <class ElemType>
 ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -2486,6 +2486,18 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
    return *this;
 }

+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    if (IsEmpty())
+        LogicError("SetToZeroIfLessThan: Matrix is empty.");
+    CUDA_LONG N = (CUDA_LONG) GetNumElements();
+    int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock);
+    PrepareDevice();
+    SyncGuard syncGuard;
+    _setToZeroIfLessThan<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(Data(), threshold, N);
+    return *this;
+}
 template <class ElemType>
 ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
 {
@ -5318,6 +5330,119 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
        return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }

+// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+// This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
+template <class ElemType>
+void GPUMatrix<ElemType>::TensorOpDebug(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                        const array<size_t, 2>& offsets,
+                                        const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                        const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
+{
+    if (reductionOp != ElementWiseOperator::opSum &&
+        reductionOp != ElementWiseOperator::opLogSum &&
+        reductionOp != ElementWiseOperator::opMin &&
+        reductionOp != ElementWiseOperator::opMax &&
+        reductionOp != ElementWiseOperator::opElementwiseProduct)
+        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
+
+    a.PrepareDevice();
+    if (a.GetComputeDeviceId() != GetComputeDeviceId())
+        InvalidArgument("All matrices must be on the same GPU");
+
+    // special case: linear processing
+    // The case statement has measurable impact for unary ops (but not for binary ops it seems, due to double mem access).
+    // Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all.
+    if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
+    {
+        // special case: for copy, use cudaMemcpy() instead, or cublas_axpy()
+        // TODO: We should observe if these actually make a speed difference, and if not, remove these special cases.
+        if (op == ElementWiseOperator::opCopy && beta == 0 && alpha == 1)
+        {
+            //fprintf(stderr, "TensorOpDebug 1 \n");
+            return CUDA_CALL(cudaMemcpy(Data() + offsets[1], a.Data() + offsets[0], sizeof(ElemType) * regularOpDims[0], cudaMemcpyDeviceToDevice));
+        }
+        else if (op == ElementWiseOperator::opCopy && beta == 1)
+        {
+            //fprintf(stderr, "TensorOpDebug 2 \n");
+            return CUBLAS_CALL(cublasaxpyHelper(GetCublasHandle(GetComputeDeviceId()), (int) regularOpDims[0], &alpha, a.Data() + offsets[0], 1, Data() + offsets[1], 1));
+        }
+        else
+        {
+            //fprintf(stderr, "TensorOpDebug 3 \n");
+
+            return LaunchUnaryTensorOp<ElemType>(beta, a.Data() + offsets[0], Data() + offsets[1], alpha, op, regularOpDims[0]);
+        }
+    }
+
+    // special case: sum-reducing a matrix onto a column vector; can be done with SGEMM
+    // Note: A minor risk is that with this, our own reduction function will rarely be used.
+    // That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
+    else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
+             reductionOp == ElementWiseOperator::opSum &&
+#ifdef _DEBUG
+             sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
+#endif
+             regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column
+             reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t) regularOpDims[0])   // reducing across columns and no overlap
+    {
+        assert(reducingStrides[1][0] == 0);
+        auto ARows = regularOpDims[0];    // vertical steps
+        auto ACols = reducingOpDims[0];   // horizontal steps (reduction)
+        auto ALd = reducingStrides[0][0]; // horizontal step width through matrix
+        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+        CUBLAS_CALL(cublasgemmHelper(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, (int) /*CRows=*/ARows, /*CCols=*/1, (int) ACols, &alpha,
+                                     /*A00=*/a.Data() + offsets[0], (int) ALd,
+                                     /*B00=*/GetOnesVector<ElemType>(ACols, a.GetComputeDeviceId())->Data(), (int) /*BRows=*/ACols, &beta,
+                                     /*C00=*/Data() + offsets[1], (int) /*CRows=*/ARows));
+        //fprintf(stderr, "TensorOpDebug 4 \n");
+
+        return;
+    }
+
+    // TODO: Add a special case for tensor bias reduction. cudnn is ~7% faster on Image/QuickE2E.
+
+    // regular case
+    else
+    {
+        /*
+        fprintf(stderr, "TensorOpDebug 5 \n");
+        for (size_t i = 0; i < 2; i++)
+        {
+            // fprintf(stderr, "i = %d, offsets = %d,  regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i]));
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, offsets = %d\n ", int(i), int(offsets[i]));
+        }
+        fprintf(stderr, "TensorOpDebug 5.1, regularOpDims.size() = %d \n", int(regularOpDims.size()));
+
+        for (size_t i = 0; i < regularOpDims.size(); i++)
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i]));
+
+        fprintf(stderr, "TensorOpDebug 5.1, reducingOpDims.size() = %d \n", int(reducingOpDims.size()));
+        for (size_t i = 0; i < reducingOpDims.size(); i++)
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i]));
+
+        for (size_t i = 0; i < 2; i++)
+        {
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularStrides.size() = %d \n", int(i), int(regularStrides[i].size()));
+            for (size_t j = 0; j < regularStrides[i].size(); j++)
+            {
+                fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j]));
+            }
+
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingStrides.size() = %d \n", int(i), int(reducingStrides[i].size()));
+            for (size_t j = 0; j < reducingStrides[i].size(); j++)
+            {
+                fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j]));
+            }
+        }
+
+        fprintf(stderr, "TensorOpDebug 5.2, beta = %f, alpha = %f, a.data = %f, data = %f \n", double(beta), double(alpha), double(a.FrobeniusNorm()), double(FrobeniusNorm()));
+        // return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        */
+        return TensorOpNDebug<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, *this);
+
+    }
+}
+
 // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 template <class ElemType>
 void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -437,6 +437,7 @@ public:
    GPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);

    GPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
+    GPUMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);

    DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
    ElemType SumOfAbsElements() const; // sum of all abs(elements)
@ -607,6 +608,11 @@ public:
                  const std::array<size_t, 2>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+    void TensorOpDebug(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                  const std::array<size_t, 2>& offsets,
+                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
    void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 3>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -1306,6 +1306,19 @@ __global__ void _setToZeroIfAbsLessThan(
        a[id] = 0;
 }

+template <class ElemType>
+__global__ void _setToZeroIfLessThan(
+    ElemType* a,
+    const ElemType threshold,
+    const CUDA_LONG N)
+{
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
+    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+    if (((comp_t) a[id]) < (comp_t) threshold)
+        a[id] = 0;
+}
 template <class ElemType>
 __global__ void _areEqual(
    const ElemType* a,
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -2987,6 +2987,19 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
    return *this;
 }

+template <class ElemType>
+GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    VerifyWritable(__FUNCTION__);
+
+    if (IsEmpty())
+        LogicError("SetToZeroIfLessThan: Matrix is empty.");
+    CUDA_LONG N = (CUDA_LONG) GetNumNZElements();
+    int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock);
+    SyncGuard syncGuard;
+    _setToZeroIfLessThan<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(NzValues(), threshold, N);
+    return *this;
+}
 #pragma endregion

 #pragma region Helper Functions
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@ -395,7 +395,7 @@ public:
    GPUSparseMatrix<ElemType>& AssignTruncateTopOf(const GPUSparseMatrix<ElemType>& a, const ElemType threshold);

    GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
-
+    GPUSparseMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
    GPUSparseMatrix<ElemType>& AssignOneHot(const GPUMatrix<ElemType>& a, vector<size_t>& shape, size_t axis);
    void SetDiagonalValue(const ElemType v);
    void SetDiagonalValue(const GPUMatrix<ElemType>& vector);
--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@ -875,6 +875,12 @@ static shared_ptr<ElemType> GetReductionBuffer(size_t N)
    return reductionBuffersCache[deviceId];
 }

+// this is safe for multithread calling in RNNT_EMBR
+template <class ElemType>
+static shared_ptr<ElemType> GetReductionBufferNoCache(size_t N)
+{
+    return AllocateReductionBuffer<ElemType>(N);
+}
 // All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
 template <class ElemType, C_size_t N, C_int M, C_int K>
 static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
@ -1112,6 +1118,272 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
    }
 }

+
+
+template <class ElemType, C_size_t N, C_int M, C_int K>
+static void LaunchTensorOpWithReductionDebug(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                        const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrideVectors,
+                                             const SmallVector<size_t>& reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N>& reducingStrideVectors, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result)
+{
+    //fprintf(stderr, "LaunchTensorOpWithReductionDebug  1,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+    // return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    a;
+    result;    
+    typedef typename TypeSelector<ElemType>::comp_t ReduceElemType;
+    // copy all parameters to CUDA-compatible data structures
+    FixedArray<ElemType*, N> pointers(pointerVector);
+    SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
+    C_size_t numElements = 1;
+    // input divisors
+    SmallVector<fast_divmod> regularOpStrideDivmodVector;
+    for (C_size_t k = 0; k < regularOpDims.size(); k++)
+    {
+        regularOpStrideVector.push_back(numElements); // stride for dense representation of our output elements (if they were flattened)
+        regularOpStrideDivmodVector.push_back(fast_divmod((unsigned int) numElements));
+        numElements *= (C_size_t) regularOpDims[k];
+    }
+    // output divisors
+    SmallVector<fast_divmod> reducingOpDimDivmodVector;
+    C_size_t stride = 1;
+    for (C_size_t k = 0; k < reducingOpDimVector.size(); ++k)
+    {
+        reducingOpDimDivmodVector.push_back(fast_divmod(stride));
+        stride *= (C_size_t) reducingOpDimVector[k];
+    }
+
+    FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
+    FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
+    FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
+    FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
+    // reduced divisors
+    FixedArray<fast_divmod, K> regularOpStrideDivmod(regularOpStrideDivmodVector);
+    FixedArray<fast_divmod, M> reducingOpDimDivmod(reducingOpDimDivmodVector);
+
+    // launch the kernel
+    CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual output element
+    SyncGuard syncGuard;
+
+    // do some optimization for reductions
+    //  - example: 30 GPU procs, warp size 32 --> 960 GPU cores
+    //  - NN elements must be computed, each involving a reduction over reductionDim elements
+    // Cases:
+    //  - #output elements NN >= GPU cores  -->  use one proc per element, do reduction in inner loop
+    //    E.g. if >=960 elements are computed, each gets its own GPU thread.
+    //  - reduction dimension would benefit from multiple blocks  -->  multiple blocks work on a single output element
+    //    E.g.
+    //     - gradient of adding a bias: reducing to a bias, e.g. 512-dim
+    //     - gradient of scalar multiplication: big elementwise product reduced to a scalar (big dot product, e.g. [1024 x 1024] = 1M elements)
+    //     - softmax in seq-2-seq attention model: reduce over length of attention window (e.g. 20)
+    //     - summation of criterion value: scalar reduction over a few hundred or thousand samples in the minibatch
+    C_size_t reductionDim = 1; // number of elements to reduce over
+    for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
+        reductionDim *= (C_size_t) reducingOpDimVector[k];
+    GridDim grid(NN);
+    let& props = GridDim::GetDeviceProps();
+    bool disableParallelReduction = false; // (for debugging)
+    //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+    // === arg based reduction, one thread per output element
+    if ((reductionOp == ElementWiseOperator::opArgmax) ||
+        (reductionOp == ElementWiseOperator::opArgmin))
+    {
+
+        _launchTensorArgOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
+            pointers, reductionOp,
+            regularOpStrides, regularStrides, grid.m_N,
+            reducingOpDims, reducingStrides,
+            regularOpStrideDivmod, reducingOpDimDivmod);
+    }
+    // === simple case: NN large, one thread per output element
+    else if (reductionDim == 1 ||                                     // no reduction
+             grid.m_blocksPerGrid >= props.multiProcessorCount ||     // enough output elements to fill all multiprocs
+             reductionDim * numElements <= 2 * props.warpSize ||      // trivial operation not worth the trouble (2* because the more complex one also needs 2 kernel launches)
+             disableParallelReduction ||                              // (for debugging)
+             reductionDim * numElements <= props.multiProcessorCount) // recursive call from reduction below
+    {
+        // we got enough elements to generate: do one element per thread, and reduction inside
+        //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.1,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+        _launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
+            beta, pointers, alpha, op, reductionOp,
+            regularOpStrides, regularStrides, grid.m_N,
+            reducingOpDims, reducingStrides,
+            regularOpStrideDivmod, reducingOpDimDivmod);
+        //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.2,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+    }
+    // === optimization: simple case would not use all multiprocs
+    else
+    {
+        // m_blocksPerGrid can be thought of NN / 512, with appropriate rounding
+
+        // we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
+        // If we get here, then
+        //  - the total number of outputs to produce is < #multiprocs * warpSize, e.g. < 960
+        //  - each output has at least two inputs, but possibly millions
+        // Examples:
+        //  (a1) NN=900
+        //        - each multiproc processes multiple elements concurrently, each reducing over its inputs inside
+        //        - use one block per output element
+        //  (a2) NN=30
+        //        - same as (a1) except 30 multiprocs run only a single block each
+        //  (a3) NN=16
+        //        - same as (a1) except only 16 multiproc run one block
+        //  (b1) NN=15
+        //        - 2 blocks work together on a single output element
+        //  (b2) NN=1    (NN < #multiprocs, e.g. NN < 30)
+        //        - multiple blocks work together on a single output element
+        //        - only this case requires memory, and only K * NN
+        //          where K = blocks that work together,
+        //          both K and NN < #multiprocs,
+        //          and K * NN = on the order of NN, but generally a bit larger due to rounding.
+
+        // By how much do we underutilize?
+        // We increase #blocks by that factor by breaking reduction into that many chunks.
+        //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.3,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+        int numReductionChunks = std::max<int>(props.multiProcessorCount / NN, 1); // only >1 for NN < multiProcessorCount
+
+        // distribute NN over block X and Y
+        int blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
+        int numBlocksX = CeilDiv(NN, blockXOverBy);
+        int numBlocksY = CeilDiv(NN, numBlocksX);
+        // while block Z is for multiple blocks working together on a single output element
+        int numBlocksZ = numReductionChunks;
+        // Block dim is now:
+        //  - X, Y: such that X*Y covers NN
+        //  - Z: reduction chunks
+
+        // reduction goes into thread dim X
+        int reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
+        int numThreadsX = std::min<int>(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
+
+        // --- cases (a1) and (a2)
+        // This involves no reduction across blocks.
+        //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.4,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+        if (numReductionChunks == 1)
+        {
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.5,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
+                beta, pointers, alpha, op, reductionOp,
+                regularOpStrides, regularStrides, NN,
+                reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize,
+                regularOpStrideDivmod, reducingOpDimDivmod);
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.6,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+        }
+        // --- case (b)
+        // Reduction across blocks. This is the difficult one.
+#ifndef ALLOW_ATOMIC_REDUCTION // temporarily disabled to ensure it is not causing the non-reproducability
+        else
+        {
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.7,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            // we get here if NN <= #multiprocs
+            assert(NN <= props.multiProcessorCount && numBlocksX == NN && numBlocksY == 1);
+            // dims are:
+            //  - numBlocksZ = numReductionChunks = how many multiprocs work together to produce one output element
+            //  - numBlocksX = NN = number of output elements
+            //  - numThreadsX = reductionChunkSize clipped to 512; reductionChunkSize > 512 is handled by an inner for loop inside of the kernel
+
+            // we need memory for block outputs of dimension [numBlocksX x numBlocksZ]
+            //  - total elements = NN * Floor(#multiprocs / NN) = <= #multiprocs
+            let reductionBufferSize = props.multiProcessorCount;
+            assert(reductionBufferSize >= NN * numBlocksZ);
+            shared_ptr<ElemType> reductionBuffer = GetReductionBufferNoCache<ElemType>(reductionBufferSize);
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.8,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            // 'pointers', 'regularOpStrides', and 'regularStrides' are set up to point to the target memory.
+            // We need to reroute them to point to our reductionBuffer.
+            //  - pointer[N-1] -> replace by reductionBuffer
+            //  - regularStrides -> replace [N-1] by regularOpStrides which already represent the NN elements for a dense memory layout
+            //  - beta -> 0 since we write into temp memory
+            //  - kernel must use block.z as second index into the output buffer; add (block.z * NN) to the pointer
+            FixedArray<ElemType*, N> pointers1 = pointers;
+            pointers1[N - 1] = reductionBuffer.get();
+            auto regularStrideVectors1 = regularStrideVectors;
+            for (size_t k = 0; k < regularOpStrides.size(); k++)
+                regularStrideVectors1[N - 1][k] = (ptrdiff_t) regularOpStrideVector[k];
+            FixedMatrix<C_int, N, K> regularStrides1(regularStrideVectors1);
+            ElemType beta1 = 0;
+            ElemType alpha1 = 1;
+            // fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.9,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
+                beta1, pointers1, alpha1, op, reductionOp,
+                regularOpStrides, regularStrides1, NN,
+                reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize,
+                regularOpStrideDivmod, reducingOpDimDivmod);
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  3,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+
+#if 1
+            // now reduce and redistribute
+            // Create a new tensor task, and execute it recursively:
+            //  - input  = reductionBuffer
+            //  - output = true output
+            //  - op dims/strides     = output elements
+            //  - reduce dims/strides = numBlocksZ
+            //  - op = opCopy
+            array<ElemType*, 2> pointerVector2{reductionBuffer.get(), pointerVector[N - 1]};
+            const array<SmallVector<ptrdiff_t>, 2> regularStrideVectors2{regularStrideVectors1[N - 1], regularStrideVectors[N - 1]};
+            const array<SmallVector<ptrdiff_t>, 2> reducingStrideVectors2{SmallVector<ptrdiff_t>{NN}, SmallVector<ptrdiff_t>{0}};
+            const SmallVector<size_t> reducingOpDimVector2{(size_t) numReductionChunks};
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  3.1,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            LaunchTensorOpWithReduction<ElemType, /*N=*/2, /*M=*/1, K>(
+                beta, pointerVector2, alpha, ElementWiseOperator::opCopy, reductionOp,
+                regularOpDims, regularStrideVectors2,
+                reducingOpDimVector2, reducingStrideVectors2);
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  3.2,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            // (note: ^^this will have a nested syncGuard, which is fine)
+
+#else
+            _launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
+                beta, pointers, alpha, op, reductionOp,
+                regularOpStrides, regularStrides, grid.m_N,
+                reducingOpDims, reducingStrides);
+            //for (size_t z = 0; z < numBlocksZ; z++)
+            //    _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(z == 0 ? beta : 1, pointers, alpha, op,
+            //    regularOpStrides, regularStrides, NN,
+            //    reducingOpDims, reducingStrides, reductionChunkSize * z, reductionChunkSize);
+            vector<ElemType> peekPartial(NN * numBlocksZ, -42);
+            vector<ElemType> peekFinal(NN, -42);
+            CUDA_CALL(cudaMemcpy(peekPartial.data(), reductionBuffer, sizeof(ElemType) * peekPartial.size(), cudaMemcpyDeviceToHost));
+            CUDA_CALL(cudaMemcpy(peekFinal.data(), pointers[pointers.size() - 1], sizeof(ElemType) * peekFinal.size(), cudaMemcpyDeviceToHost));
+            double s1 = 0, s2 = 0;
+            for (auto v : peekPartial)
+                s1 += v;
+            for (auto v : peekFinal)
+                s2 += v;
+            sin(1.0);
+#endif
+        }
+#else
+        else if (beta == 1)
+        {
+            // no need to pre-scale; just add (common for gradients)
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides,
+                                                                                                                                                                       regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize,
+                                                                                                                                                                       regularOpStrideDivmod, reducingOpDimDivmod);
+            return;
+        }
+        else
+        {
+            // We need more than one chunk, we will use atomicAdd().
+            // First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize,
+                                                                                                                                                              regularOpStrideDivmod, reducingOpDimDivmod);
+            // We will leave it like this for a while, but eventually need to revisit using temporary memory.
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize,
+                                                                                                                                                                           regularOpStrideDivmod, reducingOpDimDivmod);
+        }
+#endif
+    }
+}
 // -----------------------------------------------------------------------
 // kernel and launch  --linear unary
 // -----------------------------------------------------------------------
@ -1205,6 +1477,24 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
    }
 }

+template <class ElemType, C_size_t N, C_int K>
+static void TensorOpWithRegularLoopDebug(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+                                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+{
+    size_t dims = reducingOpDims.size();
+    switch (dims)
+    {
+    case 2:
+        return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 1:
+        return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 0:
+        return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides);
+    default:
+        LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int) dims);
+    }
+}
 // tensor operation, generalized in number of arguments
 // This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
 template <class ElemType, C_size_t N>
@ -1236,6 +1526,17 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
    }
 }

+template <class ElemType, C_size_t N>
+void TensorOpNDebug(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+               const array<size_t, N>& offsets,
+               const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result)
+{
+    for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
+        pointers[i] += offsets[i];
+    return LaunchTensorOpWithReductionDebug<ElemType, N, 1, 0>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, result);
+}
+
 //------------------------------------------------------------------------
 // explicit instantiations--these are being called from GPUMatrix.cu
 //------------------------------------------------------------------------
@ -1277,6 +1578,21 @@ template void TensorOpN<half, 4>(half beta, array<half*, 4> pointers, half alpha
                                  const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                  const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);

+template void TensorOpNDebug<half, 2>(half beta, array<half*, 2> pointers, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                 const array<size_t, 2>& offsets,
+                                 const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                      const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<half>& a, GPUMatrix<half>& result);
+
+template void TensorOpNDebug<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                   const array<size_t, 2>& offsets,
+                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                        const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides,  const GPUMatrix<double>& a, GPUMatrix<double>& result);
+
+template void TensorOpNDebug<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                  const array<size_t, 2>& offsets,
+                                  const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                       const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<float>& a, GPUMatrix<float>& result);
+

 template void LaunchUnaryTensorOp(float beta, const float* pa, float* pb, float alpha, ElementWiseOperator op, size_t regularOpDim);
 template void LaunchUnaryTensorOp(double beta, const double* pa, double* pb, double alpha, ElementWiseOperator op, size_t regularOpDim);
--- a/Source/Math/GPUTensor.h
+++ b/Source/Math/GPUTensor.h
@ -23,6 +23,12 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
               const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
               const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides);

+template <class ElemType, C_size_t N>
+void TensorOpNDebug(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+               const array<size_t, N>& offsets,
+               const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result);
+
 template <class ElemType>
 void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);

--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -3669,6 +3669,21 @@ Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType thresh
    return *this;
 }

+template <class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    if (IsEmpty())
+        LogicError("SetToZeroIfLessThan: Matrix is empty.");
+
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->SetToZeroIfLessThan(threshold),
+                            m_GPUMatrix->SetToZeroIfLessThan(threshold),
+                            NOT_IMPLEMENTED,
+                            m_GPUSparseMatrix->SetToZeroIfLessThan(threshold));
+
+    return *this;
+}
 //sum of all elements
 template <class ElemType>
 ElemType Matrix<ElemType>::SumOfElements() const
@ -6399,6 +6414,23 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemTy
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
 }
+template <class ElemType>
+void Matrix<ElemType>::TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                const array<size_t, 2>& offsets,
+                                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
+{
+    VerifyIsDense(*this) && VerifyIsDense(a);
+
+    DecideAndMoveToRightDevice(*this, a);
+
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            m_GPUMatrix->TensorOpDebug(beta, *a.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}

 template <class ElemType>
 void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -474,6 +474,8 @@ public:
    void InplaceTranspose();

    Matrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
+    Matrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
+

    DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
    ElemType SumOfAbsElements() const; // sum of all abs(elements)
@ -658,6 +660,11 @@ public:
                  const std::array<size_t, 2>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+    void TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                  const std::array<size_t, 2>& offsets,
+                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
    void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                  const std::array<size_t, 3>& offsets,
                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -709,6 +709,11 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
    return *this;
 }

+template <class ElemType>
+GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    return *this;
+}
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
 {
@ -1723,7 +1728,11 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
 {
    return *this;
 }
-
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    return *this;
+}
 template <class ElemType>
 ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
 {
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@ -217,7 +217,7 @@ static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N>
    }

    for (size_t i = 0; i < N; i++)
-        offsets[i] = shapes[i].GetOffset();
+        offsets[i] = shapes[i].GetOffset();        
 }

 // enforce that in case of broadcasting, the output must not be an input
@ -249,6 +249,47 @@ void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemT
    GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }

+template <class ElemType>
+void TensorView<ElemType>::DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
+{
+    // static int cc = 0; if (cc++ == 0)
+    //    fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
+
+    // prepare all tensor descriptor information as needed for execution
+    array<size_t, 2> offsets;
+    array<SmallVector<ptrdiff_t>, 2> regularStrides, reducingStrides;
+    SmallVector<size_t> regularOpDims, reducingOpDims;
+    PrepareTensorOperands<ElemType, 2>(array<TensorShape, 2>{a.GetShape(), GetShape()}, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+
+    // output cannot be input when reducing
+    if (reducingOpDims.size() > 0)
+        CheckDifferentObject(a, *this);
+    /*
+    for (size_t i = 0; i < 2; i++)
+    {
+        // fprintf(stderr, "i = %d, offsets = %d,  regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i]));
+        fprintf(stderr, "i = %d, offsets = %d\n ", int(i), int(offsets[i]));
+    }
+    for (size_t i = 0; i < regularOpDims.size(); i++)
+        fprintf(stderr, "i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i]));
+    for (size_t i = 0; i < reducingOpDims.size(); i++)
+        fprintf(stderr, "i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i]));
+
+    for (size_t i = 0; i < 2; i++)
+    {
+        for (size_t j = 0; j < regularStrides[i].size(); j++)
+        {
+            fprintf(stderr, "i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j]));
+        }
+        for (size_t j = 0; j < reducingStrides[i].size(); j++)
+        {
+            fprintf(stderr, "i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j]));
+        }
+    }
+    */
+    // now perform the operation
+    GetSOB().TensorOpDebug(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+}
 template <class ElemType>
 void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
 {
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -131,6 +131,7 @@ public:
 #pragma pop_macro("DeclareTernaryTensorOp")

    void DoUnaryOpOf  (ElemType beta, const TensorView& a,                                           ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
+    void DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
    void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b,                      ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
    void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);

--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -1051,10 +1051,18 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                                    string showWERMode,
                                    bool SVD)
 {
-    PROFILE_SCOPE(profilerEvtMainEpoch);

+    PROFILE_SCOPE(profilerEvtMainEpoch);
+    std::vector<std::wstring> decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2);
    ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training);

+    ComputationNetwork decode_cn_root;
+    decode_cn_root.CopySubTree(*net, decodeOutputNodeNames[0], L"", CopyNodeFlags::copyNodeAll);
+    decode_cn_root.CompileNetwork();
+    std::vector<ComputationNodeBasePtr> decodeOutputNodesTmp = decode_cn_root.OutputNodesByName(decodeOutputNodeNames);
+    decode_cn_root.FormEvalOrder(decodeOutputNodesTmp[0]);
+    decode_cn_root.FormNestedNetwork(decodeOutputNodesTmp[0]);
+
    // bring our 'out' values into consistent state
    epochCriterion = EpochCriterion(0);
    epochEvalErrors.assign(epochEvalErrors.size(), EpochCriterion(0));
@ -1078,7 +1086,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
    auto ContainsAccumulatedResult = [&evaluationNodesWhichAccumulateResult](ComputationNodeBasePtr node) {
        return evaluationNodesWhichAccumulateResult.find(node) != evaluationNodesWhichAccumulateResult.end();
    };
-
    // MA-related variables
    size_t nSamplesSinceLastModelSync = 0;
    size_t blockSizePerWorker = 0;
@ -1122,7 +1129,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
    // this is non-trivial, we need a manager object to handle this
    if (numSubminibatchesNeeded > 1)
        smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
-
    // The following is a special feature only supported by the Kaldi2Reader for more efficient sequence training.
    // This attempts to compute the error signal for the whole utterance, which will
    // be fed to the neural network as features. Currently it is a workaround
@ -1241,6 +1247,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
        numSamplesWithLabelOfNetworkMBR = 0;

        auto profGetMinibatch = ProfilerTimeBegin();
+
        bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork<ElemType>(*trainSetDataReader, net, criterionNodes[0],
                                                                                useDistributedMBReading, useParallelTrain, *inputMatrices, actualMBSize, m_mpi);

@ -1321,12 +1328,10 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                    //net->CompileNetwork();
                    std::vector<std::wstring> encodeOutputNodeNames(outputNodeNamesVector.begin(), outputNodeNamesVector.begin() + 1);
                    std::vector<ComputationNodeBasePtr> encodeOutputNodes = net->OutputNodesByName(encodeOutputNodeNames);
-
                    //
                    //net->CollectInputAndLearnableParameters(encodeOutputNodes[0]);
                    std::list<ComputationNodeBasePtr> InputNodesList = net->InputNodes(criterionNodes[0]);
                    std::vector<std::wstring> encodeInputNodeNames;
-
                    if (SVD)
                        encodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8);
                    else
@ -1336,7 +1341,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                    *encodeInputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(encodeInputNodes);

                    //get decode input matrix
-                    std::vector<std::wstring> decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2);
                    std::vector<ComputationNodeBasePtr> decodeOutputNodes = net->OutputNodesByName(decodeOutputNodeNames);

                    //net->CollectInputAndLearnableParameters(decodeOutputNodes[0]);
@ -1346,6 +1350,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                    else
                        decodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8);
                    std::vector<ComputationNodeBasePtr> decodeinputNodes = net->OutputNodesByName(decodeInputNodeNames);
+
                    *decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes);

                    if (!ordered)
@ -1359,7 +1364,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                        std::vector<ComputationNodeBasePtr> Plustransnodes = net->OutputNodesByName(plusTransNodeNames);
                        net->FormEvalOrder(Plustransnodes[0]);
                    }
-
                    //form eval order for RELU

                    auto reffeainput = (*encodeInputMatrices).begin();
@ -1367,9 +1371,13 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                    auto reflminput = (*decodeinputMatrices).begin();
                    auto decodeMBLayout = reflminput->second.pMBLayout;

-                    net->ForwardProp(encodeOutputNodes);
+                    //form eval order for RELU
+                    time_t my_time = time(NULL);
+                    fprintf(stderr, "SGD time 0 = %s", ctime(&my_time));

-                    Matrix<ElemType> encodeOutput(net->GetDeviceId());
+                    net->ForwardProp(encodeOutputNodes);
+                    size_t deviceid = net->GetDeviceId();
+                    Matrix<ElemType> encodeOutput(deviceid);
                    encodeOutput.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(encodeOutputNodes[0])->Value()));

                    vector<vector<PathInfo>> uttPathsInfo;
@ -1379,12 +1387,50 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                    vector<float> vt_onebest_wer;
                    vt_onebest_wer.clear();

-                    // time_t my_time = time(NULL);
-                    // fprintf(stderr, "SGD time 1 = %s", ctime(&my_time));
+                    my_time = time(NULL);
+                    fprintf(stderr, "SGD time 1 = %s", ctime(&my_time));
                    RNNTDecodeFunctions<ElemType> rnntdfs;
-                    rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net);
-                    //my_time = time(NULL);
-                    //fprintf(stderr,  "SGD time 2 = %s", ctime(&my_time));
+                    //rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net);
+
+                    //vt_printname.push_back(L"DecodeOutputLN");
+
+                    if (m_enableMultiThreadDecodeMBR)
+                    {
+                        ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode;
+                        Matrix<ElemType> Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid);
+                        if (SVD)
+                        {
+                            WmuNode = net->GetNodeFromName(outputNodeNamesVector[4]);   
+                            WmvNode = net->GetNodeFromName(outputNodeNamesVector[5]);
+                            bmNode = net->GetNodeFromName(outputNodeNamesVector[6]);
+                            Wmu.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmuNode)->Value()));
+                            Wmv.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmvNode)->Value()));
+                            WmNode;
+                            Wm;
+                        }
+                        else
+                        {
+                            WmNode = net->GetNodeFromName(outputNodeNamesVector[4]);
+                            bmNode = net->GetNodeFromName(outputNodeNamesVector[5]);
+                            Wm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmNode)->Value()));
+                            WmuNode;
+                            WmvNode;
+                            Wmu;
+                            Wmv;
+                        }
+
+                        bm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(bmNode)->Value()));
+                        /*
+                        size_t num_utt = 7;
+                        size_t start_utt = 0;
+                        */
+                        rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeInputNodeNames, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, decode_cn_root,  Wm, Wmu, Wmv, bm); /*, num_utt, start_utt); */
+                    }
+                    else
+                        rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, net);
+                    // rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net, decode_cn, decodeinputNodes_tmp);
+                    my_time = time(NULL);
+                    fprintf(stderr, "SGD time 2 = %s", ctime(&my_time));
                    //fprintf(stderr, "decode SGD v0 .\n");

                    //net->BumpEvalTimeStamp(decodeinputNodes);
@ -1400,21 +1446,11 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                    refFeaMatBackup.SetValue(reffeainput->second.GetMatrix<ElemType>());
                    size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences();

-                    //my_time = time(NULL);
-                    //fprintf(stderr, "SGD time 3 = %s", ctime(&my_time));
+                    my_time = time(NULL);
+                    fprintf(stderr, "SGD time 3 = %s", ctime(&my_time));

                    for (const auto& seq : encodeMBLayout->GetAllSequences())
                    {
-                        /*
-                        if (seqId == 1)
-                        {
-                            for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-                            {
-                                ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-                                node->force_gradient_accumulate(true);
-                            }
-                        }
-                        */
                        if (seq.seqId == GAP_SEQUENCE_ID)
                        {
                            continue;
@ -1423,15 +1459,12 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                        {
                            continue;
                        }
-
-                        //if (firstdebug)
                        cNode->SetMWERInfo(uttPathsInfo[seqId], lengthNorm, wordPathPosteriorFromDecodeMBR, doMBR, vt_nws[seqId]);

                        // get the feature MBLayout
                        size_t numFrames = seq.GetNumTimeSteps();
                        numSamplesWithLabelOfNetworkMBR += numFrames;

-                        // if (firstdebug)
                        reffeainput->second.pMBLayout->Init(1, numFrames); // 1 channel, 1 utterance

                        Matrix<ElemType> fea(deviceID);
@ -1447,11 +1480,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,

                            fea.SetColumn(refFeaMatBackup.ColumnSlice(uID, 1), t);
                        }
-                        //if (firstdebug)
-                        {
-                            reffeainput->second.GetMatrix<ElemType>().SetValue(fea);
-                            reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames
-                        }
+
+                        reffeainput->second.GetMatrix<ElemType>().SetValue(fea);
+                        reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames

                        // guoye: the below 2 commands reset the state, to make sure ForwardProb always get carried out
                        ComputationNetwork::BumpEvalTimeStamp(encodeInputNodes); // guoy: update the time stamp before you do forward prob
@ -1463,9 +1494,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                        size_t nBest = uttPathsInfo[seqId].size();
                        if (nBest > (m_maxFrameNumPerMinibatchMBR / numFrames))
                        {
-                            // reset nBest to make the MB size framenum with budget
+                            // reset nBest to make the MB size
                            nBest = (m_maxFrameNumPerMinibatchMBR / numFrames);
-                        }    
+                        }
                        size_t maxPhoneSeqLen = uttPathsInfo[seqId][0].label_seq.size();

                        for (size_t n = 1; n < nBest; n++)
@ -1583,8 +1614,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                        }
                    }

-                    //my_time = time(NULL);
-                    //fprintf(stderr, "SGD time 4 = %s", ctime(&my_time));
+                    my_time = time(NULL);
+                    fprintf(stderr, "SGD time 4 = %s", ctime(&my_time));
                }
                // ===========================================================
                // forward prop for evaluate eval nodes
@ -1790,8 +1821,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                {
 #ifdef _DEBUG
                    if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): "))
-                        LogicError("
-                            %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
+                        LogicError("%ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
 #endif
                    double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier();
                    double nodeDependentRegMultiplier = dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->GetRegMultiplier();
@ -3460,7 +3490,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
    m_lengthNorm = configSGD(L"LengthNorm", true);
    m_showWERMode = configSGD(L"showWERMode", "average");
    m_isSVD = configSGD(L"SVD", true);
-
+    m_enableMultiThreadDecodeMBR = configSGD(L"enableMultiThreadDecodeMBR", true);
    m_maxFrameNumPerMinibatchMBR = configSGD(L"MaxFrameNumPerMinibatchMBR", (size_t) 2000);
    if (m_doGradientCheck && sizeofElemType != sizeof(double))
    {
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -292,6 +292,7 @@ protected:
    string m_showWERMode;
    bool m_isSVD;
    size_t m_maxFrameNumPerMinibatchMBR;
+    bool m_enableMultiThreadDecodeMBR;
    // Parallel training
    MPIWrapperPtr m_mpi;

--- a/Source/SGDLib/SimpleOutputWriter.h
+++ b/Source/SGDLib/SimpleOutputWriter.h
@ -421,7 +421,7 @@ public:
        Matrix<ElemType> encodeOutput(deviceid);
        Matrix<ElemType> decodeOutput(deviceid);
        Matrix<ElemType> greedyOutput(deviceid), greedyOutputMax(deviceid);
-        Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
+        Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
        Matrix<ElemType> lmin(deviceid);
        MatrixPool m_matrixPool;
        m_matrixPool.OptimizedMemoryAllocation();
@ -472,35 +472,7 @@ public:
                CurSequences = nextSequences;

                vector<typename RNNTDecodeFunctions<ElemType>::Sequence>().swap(nextSequences);
-                //deal with the same prefix
-                /*sort(CurSequences.begin(), CurSequences.end(),
-                     [](const Sequence& a, const Sequence& b) -> bool {
-                         return a.labelseq.size() > b.labelseq.size();
-                     });
-                for (size_t n = 0; n < CurSequences.size() - 1; n++)
-                {
-                    for (size_t h = n + 1; h < CurSequences.size(); h++)
-                    {
-                        if (isPrefix(CurSequences[h], CurSequences[n]))
-                        {
-                            //forward_prop the prefix
-                            forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size());

-                            forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
-
-                            size_t idx = CurSequences[h].labelseq.size();
-                            ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0);
-                            for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++)
-                            {
-                                forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1);
-                                forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
-
-                                curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0);
-                            }
-                            CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP);
-                        }
-                    }
-                }*/
                //nextSequences.clear();
                while (true)
                {
@ -515,8 +487,8 @@ public:
                    rnntdfs.prepareSequence(tempSeq);
                    rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net);
                    if (isSVD)
-                        rnntdfs.forwardmergedSVD(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wmu, Wmv, bm, *m_net);
-                    else rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net);
+                        rnntdfs.forwardmergedSVD(tempSeq, t,  encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, m_net);
+                    else rnntdfs.forwardmerged(tempSeq, t,  encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net);

                    //sumofENandDE.Print("sum");
                    //sort log posterior and get best N labels
@ -715,7 +687,7 @@ public:
        Matrix<ElemType> encodeOutput(deviceid);
        Matrix<ElemType> decodeOutput(deviceid);
        Matrix<ElemType> greedyOutput(deviceid), greedyOutputMax(deviceid);
-        Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
+        Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
        Matrix<ElemType> lmin(deviceid);
        MatrixPool m_matrixPool;
        m_matrixPool.OptimizedMemoryAllocation();
@ -760,36 +732,6 @@ public:
                CurSequences = nextSequences;

                vector<typename RNNTDecodeFunctions<ElemType>::Sequence>().swap(nextSequences);
-                //deal with the same prefix
-                /*sort(CurSequences.begin(), CurSequences.end(),
-                     [](const Sequence& a, const Sequence& b) -> bool {
-                         return a.labelseq.size() > b.labelseq.size();
-                     });
-                for (size_t n = 0; n < CurSequences.size() - 1; n++)
-                {
-                    for (size_t h = n + 1; h < CurSequences.size(); h++)
-                    {
-                        if (isPrefix(CurSequences[h], CurSequences[n]))
-                        {
-                            //forward_prop the prefix
-                            forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size());
-
-                            forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
-
-                            size_t idx = CurSequences[h].labelseq.size();
-                            ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0);
-                            for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++)
-                            {
-                                forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1);
-                                forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
-
-                                curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0);
-                            }
-                            CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP);
-                        }
-                    }
-                }*/
-                //nextSequences.clear();
                while (true)
                {

@ -802,7 +744,7 @@ public:
                    CurSequences.erase(maxSeq);
                    rnntdfs.prepareSequence(tempSeq);
                    rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net);
-                    rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net);
+                    rnntdfs.forwardmerged(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net);

                    //sumofENandDE.Print("sum");
                    //sort log posterior and get best N labels