From 1fc15ed1058afc48ade968758b2d4d23974f156d Mon Sep 17 00:00:00 2001
From: Guoli Ye <guoye@microsoft.com>
Date: Thu, 20 Feb 2020 22:34:44 -0800
Subject: [PATCH] multithread GPU code checkin, no speedup though due to the
 stream in kernel function

---
 .../ComputationNetwork.h                      | 1490 +++++++++++++----
 .../ComputationNetworkEditing.cpp             |   55 +-
 .../ComputationNetworkLib/ComputationNode.h   |    5 +-
 .../ComputationNetworkLib/ReshapingNodes.cpp  |  188 ++-
 Source/ComputationNetworkLib/ReshapingNodes.h |  611 ++++---
 Source/Math/CPUMatrix.h                       |    1 +
 Source/Math/CPUMatrixImpl.h                   |   18 +
 Source/Math/GPUMatrix.cu                      |  125 ++
 Source/Math/GPUMatrix.h                       |    6 +
 Source/Math/GPUMatrixCUDAKernels.cuh          |   13 +
 Source/Math/GPUSparseMatrix.cu                |   13 +
 Source/Math/GPUSparseMatrix.h                 |    2 +-
 Source/Math/GPUTensor.cu                      |  316 ++++
 Source/Math/GPUTensor.h                       |    6 +
 Source/Math/Matrix.cpp                        |   32 +
 Source/Math/Matrix.h                          |    7 +
 Source/Math/NoGPU.cpp                         |   11 +-
 Source/Math/TensorView.cpp                    |   43 +-
 Source/Math/TensorView.h                      |    1 +
 Source/SGDLib/SGD.cpp                         |  112 +-
 Source/SGDLib/SGD.h                           |    1 +
 Source/SGDLib/SimpleOutputWriter.h            |   68 +-
 22 files changed, 2376 insertions(+), 748 deletions(-)
diff --git a/Source/ComputationNetworkLib/ComputationNetwork.h b/Source/ComputationNetworkLib/ComputationNetwork.h
index d05d71494..748105a1f 100644
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@@ -59,7 +59,6 @@ inline std::wstring ToString(const ComputationNodeBasePtr& node)
 // ===========================================================================
 // move the shared function/data structure from SimpleOutputWriter.h to this file
 
-
 class ComputationNetwork : public ScriptableObjects::Object,
                            public ScriptableObjects::HasToString,
                            public ScriptableObjects::CustomConfigRecord
@@ -239,7 +238,6 @@ public:
         }
     }
 
-
     static void BumpEvalTimeStamp(const std::vector<ComputationNodeBasePtr>& nodes);
     void ResetEvalTimeStamps();
     void SetEvalTimeStampsOutdatedWithRegardToAll();
@@ -494,6 +492,7 @@ public:
 
     ComputationNodeBasePtr CopyNode(const ComputationNetwork& fromNet, const std::wstring fromName, std::wstring toName, const CopyNodeFlags flags);
     void CopySubTree(const ComputationNetwork& fromNet, const std::wstring fromName, std::wstring toNamePrefix, const CopyNodeFlags flags);
+    void ShowNodeMemory(const ComputationNetwork& fromNet, const std::wstring fromName);
     void CopyInputs(const std::wstring fromName, std::wstring toName);
     void RenameNode(const std::wstring& nodeNameOrig, const std::wstring& nodeNameNew);
     void RenameNode(ComputationNodeBasePtr node, const std::wstring& newNodeName);
@@ -1421,8 +1420,6 @@ public:
     vector<shared_ptr<Matrix<ElemType>>> m_decodeOutputCache;
     std::vector<wstring> m_nodesToCache;
 
-
-
     struct Sequence
     {
         //shared_ptr<Matrix<ElemType>> LabelMatrix;
@@ -1524,6 +1521,77 @@ public:
         vector<size_t>().swap(oneSeq.labelseq);
     }
 
+    // the below 2 newseq and deleteseq are for multithread, where different thread do not want to share the m_nameToPastValueNodeCache and m_decodeOutputCache. Rather, each thread will have its own passed explicitly by parameter ;
+
+    Sequence newSeq(Sequence& a, DEVICEID_TYPE deviceId, unordered_map<wstring, vector<shared_ptr<PastValueNode<ElemType>>>>& m_nameToPastValueNodeCachePerThread, vector<shared_ptr<Matrix<ElemType>>>& m_decodeOutputCachePerThread)
+    {
+        Sequence oneSeq;
+        oneSeq.labelseq = a.labelseq;
+        oneSeq.logP = a.logP;
+        oneSeq.length = a.length;
+        oneSeq.lengthwithblank = a.lengthwithblank;
+        oneSeq.processlength = a.processlength;
+        if (m_decodeOutputCachePerThread.size() > 0)
+        {
+            oneSeq.decodeoutput = m_decodeOutputCachePerThread.back();
+            m_decodeOutputCachePerThread.pop_back();
+        }
+        else
+        {
+            oneSeq.decodeoutput = make_shared<Matrix<ElemType>>(a.decodeoutput->GetNumRows(), (size_t) 1, a.decodeoutput->GetDeviceId());
+        }
+        oneSeq.decodeoutput->SetValue(*(a.decodeoutput));
+
+        typename unordered_map<wstring, shared_ptr<PastValueNode<ElemType>>>::iterator it;
+        for (it = a.nameToNodeValues.begin(); it != a.nameToNodeValues.end(); it++)
+        {
+            if (oneSeq.processlength > 0)
+            {
+                if (it->second->Value().GetNumElements() > 0 && a.realValues)
+                {
+                    oneSeq.nameToParentNodeValues[it->first] = it->second;
+                    a.refs++;
+                }
+                else
+                    oneSeq.nameToParentNodeValues[it->first] = a.nameToParentNodeValues[it->first];
+                /*size_t ab = oneSeq.nameToParentNodeValues[it->first]->Value().GetNumElements();
+                if (ab > 0)
+                    fprintf(stderr, "test %ls %zu", it->first.c_str(), ab);*/
+            }
+            auto itin = m_nameToPastValueNodeCachePerThread.find(it->first);
+            if (itin != m_nameToPastValueNodeCachePerThread.end() && m_nameToPastValueNodeCachePerThread[it->first].size() > 0)
+            {
+                oneSeq.nameToNodeValues[it->first] = m_nameToPastValueNodeCachePerThread[it->first].back();
+                m_nameToPastValueNodeCachePerThread[it->first].pop_back();
+            }
+            else
+            {
+                oneSeq.nameToNodeValues[it->first] = make_shared<PastValueNode<ElemType>>(deviceId, it->first);
+            }
+            /*std::ostringstream address;
+            address << oneSeq.nameToNodeValues[it->first];
+            fprintf(stderr, "newSeq %ls %s \n", it->first.c_str(), address.str().c_str());*/
+        }
+
+        return oneSeq;
+    }
+
+    void deleteSeq(Sequence oneSeq, unordered_map<wstring, vector<shared_ptr<PastValueNode<ElemType>>>>& m_nameToPastValueNodeCachePerThread, vector<shared_ptr<Matrix<ElemType>>>& m_decodeOutputCachePerThread)
+    {
+        typename unordered_map<wstring, shared_ptr<PastValueNode<ElemType>>>::iterator it;
+        for (it = oneSeq.nameToNodeValues.begin(); it != oneSeq.nameToNodeValues.end(); it++)
+        {
+            auto itin = m_nameToPastValueNodeCachePerThread.find(it->first);
+            if (itin == m_nameToPastValueNodeCachePerThread.end())
+                m_nameToPastValueNodeCachePerThread[it->first] = vector<shared_ptr<PastValueNode<ElemType>>>();
+            if (oneSeq.refs == 0)
+                m_nameToPastValueNodeCachePerThread[it->first].push_back(oneSeq.nameToNodeValues[it->first]);
+        }
+        m_decodeOutputCachePerThread.push_back(oneSeq.decodeoutput);
+
+        vector<size_t>().swap(oneSeq.labelseq);
+    }
+
     void extendSeq(Sequence& insequence, size_t labelId, ElemType logP)
     {
         insequence.labelseq.push_back(labelId);
@@ -1531,7 +1599,7 @@ public:
         insequence.length++;
         insequence.lengthwithblank++;
     }
-    vector<pair<size_t, ElemType>> getTopN(Microsoft::MSR::CNTK::Matrix<ElemType>& prob, size_t N, size_t& blankid)
+    vector<pair<size_t, ElemType>> getTopN(Microsoft::MSR::CNTK::Matrix<ElemType>& prob, size_t N, const size_t& blankid)
     {
         vector<pair<size_t, ElemType>> datapair;
         typedef typename vector<pair<size_t, ElemType>>::value_type ValueType;
@@ -1568,121 +1636,227 @@ public:
     }
 
     void forward_decode(Sequence& oneSeq, StreamMinibatchInputs decodeinputMatrices, DEVICEID_TYPE deviceID, const std::vector<ComputationNodeBasePtr>& decodeOutputNodes,
-                        const std::vector<ComputationNodeBasePtr>& decodeinputNodes, size_t vocabSize, size_t plength, ComputationNetwork& cn)
+                        const std::vector<ComputationNodeBasePtr>& decodeinputNodes, size_t vocabSize, size_t plength, ComputationNetwork& net, int uttFrameNum = 0)
 
     {
 
         //        size_t labelLength = oneSeq.length;
         if (oneSeq.processlength + 1 != plength && plength != oneSeq.processlength)
             LogicError("Current implementation assumes 1 step difference");
-        if (plength != oneSeq.processlength)
+                /*
+        if (uttFrameNum == 94)
         {
-            /*m_logIndex = m_logIndex + 1;
-            wstring fileName = L"D:\\users\\vadimma\\cntk_3\\new_opt" + std::to_wstring(m_logIndex) + L".txt";
-            std::ofstream out(fileName, std::ios::out);
-            out << fixed;
-            out.precision(3);
-            for (size_t li = 0; li < oneSeq.labelseq.size(); li++)
-                out << oneSeq.labelseq[li] << " ";
-
-            out << "\n";*/
-
-            Matrix<ElemType> lmin(deviceID);
-
-            lmin.Resize(vocabSize, 1);
-            lmin.SetValue(0.0);
-            lmin(oneSeq.labelseq[plength - 1], 0) = 1.0;
-            auto lminput = decodeinputMatrices.begin();
-            lminput->second.pMBLayout->Init(1, 1);
-            //std::swap(lminput->second.GetMatrix<ElemType>(), lmin);
-            lminput->second.GetMatrix<ElemType>().SetValue(lmin);
-            if (plength == 1)
+            for (const auto& node : net.GetAllNodesForRoot(decodeOutputNodes[0]))
             {
-                lminput->second.pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1);
+
+                if (dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().IsEmpty())
+                {
+                    fprintf(stderr, "forward_decode 0  NodeName = %ls, Empty \n", node->NodeName().c_str());
+                }
+                else
+                {
+                    double pnorm = dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().FrobeniusNorm();
+                    fprintf(stderr, "forward_decode 0  NodeName = %ls, Norm = %f \n", node->NodeName().c_str(), pnorm);
+                }
             }
-            else
+        }
+        */
+                if (plength != oneSeq.processlength)
             {
-                ///lminput->second.pMBLayout->//m_sequences.erase(0);
-                lminput->second.pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, SentinelValueIndicatingUnspecifedSequenceBeginIdx, 1);
+                Matrix<ElemType> lmin(deviceID);
 
-                //DataReaderHelpers::NotifyChangedNodes<ElemType>(m_net, decodeinputMatrices);
+                lmin.Resize(vocabSize, 1);
+                lmin.SetValue(0.0);
+                lmin(oneSeq.labelseq[plength - 1], 0) = 1.0;
+                auto lminput = decodeinputMatrices.begin();
+                if (lminput->second.pMBLayout == NULL)
+                {
+                    lminput->second.pMBLayout = make_shared<MBLayout>();
+                }
+                lminput->second.pMBLayout->Init(1, 1);
+                //std::swap(lminput->second.GetMatrix<ElemType>(), lmin);
+                lminput->second.GetMatrix<ElemType>().SetValue(lmin);
+                if (plength == 1)
+                {
+                    lminput->second.pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1);
+                }
+                else
+                {
+                    ///lminput->second.pMBLayout->//m_sequences.erase(0);
+                    lminput->second.pMBLayout->AddSequence(NEW_SEQUENCE_ID, 0, SentinelValueIndicatingUnspecifedSequenceBeginIdx, 1);
+
+                    //DataReaderHelpers::NotifyChangedNodes<ElemType>(m_net, decodeinputMatrices);
+
+                    for (size_t i = 0; i < m_nodesToCache.size(); i++)
+                    {
+                        auto nodePtr = net.GetNodeFromName(m_nodesToCache[i]);
+
+                        if (oneSeq.nameToNodeValues[m_nodesToCache[i]]->Value().GetNumElements() > 0)
+
+                        {
+                            oneSeq.nameToNodeValues[m_nodesToCache[i]]->CopyTo(nodePtr, m_nodesToCache[i], CopyNodeFlags::copyNodeInputLinks);
+                        }
+                    }
+                }
+
+                net.BumpEvalTimeStamp(decodeinputNodes);
+                // NotifyChangedNodes<ElemType>(m_net, decodeinputMatrices);
+
+                net.ForwardProp(decodeOutputNodes[0]);
+                /*
+                if (uttFrameNum == 94)
+                {
+
+                    for (const auto& node : net.GetAllNodesForRoot(decodeOutputNodes[0]))
+                    {
+
+                        if (dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().IsEmpty())
+                        {
+                            fprintf(stderr, "forward_decode 1  NodeName = %ls, Empty \n", node->NodeName().c_str());
+                        }
+                        else
+                        {
+                            double pnorm = dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value().FrobeniusNorm();
+                            fprintf(stderr, "forward_decode 1  NodeName = %ls, Norm = %f \n", node->NodeName().c_str(), pnorm);
+                        }
+                    }
+                    fprintf(stderr, "forward_decode decodeOutputNodes = %f, oneSeq.decodeoutput = %f, debug 1\n", (*(&dynamic_pointer_cast<ComputationNode<ElemType>>(decodeOutputNodes[0])->Value())).FrobeniusNorm(), (*(oneSeq.decodeoutput)).FrobeniusNorm());
+                }
+                */
+                //Matrix<ElemType> tempMatrix = *(&dynamic_pointer_cast<ComputationNode<ElemType>>(decodeOutputNodes[0])->Value());
+                oneSeq.decodeoutput->SetValue((*(&dynamic_pointer_cast<ComputationNode<ElemType>>(decodeOutputNodes[0])->Value())));
+                // fprintf(stderr, "forward_decode = %f \n", oneSeq.decodeoutput->FrobeniusNorm());
+                /*
+                if (uttFrameNum == 94)
+                {
+                    fprintf(stderr, "oneSeq.decodeoutput = %f, debug 2\n", (*(oneSeq.decodeoutput)).FrobeniusNorm());
+                }
+                */
+                oneSeq.processlength = plength;
 
                 for (size_t i = 0; i < m_nodesToCache.size(); i++)
                 {
-                    auto nodePtr = cn.GetNodeFromName(m_nodesToCache[i]);
-
-                    if (oneSeq.nameToNodeValues[m_nodesToCache[i]]->Value().GetNumElements() > 0)
+                    auto nodePtr = net.GetNodeFromName(m_nodesToCache[i]);
 
+                    if (plength == 1)
                     {
-                        oneSeq.nameToNodeValues[m_nodesToCache[i]]->CopyTo(nodePtr, m_nodesToCache[i], CopyNodeFlags::copyNodeInputLinks);
+                        nodePtr->CopyTo(oneSeq.nameToNodeValues[m_nodesToCache[i]], m_nodesToCache[i], CopyNodeFlags::copyNodeAll);
                     }
                 }
+
+                lmin.ReleaseMemory();
             }
-
-            // cn.BumpEvalTimeStamp(decodeinputNodes);
-
-            //m_net->DumpAllNodesToFile(true, true, L"D:\\users\\vadimma\\cntk_3\\After_model_opt.txt");
-            cn.BumpEvalTimeStamp(decodeinputNodes);
-            // NotifyChangedNodes<ElemType>(m_net, decodeinputMatrices);
-
-            cn.ForwardProp(decodeOutputNodes[0]);
-            //Matrix<ElemType> tempMatrix = *(&dynamic_pointer_cast<ComputationNode<ElemType>>(decodeOutputNodes[0])->Value());
-            oneSeq.decodeoutput->SetValue((*(&dynamic_pointer_cast<ComputationNode<ElemType>>(decodeOutputNodes[0])->Value())));
-            oneSeq.processlength = plength;
-
-            for (size_t i = 0; i < m_nodesToCache.size(); i++)
-            {
-                auto nodePtr = cn.GetNodeFromName(m_nodesToCache[i]);
-
-                if (plength == 1)
-                {
-                    nodePtr->CopyTo(oneSeq.nameToNodeValues[m_nodesToCache[i]], m_nodesToCache[i], CopyNodeFlags::copyNodeAll);
-                }
-            }
-
-            lmin.ReleaseMemory();
-        }
     }
-    void forwardmerged(Sequence a, size_t t, Matrix<ElemType>& sumofENandDE, Matrix<ElemType>& encodeOutput, Matrix<ElemType>& decodeOutput, ComputationNodeBasePtr PlusNode,
-                       ComputationNodeBasePtr PlusTransNode, std::vector<ComputationNodeBasePtr> Plusnodes, std::vector<ComputationNodeBasePtr> Plustransnodes, Matrix<ElemType>& Wm, Matrix<ElemType>& bm, ComputationNetwork& cn)
+    void forwardmerged(Sequence a, size_t t, const Matrix<ElemType>& encodeOutput, Matrix<ElemType>& decodeOutput,
+                       std::vector<ComputationNodeBasePtr> Plusnodes, std::vector<ComputationNodeBasePtr> Plustransnodes, const Matrix<ElemType>& Wm, const Matrix<ElemType>& bm, const ComputationNetworkPtr& net,
+                       int uttFrameNum = 0, DEVICEID_TYPE deviceID = CPUDEVICE)
     {
+        /*
+        if (uttFrameNum == 94)
+        {
 
-        sumofENandDE.AssignSumOf(encodeOutput.ColumnSlice(t, 1), *(a.decodeoutput));
-        //sumofENandDE.InplaceLogSoftmax(true);
-        Matrix<ElemType> tempMatrix(encodeOutput.GetDeviceId());
+            fprintf(stderr, "frowardmerged encodeoutput = %f, a.decodeoutput = %f, debug 1\n", encodeOutput.ColumnSlice(t, 1).FrobeniusNorm(), (*(a.decodeoutput)).FrobeniusNorm());
+        }
+        */
+        decodeOutput.AssignSumOf(encodeOutput.ColumnSlice(t, 1), *(a.decodeoutput)); // sum broadcast
+        //decodeOutput.AssignSumOf(encodeOutput.ColumnSlice(t, 1), encodeOutput.ColumnSlice(t, 1));
+        /*
+        if (uttFrameNum == 94)
+        {
+            fprintf(stderr, "frowardmerged decodeOutput = %f, debug 2\n", decodeOutput.FrobeniusNorm());
+        }
+        */
+
+        Matrix<ElemType> tempMatrix(deviceID);
         //plus broadcast
-        (&dynamic_pointer_cast<ComputationNode<ElemType>>(PlusNode)->Value())->SetValue(sumofENandDE);
-        //SumMatrix.SetValue(sumofENandDE);
-        ComputationNetwork::BumpEvalTimeStamp(Plusnodes);
-        auto PlusMBlayout = PlusNode->GetMBLayout();
-        PlusMBlayout->Init(1, 1);
-        PlusMBlayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1);
-        // cn.FormEvalOrder(Plustransnodes[0]);
-        cn.ForwardPropFromTo(Plusnodes, Plustransnodes);
-        decodeOutput.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(PlusTransNode)->Value()));
+        if (!net)
+        {
+            //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 3 \n", uttNum, (void*) (&decodeOutput));
+            decodeOutput.SetToZeroIfLessThan(0); // reLU
+            //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 4 \n", uttNum, (void*) (&decodeOutput));
+            /*
+            if (uttFrameNum == 94)
+            {
+                fprintf(stderr, "frowardmerged decodeOutput = %f, debug 3\n", decodeOutput.FrobeniusNorm());
+            }
+            */
+        }
+        else
+        {
+            (&dynamic_pointer_cast<ComputationNode<ElemType>>(Plusnodes[0])->Value())->SetValue(decodeOutput);
+            ComputationNetwork::BumpEvalTimeStamp(Plusnodes);
+            auto PlusMBlayout = Plusnodes[0]->GetMBLayout();
+            PlusMBlayout->Init(1, 1);
+            PlusMBlayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1);
+
+            net->ForwardPropFromTo(Plusnodes, Plustransnodes);
+            decodeOutput.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(Plustransnodes[0])->Value()));
+        }
+
+        //fprintf(stderr, "forward merge = %f \n", decodeOutput.FrobeniusNorm());
+        //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 5 \n", uttNum, (void*) (&decodeOutput));
+        /*
+        if (uttFrameNum == 94)
+        {
+
+            fprintf(stderr, "frowardmerged Wm = %f, decodeoutput = %f, debug 4\n", Wm.FrobeniusNorm(), decodeOutput.FrobeniusNorm());
+        }
+        */
         tempMatrix.AssignProductOf(Wm, true, decodeOutput, false);
+        //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 6 \n", uttNum, (void*) (&decodeOutput));
+        /*
+        if (uttFrameNum == 94)
+        {
+
+            fprintf(stderr, "frowardmerged bm = %f, tempMatrix = %f, debug 5\n", bm.FrobeniusNorm(), tempMatrix.FrobeniusNorm());
+        }
+        */
         decodeOutput.AssignSumOf(tempMatrix, bm);
+        /*
+        if (uttFrameNum == 94)
+        {
+
+            fprintf(stderr, "frowardmerged decodeOutput = %f, debug 6\n", decodeOutput.FrobeniusNorm());
+        }
+        */
+        //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 7 \n", uttNum, (void*) (&decodeOutput));
         //decodeOutput.VectorMax(maxIdx, maxVal, true);
         decodeOutput.InplaceLogSoftmax(true);
+        /*
+        if (uttFrameNum == 94)
+        {
+
+            fprintf(stderr, "frowardmerged decodeOutput = %f, debug 7\n", decodeOutput.FrobeniusNorm());
+        }
+        */
+        //fprintf(stderr, "debug forwardmerge uttNum = %d, &decodeOutput = %p , 8 \n", uttNum, (void*) (&decodeOutput));
     }
 
-    void forwardmergedSVD(Sequence a, size_t t, Matrix<ElemType>& sumofENandDE, Matrix<ElemType>& encodeOutput, Matrix<ElemType>& decodeOutput, ComputationNodeBasePtr PlusNode,
-                          ComputationNodeBasePtr PlusTransNode, std::vector<ComputationNodeBasePtr> Plusnodes, std::vector<ComputationNodeBasePtr> Plustransnodes, Matrix<ElemType>& Wmu, Matrix<ElemType>& Wmv, Matrix<ElemType>& bm, ComputationNetwork& cn)
+    void forwardmergedSVD(Sequence a, size_t t, const Matrix<ElemType>& encodeOutput, Matrix<ElemType>& decodeOutput, std::vector<ComputationNodeBasePtr> Plusnodes, std::vector<ComputationNodeBasePtr> Plustransnodes, const Matrix<ElemType>& Wmu, const Matrix<ElemType>& Wmv, const Matrix<ElemType>& bm, const ComputationNetworkPtr& net)
     {
 
-        sumofENandDE.AssignSumOf(encodeOutput.ColumnSlice(t, 1), *(a.decodeoutput));
-        //sumofENandDE.InplaceLogSoftmax(true);
-        Matrix<ElemType> tempMatrix(encodeOutput.GetDeviceId()), tempMatrix1(encodeOutput.GetDeviceId());
+        decodeOutput.AssignSumOf(encodeOutput.ColumnSlice(t, 1), *(a.decodeoutput));
+        Matrix<ElemType> tempMatrix(encodeOutput.GetDeviceId()), tempMatrix1(encodeOutput.GetDeviceId()); //broadcast
         //plus broadcast
-        (&dynamic_pointer_cast<ComputationNode<ElemType>>(PlusNode)->Value())->SetValue(sumofENandDE);
-        //SumMatrix.SetValue(sumofENandDE);
-        ComputationNetwork::BumpEvalTimeStamp(Plusnodes);
-        auto PlusMBlayout = PlusNode->GetMBLayout();
-        PlusMBlayout->Init(1, 1);
-        PlusMBlayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1);
-        // cn.FormEvalOrder(Plustransnodes[0]);
-        cn.ForwardPropFromTo(Plusnodes, Plustransnodes);
-        decodeOutput.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(PlusTransNode)->Value()));
+
+        if (!net)
+        {
+            decodeOutput.SetToZeroIfLessThan(0); //reLu
+        }
+        else
+        {
+            (&dynamic_pointer_cast<ComputationNode<ElemType>>(Plusnodes[0])->Value())->SetValue(decodeOutput);
+            //SumMatrix.SetValue(sumofENandDE);
+            ComputationNetwork::BumpEvalTimeStamp(Plusnodes);
+            auto PlusMBlayout = Plusnodes[0]->GetMBLayout();
+            PlusMBlayout->Init(1, 1);
+            PlusMBlayout->AddSequence(NEW_SEQUENCE_ID, 0, 0, 1);
+
+            net->ForwardPropFromTo(Plusnodes, Plustransnodes);
+            decodeOutput.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(Plustransnodes[0])->Value()));
+        }
+
+        // fprintf(stderr, "forward merge SVD = %f \n", decodeOutput.FrobeniusNorm());
         tempMatrix.AssignProductOf(Wmu, true, decodeOutput, false);
         tempMatrix1.AssignProductOf(Wmv, true, tempMatrix, false);
         decodeOutput.AssignSumOf(tempMatrix1, bm);
@@ -1710,7 +1884,7 @@ public:
         }
     }
 
-    float compute_wer(vector<string>& ref, vector<string>& rec)
+    float compute_wer(const vector<string>& ref, vector<string>& rec)
     {
         short** mat;
         size_t i, j;
@@ -1745,221 +1919,60 @@ public:
         delete[] mat;
         return wer;
     }
-    
-    void RNNT_decode_nbest_MBR(const std::vector<std::wstring>& outputNodeNames, Matrix<ElemType>& encodeOutput, MBLayoutPtr& encodeMBLayout,
-                               Matrix<ElemType>& decodeInputMatrix, MBLayoutPtr& decodeMBLayout, std::vector<ComputationNodeBasePtr> decodeinputNodes, size_t numBestMBR, bool lengthNorm, const vector<string>& vt_labels, vector<vector<PathInfo>>& uttPathsInfo, vector<size_t>& vt_nws, vector<float>& vt_onebest_wer,
-                               bool SVD, ComputationNetwork& cn)
-    {
-        //time_t my_time = time(NULL);
-        //fprintf(stderr, "RNNT_decode_nbest_MBR time 1 = %s", ctime(&my_time));
 
-        if (outputNodeNames.size() == 0)
-            fprintf(stderr, "OutputNodeNames are not specified, using the default outputnodes.\n");
-        std::vector<ComputationNodeBasePtr> outputNodes = cn.OutputNodesByName(outputNodeNames);
+    /*
+   void RNNT_decode_oneutt_MBR(std::ref(cn), std::ref(vocabSize), std::ref(blankId), std::ref(deviceid), std::ref(uttFrameNum[uttID]),
+                                std::ref(decodeOutputNodeNames), std::ref(decodeInputNodeNames), 
+                                std::ref(uttFrameBeginIdx[uttID]), std::ref(uttFrameToChanInd[uttID]), std::ref(numParallelSequences),
+                                std::ref(SVD), std::ref(encondeOutput), std::ref(outputNodeNames),
+                                std::ref(numBestMBR),  std::ref(lengthNorm),
+                                std::ref(wordSeqs[uttID]), std::ref(uttPathsInfo[uttID]), std::ref(vt_onebest_wer[uttID])
+    */
+        void RNNT_decode_oneutt_MBR(const ComputationNetwork& net, const size_t& vocabSize, const size_t& blankId, const size_t& deviceid, const size_t& uttFrameNum,
 
-
-        //prediction related nodes
-        std::vector<std::wstring> decodeOutputNodeNames(outputNodeNames.begin() + 1, outputNodeNames.begin() + 2);
-        std::vector<ComputationNodeBasePtr> decodeOutputNodes = cn.OutputNodesByName(decodeOutputNodeNames);
-
-        std::list<ComputationNodeBasePtr> pastValueNodes = cn.PastValueNodesForOutputs(decodeOutputNodes);
-
-        std::list<ComputationNodeBasePtr>::iterator it;
-        for (it = pastValueNodes.begin(); it != pastValueNodes.end(); ++it)
+                                    const std::vector<std::wstring>& decodeOutputNodeNames,
+                                    const std::vector<std::wstring>& decodeInputNodeNames,
+                                    const size_t& uttFrameBeginIdx, const size_t& uttFrameToChanInd, const size_t& numParallelSequences,
+                                    const bool& SVD, const Matrix<ElemType>& encodeOutput, const std::vector<std::wstring>& outputNodeNames,
+                                    const size_t& numBestMBR, const bool& lengthNorm, const vector<string>& vt_labels,
+                                    const std::vector<string>& wordSeq, vector<PathInfo>& oneuttPathsInfo, float& onebest_wer,
+                                    const Matrix<ElemType>& Wm, const Matrix<ElemType>& Wmu, const Matrix<ElemType>& Wmv, const Matrix<ElemType>& bm, const size_t uttID)
         {
-            auto pastValueNode = dynamic_pointer_cast<PastValueNode<ElemType>>(*it); //DelayedValueNodeBase
-            if (pastValueNode || !(*it)->NodeName().compare(0, 5, L"Loop_"))
+            vector<Sequence> CurSequences, nextSequences;
+            ComputationNetwork decode_net;
+            unordered_map<wstring, vector<shared_ptr<PastValueNode<ElemType>>>> m_nameToPastValueNodeCachePerThread;
+            time_t my_time;
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_oneutt_MBR time 1 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID));
+           
+            vector<shared_ptr<Matrix<ElemType>>> m_decodeOutputCachePerThread;
+
+            decode_net.CopySubTree(net, decodeOutputNodeNames[0], L"", CopyNodeFlags::copyNodeAll);
+            // m_pMBLayout->CopyFrom(m_minibatchBuffer[index].pMBLayout);
+            decode_net.CompileNetwork();
+            std::vector<ComputationNodeBasePtr> decodeOutputNodes = decode_net.OutputNodesByName(decodeOutputNodeNames);
+
+            decode_net.FormEvalOrder(decodeOutputNodes[0]);
+            decode_net.FormNestedNetwork(decodeOutputNodes[0]);
+
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_oneutt_MBR time 2 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID));
+
+           
+                    
+            for (const auto& node : decode_net.GetAllNodesForRoot(decodeOutputNodes[0]))
             {
-                m_nodesToCache.push_back((*it)->NodeName());
-            }
-        }
-        //joint nodes
-        ComputationNodeBasePtr PlusNode = cn.GetNodeFromName(outputNodeNames[2]);
-        ComputationNodeBasePtr PlusTransNode = cn.GetNodeFromName(outputNodeNames[3]);
-        ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode;
-        WmNode;
-        WmuNode;
-        WmvNode;
-        if (SVD)
-        {
-            WmuNode = cn.GetNodeFromName(outputNodeNames[4]);
-            WmvNode = cn.GetNodeFromName(outputNodeNames[5]);
-            bmNode = cn.GetNodeFromName(outputNodeNames[6]);
-        }
-        else
-        {
-            WmNode = cn.GetNodeFromName(outputNodeNames[4]);
-            bmNode = cn.GetNodeFromName(outputNodeNames[5]);
-        }
-        std::vector<ComputationNodeBasePtr> Plusnodes, Plustransnodes;
-        Plusnodes.push_back(PlusNode);
-        Plustransnodes.push_back(PlusTransNode);
-
-        size_t deviceid = decodeInputMatrix.GetDeviceId();
-        std::map<std::wstring, void*, nocase_compare> outputMatrices;
-        Matrix<ElemType> decodeOutput(deviceid), Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid), tempMatrix(deviceid);
-        Matrix<ElemType> greedyOutput(deviceid);
-        Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
-        Wmu;
-        Wmv;
-        Wm;
-        if (SVD)
-        {
-            Wmu.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmuNode)->Value()));
-            Wmv.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmvNode)->Value()));
-        }
-        else
-            Wm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmNode)->Value()));
-
-        bm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(bmNode)->Value()));
-        const size_t numIterationsBeforePrintingProgress = 100;
-
-        //get MBlayer of encoder input
-        size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences();
-        size_t numParallelPhoneSequences = decodeMBLayout->GetNumParallelSequences();
-        const auto numSequences = encodeMBLayout->GetNumSequences();
-
-        std::vector<size_t> uttFrameBeginIdx, uttPhoneBeginIdx;
-        // the frame number of each utterance. The size of this vector =  the number of all utterances in this minibatch
-        std::vector<size_t> uttFrameNum, uttPhoneNum;
-        // map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
-        std::vector<size_t> uttFrameToChanInd, uttPhoneToChanInd;
-
-        uttFrameNum.clear();
-        uttFrameToChanInd.clear();
-        uttFrameBeginIdx.clear();
-
-        uttFrameNum.reserve(numSequences);
-        uttFrameToChanInd.reserve(numSequences);
-        uttFrameBeginIdx.reserve(numSequences);
-
-        uttPhoneNum.clear();
-        uttPhoneToChanInd.clear();
-        uttPhoneBeginIdx.clear();
-
-        uttPhoneNum.reserve(numSequences);
-        uttPhoneToChanInd.reserve(numSequences);
-        uttPhoneBeginIdx.reserve(numSequences);
-        uttPathsInfo.clear();
-        uttPathsInfo.resize(numSequences);
-
-        vt_nws.clear();
-        vt_nws.resize(numSequences);
-
-        vt_onebest_wer.clear();
-        vt_onebest_wer.resize(numSequences);
-        //get utt information, such as channel map id and utt begin frame, utt frame num, utt phone num for frame and phone respectively....
-        size_t seqId = 0; //frame
-        size_t totalframenum = 0;
-
-        // this->FormEvalOrder(Plustransnodes[0]);
-
-        for (const auto& seq : encodeMBLayout->GetAllSequences())
-        {
-            if (seq.seqId == GAP_SEQUENCE_ID)
-            {
-                continue;
-            }
-            assert(seq.seqId == seqId);
-            seqId++;
-            uttFrameToChanInd.push_back(seq.s);
-            size_t numFrames = seq.GetNumTimeSteps();
-            uttFrameBeginIdx.push_back(seq.tBegin);
-            uttFrameNum.push_back(numFrames);
-            totalframenum += numFrames;
-        }
-
-        //get utt information for prediction input....
-        seqId = 0; //frame
-
-        for (const auto& seq : decodeMBLayout->GetAllSequences())
-        {
-            if (seq.seqId == GAP_SEQUENCE_ID)
-            {
-                continue;
-            }
-            assert(seq.seqId == seqId);
-            seqId++;
-            uttPhoneToChanInd.push_back(seq.s);
-            size_t numFrames = seq.GetNumTimeSteps();
-            uttPhoneBeginIdx.push_back(seq.tBegin);
-            uttPhoneNum.push_back(numFrames);
-        }
-
-        //get phone sequene
-        CNTK::Matrix<ElemType> maxIndex(deviceid), maxValue(deviceid);
-        decodeInputMatrix.VectorMax(maxIndex, maxValue, true);
-        maxIndex.TransferToDeviceIfNotThere(CPUDEVICE);
-
-        //backup decoding input matrix and MBlayout
-        MBLayoutPtr decodebackupMBlayout;
-        decodebackupMBlayout = make_shared<MBLayout>();
-        decodebackupMBlayout->CopyFrom(decodeMBLayout);
-
-        Matrix<ElemType> decodeInputMatrixBackup(deviceid);
-        decodeInputMatrixBackup.SetValue(decodeInputMatrix);
-
-        std::vector<std::vector<size_t>> phoneSeqs;
-        phoneSeqs.resize(numSequences);
-        for (size_t utt = 0; utt < numSequences; utt++)
-        {
-            //phoneSeqs[utt].resize(uttPhoneNum[utt]);
-            for (size_t u = 0; u < uttPhoneNum[utt]; u++)
-            {
-                size_t uID = (u + uttPhoneBeginIdx[utt]) * numParallelPhoneSequences + uttPhoneToChanInd[utt];
-                phoneSeqs[utt].push_back((size_t)(maxIndex(0, uID)));
-            }
-        }
-        // convert the phoneSeqs to word sequence, as reference, convert a string of "_ab_cdef_g" to word sequence of "ab cdef g".
-        std::vector<std::vector<string>> wordSeqs;
-        wordSeqs.resize(numSequences);
-        for (size_t uttID = 0; uttID < numSequences; uttID++)
-        {
-            string word_sequence = "";
-            for (size_t i = 0; i < phoneSeqs[uttID].size(); i++)
-            {
-                size_t labelID = phoneSeqs[uttID][i];
-                if (labelID != (vt_labels.size() - 1)) // it is not <blank>
+                if (node->OperationName().find(L"ReduceElements") != string::npos)
                 {
-                    string wordpiece = vt_labels[labelID];
-                    word_sequence += wordpiece;
+                    auto rNode = node->As<ReduceElementsNode<ElemType>>();
+                    //rNode->set_frame(uttFrameNum);
+                    rNode->is_multi_thread(true);
                 }
             }
-            convert_word_sequence_string_2_vector(word_sequence, wordSeqs[uttID], '_');
 
-            vt_nws[uttID] = wordSeqs[uttID].size();
-            /*
-            fprintf(stderr, "word sequence for uttID = %d .\n", int(uttID));
-            for (size_t i = 0; i < wordSeqs[uttID].size(); i++)
-            {
-                fprintf(stderr, "%s ", wordSeqs[uttID][i].c_str());
-            }
-            fprintf(stderr, "\n");
-            */
-        }
+            std::vector<ComputationNodeBasePtr> decodeinputNodes = decode_net.OutputNodesByName(decodeInputNodeNames);
+            StreamMinibatchInputs decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes);
 
-        // the data structure for phone sequence
-
-        // do decoding for the utterances, and feed in the data structure,
-
-        size_t vocabSize = bm.GetNumRows();
-        size_t blankId = vocabSize - 1;
-        vector<Sequence> CurSequences, nextSequences;
-        // sanity check
-        if (vt_labels.size() != vocabSize)
-        {
-            RuntimeError("RNNT_decode_nbest_MBR: size not match, vt_labels.size() = %d, and vocabSize = %d.", int(vt_labels.size()), int(vocabSize));
-        }
-
-        StreamMinibatchInputs decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes);
-
-        // this->FormEvalOrder(Plustransnodes[0]);
-
-        //my_time = time(NULL);
-        //fprintf(stderr, "RNNT_decode_nbest_MBR time 2 = %s", ctime(&my_time));
-        for (size_t uttID = 0; uttID < numSequences; uttID++)
-        {
-            // fprintf(stderr, "decode v0 uttID = %d .\n", int(uttID));
             nextSequences.clear();
             //initialize with blank ID
             Sequence oneSeq = newSeq(vocabSize, (size_t) 50, deviceid);
@@ -1967,46 +1980,128 @@ public:
 
             nextSequences.push_back(oneSeq);
 
+            Matrix<ElemType> decodeOutput(deviceid);
+
+            std::vector<ComputationNodeBasePtr> Plusnodes, Plustransnodes; // as a placeholder, will not be used in multithread case
+
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_oneutt_MBR time 3 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID));
+
             // loop for each frame
-            for (size_t t = 0; t < uttFrameNum[uttID]; t++)
+            for (size_t t = 0; t < uttFrameNum; t++)
             {
+                //fprintf(stderr, "one utt, uttframenum = %d, t = %d, 1 \n", int(uttFrameNum), int(t));
                 for (size_t n = 0; n < CurSequences.size(); n++)
                 {
-                    deleteSeq(CurSequences[n]);
+                    deleteSeq(CurSequences[n], m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
                 }
                 vector<Sequence>().swap(CurSequences);
                 CurSequences = nextSequences;
 
                 vector<Sequence>().swap(nextSequences);
-                //fprintf(stderr, "t = %d .\n", int(t));
-
-                //deal with the same prefix
-                //int count = 0;
+                //fprintf(stderr,"one utt, uttframenum = %d, t = %d, 2 \n", int(uttFrameNum), int(t));
+                int count = 0;
                 while (true)
                 {
-                    // fprintf(stderr, "count = %d .\n", int(count++));
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 1 \n", int(uttFrameNum), int(t));
+                    /*
+                if (uttFrameNum == 94)
+                {
 
+                    fprintf(stderr, "uttframenum = %d, t = %d, count = %d,  debug 1 \n", int(uttFrameNum), int(t), count);
+                    fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size()));
+                    for (size_t n = 0; n < CurSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP);
+                    }
+
+                    fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size()));
+                    for (size_t n = 0; n < nextSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP);
+                    }
+                }
+                */
                     auto maxSeq = std::max_element(CurSequences.begin(), CurSequences.end());
-                    Sequence tempSeq = newSeq(*maxSeq, deviceid);
-                    deleteSeq(*maxSeq);
+                    Sequence tempSeq = newSeq(*maxSeq, deviceid, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
+                    /*
+                if (uttFrameNum == 94)
+                {
+                    fprintf(stderr, "tempSeq.logp = %f, debug 1 \n", tempSeq.logP);
+                }
+                */
+                    deleteSeq(*maxSeq, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
                     CurSequences.erase(maxSeq);
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 2 \n", int(uttFrameNum), int(t));
                     prepareSequence(tempSeq);
-                    forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), cn);
+                    /*
+                if (uttFrameNum == 94)
+                {
+                    fprintf(stderr, "tempSeq.logp = %f, debug 2 \n", tempSeq.logP);
+                }
+                if (uttFrameNum == 94)
+                {
 
-                    size_t tinMB = (t + uttFrameBeginIdx[uttID]) * numParallelSequences + uttFrameToChanInd[uttID];
+                    fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 2 \n", int(uttFrameNum), int(t), count);
+                    fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size()));
+                    for (size_t n = 0; n < CurSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP);
+                    }
+
+                    fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size()));
+                    for (size_t n = 0; n < nextSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP);
+                    }
+                }
+                */
+                    // mask for debug purpose
+                    forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), decode_net, int(uttFrameNum));
+                    /*
+                if (uttFrameNum == 94)
+                {
+                    fprintf(stderr, "tempSeq.logp = %f, debug 3 \n", tempSeq.logP);
+                }
+                */
+                    size_t tinMB = (t + uttFrameBeginIdx) * numParallelSequences + uttFrameToChanInd;
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 3 \n", int(uttFrameNum), int(t));
                     if (SVD)
-                        forwardmergedSVD(tempSeq, tinMB, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wmu, Wmv, bm, cn);
+                        forwardmergedSVD(tempSeq, tinMB, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, NULL);
                     else
-                        forwardmerged(tempSeq, tinMB, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, cn);
+                        forwardmerged(tempSeq, tinMB, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, NULL, int(uttFrameNum), deviceid);
+                    /*
+                if (uttFrameNum == 94)
+                {
+                    ElemType* probdata = decodeOutput.CopyToArray();
 
+                    fprintf(stderr, "tempSeq.logp = %f, probdata[blankid] = %f, norm = %f, debug 4 \n", tempSeq.logP, probdata[blankId], decodeOutput.FrobeniusNorm());
+                    delete probdata;
+                }
+                */
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 4 \n", int(uttFrameNum), int(t));
                     //sort log posterior and get best N labels
                     vector<pair<size_t, ElemType>> topN = getTopN(decodeOutput, numBestMBR, blankId);
-
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 5 \n", int(uttFrameNum), int(t));
                     //expand blank
-                    Sequence seqK = newSeq(tempSeq, deviceid);
+                    Sequence seqK = newSeq(tempSeq, deviceid, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
+                    /*
+                if (uttFrameNum == 94)
+                {
+                    fprintf(stderr, "tempSeq.logp = %f, debug 5 \n", tempSeq.logP);
+                }
+                */
                     ElemType newlogP = topN[vocabSize].second + tempSeq.logP;
+                    /*
+                if (uttFrameNum == 94)
+                {
+                    fprintf(stderr, "tempSeq.logp = %f, topN = %f, debug 6 \n", tempSeq.logP, topN[vocabSize].second);
+                }
+                */
                     seqK.logP = newlogP;
                     bool existseq = false;
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 6 \n", int(uttFrameNum), int(t));
+
                     for (auto itseq = nextSequences.begin(); itseq != nextSequences.end(); itseq++)
                     {
                         //merge the score with same sequence
@@ -2017,15 +2112,36 @@ public:
                             break;
                         }
                     }
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 7 \n", int(uttFrameNum), int(t));
                     if (!existseq)
                     {
                         nextSequences.push_back(seqK);
                     }
+                    /*
+                if (uttFrameNum == 94)
+                {
+
+                    fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 3 \n", int(uttFrameNum), int(t), count);
+                    fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size()));
+                    for (size_t n = 0; n < CurSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP);
+                    }
+
+                    fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size()));
+                    for (size_t n = 0; n < nextSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP);
+                    }
+                }
+                */
+
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 8 \n", int(uttFrameNum), int(t));
                     int iLabel;
                     for (iLabel = 0; iLabel < numBestMBR; iLabel++)
                     {
 
-                        seqK = newSeq(tempSeq, deviceid);
+                        seqK = newSeq(tempSeq, deviceid, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
                         newlogP = topN[iLabel].second + tempSeq.logP;
                         seqK.logP = newlogP;
 
@@ -2037,30 +2153,111 @@ public:
                             CurSequences.push_back(seqK);
                         }
                     }
-                    vector<pair<size_t, ElemType>>().swap(topN);
-                    deleteSeq(tempSeq);
+                    /*
+                if (uttFrameNum == 94)
+                {
 
+                    fprintf(stderr, "uttframenum = %d, t = %d, count = %d,  debug 4 \n", int(uttFrameNum), int(t), count);
+                    fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size()));
+                    for (size_t n = 0; n < CurSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP);
+                    }
+
+                    fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size()));
+                    for (size_t n = 0; n < nextSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP);
+                    }
+                }
+                */
+
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 9 \n", int(uttFrameNum), int(t));
+                    vector<pair<size_t, ElemType>>().swap(topN);
+                    deleteSeq(tempSeq, m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 10 \n", int(uttFrameNum), int(t));
+                    /*
+                if (uttFrameNum == 94)
+                {
+
+                    fprintf(stderr, "uttframenum = %d, t = %d, debug 5, count = %d,   \n", int(uttFrameNum), int(t), count);
+                    fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size()));
+                    for (size_t n = 0; n < CurSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP);
+                    }
+
+                    fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size()));
+                    for (size_t n = 0; n < nextSequences.size(); n++)
+                    {
+                        fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP);
+                    }
+                }
+                */
                     if (CurSequences.size() == 0)
                         break;
                     auto ya = std::max_element(CurSequences.begin(), CurSequences.end());
                     auto yb = std::max_element(nextSequences.begin(), nextSequences.end());
                     if (nextSequences.size() > numBestMBR && yb->logP > ya->logP)
                         break;
+                    //fprintf(stderr, "while, uttframenum = %d, t = %d, 11 \n", int(uttFrameNum), int(t));
+                    count++;
                 }
+                //fprintf(stderr,"one utt, uttframenum = %d, t = %d, 3 \n", int(uttFrameNum), int(t));
                 std::sort(nextSequences.begin(), nextSequences.end());
                 std::reverse(nextSequences.begin(), nextSequences.end());
                 if (nextSequences.size() > numBestMBR)
                 {
                     for (size_t n = numBestMBR; n < nextSequences.size(); n++)
                     {
-                        deleteSeq(nextSequences[n]);
+                        deleteSeq(nextSequences[n], m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
                     }
                 }
+                /*
+            if (uttFrameNum == 94)
+            {
+
+                fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 6 \n", int(uttFrameNum), int(t), count);
+                fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size()));
+                for (size_t n = 0; n < CurSequences.size(); n++)
+                {
+                    fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP);
+                }
+
+                fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size()));
+                for (size_t n = 0; n < nextSequences.size(); n++)
+                {
+                    fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP);
+                }
+            }
+            */
+                //fprintf(stderr,"one utt, uttframenum = %d, t = %d, 4 \n", int(uttFrameNum), int(t));
                 for (size_t iseq = nextSequences.size(); iseq > numBestMBR; iseq--)
                     nextSequences.pop_back();
-            }
+                //fprintf(stderr, "one utt, uttframenum = %d, t = %d, 5 \n", int(uttFrameNum), int(t));
+                /*
+            if (uttFrameNum == 94)
+            {
 
+                fprintf(stderr, "uttframenum = %d, t = %d, count = %d, debug 7 \n", int(uttFrameNum), int(t), count);
+                fprintf(stderr, "uttframenum = %d, t = %d, curSequences sequence size = %d \n", int(uttFrameNum), int(t), int(CurSequences.size()));
+                for (size_t n = 0; n < CurSequences.size(); n++)
+                {
+                    fprintf(stderr, "n = %d, logp = %f \n", int(n), CurSequences[n].logP);
+                }
+
+                fprintf(stderr, "uttframenum = %d, t = %d, next sequence size = %d \n", int(uttFrameNum), int(t), int(nextSequences.size()));
+                for (size_t n = 0; n < nextSequences.size(); n++)
+                {
+                    fprintf(stderr, "n = %d, logp = %f \n", int(n), nextSequences[n].logP);
+                }
+            }
+            */
+            }
+            //fprintf(stderr, "one utt, uttframenum = %d, 6 \n", int(uttFrameNum));
             //nbest output
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_oneutt_MBR time 4 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID));
 
             if (nextSequences.size() != 0)
             {
@@ -2113,63 +2310,648 @@ public:
                     vector<string> vt_words;
                     convert_word_sequence_string_2_vector(word_sequence, vt_words, '_');
 
-                    pi.WER = compute_wer(wordSeqs[uttID], vt_words);
+                    pi.WER = compute_wer(wordSeq, vt_words);
 
                     pi.label_seq = nextSequences[n].labelseq;
 
-                    uttPathsInfo[uttID].push_back(pi);
+                    oneuttPathsInfo.push_back(pi);
                 }
-                vt_onebest_wer[uttID] = uttPathsInfo[uttID][onebest_index].WER;
+                onebest_wer = oneuttPathsInfo[onebest_index].WER;
             }
+            //fprintf(stderr, "one utt, uttframenum = %d, 7 \n", int(uttFrameNum));
+
             for (size_t n = 0; n < CurSequences.size(); n++)
             {
-                deleteSeq(CurSequences[n]);
+                deleteSeq(CurSequences[n], m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
             }
+            //fprintf(stderr, "one utt, uttframenum = %d, 8 \n", int(uttFrameNum));
+
             vector<Sequence>().swap(CurSequences);
             for (size_t n = 0; n < nextSequences.size(); n++)
             {
-                deleteSeq(nextSequences[n]);
+                deleteSeq(nextSequences[n], m_nameToPastValueNodeCachePerThread, m_decodeOutputCachePerThread);
             }
+            //fprintf(stderr, "one utt, uttframenum = %d, 9 \n", int(uttFrameNum));
+
             vector<Sequence>().swap(nextSequences);
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_oneutt_MBR time 5 = %s, uttFrameNum = %d, uttID = %d \n", ctime(&my_time), int(uttFrameNum), int(uttID));
+
             // end here
+        }
 
-        } // end of for loop
-        decodeInputMatrix.SetValue(decodeInputMatrixBackup);
-        //decodeInputMatrix.Print("after ss");
-        decodeMBLayout->CopyFrom(decodebackupMBlayout);
-        //my_time = time(NULL);
-        //fprintf(stderr, "RNNT_decode_nbest_MBR time 3 = %s", ctime(&my_time));
+        void RNNT_decode_nbest_MBR_Multithread(const std::vector<std::wstring>& outputNodeNames, Matrix<ElemType>& encodeOutput, MBLayoutPtr& encodeMBLayout,
+                                               Matrix<ElemType>& decodeInputMatrix, MBLayoutPtr& decodeMBLayout, const std::vector<std::wstring> decodeInputNodeNames,
+                                               size_t numBestMBR, bool lengthNorm, const vector<string>& vt_labels, vector<vector<PathInfo>>& uttPathsInfo, vector<size_t>& vt_nws, vector<float>& vt_onebest_wer,
+                                               bool SVD, ComputationNetwork& net,
+                                               const Matrix<ElemType>& Wm, const Matrix<ElemType>& Wmu, const Matrix<ElemType>& Wmv, const Matrix<ElemType>& bm) /*, size_t num_utt, size_t start_utt) */
+        {
+            
+            time_t my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_nbest_MBR_Multithread time 1 = %s \n", ctime(&my_time));
+            
+            if (outputNodeNames.size() == 0)
+                fprintf(stderr, "OutputNodeNames are not specified, using the default outputnodes.\n");
+
+            //prediction related nodes
+            std::vector<std::wstring> decodeOutputNodeNames(outputNodeNames.begin() + 1, outputNodeNames.begin() + 2);
+
+            std::vector<ComputationNodeBasePtr> decodeOutputNodes;
+
+            std::list<ComputationNodeBasePtr> pastValueNodes;
+            decodeOutputNodes = net.OutputNodesByName(decodeOutputNodeNames);
+            pastValueNodes = net.PastValueNodesForOutputs(decodeOutputNodes);
+
+            std::list<ComputationNodeBasePtr>::iterator it;
+            for (it = pastValueNodes.begin(); it != pastValueNodes.end(); ++it)
+            {
+                auto pastValueNode = dynamic_pointer_cast<PastValueNode<ElemType>>(*it); //DelayedValueNodeBase
+                if (pastValueNode || !(*it)->NodeName().compare(0, 5, L"Loop_"))
+                {
+                    m_nodesToCache.push_back((*it)->NodeName());
+                }
+            }
+            //joint nodes
+
+            int deviceid = decodeInputMatrix.GetDeviceId();
+            std::map<std::wstring, void*, nocase_compare> outputMatrices;
+            
+            Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
+
+            const size_t numIterationsBeforePrintingProgress = 100;
+
+            //get MBlayer of encoder input
+            size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences();
+            size_t numParallelPhoneSequences = decodeMBLayout->GetNumParallelSequences();
+            const auto numSequences = encodeMBLayout->GetNumSequences();
+
+            std::vector<size_t> uttFrameBeginIdx, uttPhoneBeginIdx;
+            // the frame number of each utterance. The size of this vector =  the number of all utterances in this minibatch
+            std::vector<size_t> uttFrameNum, uttPhoneNum;
+            // map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
+            std::vector<size_t> uttFrameToChanInd, uttPhoneToChanInd;
+
+            uttFrameNum.clear();
+            uttFrameToChanInd.clear();
+            uttFrameBeginIdx.clear();
+
+            uttFrameNum.reserve(numSequences);
+            uttFrameToChanInd.reserve(numSequences);
+            uttFrameBeginIdx.reserve(numSequences);
+
+            uttPhoneNum.clear();
+            uttPhoneToChanInd.clear();
+            uttPhoneBeginIdx.clear();
+
+            uttPhoneNum.reserve(numSequences);
+            uttPhoneToChanInd.reserve(numSequences);
+            uttPhoneBeginIdx.reserve(numSequences);
+            uttPathsInfo.clear();
+            uttPathsInfo.resize(numSequences);
+
+            vt_nws.clear();
+            vt_nws.resize(numSequences);
+
+            vt_onebest_wer.clear();
+            vt_onebest_wer.resize(numSequences);
+            //get utt information, such as channel map id and utt begin frame, utt frame num, utt phone num for frame and phone respectively....
+            size_t seqId = 0; //frame
+            size_t totalframenum = 0;
+
+            // this->FormEvalOrder(Plustransnodes[0]);
+
+            for (const auto& seq : encodeMBLayout->GetAllSequences())
+            {
+                if (seq.seqId == GAP_SEQUENCE_ID)
+                {
+                    continue;
+                }
+                assert(seq.seqId == seqId);
+                seqId++;
+                uttFrameToChanInd.push_back(seq.s);
+                size_t numFrames = seq.GetNumTimeSteps();
+                uttFrameBeginIdx.push_back(seq.tBegin);
+                uttFrameNum.push_back(numFrames);
+                totalframenum += numFrames;
+            }
+
+            //get utt information for prediction input....
+            seqId = 0; //frame
+
+            for (const auto& seq : decodeMBLayout->GetAllSequences())
+            {
+                if (seq.seqId == GAP_SEQUENCE_ID)
+                {
+                    continue;
+                }
+                assert(seq.seqId == seqId);
+                seqId++;
+                uttPhoneToChanInd.push_back(seq.s);
+                size_t numFrames = seq.GetNumTimeSteps();
+                uttPhoneBeginIdx.push_back(seq.tBegin);
+                uttPhoneNum.push_back(numFrames);
+            }
+
+            //get phone sequene
+            CNTK::Matrix<ElemType> maxIndex(deviceid), maxValue(deviceid);
+            decodeInputMatrix.VectorMax(maxIndex, maxValue, true);
+            maxIndex.TransferToDeviceIfNotThere(CPUDEVICE);
+
+            //backup decoding input matrix and MBlayout
+            MBLayoutPtr decodebackupMBlayout;
+            decodebackupMBlayout = make_shared<MBLayout>();
+            decodebackupMBlayout->CopyFrom(decodeMBLayout);
+
+            Matrix<ElemType> decodeInputMatrixBackup(deviceid);
+            decodeInputMatrixBackup.SetValue(decodeInputMatrix);
+
+            std::vector<std::vector<size_t>> phoneSeqs;
+            phoneSeqs.resize(numSequences);
+            for (size_t utt = 0; utt < numSequences; utt++)
+            {
+                //phoneSeqs[utt].resize(uttPhoneNum[utt]);
+                for (size_t u = 0; u < uttPhoneNum[utt]; u++)
+                {
+                    size_t uID = (u + uttPhoneBeginIdx[utt]) * numParallelPhoneSequences + uttPhoneToChanInd[utt];
+                    phoneSeqs[utt].push_back((size_t)(maxIndex(0, uID)));
+                }
+            }
+            // convert the phoneSeqs to word sequence, as reference, convert a string of "_ab_cdef_g" to word sequence of "ab cdef g".
+            std::vector<std::vector<string>> wordSeqs;
+            wordSeqs.resize(numSequences);
+            for (size_t uttID = 0; uttID < numSequences; uttID++)
+            {
+                string word_sequence = "";
+                for (size_t i = 0; i < phoneSeqs[uttID].size(); i++)
+                {
+                    size_t labelID = phoneSeqs[uttID][i];
+                    if (labelID != (vt_labels.size() - 1)) // it is not <blank>
+                    {
+                        string wordpiece = vt_labels[labelID];
+                        word_sequence += wordpiece;
+                    }
+                }
+                convert_word_sequence_string_2_vector(word_sequence, wordSeqs[uttID], '_');
+
+                vt_nws[uttID] = wordSeqs[uttID].size();
+            }
+
+            // the data structure for phone sequence
+
+            // do decoding for the utterances, and feed in the data structure,
+
+            size_t vocabSize = vt_labels.size();
+            size_t blankId = vocabSize - 1;
+
+            // this->FormEvalOrder(Plustransnodes[0]);
+
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_nbest_MBR_Multithread time 2 = %s, numSequences = %d, uttFrameNum = %d \n ", ctime(&my_time), int(numSequences), int(uttFrameNum[0]));
+            std::vector<std::thread> vt_threads(numSequences);
+            deviceid = CPUDEVICE;
+            for (size_t uttID = 0; uttID < numSequences; uttID++)
+            //for (size_t uttID = start_utt; uttID < num_utt; uttID++)
+            {
+                vt_threads[uttID] = std::thread(&RNNTDecodeFunctions::RNNT_decode_oneutt_MBR, this, std::ref(net), std::ref(vocabSize), std::ref(blankId), std::ref(deviceid), std::ref(uttFrameNum[uttID]),
+                                                std::ref(decodeOutputNodeNames), std::ref(decodeInputNodeNames),
+                                                std::ref(uttFrameBeginIdx[uttID]), std::ref(uttFrameToChanInd[uttID]), std::ref(numParallelSequences),
+                                                std::ref(SVD), std::ref(encodeOutput), std::ref(outputNodeNames),
+                                                std::ref(numBestMBR), std::ref(lengthNorm), std::ref(vt_labels),
+                                                std::ref(wordSeqs[uttID]), std::ref(uttPathsInfo[uttID]), std::ref(vt_onebest_wer[uttID]),
+                                                std::ref(Wm), std::ref(Wmu), std::ref(Wmv), std::ref(bm), (uttID));
+
+            } // end of for loop
+
+            for (size_t uttID = 0; uttID < numSequences; uttID++)
+            // for (size_t uttID = start_utt; uttID < num_utt; uttID++)
+            {
+                vt_threads[uttID].join();
+            }
+            decodeInputMatrix.SetValue(decodeInputMatrixBackup);
+            //decodeInputMatrix.Print("after ss");
+            decodeMBLayout->CopyFrom(decodebackupMBlayout);
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_nbest_MBR_Multithread time 3 = %s \n", ctime(&my_time));
+        }
+
+        void RNNT_decode_nbest_MBR(const std::vector<std::wstring>& outputNodeNames, Matrix<ElemType>& encodeOutput, MBLayoutPtr& encodeMBLayout,
+                                   Matrix<ElemType>& decodeInputMatrix, MBLayoutPtr& decodeMBLayout, std::vector<ComputationNodeBasePtr> decodeinputNodes, size_t numBestMBR, bool lengthNorm, const vector<string>& vt_labels, vector<vector<PathInfo>>& uttPathsInfo, vector<size_t>& vt_nws, vector<float>& vt_onebest_wer,
+                                   bool SVD, const ComputationNetworkPtr& net)
+        {
+            time_t my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_nbest_MBR time 1 = %s \n", ctime(&my_time));
+
+            if (outputNodeNames.size() == 0)
+                fprintf(stderr, "OutputNodeNames are not specified, using the default outputnodes.\n");
+            std::vector<ComputationNodeBasePtr> outputNodes = net->OutputNodesByName(outputNodeNames);
+
+            //prediction related nodes
+            std::vector<std::wstring> decodeOutputNodeNames(outputNodeNames.begin() + 1, outputNodeNames.begin() + 2);
+            std::vector<ComputationNodeBasePtr> decodeOutputNodes = net->OutputNodesByName(decodeOutputNodeNames);
+
+            std::list<ComputationNodeBasePtr> pastValueNodes = net->PastValueNodesForOutputs(decodeOutputNodes);
+
+            std::list<ComputationNodeBasePtr>::iterator it;
+            for (it = pastValueNodes.begin(); it != pastValueNodes.end(); ++it)
+            {
+                auto pastValueNode = dynamic_pointer_cast<PastValueNode<ElemType>>(*it); //DelayedValueNodeBase
+                if (pastValueNode || !(*it)->NodeName().compare(0, 5, L"Loop_"))
+                {
+                    m_nodesToCache.push_back((*it)->NodeName());
+                }
+            }
+            //joint nodes
+            ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode;
+            WmNode;
+            WmuNode;
+            WmvNode;
+            if (SVD)
+            {
+                WmuNode = net->GetNodeFromName(outputNodeNames[4]);
+                WmvNode = net->GetNodeFromName(outputNodeNames[5]);
+                bmNode = net->GetNodeFromName(outputNodeNames[6]);
+            }
+            else
+            {
+                WmNode = net->GetNodeFromName(outputNodeNames[4]);
+                bmNode = net->GetNodeFromName(outputNodeNames[5]);
+            }
+            std::vector<ComputationNodeBasePtr> Plusnodes, Plustransnodes;
+            Plusnodes.push_back(net->GetNodeFromName(outputNodeNames[2]));
+            Plustransnodes.push_back(net->GetNodeFromName(outputNodeNames[3]));
+
+            size_t deviceid = decodeInputMatrix.GetDeviceId();
+            std::map<std::wstring, void*, nocase_compare> outputMatrices;
+            Matrix<ElemType> decodeOutput(deviceid), Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid);
+            
+            Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
+            Wmu;
+            Wmv;
+            Wm;
+            if (SVD)
+            {
+                Wmu.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmuNode)->Value()));
+                Wmv.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmvNode)->Value()));
+            }
+            else
+                Wm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmNode)->Value()));
+
+            bm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(bmNode)->Value()));
+            const size_t numIterationsBeforePrintingProgress = 100;
+
+            //get MBlayer of encoder input
+            size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences();
+            size_t numParallelPhoneSequences = decodeMBLayout->GetNumParallelSequences();
+            const auto numSequences = encodeMBLayout->GetNumSequences();
+
+            std::vector<size_t> uttFrameBeginIdx, uttPhoneBeginIdx;
+            // the frame number of each utterance. The size of this vector =  the number of all utterances in this minibatch
+            std::vector<size_t> uttFrameNum, uttPhoneNum;
+            // map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
+            std::vector<size_t> uttFrameToChanInd, uttPhoneToChanInd;
+
+            uttFrameNum.clear();
+            uttFrameToChanInd.clear();
+            uttFrameBeginIdx.clear();
+
+            uttFrameNum.reserve(numSequences);
+            uttFrameToChanInd.reserve(numSequences);
+            uttFrameBeginIdx.reserve(numSequences);
+
+            uttPhoneNum.clear();
+            uttPhoneToChanInd.clear();
+            uttPhoneBeginIdx.clear();
+
+            uttPhoneNum.reserve(numSequences);
+            uttPhoneToChanInd.reserve(numSequences);
+            uttPhoneBeginIdx.reserve(numSequences);
+            uttPathsInfo.clear();
+            uttPathsInfo.resize(numSequences);
+
+            vt_nws.clear();
+            vt_nws.resize(numSequences);
+
+            vt_onebest_wer.clear();
+            vt_onebest_wer.resize(numSequences);
+            //get utt information, such as channel map id and utt begin frame, utt frame num, utt phone num for frame and phone respectively....
+            size_t seqId = 0; //frame
+            size_t totalframenum = 0;
+
+            // this->FormEvalOrder(Plustransnodes[0]);
+
+            for (const auto& seq : encodeMBLayout->GetAllSequences())
+            {
+                if (seq.seqId == GAP_SEQUENCE_ID)
+                {
+                    continue;
+                }
+                assert(seq.seqId == seqId);
+                seqId++;
+                uttFrameToChanInd.push_back(seq.s);
+                size_t numFrames = seq.GetNumTimeSteps();
+                uttFrameBeginIdx.push_back(seq.tBegin);
+                uttFrameNum.push_back(numFrames);
+                totalframenum += numFrames;
+            }
+
+            //get utt information for prediction input....
+            seqId = 0; //frame
+
+            for (const auto& seq : decodeMBLayout->GetAllSequences())
+            {
+                if (seq.seqId == GAP_SEQUENCE_ID)
+                {
+                    continue;
+                }
+                assert(seq.seqId == seqId);
+                seqId++;
+                uttPhoneToChanInd.push_back(seq.s);
+                size_t numFrames = seq.GetNumTimeSteps();
+                uttPhoneBeginIdx.push_back(seq.tBegin);
+                uttPhoneNum.push_back(numFrames);
+            }
+
+            //get phone sequene
+            CNTK::Matrix<ElemType> maxIndex(deviceid), maxValue(deviceid);
+            decodeInputMatrix.VectorMax(maxIndex, maxValue, true);
+            maxIndex.TransferToDeviceIfNotThere(CPUDEVICE);
+
+            //backup decoding input matrix and MBlayout
+            MBLayoutPtr decodebackupMBlayout;
+            decodebackupMBlayout = make_shared<MBLayout>();
+            decodebackupMBlayout->CopyFrom(decodeMBLayout);
+
+            Matrix<ElemType> decodeInputMatrixBackup(deviceid);
+            decodeInputMatrixBackup.SetValue(decodeInputMatrix);
+
+            std::vector<std::vector<size_t>> phoneSeqs;
+            phoneSeqs.resize(numSequences);
+            for (size_t utt = 0; utt < numSequences; utt++)
+            {
+                //phoneSeqs[utt].resize(uttPhoneNum[utt]);
+                for (size_t u = 0; u < uttPhoneNum[utt]; u++)
+                {
+                    size_t uID = (u + uttPhoneBeginIdx[utt]) * numParallelPhoneSequences + uttPhoneToChanInd[utt];
+                    phoneSeqs[utt].push_back((size_t)(maxIndex(0, uID)));
+                }
+            }
+            // convert the phoneSeqs to word sequence, as reference, convert a string of "_ab_cdef_g" to word sequence of "ab cdef g".
+            std::vector<std::vector<string>> wordSeqs;
+            wordSeqs.resize(numSequences);
+            for (size_t uttID = 0; uttID < numSequences; uttID++)
+            {
+                string word_sequence = "";
+                for (size_t i = 0; i < phoneSeqs[uttID].size(); i++)
+                {
+                    size_t labelID = phoneSeqs[uttID][i];
+                    if (labelID != (vt_labels.size() - 1)) // it is not <blank>
+                    {
+                        string wordpiece = vt_labels[labelID];
+                        word_sequence += wordpiece;
+                    }
+                }
+                convert_word_sequence_string_2_vector(word_sequence, wordSeqs[uttID], '_');
+
+                vt_nws[uttID] = wordSeqs[uttID].size();
+            }
+
+            // the data structure for phone sequence
+
+            // do decoding for the utterances, and feed in the data structure,
+
+            size_t vocabSize = bm.GetNumRows();
+            size_t blankId = vocabSize - 1;
+            vector<Sequence> CurSequences, nextSequences;
+            // sanity check
+            if (vt_labels.size() != vocabSize)
+            {
+                RuntimeError("RNNT_decode_nbest_MBR: size not match, vt_labels.size() = %d, and vocabSize = %d.", int(vt_labels.size()), int(vocabSize));
+            }
+
+            StreamMinibatchInputs decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes);
+
+            // this->FormEvalOrder(Plustransnodes[0]);
+
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_nbest_MBR time 2 = %s, num_sequence = %d \n", ctime(&my_time), int(numSequences));
+
+            for (size_t uttID = 0; uttID < numSequences; uttID++)
+            {
+                // fprintf(stderr, "decode v0 uttID = %d .\n", int(uttID));
+                nextSequences.clear();
+                //initialize with blank ID
+                Sequence oneSeq = newSeq(vocabSize, (size_t) 50, deviceid);
+                extendSeq(oneSeq, blankId, 0.0);
+
+                nextSequences.push_back(oneSeq);
+
+                // loop for each frame
+                for (size_t t = 0; t < uttFrameNum[uttID]; t++)
+                {
+                    for (size_t n = 0; n < CurSequences.size(); n++)
+                    {
+                        deleteSeq(CurSequences[n]);
+                    }
+                    vector<Sequence>().swap(CurSequences);
+                    CurSequences = nextSequences;
+
+                    vector<Sequence>().swap(nextSequences);
+                    //fprintf(stderr, "t = %d .\n", int(t));
+
+                    //deal with the same prefix
+                    //int count = 0;
+                    while (true)
+                    {
+                        // fprintf(stderr, "count = %d .\n", int(count++));
+
+                        auto maxSeq = std::max_element(CurSequences.begin(), CurSequences.end());
+                        Sequence tempSeq = newSeq(*maxSeq, deviceid);
+                        deleteSeq(*maxSeq);
+                        CurSequences.erase(maxSeq);
+                        prepareSequence(tempSeq);
+                        forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *net);
+
+                        size_t tinMB = (t + uttFrameBeginIdx[uttID]) * numParallelSequences + uttFrameToChanInd[uttID];
+                        if (SVD)
+                            forwardmergedSVD(tempSeq, tinMB, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, net);
+                        else
+                            forwardmerged(tempSeq, tinMB, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, net);
+
+                        //sort log posterior and get best N labels
+                        vector<pair<size_t, ElemType>> topN = getTopN(decodeOutput, numBestMBR, blankId);
+
+                        //expand blank
+                        Sequence seqK = newSeq(tempSeq, deviceid);
+                        ElemType newlogP = topN[vocabSize].second + tempSeq.logP;
+                        seqK.logP = newlogP;
+                        bool existseq = false;
+                        for (auto itseq = nextSequences.begin(); itseq != nextSequences.end(); itseq++)
+                        {
+                            //merge the score with same sequence
+                            if (seqK.labelseq == itseq->labelseq)
+                            {
+                                existseq = true;
+                                itseq->logP = decodeOutput.LogAdd(seqK.logP, itseq->logP);
+                                break;
+                            }
+                        }
+                        if (!existseq)
+                        {
+                            nextSequences.push_back(seqK);
+                        }
+                        int iLabel;
+                        for (iLabel = 0; iLabel < numBestMBR; iLabel++)
+                        {
+
+                            seqK = newSeq(tempSeq, deviceid);
+                            newlogP = topN[iLabel].second + tempSeq.logP;
+                            seqK.logP = newlogP;
+
+                            if (topN[iLabel].first != blankId)
+
+                            {
+                                extendSeq(seqK, topN[iLabel].first, newlogP);
+
+                                CurSequences.push_back(seqK);
+                            }
+                        }
+                        vector<pair<size_t, ElemType>>().swap(topN);
+                        deleteSeq(tempSeq);
+
+                        if (CurSequences.size() == 0)
+                            break;
+                        auto ya = std::max_element(CurSequences.begin(), CurSequences.end());
+                        auto yb = std::max_element(nextSequences.begin(), nextSequences.end());
+                        if (nextSequences.size() > numBestMBR && yb->logP > ya->logP)
+                            break;
+                    }
+                    std::sort(nextSequences.begin(), nextSequences.end());
+                    std::reverse(nextSequences.begin(), nextSequences.end());
+                    if (nextSequences.size() > numBestMBR)
+                    {
+                        for (size_t n = numBestMBR; n < nextSequences.size(); n++)
+                        {
+                            deleteSeq(nextSequences[n]);
+                        }
+                    }
+                    for (size_t iseq = nextSequences.size(); iseq > numBestMBR; iseq--)
+                        nextSequences.pop_back();
+                }
+
+                //nbest output
+
+                if (nextSequences.size() != 0)
+                {
+                    float totalProb = 0;
+
+                    ElemType onebest_lnLogP = ElemType(nextSequences[0].logP / nextSequences[0].labelseq.size());
+                    size_t onebest_index = 0;
+
+                    ElemType lnLogP;
+                    for (size_t n = 0; n < nextSequences.size(); n++)
+                    {
+                        if (n == 0)
+                        {
+                            lnLogP = onebest_lnLogP;
+                        }
+                        else
+                        {
+                            lnLogP = ElemType(nextSequences[n].logP / nextSequences[n].labelseq.size());
+                            if (lnLogP > onebest_lnLogP)
+                            {
+                                onebest_lnLogP = lnLogP;
+                                onebest_index = n;
+                            }
+                        }
+
+                        if (lengthNorm)
+                            nextSequences[n].logP = lnLogP;
+
+                        nextSequences[n].logP = exp(nextSequences[n].logP); // the logP actually becomes P
+                        totalProb += float(nextSequences[n].logP);
+                    }
+
+                    for (size_t n = 0; n < nextSequences.size(); n++)
+                    {
+                        PathInfo pi;
+                        pi.prob = float(nextSequences[n].logP / totalProb);
+
+                        string word_sequence = "";
+                        for (size_t k = 0; k < nextSequences[n].length - 1; k++)
+                        {
+                            size_t labelID = nextSequences[n].labelseq[k + 1];
+                            if (labelID != (vt_labels.size() - 1)) // it is not <blank>
+                            {
+
+                                string wordpiece = vt_labels[labelID];
+                                word_sequence += wordpiece;
+                            }
+                        }
+
+                        vector<string> vt_words;
+                        convert_word_sequence_string_2_vector(word_sequence, vt_words, '_');
+
+                        pi.WER = compute_wer(wordSeqs[uttID], vt_words);
+
+                        pi.label_seq = nextSequences[n].labelseq;
+
+                        uttPathsInfo[uttID].push_back(pi);
+                    }
+                    vt_onebest_wer[uttID] = uttPathsInfo[uttID][onebest_index].WER;
+                }
+                for (size_t n = 0; n < CurSequences.size(); n++)
+                {
+                    deleteSeq(CurSequences[n]);
+                }
+                vector<Sequence>().swap(CurSequences);
+                for (size_t n = 0; n < nextSequences.size(); n++)
+                {
+                    deleteSeq(nextSequences[n]);
+                }
+                vector<Sequence>().swap(nextSequences);
+                // end here
+                my_time = time(NULL);
+
+                fprintf(stderr, "RNNT_decode_nbest_MBR time 3 = %s, uttID = %d \n", ctime(&my_time), int(uttID));
+
+            } // end of for loop
+            decodeInputMatrix.SetValue(decodeInputMatrixBackup);
+            //decodeInputMatrix.Print("after ss");
+            decodeMBLayout->CopyFrom(decodebackupMBlayout);
+            my_time = time(NULL);
+            fprintf(stderr, "RNNT_decode_nbest_MBR time 4 = %s \n", ctime(&my_time));
+        }
+    };
+
+    // helper that returns 'float' or 'double' depending on ElemType
+    template <typename ElemType>
+    static inline const wchar_t* ElemTypeName();
+    template <>
+    /*static*/
+    inline const wchar_t* ElemTypeName<float>()
+    {
+        return L"float";
+    }
+    template <>
+    /*static*/ inline const wchar_t* ElemTypeName<double>()
+    {
+        return L"double";
+    }
+    template <>
+    /*static*/ inline const wchar_t* ElemTypeName<half>()
+    {
+        return L"half";
     }
-};
 
-// helper that returns 'float' or 'double' depending on ElemType
-template <typename ElemType>
-static inline const wchar_t* ElemTypeName();
-template <>
-/*static*/ inline const wchar_t* ElemTypeName<float>()
-{
-    return L"float";
-}
-template <>
-/*static*/ inline const wchar_t* ElemTypeName<double>()
-{
-    return L"double";
-}
-template <>
-/*static*/ inline const wchar_t* ElemTypeName<half>()
-{
-    return L"half";
-}
+    // The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
+    // The corresponding Matrix<float> is emitted in the SetDeviceId function above.
+    template class Matrix<double>;
+    template class Matrix<half>;
 
-// The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
-// The corresponding Matrix<float> is emitted in the SetDeviceId function above.
-template class Matrix<double>;
-template class Matrix<half>;
-
-// TODOs:
-//  - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling)
-//  - have overrides of RuntimeError etc. in ComputationNode, which prepend the error string with the node name and operation
+    // TODOs:
+    //  - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling)
+    //  - have overrides of RuntimeError etc. in ComputationNode, which prepend the error string with the node name and operation
 
+} // namespace CNTK
 } // namespace CNTK
 } // namespace MSR
-} // namespace Microsoft
diff --git a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
index 6175d2c9c..b814f2f03 100644
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@@ -75,6 +75,9 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
         LogicError("CopySubTree: you cannot copy a tree without copying the node values.");
 
     ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
+    std::set<wstring> set_cur_tonode_names;
+
+    set_cur_tonode_names.clear();
 
     if (!fromNet.EvalOrderExists(fromRoot))
         const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
@@ -84,20 +87,66 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,
         wstring fromNodeName = fromNode->NodeName();
         wstring toNodeName = toNamePrefix + fromNodeName;
 
+        // fprintf(stderr, "CopySubTree node = %ls\n", fromNodeName.c_str());
+
         ComputationNodeBasePtr toNode = CopyNode(fromNet, fromNodeName,
                                                  toNodeName,
                                                  CopyNodeFlags::copyNodeValue);
-
+        set_cur_tonode_names.insert(toNodeName);
+        ComputationNodeBasePtr fromNodeTmp;
+        fromNodeTmp = fromNet.GetNodeFromName(fromNodeName);
+        
+        /* fprintf(stderr, "CopySubTree node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(),
+         int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumRows()), 
+            int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumCols()) 
+            ); */
         if (flags & CopyNodeFlags::copyNodeInputLinks)
         {
             // copy the children structure but use the new nodes generated
             for (int i = 0; i < fromNode->GetNumInputs(); i++)
-                toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName()));
+            {
+                wstring inputNodeName;
+                inputNodeName = (toNamePrefix + fromNode->GetInputs()[i]->NodeName());
+                if (set_cur_tonode_names.find(inputNodeName) == set_cur_tonode_names.end() )
+                {
+                    //somehow the evalorder goes wrong, and the input is not copied to the toNetwork yet, do copy here
+                    // fprintf(stderr, "CopySubTree i = %d, missing node = %ls\n", i, inputNodeName.c_str());
+
+                    CopyNode(fromNet, fromNode->GetInputs()[i]->NodeName(),
+                             inputNodeName,
+                             CopyNodeFlags::copyNodeValue);
+                    set_cur_tonode_names.insert(inputNodeName);
+                }
+                //fprintf(stderr, "CopySubTree i = %d, node = %ls\n", i, toNodeName.c_str());
+
+                //toNode->SetInput(i, GetNodeFromName(toNamePrefix + fromNode->GetInputs()[i]->NodeName()));
+                toNode->SetInput(i, GetNodeFromName(inputNodeName));
+            }
         }
     }
 }
 
-// you can only copy inputs from nodes in the same network
+void ComputationNetwork::ShowNodeMemory(const ComputationNetwork& fromNet,
+                                     const std::wstring fromName)
+{
+   
+    ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);
+  
+    if (!fromNet.EvalOrderExists(fromRoot))
+        const_cast<ComputationNetwork&>(fromNet).FormEvalOrder(fromRoot);
+
+    for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
+    {
+        wstring fromNodeName = fromNode->NodeName();
+        ComputationNodeBasePtr fromNodeTmp;
+        fromNodeTmp = fromNet.GetNodeFromName(fromNodeName);
+
+        fprintf(stderr, "ShowNodeMemory node = %ls, numrows = %d, numcols = %d \n", fromNodeName.c_str(),
+                int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumRows()),
+                int((&dynamic_pointer_cast<ComputationNode<float>>(fromNodeTmp)->Value())->GetNumCols()));
+    }
+}
+    // you can only copy inputs from nodes in the same network
 void ComputationNetwork::CopyInputs(const std::wstring fromName, std::wstring toName)
 {
     CopyNode(*this, fromName, toName, CopyNodeFlags::copyNodeInputLinks);
diff --git a/Source/ComputationNetworkLib/ComputationNode.h b/Source/ComputationNetworkLib/ComputationNode.h
index 8a6fca012..9459289a6 100644
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@@ -13,7 +13,7 @@
 #include "MatrixPool.h"
 #include "ComputationEnvironment.h"
 #include "Globals.h"
-
+#include <cuda_runtime.h>
 #include <unordered_set>
 #include <map>
 #include <string>
@@ -233,7 +233,6 @@ struct ComputationNetworkOwnedNodeState
         return m_parentGradientOptimization == ParentGradientOptimization::Reuse;
     }
 
-
     virtual void MarkValueNonSharable()
     {
         m_valueSharable = false;
@@ -351,6 +350,8 @@ class ComputationNodeBase : public IComputationNode,
 public:
     typedef shared_ptr<ComputationNodeBase> ComputationNodeBasePtr;
 
+
+
     // -----------------------------------------------------------------------
     // constructors, copying, (de-)serialization
     // -----------------------------------------------------------------------
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.cpp b/Source/ComputationNetworkLib/ReshapingNodes.cpp
index bd1fbf816..62f970c09 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@@ -26,7 +26,12 @@
 #include <boost/algorithm/string/join.hpp>
 #include <boost/range/adaptor/transformed.hpp>
 
-namespace Microsoft { namespace MSR { namespace CNTK {
+namespace Microsoft
+{
+namespace MSR
+{
+namespace CNTK
+{
 
 // -----------------------------------------------------------------------
 // ReduceElements (op, axis=, input)
@@ -39,10 +44,10 @@ template <class ElemType>
     if (flags & CopyNodeFlags::copyNodeValue)
     {
         auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
-        node->m_axes        = m_axes;
-        node->m_operation   = m_operation;
+        node->m_axes = m_axes;
+        node->m_operation = m_operation;
         node->m_reductionOp = m_reductionOp;
-        node->m_scale       = m_scale;
+        node->m_scale = m_scale;
         node->m_keepDimensions = m_keepDimensions;
     }
 }
@@ -57,7 +62,7 @@ template <class ElemType>
         fstream >> num_axes;
     for (int i = 0; i < num_axes; ++i)
     {
-        int axis; 
+        int axis;
         fstream >> axis;
         m_axes.push_back(axis);
     }
@@ -123,7 +128,7 @@ template <class ElemType>
     if (ReduceSequenceAxis())
     {
         ElemType gapPadValue = NeutralValue(m_reductionOp);
-        input = ComputationNode<ElemType>::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/ true, &gapPadValue);
+        input = ComputationNode<ElemType>::Unpack(GetSampleLayout(), InputRef(0).Value(), InputRef(0).GetMBLayout(), m_tempUnpackedData, m_tempScatterIndices, m_tempMask, /*batchMajor=*/true, &gapPadValue);
     }
     else
         input = InputRef(0).ValueTensorFor(rank, frInput);
@@ -142,7 +147,24 @@ template <class ElemType>
     default:
         // the actual operation is a Copy with reduction, where the magic is in the reduction op
         // For "Mean", m_scale is 1/#elements, and 1 otherwise.
-        result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
+        {
+            /*
+            if (num_frames == 94)
+            {
+                fprintf(stderr, "reshape 1, name  = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale));
+            }
+            */
+            if (multi_thread)
+                result.DoUnaryOpOfDebug(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
+            else
+                result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
+            /*
+            if (num_frames == 94)
+            {
+                fprintf(stderr, "reshape 2, name  = %ls, result = %f, input = %f, m_scale = %f \n", NodeName().c_str(), double(result.GetSOB().FrobeniusNorm()), double(input.GetSOB().FrobeniusNorm()), double(m_scale));
+            }
+            */
+        }
     }
 }
 
@@ -156,15 +178,15 @@ template <class ElemType>
     {
         // Broadcast along the sequence
         auto result = ValueFor(fr);
-        ComputationNode<ElemType>::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/ accumulateGradient ? (ElemType)1 : (ElemType)0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices);
+        ComputationNode<ElemType>::BroadcastToPacked(Gradient(), GetMBLayout(), /*beta =*/accumulateGradient ? (ElemType) 1 : (ElemType) 0, InputRef(0).Gradient(), FrameRange(InputRef(0).GetMBLayout()), m_tempGatherIndices);
     }
     else
     {
         const auto frInput = (ReduceAllAxes() || ReduceBatchAxis()) ? FrameRange(InputRef(0).GetMBLayout()) : fr; // can't use 'fr' for ReduceAllAxes() as it refers to the result (same as for training criteria)
-                                                                                        // get the args
+                                                                                                                  // get the args
         size_t rank = DetermineElementwiseTensorRank();
         auto sliceOutputGrad = ReduceAllAxes() ? TensorView<ElemType>(GradientPtr(), GetSampleLayout()) : GradientTensorFor(rank, fr); // propagate from this one...
-        auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput); // ...to this one
+        auto sliceInputGrad = InputRef(0).GradientTensorFor(rank, frInput);                                                            // ...to this one
 
         // gradients are not as simple as passing an op-code, unfortunately
         switch (m_reductionOp)
@@ -220,8 +242,8 @@ template <class ElemType>
         break;
         case ElementWiseOperator::opElementwiseProduct:
         {
-            auto input  = InputRef(inputIndex).ValueTensorFor(rank, frInput);
-            auto output =                      ValueTensorFor(rank, fr.AllowBroadcast());
+            auto input = InputRef(inputIndex).ValueTensorFor(rank, frInput);
+            auto output = ValueTensorFor(rank, fr.AllowBroadcast());
             if (accumulateGradient)
                 sliceInputGrad.AddElementwiseProductWithQuotientOf(sliceOutputGrad, output, input);
             else
@@ -242,13 +264,20 @@ template <class ElemType>
 {
     switch (m_reductionOp)
     {
-    case ElementWiseOperator::opSum:                   return false;
-    case ElementWiseOperator::opLogSum:                return true;
-    case ElementWiseOperator::opMin:                   return true;
-    case ElementWiseOperator::opMax:                   return true;
-    case ElementWiseOperator::opElementwiseProduct:    return true;
-    case ElementWiseOperator::opArgmin:                return false;
-    case ElementWiseOperator::opArgmax:                return false;
+    case ElementWiseOperator::opSum:
+        return false;
+    case ElementWiseOperator::opLogSum:
+        return true;
+    case ElementWiseOperator::opMin:
+        return true;
+    case ElementWiseOperator::opMax:
+        return true;
+    case ElementWiseOperator::opElementwiseProduct:
+        return true;
+    case ElementWiseOperator::opArgmin:
+        return false;
+    case ElementWiseOperator::opArgmax:
+        return false;
     }
     LogicError("Should not get here.");
 }
@@ -258,13 +287,20 @@ template <class ElemType>
 {
     switch (m_reductionOp)
     {
-    case ElementWiseOperator::opSum:                   return false;
-    case ElementWiseOperator::opLogSum:                return true;
-    case ElementWiseOperator::opMin:                   return true;
-    case ElementWiseOperator::opMax:                   return true;
-    case ElementWiseOperator::opElementwiseProduct:    return true;
-    case ElementWiseOperator::opArgmin:                return false;
-    case ElementWiseOperator::opArgmax:                return false;
+    case ElementWiseOperator::opSum:
+        return false;
+    case ElementWiseOperator::opLogSum:
+        return true;
+    case ElementWiseOperator::opMin:
+        return true;
+    case ElementWiseOperator::opMax:
+        return true;
+    case ElementWiseOperator::opElementwiseProduct:
+        return true;
+    case ElementWiseOperator::opArgmin:
+        return false;
+    case ElementWiseOperator::opArgmax:
+        return false;
     }
     LogicError("Should not get here.");
 }
@@ -282,7 +318,7 @@ template <class ElemType>
     // validate the opcode (in case we got instantiated empty and never updated)
     ValidateOp();
 
-    m_scale = (ElemType)1;
+    m_scale = (ElemType) 1;
     if (ReduceAllAxes())
         Base::ValidateUnaryReduce(isFinalValidationPass, m_keepDimensions);
     else if (ReduceSequenceAxis())
@@ -319,29 +355,27 @@ template <class ElemType>
 
         let shape = Input(0)->GetSampleLayout();
         auto dims = shape.GetDims();
-        size_t reducedDimProd = 1; 
+        size_t reducedDimProd = 1;
         if (ReduceAllStaticAxes())
         {
             reducedDimProd = shape.GetNumElements();
-            dims = m_keepDimensions ? SmallVector<size_t>(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector<size_t>({}) : SmallVector<size_t>({ 1 })); // entire sample is reduced to a scalar
+            dims = m_keepDimensions ? SmallVector<size_t>(shape.GetRank(), 1) : (Environment().IsV2Library() ? SmallVector<size_t>({}) : SmallVector<size_t>({1})); // entire sample is reduced to a scalar
         }
-        else if (!m_axes.empty() 
-                && std::all_of(m_axes.begin(), 
-                                m_axes.end(), 
-                                [&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); }))
+        else if (!m_axes.empty() && std::all_of(m_axes.begin(),
+                                                m_axes.end(),
+                                                [&dims](int axis) { return axis - 1 >= 0 && axis - 1 < dims.size(); }))
         {
             //Accumulate the number of elements for reduce_mean
             reducedDimProd = std::accumulate(m_axes.begin(),
-                                                m_axes.end(), 
-                                                1, 
-                                                [&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; });
+                                             m_axes.end(),
+                                             1,
+                                             [&dims](size_t acc, int& axis) { return acc * dims[axis - 1]; });
 
             // axes reduced to a scalar
             if (m_keepDimensions)
                 std::for_each(m_axes.begin(),
-                    m_axes.end(),
-                    [&dims](int axis) {dims[axis - 1] = 1; }
-                 );
+                              m_axes.end(),
+                              [&dims](int axis) { dims[axis - 1] = 1; });
             else
             {
                 SmallVector<size_t> reducedDims(dims.size() - m_axes.size());
@@ -355,13 +389,12 @@ template <class ElemType>
                 dims = reducedDims;
             }
         }
-        else if (isFinalValidationPass) 
+        else if (isFinalValidationPass)
         {
             InvalidArgument("The shape of %ls [%ls] can not be reduced along axes [%ls]",
-                NodeDescription().c_str(),
-                wstring(shape).c_str(),
-                boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str()
-            );
+                            NodeDescription().c_str(),
+                            wstring(shape).c_str(),
+                            boost::algorithm::join(m_axes | boost::adaptors::transformed([](int axis) { return std::to_wstring(axis); }), ", ").c_str());
         }
         // for "Mean", we must divide by #elements
         if (isFinalValidationPass && IsMean())
@@ -384,10 +417,14 @@ struct SequenceLengthVector
 {
     typedef vector<vector<size_t>> SequenceVector;
     typedef MBLayout::SequenceInfo SequenceInfo;
-    const SequenceVector& m_sequenceVector;        // vector of sequences (to get sequence length)
-    const vector<SequenceInfo>& m_sequenceInfo;    // original sequence info (for seqId)
-    SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector) : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) { }
-    size_t size() const { return m_sequenceInfo.size(); }
+    const SequenceVector& m_sequenceVector;     // vector of sequences (to get sequence length)
+    const vector<SequenceInfo>& m_sequenceInfo; // original sequence info (for seqId)
+    SequenceLengthVector(const vector<SequenceInfo>& sequenceInfo, const SequenceVector& sequenceVector)
+        : m_sequenceInfo(sequenceInfo), m_sequenceVector(sequenceVector) {}
+    size_t size() const
+    {
+        return m_sequenceInfo.size();
+    }
     MBLayout::SequenceInfo operator[](size_t i) const // return a descriptor of the new sequence
     {
         SequenceInfo seq;
@@ -427,12 +464,12 @@ template <class ElemType>
         for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
         {
             double delta = input(0, inMBLayout->GetColumnIndex(seq, t)); // how many frames the current time step should expand into
-            desiredCount += delta; // this is now how many frames we should have
+            desiredCount += delta;                                       // this is now how many frames we should have
             // use a margin against round-off errors, so that we get non-binary ratios like 1/3 and 1/5 right
             // This really means generate a frame if too few, unless we are within machine accuracy of the target.
             // The assumption is that the delta has this error, while accumulation (in double) has no error.
             ElemType relativeMargin = 1 - std::numeric_limits<ElemType>::epsilon();
-            while ((indexSequence.empty() && desiredCount > 0)  // no margin for the first frame (always include unless flag is 0)
+            while ((indexSequence.empty() && desiredCount > 0) // no margin for the first frame (always include unless flag is 0)
                    || indexSequence.size() < desiredCount * relativeMargin)
                 indexSequence.push_back(t);
         }
@@ -441,10 +478,10 @@ template <class ElemType>
     input.CollapseDataLocation(); // BUGBUG: Move back, since BOTH state is broken at present.
     // create a new MBLayout
     let& outMBLayout = GetMBLayout();
-    outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
+    outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/ m_placementBuffer, /*temp*/ m_rowAllocationsBuffer);
     // copy to output
     vector<ElemType> buf(outMBLayout->GetNumCols(), numeric_limits<ElemType>::quiet_NaN()); // STL cannot easily avoid initializing, so we might as well init with NaN for gaps
-    let size = min(sequences.size(), outMBLayout->GetAllSequences().size()); // no non-gap sequence has an index beyond this
+    let size = min(sequences.size(), outMBLayout->GetAllSequences().size());                // no non-gap sequence has an index beyond this
     for (size_t i = 0; i < size; i++)
     {
         let& seq = outMBLayout->GetAllSequences()[i];
@@ -452,7 +489,7 @@ template <class ElemType>
             continue;
         let& indexSequence = indexSequences[i];
         for (size_t t = 0; t < seq.GetNumTimeSteps(); t++)
-            buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType)indexSequence[t];
+            buf[outMBLayout->GetColumnIndex(seq, t)] = (ElemType) indexSequence[t];
     }
     // there may be dangling gaps at the end. Take the opportunity to verify this.
     for (size_t i = size; i < sequences.size(); i++)
@@ -460,7 +497,7 @@ template <class ElemType>
     for (size_t i = size; i < outMBLayout->GetAllSequences().size(); i++)
         assert(outMBLayout->GetAllSequences()[i].seqId == GAP_SEQUENCE_ID);
     // the result will be kept in CPUDEVICE, since most likely we will access it again in PackedIndexNode
-    Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/ true, /*emptyTransfer=*/ true, /*updatePreferredDevice=*/ true);
+    Value().TransferToDeviceIfNotThere(CPUDEVICE, /*isBeingMoved=*/true, /*emptyTransfer=*/true, /*updatePreferredDevice=*/true);
     Value().SetValue(1, outMBLayout->GetNumCols(), CPUDEVICE, buf.data(), MatrixFormat::matrixFormatColMajor);
 }
 
@@ -501,9 +538,9 @@ template <class ElemType>
 /*virtual*/ void PackedIndexNode<ElemType>::ForwardPropNonLooping() /*override*/
 {
     let& sourceMBLayout = InputRef(SOURCEDATA).GetMBLayout(); // only used for index conversion
-    let& indexMBLayout  = InputRef(INDEXDATA).GetMBLayout();
-    let&  index  = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped
-    auto& result =                     Value(); // packed index values as mapped to sourceData's layout
+    let& indexMBLayout = InputRef(INDEXDATA).GetMBLayout();
+    let& index = InputRef(INDEXDATA).Value(); // per-seq index values that are to be mapped
+    auto& result = Value();                   // packed index values as mapped to sourceData's layout
     // loop over sourceSequences
     // Input matrix contains time indices for each sequence that refer to frames inside that sequence.
     // We replace every per-sequence index by the resolved column index w.r.t. the same MBLayout.
@@ -516,10 +553,10 @@ template <class ElemType>
         let& indexSeq = indexMBLayout->FindMatchingSequence(sourceSequences, i); // find corresponding entry in indexMBLayout
         for (size_t tIndex = 0; tIndex < indexSeq.GetNumTimeSteps(); tIndex++)   // map all index values in index sequence
         {
-            let jIndex  = indexMBLayout->GetColumnIndex(indexSeq, tIndex);    // map time index to actual location in the matrix storage object
-            let tSource = (size_t)index(0, jIndex);                           // the new time location (relative to source sequence)
+            let jIndex = indexMBLayout->GetColumnIndex(indexSeq, tIndex);     // map time index to actual location in the matrix storage object
+            let tSource = (size_t) index(0, jIndex);                          // the new time location (relative to source sequence)
             let jSource = sourceMBLayout->GetColumnIndex(sourceSeq, tSource); // map new time index as well. This performs a range check.
-            result(0, jIndex) = (ElemType)jSource;
+            result(0, jIndex) = (ElemType) jSource;
         }
     }
     // Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes.
@@ -563,8 +600,8 @@ template <class ElemType>
 /*virtual*/ void GatherPackedNode<ElemType>::ForwardPropNonLooping() /*override*/
 {
     InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
-    let&  index  = InputRef(INDEXDATA) .Value(); // column indices to copy from
-    let&  source = InputRef(SOURCEDATA).Value(); // source data to copy
+    let& index = InputRef(INDEXDATA).Value();                                                         // column indices to copy from
+    let& source = InputRef(SOURCEDATA).Value();                                                       // source data to copy
 
 #ifdef _MSC_VER
     auto& outputValuePtrRef = ValuePtrRef();
@@ -587,9 +624,9 @@ template <class ElemType>
 {
     if (inputIndex == SOURCEDATA)
     {
-        let&  index          = InputRef(INDEXDATA) .Value();    // column indices to copy from
+        let& index = InputRef(INDEXDATA).Value();               // column indices to copy from
         auto& sourceGradient = InputRef(SOURCEDATA).Gradient(); // source to propagate the gradient intpu
-        auto& outputGradient =                      Gradient(); // output gradient to propagate
+        auto& outputGradient = Gradient();                      // output gradient to propagate
         sourceGradient.DoScatterColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1, true);
     }
 }
@@ -614,7 +651,7 @@ template <class ElemType>
         SetDims(Input(SOURCEDATA)->GetSampleLayout(), HasMBLayout());
     else
     {
-        SmallVector<size_t> layout = { 1 }; // Scalar
+        SmallVector<size_t> layout = {1}; // Scalar
         if (Input(SOURCEDATA)->GetSampleLayout().GetRank() > 1)
         {
             auto srcLayout = Input(SOURCEDATA)->GetSampleLayout().GetDims();
@@ -638,8 +675,8 @@ template <class ElemType>
     if (*InputRef(INDEXDATA).GetMBLayout() != *InputRef(SOURCEDATA).GetMBLayout())
         InvalidArgument("%ls %ls operation requires the minibatch layout of index and source data to be the same.", NodeName().c_str(), OperationName().c_str());
     InputRef(INDEXDATA).MaskMissingValueColumnsTo(FrameRange(InputRef(INDEXDATA).GetMBLayout()), -1); // indicates an invalid column to Gather/Scatter
-    let&  index  = InputRef(INDEXDATA) .Value(); // column indices to copy from
-    let&  source = InputRef(SOURCEDATA).Value(); // source data to copy
+    let& index = InputRef(INDEXDATA).Value();                                                         // column indices to copy from
+    let& source = InputRef(SOURCEDATA).Value();                                                       // source data to copy
 
 #ifdef _MSC_VER
     auto& outputValuePtrRef = ValuePtrRef();
@@ -653,7 +690,7 @@ template <class ElemType>
                                                                source.GetMatrixType(),
                                                                source.GetFormat());
 
-    auto& output =                      Value(); // output goes here
+    auto& output = Value(); // output goes here
     output.DoScatterColumnsOf(/*beta=*/0, index, source, /*alpha=*/1, true);
 }
 
@@ -662,9 +699,9 @@ template <class ElemType>
 {
     if (inputIndex == SOURCEDATA)
     {
-        let&  index          = InputRef(INDEXDATA).Value();     // column indices to copy from
+        let& index = InputRef(INDEXDATA).Value();             // column indices to copy from
         auto& sourceGradient = Input(SOURCEDATA)->Gradient(); // source to propagate the gradient input
-        auto& outputGradient =                      Gradient(); // output gradient to propagate
+        auto& outputGradient = Gradient();                    // output gradient to propagate
         sourceGradient.DoGatherColumnsOf(/*beta=*/1, index, outputGradient, /*alpha=*/1);
     }
 }
@@ -707,8 +744,8 @@ template <class ElemType>
 CropNode<ElemType>::CropNode(size_t offsetX, size_t offsetY, DEVICEID_TYPE deviceId, const wstring& name)
     : CropNode(deviceId, name)
 {
-    m_xOffset = (double)(offsetX);
-    m_yOffset = (double)(offsetY);
+    m_xOffset = (double) (offsetX);
+    m_yOffset = (double) (offsetY);
 }
 
 template <class ElemType>
@@ -880,8 +917,7 @@ void CropNode<ElemType>::ComputeCropOffsets()
     // nodeToTransformMap contains coordinate maps for all nodes traversed so far, and is updated by this function.
     // Traversal stack contains all nodes traversed so far. Inputs of currNode are pushed to traversal stack so that their
     // inputs can be processed later on.
-    auto ProcessInputs = [](ComputationNodeBase* currNode, stack<ComputationNodeBase*>& traversalStack, unordered_map<ComputationNodeBase*, SpaceTransform>& nodeToTransformMap)
-    {
+    auto ProcessInputs = [](ComputationNodeBase* currNode, stack<ComputationNodeBase*>& traversalStack, unordered_map<ComputationNodeBase*, SpaceTransform>& nodeToTransformMap) {
         if (!currNode->Is<TransformerNode>())
             RuntimeError("Node does not support affine transform for cropping.");
 
@@ -1033,4 +1069,6 @@ template class CropNode<float>;
 template class CropNode<double>;
 template class CropNode<half>;
 
-}}}
+} // namespace CNTK
+} // namespace MSR
+} // namespace Microsoft
diff --git a/Source/ComputationNetworkLib/ReshapingNodes.h b/Source/ComputationNetworkLib/ReshapingNodes.h
index 25bb1aaf2..c84fd1263 100644
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@@ -21,7 +21,12 @@
 #include <algorithm>
 #include <assert.h>
 
-namespace Microsoft { namespace MSR { namespace CNTK {
+namespace Microsoft
+{
+namespace MSR
+{
+namespace CNTK
+{
 
 // -----------------------------------------------------------------------
 // Reshape(x, tensorShape, beginAxis=0, endAxis=0) -- reinterpret input samples as having different tensor dimensions
@@ -46,8 +51,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 template <class ElemType>
 class ReshapeNode : public UnaryElementWiseNode<ElemType>
 {
-    typedef UnaryElementWiseNode<ElemType> Base; UsingUnaryElementwiseNodeBaseMembers;
-    static const std::wstring TypeName() { return L"Reshape"; }
+    typedef UnaryElementWiseNode<ElemType> Base;
+    UsingUnaryElementwiseNodeBaseMembers;
+    static const std::wstring TypeName()
+    {
+        return L"Reshape";
+    }
 
 public:
     ReshapeNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& replacementSampleLayout = TensorShape(), int beginAxis = 1, int endAxis = 0)
@@ -70,7 +79,7 @@ public:
         {
             auto node = dynamic_pointer_cast<ReshapeNode<ElemType>>(nodeP);
             node->m_beginDimParameter = m_beginDimParameter;
-            node->m_endDimParameter   = m_endDimParameter;
+            node->m_endDimParameter = m_endDimParameter;
             node->m_replacementSampleLayout = m_replacementSampleLayout;
         }
     }
@@ -152,25 +161,25 @@ public:
 
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
-        auto result     =             ValueFor(fr);
+        auto result = ValueFor(fr);
         auto inputValue = InputRef(0).ValueFor(fr);
         ForwardPropImpl(result, inputValue);
     }
 
-    static void ForwardPropImpl(Matrix<ElemType> &result, Matrix<ElemType> &input)
+    static void ForwardPropImpl(Matrix<ElemType>& result, Matrix<ElemType>& input)
     {
         result.AssignValuesOf(input.Reshaped(result.GetNumRows(), result.GetNumCols()));
     }
 
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
-        auto gradient      =                      GradientFor(fr);
+        auto gradient = GradientFor(fr);
         auto inputGradient = InputRef(inputIndex).GradientFor(fr);
 
         BackpropImpl(gradient, inputGradient, Input(inputIndex)->IsGradientOptimized(this), Input(inputIndex)->ParentGradientReused());
     }
 
-    static void BackpropImpl(Matrix<ElemType> &gradient, Matrix<ElemType> &inputGradient, bool isGradientOptimized, bool isParentGradientReused)
+    static void BackpropImpl(Matrix<ElemType>& gradient, Matrix<ElemType>& inputGradient, bool isGradientOptimized, bool isParentGradientReused)
     {
         if (isGradientOptimized)
         {
@@ -188,10 +197,19 @@ public:
             inputGradient += gradient.Reshaped(inputGradient.GetNumRows(), inputGradient.GetNumCols());
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
 
-    virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override { return ParentGradientOptimization::Reuse; }
+    virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override
+    {
+        return ParentGradientOptimization::Reuse;
+    }
 
 private:
     TensorShape m_replacementSampleLayout; // user-specified dimensions to replace dimensions [beginAxis, endAxis]
@@ -222,12 +240,19 @@ template class ReshapeNode<double>;
 template <class ElemType>
 class ReduceElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"ReduceElements"; }
+    typedef ComputationNode<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"ReduceElements";
+    }
 
     void ValidateOp();
 
-    static inline bool Contains(const std::vector<int>& axes, int axis) { return std::find(axes.begin(), axes.end(), axis) != axes.end(); }
+    static inline bool Contains(const std::vector<int>& axes, int axis)
+    {
+        return std::find(axes.begin(), axes.end(), axis) != axes.end();
+    }
     static bool DefaultKeepDimensionsSetting(int axis)
     {
         return !((axis == CNTKInternalIdxValueForAllStaticAxes) || (axis == CNTKInternalIdxValueForAllAxes));
@@ -245,11 +270,16 @@ public:
     {
         switch (op)
         {
-        case ElementWiseOperator::opSum:                return  0;
-        case ElementWiseOperator::opLogSum:             return -std::numeric_limits<ElemType>::infinity();
-        case ElementWiseOperator::opMin:                return  std::numeric_limits<ElemType>::infinity();
-        case ElementWiseOperator::opMax:                return -std::numeric_limits<ElemType>::infinity();
-        case ElementWiseOperator::opElementwiseProduct: return 1;
+        case ElementWiseOperator::opSum:
+            return 0;
+        case ElementWiseOperator::opLogSum:
+            return -std::numeric_limits<ElemType>::infinity();
+        case ElementWiseOperator::opMin:
+            return std::numeric_limits<ElemType>::infinity();
+        case ElementWiseOperator::opMax:
+            return -std::numeric_limits<ElemType>::infinity();
+        case ElementWiseOperator::opElementwiseProduct:
+            return 1;
         default:
             InvalidArgument("ReduceElementsNode::NeutralValue: Invalid operation code; allowed are: 'opSum', 'opMax', 'opMin', 'opElementwiseProduct', 'opLogSum'.");
         }
@@ -258,47 +288,57 @@ public:
     // map the operation specified as a string to an ElementWiseOperator value.
     static ElementWiseOperator ReductionOpEnumValue(const std::wstring& opName)
     {
-        if      (opName == L"Plus")   return ElementWiseOperator::opSum;
-        else if (opName == L"Sum")    return ElementWiseOperator::opSum;
-        else if (opName == L"Mean")   return ElementWiseOperator::opSum;
-        else if (opName == L"LogSum") return ElementWiseOperator::opLogSum;
-        else if (opName == L"Min")    return ElementWiseOperator::opMin;
-        else if (opName == L"Max")    return ElementWiseOperator::opMax;
-        else if (opName == L"Prod")   return ElementWiseOperator::opElementwiseProduct;
-        else if (opName == L"Argmin") return ElementWiseOperator::opArgmin;
-        else if (opName == L"Argmax") return ElementWiseOperator::opArgmax;
+        if (opName == L"Plus")
+            return ElementWiseOperator::opSum;
+        else if (opName == L"Sum")
+            return ElementWiseOperator::opSum;
+        else if (opName == L"Mean")
+            return ElementWiseOperator::opSum;
+        else if (opName == L"LogSum")
+            return ElementWiseOperator::opLogSum;
+        else if (opName == L"Min")
+            return ElementWiseOperator::opMin;
+        else if (opName == L"Max")
+            return ElementWiseOperator::opMax;
+        else if (opName == L"Prod")
+            return ElementWiseOperator::opElementwiseProduct;
+        else if (opName == L"Argmin")
+            return ElementWiseOperator::opArgmin;
+        else if (opName == L"Argmax")
+            return ElementWiseOperator::opArgmax;
 
         // more here
-        else InvalidArgument("Invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min', 'Prod', 'Argmax', 'Argmin'.", opName.c_str());
+        else
+            InvalidArgument("Invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min', 'Prod', 'Argmax', 'Argmin'.", opName.c_str());
     }
 
 public:
-    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, int axis, bool keepDimensions) :
-        Base(deviceId, name), m_operation(operation), m_axes({ axis }), m_reductionOp((ElementWiseOperator)-1/*invalid*/), m_scale(0/*invalid*/), m_keepDimensions(keepDimensions)
+    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, int axis, bool keepDimensions)
+        : Base(deviceId, name), m_operation(operation), m_axes({axis}), m_reductionOp((ElementWiseOperator) -1 /*invalid*/), m_scale(0 /*invalid*/), m_keepDimensions(keepDimensions)
     {
         if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate())
             ValidateOp();
     }
 
-    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = CNTKInternalIdxValueForAllStaticAxes) :
-        ReduceElementsNode(deviceId, name, operation, { axis }, DefaultKeepDimensionsSetting(axis))
+    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = CNTKInternalIdxValueForAllStaticAxes)
+        : ReduceElementsNode(deviceId, name, operation, {axis}, DefaultKeepDimensionsSetting(axis))
     {
     }
 
-    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, const std::vector<int>& axis, bool keepDimensions) :
-        Base(deviceId, name), m_operation(operation), m_axes(axis), m_reductionOp((ElementWiseOperator)-1/*invalid*/), m_scale(0/*invalid*/), m_keepDimensions(keepDimensions)
+    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, const std::vector<int>& axis, bool keepDimensions)
+        : Base(deviceId, name), m_operation(operation), m_axes(axis), m_reductionOp((ElementWiseOperator) -1 /*invalid*/), m_scale(0 /*invalid*/), m_keepDimensions(keepDimensions)
     {
         if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate())
             ValidateOp();
     }
 
-    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, const std::vector<int>& axis) :
-        ReduceElementsNode(deviceId, name, operation, axis, DefaultKeepDimensionsSetting(axis))
+    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation, const std::vector<int>& axis)
+        : ReduceElementsNode(deviceId, name, operation, axis, DefaultKeepDimensionsSetting(axis))
     {
     }
 
-    ReduceElementsNode(const ScriptableObjects::IConfigRecordPtr configp) :
-        ReduceElementsNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"reductionOp"), (int) configp->Get(L"axis"))
+    ReduceElementsNode(const ScriptableObjects::IConfigRecordPtr configp)
+        : ReduceElementsNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"reductionOp"), (int) configp->Get(L"axis"))
     {
         AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
     }
@@ -352,25 +392,55 @@ public:
         ReleaseMatrixToPool(m_tempGatherIndices, matrixPool);
     }
 
-    std::wstring ReductionOpName() const { return m_operation; }
-    const std::vector<int>& ReductionAxis() const { return m_axes; }
+    std::wstring ReductionOpName() const
+    {
+        return m_operation;
+    }
+    const std::vector<int>& ReductionAxis() const
+    {
+        return m_axes;
+    }
+    // for debug purpose
+    void set_frame(int nfs)
+    {
+        num_frames = nfs;
+    }
+    void is_multi_thread(bool mt)
+    {
+        multi_thread = mt;
+    }
 
-    static const int  CNTKInternalIdxValueForAllStaticAxes = 0;
-    static const int  CNTKInternalIdxValueForAllAxes = -1;
-    static const int  CNTKInternalIdxValueForSequenceAxis = -2;
-    static const int  CNTKInternalIdxValueForBatchAxis = -3;
+    static const int CNTKInternalIdxValueForAllStaticAxes = 0;
+    static const int CNTKInternalIdxValueForAllAxes = -1;
+    static const int CNTKInternalIdxValueForSequenceAxis = -2;
+    static const int CNTKInternalIdxValueForBatchAxis = -3;
 
 private:
-    bool IsMean() const { return (m_operation == L"Mean"); }
-    bool ReduceAllStaticAxes() const { return Contains(m_axes, CNTKInternalIdxValueForAllStaticAxes); }
-    bool ReduceAllAxes() const { return Contains(m_axes, CNTKInternalIdxValueForAllAxes); }
-    bool ReduceSequenceAxis() const { return Contains(m_axes, CNTKInternalIdxValueForSequenceAxis); }
-    bool ReduceBatchAxis() const { return Contains(m_axes, CNTKInternalIdxValueForBatchAxis); }
+    bool IsMean() const
+    {
+        return (m_operation == L"Mean");
+    }
+    bool ReduceAllStaticAxes() const
+    {
+        return Contains(m_axes, CNTKInternalIdxValueForAllStaticAxes);
+    }
+    bool ReduceAllAxes() const
+    {
+        return Contains(m_axes, CNTKInternalIdxValueForAllAxes);
+    }
+    bool ReduceSequenceAxis() const
+    {
+        return Contains(m_axes, CNTKInternalIdxValueForSequenceAxis);
+    }
+    bool ReduceBatchAxis() const
+    {
+        return Contains(m_axes, CNTKInternalIdxValueForBatchAxis);
+    }
 
 private:
     // operation attributes
     std::vector<int> m_axes;
-    std::wstring m_operation;          // the operation as a string, e.g. "Sum", see ValidateOp()
+    std::wstring m_operation; // the operation as a string, e.g. "Sum", see ValidateOp()
     bool m_keepDimensions;
 
     // things cached during validation
@@ -381,6 +451,9 @@ private:
     shared_ptr<Matrix<char>> m_tempMask;
     shared_ptr<Matrix<ElemType>> m_tempScatterIndices;
     shared_ptr<Matrix<ElemType>> m_tempUnpackedData;
+    // debug
+    size_t num_frames;
+    bool multi_thread = false;
 };
 
 // -----------------------------------------------------------------------
@@ -395,8 +468,12 @@ private:
 template <class ElemType>
 class ReconcileDynamicAxisNode : public ComputationNode<ElemType>, public NumInputs<2>
 {
-    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"ReconcileDynamicAxis"; }
+    typedef ComputationNode<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"ReconcileDynamicAxis";
+    }
 
 public:
     DeclareConstructorFromConfigWithNumInputs(ReconcileDynamicAxisNode);
@@ -412,9 +489,9 @@ public:
 
         m_layoutsMatch = InputRef(0).HasMBLayout() && *m_pMBLayout == *InputRef(0).GetMBLayout(); // this does a deep value-level comparison
 
-        if (InputRef(0).HasMBLayout() && !m_layoutsMatch &&                                       // input is a mismatching data input --only allowed case is broadcast_as()
-            ((InputRef(0).GetMBLayout()->GetNumTimeSteps() != 1) ||                               // not broadcast_as()
-             (InputRef(0).GetMBLayout()->GetNumSequences() != m_pMBLayout->GetNumSequences())))   // different batch??
+        if (InputRef(0).HasMBLayout() && !m_layoutsMatch &&                                     // input is a mismatching data input --only allowed case is broadcast_as()
+            ((InputRef(0).GetMBLayout()->GetNumTimeSteps() != 1) ||                             // not broadcast_as()
+             (InputRef(0).GetMBLayout()->GetNumSequences() != m_pMBLayout->GetNumSequences()))) // different batch??
         {
             InvalidArgument("%ls %ls operation discovered that %ls %ls operation produced an MB layout that is incompatible with that of %ls %ls.",
                             NodeName().c_str(), OperationName().c_str(),
@@ -422,11 +499,11 @@ public:
                             InputRef(1).NodeName().c_str(), InputRef(1).OperationName().c_str());
         }
 
-        if (!InputRef(0).HasMBLayout() || m_layoutsMatch)   // no shuffle-case: everything matches or non-data that can use tensor broadcast
+        if (!InputRef(0).HasMBLayout() || m_layoutsMatch) // no shuffle-case: everything matches or non-data that can use tensor broadcast
         {
             // copy the data from 'dataInput'
             size_t rank = GetSampleLayout().GetRank();
-            auto result =             ValueTensorFor(rank, fr);
+            auto result = ValueTensorFor(rank, fr);
             auto input0 = InputRef(0).ValueTensorFor(rank, InputRef(0).HasMBLayout() ? fr.WithLayout(InputRef(0).GetMBLayout()) : fr.AllowBroadcast());
             // If data input has a layout (which is known to match), then replace the pointer here ^^ to avoid another runtime check.
             // If it has no layout, then set the broadcast-allowed flag, which will accept any layout to be passed in.
@@ -436,7 +513,7 @@ public:
         else // Broadcasting along the sequence case: must reshuffle
         {
             auto result = ValueFor(fr);
-            ComputationNode<ElemType>::BroadcastToPacked(InputRef(0).Value(), InputRef(0).GetMBLayout(), /*beta =*/ 0, result, fr, m_tempGatherIndices);
+            ComputationNode<ElemType>::BroadcastToPacked(InputRef(0).Value(), InputRef(0).GetMBLayout(), /*beta =*/0, result, fr, m_tempGatherIndices);
         }
     }
 
@@ -454,7 +531,7 @@ public:
             TensorView<ElemType> inputGradient;
             if (!InputRef(0).GetMBLayout() || m_layoutsMatch)
             {
-                gradient      =                      GradientTensorFor(rank, fr);
+                gradient = GradientTensorFor(rank, fr);
                 inputGradient = InputRef(inputIndex).GradientTensorFor(rank, InputRef(inputIndex).HasMBLayout() ? fr.WithLayout(InputRef(inputIndex).GetMBLayout()) : fr.AllowBroadcast());
             }
             else
@@ -464,7 +541,7 @@ public:
                     InvalidArgument("%ls %ls operation does not support broadcasting the left operand to the right operand's dynamic axis, inside a recurrent loop.", NodeName().c_str(), OperationName().c_str());
 
                 ElemType gapPadValue = 0;
-                gradient = ComputationNode<ElemType>::Unpack(GetSampleLayout(), GradientFor(fr), m_pMBLayout, m_tempUnpackedData, m_tempScatterIndices, std::shared_ptr<Matrix<char>>(nullptr), /*batchMajor=*/ true, &gapPadValue);
+                gradient = ComputationNode<ElemType>::Unpack(GetSampleLayout(), GradientFor(fr), m_pMBLayout, m_tempUnpackedData, m_tempScatterIndices, std::shared_ptr<Matrix<char>>(nullptr), /*batchMajor=*/true, &gapPadValue);
                 inputGradient = Input(inputIndex)->GradientTensorFor(rank, FrameRange(InputRef(inputIndex).GetMBLayout(), 0));
             }
 
@@ -477,8 +554,14 @@ public:
         }
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
     virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override
     {
         return (Input(0).get() == input) ? ParentGradientOptimization::Overwrite : ParentGradientOptimization::None; // no gradient propagation to input1
@@ -533,17 +616,19 @@ template class ReconcileDynamicAxisNode<double>;
 template <class ElemType>
 class ToBatchAxisNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() {
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
         return L"ToBatchAxisNode";
     }
+
 public:
     ToBatchAxisNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
     {
-
     }
-    
+
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
         auto& inputValue = InputRef(0).Value();
@@ -570,7 +655,7 @@ public:
         return false;
     }
 
-    virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override 
+    virtual ParentGradientOptimization ImplementsGradientOptimization(const ComputationNodeBase* input) const override
     {
         return ParentGradientOptimization::Reuse;
     }
@@ -583,7 +668,7 @@ public:
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
     {
         Base::Validate(isFinalValidationPass);
-        
+
         if (!m_pMBLayout)
         {
             m_pMBLayout = make_shared<MBLayout>(); // this generates a new layout
@@ -614,14 +699,16 @@ public:
 template class ToBatchAxisNode<float>;
 template class ToBatchAxisNode<double>;
 
-
 template <class ElemType>
 class UnpackBatchAxisNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() {
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
         return L"UnpackBatchAxis";
     }
+
 public:
     UnpackBatchAxisNode(DEVICEID_TYPE deviceId, const wstring& name)
         : Base(deviceId, name)
@@ -644,12 +731,12 @@ public:
         ReshapeNode<ElemType>::BackpropImpl(gradient, inputGradient, Input(0)->IsGradientOptimized(this), Input(0)->ParentGradientReused());
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override 
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
     {
         return false;
     }
 
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override 
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
     {
         return false;
     }
@@ -659,7 +746,7 @@ public:
         return ParentGradientOptimization::Reuse;
     }
 
-    bool ForceDynamicValidation() const override 
+    bool ForceDynamicValidation() const override
     {
         return true;
     }
@@ -705,19 +792,23 @@ template class UnpackBatchAxisNode<double>;
 template <class ElemType>
 class SliceNode : public ComputationNode<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"Slice"; }
+    typedef ComputationNode<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"Slice";
+    }
 
 public:
     SliceNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector<int> beginIndex = {0}, std::vector<int> endIndex = {0}, std::vector<int> axis = {1}, std::vector<int> stride_multiplier = {1})
         : Base(deviceId, name), m_beginIndex(beginIndex), m_endIndex(endIndex), m_axis(axis), m_stride_multiplier(stride_multiplier)
     {
         if (m_beginIndex.size() != m_endIndex.size() || m_beginIndex.size() != m_axis.size())
-            InvalidArgument("%ls %ls operation: invalid size of beginIndex (%d), endIndx (%d) and axis (%d). They must agree.", NodeName().c_str(), OperationName().c_str(), (int)m_beginIndex.size(), (int)m_endIndex.size(), (int)m_axis.size());
+            InvalidArgument("%ls %ls operation: invalid size of beginIndex (%d), endIndx (%d) and axis (%d). They must agree.", NodeName().c_str(), OperationName().c_str(), (int) m_beginIndex.size(), (int) m_endIndex.size(), (int) m_axis.size());
     }
 
     SliceNode(const ScriptableObjects::IConfigRecordPtr configp)
-        : SliceNode(configp->Get(L"deviceId"), L"<placeholder>", { configp->Get(L"beginIndex") }, { configp->Get(L"endIndex") }, { configp->Get(L"axis") })
+        : SliceNode(configp->Get(L"deviceId"), L"<placeholder>", {configp->Get(L"beginIndex")}, {configp->Get(L"endIndex")}, {configp->Get(L"axis")})
     {
         AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
     }
@@ -735,35 +826,35 @@ public:
         Base::CopyTo(nodeP, newName, flags);
         auto node = dynamic_pointer_cast<SliceNode<ElemType>>(nodeP);
         node->m_beginIndex = m_beginIndex;
-        node->m_endIndex   = m_endIndex;
-        node->m_axis       = m_axis;
+        node->m_endIndex = m_endIndex;
+        node->m_axis = m_axis;
         node->m_stride_multiplier = m_stride_multiplier;
     }
 
     virtual void Load(File& fstream, size_t modelVersion) override
     {
         Base::Load(fstream, modelVersion);
-        int num = 1, axis = 1, stride_multiplier = 1;  // axis = 1 to emulate old RowSliceNode 
+        int num = 1, axis = 1, stride_multiplier = 1; // axis = 1 to emulate old RowSliceNode
         ptrdiff_t beginIndex, height;
         if (modelVersion >= CNTK_MODEL_VERSION_22)
-            fstream >> num; 
+            fstream >> num;
         if (num < 1)
-            InvalidArgument("Slice node number of axes (%d) invalid, must be >=1", num); 
+            InvalidArgument("Slice node number of axes (%d) invalid, must be >=1", num);
 
-        m_beginIndex.clear(); 
+        m_beginIndex.clear();
         m_endIndex.clear();
-        m_axis.clear(); 
+        m_axis.clear();
         m_stride_multiplier.clear();
         for (int i = 0; i < num; i++)
         {
             fstream >> beginIndex >> height; // legacy format stored (end-begin)
-            m_beginIndex.push_back((int)beginIndex);
-            m_endIndex.push_back((int)(beginIndex + height));
+            m_beginIndex.push_back((int) beginIndex);
+            m_endIndex.push_back((int) (beginIndex + height));
             if (modelVersion >= CNTK_MODEL_VERSION_3)
                 fstream >> axis;
             if (modelVersion >= CNTK_MODEL_VERSION_27)
                 fstream >> stride_multiplier;
-            m_axis.push_back(axis); 
+            m_axis.push_back(axis);
             m_stride_multiplier.push_back(stride_multiplier);
         }
     }
@@ -771,57 +862,64 @@ public:
     virtual void Save(File& fstream) const override
     {
         Base::Save(fstream);
-        int num = (int)m_axis.size(); 
-        fstream << num; 
+        int num = (int) m_axis.size();
+        fstream << num;
         for (auto i = 0; i < num; i++)
         {
-            fstream << (ptrdiff_t)m_beginIndex[i] << (ptrdiff_t)(m_endIndex[i] - m_beginIndex[i]); // legacy file format stores (end-begin), we keep it that way
+            fstream << (ptrdiff_t) m_beginIndex[i] << (ptrdiff_t)(m_endIndex[i] - m_beginIndex[i]); // legacy file format stores (end-begin), we keep it that way
             fstream << m_axis[i];
             fstream << m_stride_multiplier[i];
         }
     }
 
     // these implement numpy-style negative bound values to index from the end
-    std::vector<int> BeginIndex() const { return m_beginIndex; }
-    size_t BeginIndex(int idx) const 
+    std::vector<int> BeginIndex() const
     {
-        if (idx >= (int)m_axis.size())
-            InvalidArgument("Slice BeginIndex call with invalid index (%d) >= axis size (%d)", idx, (int)m_axis.size()); 
-        return m_beginIndex[idx] >= 0 ? (size_t)m_beginIndex[idx] : (size_t)(m_beginIndex[idx] + InputRef(0).GetSampleLayout()[m_axis[idx] - 1]); 
+        return m_beginIndex;
     }
-    std::vector<int> EndIndex() const { return m_endIndex; }
-    size_t EndIndex(int idx)   const 
+    size_t BeginIndex(int idx) const
     {
-        if (idx >= (int)m_axis.size())
-            InvalidArgument("Slice EndIndex call with invalid index (%d) >= axis size (%d)", idx, (int)m_axis.size());
-        return m_endIndex[idx]   >  0 ? (size_t)m_endIndex[idx] : (size_t)(m_endIndex[idx] + InputRef(0).GetSampleLayout()[m_axis[idx] - 1]); 
+        if (idx >= (int) m_axis.size())
+            InvalidArgument("Slice BeginIndex call with invalid index (%d) >= axis size (%d)", idx, (int) m_axis.size());
+        return m_beginIndex[idx] >= 0 ? (size_t) m_beginIndex[idx] : (size_t)(m_beginIndex[idx] + InputRef(0).GetSampleLayout()[m_axis[idx] - 1]);
     }
-    std::vector<int> Axis() const { return m_axis; }
-    int Axis(int idx) const 
-    { 
-        if (idx >= (int)m_axis.size())
-            InvalidArgument("Slice Axis call with invalid index (%d) >= axis size (%d)", idx, (int)m_axis.size());
-        return m_axis[idx]; 
+    std::vector<int> EndIndex() const
+    {
+        return m_endIndex;
+    }
+    size_t EndIndex(int idx) const
+    {
+        if (idx >= (int) m_axis.size())
+            InvalidArgument("Slice EndIndex call with invalid index (%d) >= axis size (%d)", idx, (int) m_axis.size());
+        return m_endIndex[idx] > 0 ? (size_t) m_endIndex[idx] : (size_t)(m_endIndex[idx] + InputRef(0).GetSampleLayout()[m_axis[idx] - 1]);
+    }
+    std::vector<int> Axis() const
+    {
+        return m_axis;
+    }
+    int Axis(int idx) const
+    {
+        if (idx >= (int) m_axis.size())
+            InvalidArgument("Slice Axis call with invalid index (%d) >= axis size (%d)", idx, (int) m_axis.size());
+        return m_axis[idx];
     }
 
 private:
-
     // determine the tensor shape that represents slice of the input that we are taking
-    TensorShape GetInputSlice(size_t rank, const FrameRange & fr) const
+    TensorShape GetInputSlice(size_t rank, const FrameRange& fr) const
     {
-        auto inputSlice = InputRef(0).GetTensorSliceFor(rank, fr);    // input must be narrowed down
-        for (int i = 0; i < (int)m_axis.size(); i++)  
-            inputSlice.NarrowTo(Axis(i)-1, BeginIndex(i), EndIndex(i), m_stride_multiplier[i]);
+        auto inputSlice = InputRef(0).GetTensorSliceFor(rank, fr); // input must be narrowed down
+        for (int i = 0; i < (int) m_axis.size(); i++)
+            inputSlice.NarrowTo(Axis(i) - 1, BeginIndex(i), EndIndex(i), m_stride_multiplier[i]);
         return inputSlice;
     }
 
 public:
-
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
         size_t rank = DetermineElementwiseTensorRank();
         auto output = ValueTensorFor(rank, fr);
-        let   input = TensorView<ElemType>(InputRef(0).ValuePtr(), GetInputSlice(rank, fr.AllowBroadcast()));
+        let input = TensorView<ElemType>(InputRef(0).ValuePtr(), GetInputSlice(rank, fr.AllowBroadcast()));
         output.AssignCopyOf(input);
     }
 
@@ -833,8 +931,14 @@ public:
         inputGrad.AddCopyOf(outputGrad);
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
     {
@@ -842,13 +946,13 @@ public:
         InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
 
         auto sampleLayout = Input(0)->GetSampleLayout();
-        for (int i = 0; i < (int)m_axis.size(); i++)
+        for (int i = 0; i < (int) m_axis.size(); i++)
         {
             if (m_axis[i] < 1 || (isFinalValidationPass && m_axis[i] > sampleLayout.GetRank()))
                 RuntimeError("%ls %ls operation: axis parameter %d (%d) must be in range 1..rank of input ([%s]).", NodeName().c_str(), OperationName().c_str(), i, m_axis[i], string(sampleLayout).c_str());
 
             if (isFinalValidationPass && (sampleLayout[m_axis[i] - 1] < EndIndex(i) || EndIndex(i) < BeginIndex(i) || BeginIndex(i) < 0))
-                RuntimeError("%ls %ls operation: Index range [%d,%d), interpreted as [%d,%d), is invalid for input ([%s]).", NodeName().c_str(), OperationName().c_str(), m_beginIndex[i], m_endIndex[i], (int)BeginIndex(i), (int)EndIndex(i), string(sampleLayout).c_str());
+                RuntimeError("%ls %ls operation: Index range [%d,%d), interpreted as [%d,%d), is invalid for input ([%s]).", NodeName().c_str(), OperationName().c_str(), m_beginIndex[i], m_endIndex[i], (int) BeginIndex(i), (int) EndIndex(i), string(sampleLayout).c_str());
 
             // propagate as much as we can
             if (isFinalValidationPass || (m_axis[i] - 1 < sampleLayout.GetRank() && 0 <= BeginIndex(i) && BeginIndex(i) <= EndIndex(i) && EndIndex(i) <= sampleLayout[m_axis[i] - 1])) // (the second condition guards against failing an out-of-bounds error if not isFinalValidationPass)
@@ -868,40 +972,40 @@ template class SliceNode<double>;
 
 enum class PaddingType
 {
-    CONSTANTPAD = 0, // the default, fill the padding cells with 0
-    REFLECTPAD = 1, // Padding with reflect mode
+    CONSTANTPAD = 0,  // the default, fill the padding cells with 0
+    REFLECTPAD = 1,   // Padding with reflect mode
     SYMMETRICPAD = 2, // Padding with symmetric mode
 };
 
 template <class ElemType>
 class PaddingNode : public ComputationNode<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNode<ElemType> Base; 
+    typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() 
+    static const std::wstring TypeName()
     {
         return L"Padding";
     }
-public:
-    
 
+public:
 public:
     PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector<size_t> head, std::vector<size_t> foot, PaddingType mode = PaddingType::CONSTANTPAD, double constantValue = 0)
-        : Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value((ElemType)constantValue)
+        : Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value((ElemType) constantValue)
     {
     }
 
-    PaddingNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name)
+    PaddingNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
     {
     }
-    
+
 public:
     virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
     {
         size_t rank = DetermineElementwiseTensorRank();
         auto outputSlice = GetTensorSliceFor(rank, fr); // tensor slice that represents the entire output for FrameRange
         let input = InputRef(0).ValueTensorFor(rank, fr.AllowBroadcast());
-        int maxRank = (int)(Input(0)->GetSampleLayout().GetRank());
+        int maxRank = (int) (Input(0)->GetSampleLayout().GetRank());
         let dims = Input(0)->GetSampleLayout().GetDims();
         let outputDims = GetSampleLayout().GetDims();
         auto outputSubSlice = outputSlice;
@@ -935,14 +1039,14 @@ public:
             }
         }
     }
-    
+
     virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
     {
         size_t rank = DetermineElementwiseTensorRank();
         let outputSlice = GetTensorSliceFor(rank, fr); // tensor slice that represents the entire output for FrameRange
 
         auto inputGrad = InputRef(inputIndex).GradientTensorFor(rank, fr.AllowBroadcast());
-        int maxRank = (int)(Input(inputIndex)->GetSampleLayout().GetRank());
+        int maxRank = (int) (Input(inputIndex)->GetSampleLayout().GetRank());
         let dims = Input(inputIndex)->GetSampleLayout().GetDims();
         let outputDims = GetSampleLayout().GetDims();
         // first folder the gradients if its padding mode is reflect or symmetric
@@ -973,12 +1077,12 @@ public:
         inputGrad.AddCopyOf(outputGrad);
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override 
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
     {
         return false;
     }
-    
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override 
+
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
     {
         return false;
     }
@@ -1017,7 +1121,6 @@ public:
     }
 
 private:
-
     void FillPaddingCells(const FrameRange& fr, size_t rank, size_t axis, size_t outputIndex, size_t inputIndex, size_t size, bool reverse)
     {
         if (size > 0)
@@ -1101,7 +1204,10 @@ class CropNode : public ComputationNode<ElemType>, public TransformerNode
     typedef ComputationNode<ElemType> Base;
     UsingComputationNodeMembersBoilerplate;
 
-    static const std::wstring TypeName() { return L"Crop"; }
+    static const std::wstring TypeName()
+    {
+        return L"Crop";
+    }
 
 public:
     CropNode(DEVICEID_TYPE deviceId, const std::wstring& name);
@@ -1127,17 +1233,18 @@ private:
     using TransformerNode::m_transforms;
 
     // Declaration of matrix getting method to unify accessing values and gradients.
-    typedef MatrixBasePtr(ComputationNode<ElemType>::*MatrixGetter)() const;
+    typedef MatrixBasePtr (ComputationNode<ElemType>::*MatrixGetter)() const;
 
     // Helper structure to store input/output views which define parts of input and output we work with.
     struct CroppedIOViews
     {
-        CroppedIOViews(CropNode* cropNode, MatrixGetter matrixGetter, TensorShape inputShapeCropped, TensorShape outputShape) :
-            // Input view is derived from first input.
-            inputViewCropped((cropNode->Input(0).get()->*matrixGetter)(), inputShapeCropped),
-            // Output view corresponds to single output.
-            outputView((cropNode->*matrixGetter)(), outputShape)
-        {}
+        CroppedIOViews(CropNode* cropNode, MatrixGetter matrixGetter, TensorShape inputShapeCropped, TensorShape outputShape)
+            : // Input view is derived from first input.
+              inputViewCropped((cropNode->Input(0).get()->*matrixGetter)(), inputShapeCropped),
+              // Output view corresponds to single output.
+              outputView((cropNode->*matrixGetter)(), outputShape)
+        {
+        }
 
         TensorView<ElemType> inputViewCropped;
         TensorView<ElemType> outputView;
@@ -1151,9 +1258,9 @@ private:
     // Performs offsets computation if necessary.
     void ComputeCropOffsets();
 
-    virtual void /*TransformerNode::*/ComputeTransforms() override;
+    virtual void /*TransformerNode::*/ ComputeTransforms() override;
 
-    virtual bool /*TransformerNode::*/SupportsTransformOnInput(size_t inputIndex) override;
+    virtual bool /*TransformerNode::*/ SupportsTransformOnInput(size_t inputIndex) override;
 
 protected:
     // Offset along x axis. We need to store offsets as floats for precision if one crop node affects computation of other.
@@ -1172,11 +1279,15 @@ protected:
 template <class ElemType>
 class RowStackNode : public ComputationNode<ElemType> // note: not deriving from NumInputs<> like most other nodes, because this one takes a variable number of inputs
 {
-    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"RowStack"; }
+    typedef ComputationNode<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"RowStack";
+    }
 
 public:
-    RowStackNode(DEVICEID_TYPE deviceId, const wstring& name, int spliceDim = 1/*TODO: complete this*/)
+    RowStackNode(DEVICEID_TYPE deviceId, const wstring& name, int spliceDim = 1 /*TODO: complete this*/)
         : Base(deviceId, name), m_spliceDim(spliceDim)
     {
     }
@@ -1194,7 +1305,7 @@ public:
         {
             auto node = dynamic_pointer_cast<RowStackNode<ElemType>>(nodeP);
             node->m_firstIndices = m_firstIndices;
-            node->m_spliceDim = m_spliceDim; 
+            node->m_spliceDim = m_spliceDim;
         }
     }
 
@@ -1215,11 +1326,11 @@ public:
 
 private:
     // changes the result slice (which includes all stacked inputs) to the stripe that matches where one of the inputs goes
-    TensorShape NarrowToStripe(const TensorShape & resultSlice, size_t inputIndex)
+    TensorShape NarrowToStripe(const TensorShape& resultSlice, size_t inputIndex)
     {
         auto resultSubSlice = resultSlice;
         assert(m_spliceDim > 0);
-        size_t index = (size_t)m_spliceDim - 1;
+        size_t index = (size_t) m_spliceDim - 1;
         resultSubSlice.NarrowTo(index, m_firstIndices[inputIndex], m_firstIndices[inputIndex + 1]);
         return resultSubSlice;
     }
@@ -1250,8 +1361,14 @@ public:
         inputGrad.AddCopyOf(outputGrad);
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
     {
@@ -1263,7 +1380,7 @@ public:
 
         // determine maximum rank (we can stack tensors with lower rank, which will have their dimensions paded to max automatically)
         assert(m_spliceDim > 0);
-        size_t index = (size_t)m_spliceDim - 1;
+        size_t index = (size_t) m_spliceDim - 1;
         size_t maxRank = index + 1; // spliceDim may exceed all of them, which will create a new dimension, e.g. stacking column vectors into a matrix
         for (int i = 0; i < GetNumInputs(); i++)
             if (maxRank < Input(i)->GetSampleLayout().GetRank())
@@ -1278,7 +1395,7 @@ public:
         for (int i = 0; i < GetNumInputs(); i++)
         {
             // check/fuse dims and accumulate the spliced dimension
-            let & shape = Input(i)->GetSampleLayout();
+            let& shape = Input(i)->GetSampleLayout();
             for (size_t k = 0; k < maxRank; k++)
             {
                 size_t dim = shape.GetDimPadded(k);
@@ -1286,15 +1403,15 @@ public:
                 {
                     // accumulate the spliced dimension
                     dims[index] += dim;
-                    m_firstIndices.push_back(dims[index]);    // and remember it
+                    m_firstIndices.push_back(dims[index]); // and remember it
                 }
                 else
                 {
                     // check/fuse dimensions
                     if (isFinalValidationPass && dim != dims[k] && dim != 1 && dims[k] != 1)
                         InvalidArgument("%ls %ls operation: Conflicting dimension %d between %ls %ls operation (%d) and other(s) (%d)",
-                                        NodeName().c_str(), OperationName().c_str(), (int)k, Input(i)->NodeName().c_str(), Input(i)->OperationName().c_str(), (int)dim, (int)dims[k]);
-                    if (dims[k] == 1)   // broadcast
+                                        NodeName().c_str(), OperationName().c_str(), (int) k, Input(i)->NodeName().c_str(), Input(i)->OperationName().c_str(), (int) dim, (int) dims[k]);
+                    if (dims[k] == 1) // broadcast
                         dims[k] = dim;
                 }
             }
@@ -1303,7 +1420,10 @@ public:
         SetDims(TensorShape(dims), HasMBLayout());
     }
 
-    int GetSpliceDim() const { return m_spliceDim; }
+    int GetSpliceDim() const
+    {
+        return m_spliceDim;
+    }
 
 private:
     std::vector<size_t> m_firstIndices; // start row number in the stacked matrix of each input (child) (cumsum of matrix heights); plus one final entry that equals the total dimension
@@ -1320,8 +1440,12 @@ template class RowStackNode<double>;
 template <class ElemType>
 class RowRepeatNode : public ComputationNode<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"RowRepeat"; }
+    typedef ComputationNode<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"RowRepeat";
+    }
 
 public:
     RowRepeatNode(DEVICEID_TYPE deviceId, const wstring& name, size_t numRepeats = 1)
@@ -1385,8 +1509,14 @@ public:
         InputRef(0).GradientFor(fr).AddToRowRepeatValuesOf(GradientFor(fr), m_numRepeat);
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
 
 private:
     size_t m_numRepeat;
@@ -1430,22 +1560,36 @@ and Scatter().
 template <class ElemType>
 class WhereNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"Where"; }
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"Where";
+    }
+
+    static const std::wstring DefaultWhereNodeDynamicAxisName()
+    {
+        return L"WhereNodeAxis";
+    }
 
-    static const std::wstring DefaultWhereNodeDynamicAxisName() { return L"WhereNodeAxis"; }
 public:
     DeclareConstructorFromConfigWithNumInputs(WhereNode);
-    WhereNode(DEVICEID_TYPE deviceId, const wstring& name, const wstring& dynamicAxisName = DefaultWhereNodeDynamicAxisName()) :
-        Base(deviceId, name), m_dynamicAxisName(dynamicAxisName)
+    WhereNode(DEVICEID_TYPE deviceId, const wstring& name, const wstring& dynamicAxisName = DefaultWhereNodeDynamicAxisName())
+        : Base(deviceId, name), m_dynamicAxisName(dynamicAxisName)
     {
         MarkValueNonSharable();
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override;
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
     virtual void Validate(bool isFinalValidationPass) override;
 
     virtual void Load(File& fstream, size_t modelVersion) override
@@ -1463,12 +1607,15 @@ public:
         fstream << m_dynamicAxisName;
     }
 
-    std::wstring DynamicAxisName() const { return m_dynamicAxisName; }
+    std::wstring DynamicAxisName() const
+    {
+        return m_dynamicAxisName;
+    }
 
 private:
     // buffers for creating the result sequences (kept as object state to avoid memory allocations)
-    std::vector<std::vector<size_t>>   m_indexSequenceBuffer; // [sequenceIndex][t] for creating the result sequences
-    std::vector<size_t>               m_rowAllocationsBuffer; // [row] for determining new MBLayout packing
+    std::vector<std::vector<size_t>> m_indexSequenceBuffer;   // [sequenceIndex][t] for creating the result sequences
+    std::vector<size_t> m_rowAllocationsBuffer;               // [row] for determining new MBLayout packing
     std::vector<std::pair<size_t, size_t>> m_placementBuffer; // [sequenceIndex] assigned location for a sequence
     std::wstring m_dynamicAxisName;
 };
@@ -1486,25 +1633,35 @@ private:
 template <class ElemType>
 class PackedIndexNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<2>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"PackedIndex"; }
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"PackedIndex";
+    }
 
     // our inputs
     static const size_t SOURCEDATA = 0;
-    static const size_t INDEXDATA  = 1;
+    static const size_t INDEXDATA = 1;
 
 public:
     DeclareConstructorFromConfigWithNumInputs(PackedIndexNode);
-    PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name) :
-        Base(deviceId, name)
+    PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
     {
         MarkValueNonSharable();
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t /*inputIndex*/) override;
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
     virtual void Validate(bool isFinalValidationPass) override;
 };
 
@@ -1519,8 +1676,12 @@ public:
 template <class ElemType>
 class GatherPackedNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<2>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"GatherPacked"; }
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"GatherPacked";
+    }
 
     // our inputs
     static const size_t INDEXDATA = 0;
@@ -1528,15 +1689,21 @@ class GatherPackedNode : public ComputationNodeNonLooping<ElemType>, public NumI
 
 public:
     DeclareConstructorFromConfigWithNumInputs(GatherPackedNode);
-    GatherPackedNode(DEVICEID_TYPE deviceId, const wstring& name) :
-        Base(deviceId, name)
+    GatherPackedNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override;
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { return childIndex == INDEXDATA; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    {
+        return childIndex == INDEXDATA;
+    }
     virtual void Validate(bool isFinalValidationPass) override;
 };
 
@@ -1552,25 +1719,35 @@ public:
 template <class ElemType>
 class ScatterPackedNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<3>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"ScatterPacked"; }
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"ScatterPacked";
+    }
 
     // our inputs
     static const size_t LAYOUTDATA = 0;
-    static const size_t INDEXDATA  = 1;
+    static const size_t INDEXDATA = 1;
     static const size_t SOURCEDATA = 2;
 
 public:
     DeclareConstructorFromConfigWithNumInputs(ScatterPackedNode);
-    ScatterPackedNode(DEVICEID_TYPE deviceId, const wstring& name) :
-        Base(deviceId, name)
+    ScatterPackedNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
     {
     }
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
     virtual void /*ComputationNodeNonLooping::*/ BackpropToNonLooping(size_t inputIndex) override;
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override { return childIndex == INDEXDATA; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    {
+        return childIndex == INDEXDATA;
+    }
     virtual void Validate(bool isFinalValidationPass) override;
 };
 
@@ -1581,8 +1758,12 @@ public:
 template <class ElemType>
 class DiagonalNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"Diagonal"; }
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"Diagonal";
+    }
 
 public:
     DeclareConstructorFromConfigWithNumInputs(DiagonalNode);
@@ -1620,8 +1801,14 @@ public:
 #endif
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
 
     virtual void Validate(bool isFinalValidationPass) override
     {
@@ -1802,7 +1989,7 @@ public:
     {
     }
     LegacyReshapeNode(const ScriptableObjects::IConfigRecordPtr configp)
-        : LegacyReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))/*legacy*/))
+        : LegacyReshapeNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"numRows"), ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout")) /*legacy*/))
     {
         // BUGBUG: We should not operate on image layouts here, but on a proper tensor layout.
         AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
@@ -1919,8 +2106,14 @@ public:
         }
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
+    {
+        return false;
+    }
 
     virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
     {
@@ -2051,7 +2244,6 @@ private:
 template class LegacyReshapeNode<float>;
 template class LegacyReshapeNode<double>;
 
-
 template <class ElemType>
 class GatherNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<2>
 {
@@ -2063,7 +2255,8 @@ class GatherNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<
     }
 
 public:
-    GatherNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name)
+    GatherNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
     {
     }
 
@@ -2092,7 +2285,7 @@ public:
     {
         if (inputIndex == 1) //only right operand need calculate gradient
         {
-            let&  indices = InputRef(0).Value();
+            let& indices = InputRef(0).Value();
             const auto& indicesMask = InputRef(0).GetMBLayout()->GetColumnsValidityMask(indices.GetDeviceId());
             auto& sourceGradient = InputRef(1).Gradient();
             auto& outputGradient = Gradient();
@@ -2118,10 +2311,12 @@ public:
         }
     }
 
-    virtual bool OutputUsedInComputingInputNodesGradients() const override {
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
         return false;
     }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override {
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex) const override
+    {
         return childIndex == 0;
     }
 
@@ -2143,7 +2338,7 @@ public:
         const auto& inputDims2 = inputSampleLayout2.GetDims();
 
         SmallVector<size_t> dims;
-        dims.append(inputDims2.begin(), inputDims2.end() - 1);//pop the last dim of right operand
+        dims.append(inputDims2.begin(), inputDims2.end() - 1); //pop the last dim of right operand
         dims.append(inputDims1.begin(), inputDims1.end());
         auto sampleLayout = TensorShape(dims);
 
@@ -2353,4 +2548,6 @@ other
 
  */
 
-}}}
+} // namespace CNTK
+} // namespace MSR
+} // namespace Microsoft
diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
index 3ac37bfc9..f9a0c1a8a 100755
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@@ -311,6 +311,7 @@ public:
     CPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
 
     CPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
+    CPUMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
 
     ElemType SumOfAbsElements() const; // sum of all abs(elements)
     ElemType SumOfElements() const;    // sum of all elements
diff --git a/Source/Math/CPUMatrixImpl.h b/Source/Math/CPUMatrixImpl.h
index 2f997f564..df09d92db 100644
--- a/Source/Math/CPUMatrixImpl.h
+++ b/Source/Math/CPUMatrixImpl.h
@@ -3215,6 +3215,24 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
     return *this;
 }
 
+template <class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    if (IsEmpty())
+        LogicError("SetToZeroIfLessThan: Matrix is empty.");
+
+    auto& us = *this;
+
+#pragma omp parallel for
+    foreach_coord (i, j, us)
+    {
+        if ((us(i, j)) < threshold)
+            us(i, j) = 0;
+    }
+
+    return *this;
+}
+
 //sum of all abs(elements)
 template <class ElemType>
 ElemType CPUMatrix<ElemType>::SumOfAbsElements() const
diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
index c9b27cdb3..ae7661e65 100755
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@@ -2486,6 +2486,18 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
     return *this;
 }
 
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    if (IsEmpty())
+        LogicError("SetToZeroIfLessThan: Matrix is empty.");
+    CUDA_LONG N = (CUDA_LONG) GetNumElements();
+    int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock);
+    PrepareDevice();
+    SyncGuard syncGuard;
+    _setToZeroIfLessThan<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(Data(), threshold, N);
+    return *this;
+}
 template <class ElemType>
 ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
 {
@@ -5318,6 +5330,119 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
         return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }
 
+// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+// This binds the N-ariness to a template parameter N, and gets the data pointers out from the matrix objects.
+template <class ElemType>
+void GPUMatrix<ElemType>::TensorOpDebug(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                        const array<size_t, 2>& offsets,
+                                        const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                        const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
+{
+    if (reductionOp != ElementWiseOperator::opSum &&
+        reductionOp != ElementWiseOperator::opLogSum &&
+        reductionOp != ElementWiseOperator::opMin &&
+        reductionOp != ElementWiseOperator::opMax &&
+        reductionOp != ElementWiseOperator::opElementwiseProduct)
+        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
+
+    a.PrepareDevice();
+    if (a.GetComputeDeviceId() != GetComputeDeviceId())
+        InvalidArgument("All matrices must be on the same GPU");
+
+    // special case: linear processing
+    // The case statement has measurable impact for unary ops (but not for binary ops it seems, due to double mem access).
+    // Linear gap-free unary ops happen so regularly that we will eliminate the case statement from the CUDA kernel, and instead expand all.
+    if (regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && reducingOpDims.size() == 0)
+    {
+        // special case: for copy, use cudaMemcpy() instead, or cublas_axpy()
+        // TODO: We should observe if these actually make a speed difference, and if not, remove these special cases.
+        if (op == ElementWiseOperator::opCopy && beta == 0 && alpha == 1)
+        {
+            //fprintf(stderr, "TensorOpDebug 1 \n");
+            return CUDA_CALL(cudaMemcpy(Data() + offsets[1], a.Data() + offsets[0], sizeof(ElemType) * regularOpDims[0], cudaMemcpyDeviceToDevice));
+        }
+        else if (op == ElementWiseOperator::opCopy && beta == 1)
+        {
+            //fprintf(stderr, "TensorOpDebug 2 \n");
+            return CUBLAS_CALL(cublasaxpyHelper(GetCublasHandle(GetComputeDeviceId()), (int) regularOpDims[0], &alpha, a.Data() + offsets[0], 1, Data() + offsets[1], 1));
+        }
+        else
+        {
+            //fprintf(stderr, "TensorOpDebug 3 \n");
+
+            return LaunchUnaryTensorOp<ElemType>(beta, a.Data() + offsets[0], Data() + offsets[1], alpha, op, regularOpDims[0]);
+        }
+    }
+
+    // special case: sum-reducing a matrix onto a column vector; can be done with SGEMM
+    // Note: A minor risk is that with this, our own reduction function will rarely be used.
+    // That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
+    else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
+             reductionOp == ElementWiseOperator::opSum &&
+#ifdef _DEBUG
+             sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
+#endif
+             regularOpDims.size() == 1 && regularStrides[0][0] == 1 && regularStrides[1][0] == 1 && // we are processing a column
+             reducingOpDims.size() == 1 && reducingStrides[0][0] >= (ptrdiff_t) regularOpDims[0])   // reducing across columns and no overlap
+    {
+        assert(reducingStrides[1][0] == 0);
+        auto ARows = regularOpDims[0];    // vertical steps
+        auto ACols = reducingOpDims[0];   // horizontal steps (reduction)
+        auto ALd = reducingStrides[0][0]; // horizontal step width through matrix
+        cublasHandle_t cuHandle = GetCublasHandle(a.GetComputeDeviceId());
+        CUBLAS_CALL(cublasgemmHelper(cuHandle, CUBLAS_OP_N, CUBLAS_OP_N, (int) /*CRows=*/ARows, /*CCols=*/1, (int) ACols, &alpha,
+                                     /*A00=*/a.Data() + offsets[0], (int) ALd,
+                                     /*B00=*/GetOnesVector<ElemType>(ACols, a.GetComputeDeviceId())->Data(), (int) /*BRows=*/ACols, &beta,
+                                     /*C00=*/Data() + offsets[1], (int) /*CRows=*/ARows));
+        //fprintf(stderr, "TensorOpDebug 4 \n");
+
+        return;
+    }
+
+    // TODO: Add a special case for tensor bias reduction. cudnn is ~7% faster on Image/QuickE2E.
+
+    // regular case
+    else
+    {
+        /*
+        fprintf(stderr, "TensorOpDebug 5 \n");
+        for (size_t i = 0; i < 2; i++)
+        {
+            // fprintf(stderr, "i = %d, offsets = %d,  regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i]));
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, offsets = %d\n ", int(i), int(offsets[i]));
+        }
+        fprintf(stderr, "TensorOpDebug 5.1, regularOpDims.size() = %d \n", int(regularOpDims.size()));
+
+        for (size_t i = 0; i < regularOpDims.size(); i++)
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i]));
+
+        fprintf(stderr, "TensorOpDebug 5.1, reducingOpDims.size() = %d \n", int(reducingOpDims.size()));
+        for (size_t i = 0; i < reducingOpDims.size(); i++)
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i]));
+
+        for (size_t i = 0; i < 2; i++)
+        {
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, regularStrides.size() = %d \n", int(i), int(regularStrides[i].size()));
+            for (size_t j = 0; j < regularStrides[i].size(); j++)
+            {
+                fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j]));
+            }
+
+            fprintf(stderr, "TensorOpDebug 5.1, i = %d, reducingStrides.size() = %d \n", int(i), int(reducingStrides[i].size()));
+            for (size_t j = 0; j < reducingStrides[i].size(); j++)
+            {
+                fprintf(stderr, "TensorOpDebug 5.1, i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j]));
+            }
+        }
+
+        fprintf(stderr, "TensorOpDebug 5.2, beta = %f, alpha = %f, a.data = %f, data = %f \n", double(beta), double(alpha), double(a.FrobeniusNorm()), double(FrobeniusNorm()));
+        // return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        */
+        return TensorOpNDebug<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, *this);
+
+    }
+}
+
 // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
 template <class ElemType>
 void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 02391dc0a..628e88fd1 100755
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -437,6 +437,7 @@ public:
     GPUMatrix<ElemType>& InplaceSoftThreshold(const ElemType threshold);
 
     GPUMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
+    GPUMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
 
     DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
     ElemType SumOfAbsElements() const; // sum of all abs(elements)
@@ -607,6 +608,11 @@ public:
                   const std::array<size_t, 2>& offsets,
                   const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                   const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+    void TensorOpDebug(ElemType beta, const GPUMatrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                  const std::array<size_t, 2>& offsets,
+                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
     void TensorOp(ElemType beta, const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                   const std::array<size_t, 3>& offsets,
                   const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
index 281a50391..60f6db240 100755
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -1306,6 +1306,19 @@ __global__ void _setToZeroIfAbsLessThan(
         a[id] = 0;
 }
 
+template <class ElemType>
+__global__ void _setToZeroIfLessThan(
+    ElemType* a,
+    const ElemType threshold,
+    const CUDA_LONG N)
+{
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
+    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
+    if (id >= N)
+        return;
+    if (((comp_t) a[id]) < (comp_t) threshold)
+        a[id] = 0;
+}
 template <class ElemType>
 __global__ void _areEqual(
     const ElemType* a,
diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu
index eee40d1b5..4284b2b56 100755
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@@ -2987,6 +2987,19 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
     return *this;
 }
 
+template <class ElemType>
+GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    VerifyWritable(__FUNCTION__);
+
+    if (IsEmpty())
+        LogicError("SetToZeroIfLessThan: Matrix is empty.");
+    CUDA_LONG N = (CUDA_LONG) GetNumNZElements();
+    int blocksPerGrid = (int) ceil(N * 1.0 / GridDim::maxThreadsPerBlock);
+    SyncGuard syncGuard;
+    _setToZeroIfLessThan<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(NzValues(), threshold, N);
+    return *this;
+}
 #pragma endregion
 
 #pragma region Helper Functions
diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h
index 49b9439dd..00832b4c4 100755
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@@ -395,7 +395,7 @@ public:
     GPUSparseMatrix<ElemType>& AssignTruncateTopOf(const GPUSparseMatrix<ElemType>& a, const ElemType threshold);
 
     GPUSparseMatrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
-
+    GPUSparseMatrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
     GPUSparseMatrix<ElemType>& AssignOneHot(const GPUMatrix<ElemType>& a, vector<size_t>& shape, size_t axis);
     void SetDiagonalValue(const ElemType v);
     void SetDiagonalValue(const GPUMatrix<ElemType>& vector);
diff --git a/Source/Math/GPUTensor.cu b/Source/Math/GPUTensor.cu
index 2fdfdb515..3a94e8ab1 100644
--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@@ -875,6 +875,12 @@ static shared_ptr<ElemType> GetReductionBuffer(size_t N)
     return reductionBuffersCache[deviceId];
 }
 
+// this is safe for multithread calling in RNNT_EMBR
+template <class ElemType>
+static shared_ptr<ElemType> GetReductionBufferNoCache(size_t N)
+{
+    return AllocateReductionBuffer<ElemType>(N);
+}
 // All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
 template <class ElemType, C_size_t N, C_int M, C_int K>
 static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
@@ -1112,6 +1118,272 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
     }
 }
 
+
+
+template <class ElemType, C_size_t N, C_int M, C_int K>
+static void LaunchTensorOpWithReductionDebug(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                        const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrideVectors,
+                                             const SmallVector<size_t>& reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N>& reducingStrideVectors, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result)
+{
+    //fprintf(stderr, "LaunchTensorOpWithReductionDebug  1,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+    // return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    a;
+    result;    
+    typedef typename TypeSelector<ElemType>::comp_t ReduceElemType;
+    // copy all parameters to CUDA-compatible data structures
+    FixedArray<ElemType*, N> pointers(pointerVector);
+    SmallVector<C_size_t> regularOpStrideVector; // kernel needs the strides for converting thread index back to multi-dimensional tensor index
+    C_size_t numElements = 1;
+    // input divisors
+    SmallVector<fast_divmod> regularOpStrideDivmodVector;
+    for (C_size_t k = 0; k < regularOpDims.size(); k++)
+    {
+        regularOpStrideVector.push_back(numElements); // stride for dense representation of our output elements (if they were flattened)
+        regularOpStrideDivmodVector.push_back(fast_divmod((unsigned int) numElements));
+        numElements *= (C_size_t) regularOpDims[k];
+    }
+    // output divisors
+    SmallVector<fast_divmod> reducingOpDimDivmodVector;
+    C_size_t stride = 1;
+    for (C_size_t k = 0; k < reducingOpDimVector.size(); ++k)
+    {
+        reducingOpDimDivmodVector.push_back(fast_divmod(stride));
+        stride *= (C_size_t) reducingOpDimVector[k];
+    }
+
+    FixedArray<C_unsigned_int, K> regularOpStrides(regularOpStrideVector);
+    FixedMatrix<C_int, N, K> regularStrides(regularStrideVectors);
+    FixedArray<C_unsigned_int, M> reducingOpDims(reducingOpDimVector);
+    FixedMatrix<C_int, N, M> reducingStrides(reducingStrideVectors);
+    // reduced divisors
+    FixedArray<fast_divmod, K> regularOpStrideDivmod(regularOpStrideDivmodVector);
+    FixedArray<fast_divmod, M> reducingOpDimDivmod(reducingOpDimDivmodVector);
+
+    // launch the kernel
+    CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual output element
+    SyncGuard syncGuard;
+
+    // do some optimization for reductions
+    //  - example: 30 GPU procs, warp size 32 --> 960 GPU cores
+    //  - NN elements must be computed, each involving a reduction over reductionDim elements
+    // Cases:
+    //  - #output elements NN >= GPU cores  -->  use one proc per element, do reduction in inner loop
+    //    E.g. if >=960 elements are computed, each gets its own GPU thread.
+    //  - reduction dimension would benefit from multiple blocks  -->  multiple blocks work on a single output element
+    //    E.g.
+    //     - gradient of adding a bias: reducing to a bias, e.g. 512-dim
+    //     - gradient of scalar multiplication: big elementwise product reduced to a scalar (big dot product, e.g. [1024 x 1024] = 1M elements)
+    //     - softmax in seq-2-seq attention model: reduce over length of attention window (e.g. 20)
+    //     - summation of criterion value: scalar reduction over a few hundred or thousand samples in the minibatch
+    C_size_t reductionDim = 1; // number of elements to reduce over
+    for (C_size_t k = 0; k < reducingOpDimVector.size(); k++)
+        reductionDim *= (C_size_t) reducingOpDimVector[k];
+    GridDim grid(NN);
+    let& props = GridDim::GetDeviceProps();
+    bool disableParallelReduction = false; // (for debugging)
+    //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+    // === arg based reduction, one thread per output element
+    if ((reductionOp == ElementWiseOperator::opArgmax) ||
+        (reductionOp == ElementWiseOperator::opArgmin))
+    {
+
+        _launchTensorArgOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
+            pointers, reductionOp,
+            regularOpStrides, regularStrides, grid.m_N,
+            reducingOpDims, reducingStrides,
+            regularOpStrideDivmod, reducingOpDimDivmod);
+    }
+    // === simple case: NN large, one thread per output element
+    else if (reductionDim == 1 ||                                     // no reduction
+             grid.m_blocksPerGrid >= props.multiProcessorCount ||     // enough output elements to fill all multiprocs
+             reductionDim * numElements <= 2 * props.warpSize ||      // trivial operation not worth the trouble (2* because the more complex one also needs 2 kernel launches)
+             disableParallelReduction ||                              // (for debugging)
+             reductionDim * numElements <= props.multiProcessorCount) // recursive call from reduction below
+    {
+        // we got enough elements to generate: do one element per thread, and reduction inside
+        //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.1,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+        _launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
+            beta, pointers, alpha, op, reductionOp,
+            regularOpStrides, regularStrides, grid.m_N,
+            reducingOpDims, reducingStrides,
+            regularOpStrideDivmod, reducingOpDimDivmod);
+        //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.2,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+    }
+    // === optimization: simple case would not use all multiprocs
+    else
+    {
+        // m_blocksPerGrid can be thought of NN / 512, with appropriate rounding
+
+        // we are reducing and are underutilizing the multiprocs we have: get more parallelism by doing reduction in parallel
+        // If we get here, then
+        //  - the total number of outputs to produce is < #multiprocs * warpSize, e.g. < 960
+        //  - each output has at least two inputs, but possibly millions
+        // Examples:
+        //  (a1) NN=900
+        //        - each multiproc processes multiple elements concurrently, each reducing over its inputs inside
+        //        - use one block per output element
+        //  (a2) NN=30
+        //        - same as (a1) except 30 multiprocs run only a single block each
+        //  (a3) NN=16
+        //        - same as (a1) except only 16 multiproc run one block
+        //  (b1) NN=15
+        //        - 2 blocks work together on a single output element
+        //  (b2) NN=1    (NN < #multiprocs, e.g. NN < 30)
+        //        - multiple blocks work together on a single output element
+        //        - only this case requires memory, and only K * NN
+        //          where K = blocks that work together,
+        //          both K and NN < #multiprocs,
+        //          and K * NN = on the order of NN, but generally a bit larger due to rounding.
+
+        // By how much do we underutilize?
+        // We increase #blocks by that factor by breaking reduction into that many chunks.
+        //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.3,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+        int numReductionChunks = std::max<int>(props.multiProcessorCount / NN, 1); // only >1 for NN < multiProcessorCount
+
+        // distribute NN over block X and Y
+        int blockXOverBy = CeilDiv(NN, props.maxGridSize[0]);
+        int numBlocksX = CeilDiv(NN, blockXOverBy);
+        int numBlocksY = CeilDiv(NN, numBlocksX);
+        // while block Z is for multiple blocks working together on a single output element
+        int numBlocksZ = numReductionChunks;
+        // Block dim is now:
+        //  - X, Y: such that X*Y covers NN
+        //  - Z: reduction chunks
+
+        // reduction goes into thread dim X
+        int reductionChunkSize = CeilDiv(reductionDim, numReductionChunks);
+        int numThreadsX = std::min<int>(reductionChunkSize, GridDim::maxThreadsPerBlock); // any that's over will be done by looping inside the kernel
+
+        // --- cases (a1) and (a2)
+        // This involves no reduction across blocks.
+        //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.4,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+        if (numReductionChunks == 1)
+        {
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.5,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
+                beta, pointers, alpha, op, reductionOp,
+                regularOpStrides, regularStrides, NN,
+                reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize,
+                regularOpStrideDivmod, reducingOpDimDivmod);
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.6,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+        }
+        // --- case (b)
+        // Reduction across blocks. This is the difficult one.
+#ifndef ALLOW_ATOMIC_REDUCTION // temporarily disabled to ensure it is not causing the non-reproducability
+        else
+        {
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.7,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            // we get here if NN <= #multiprocs
+            assert(NN <= props.multiProcessorCount && numBlocksX == NN && numBlocksY == 1);
+            // dims are:
+            //  - numBlocksZ = numReductionChunks = how many multiprocs work together to produce one output element
+            //  - numBlocksX = NN = number of output elements
+            //  - numThreadsX = reductionChunkSize clipped to 512; reductionChunkSize > 512 is handled by an inner for loop inside of the kernel
+
+            // we need memory for block outputs of dimension [numBlocksX x numBlocksZ]
+            //  - total elements = NN * Floor(#multiprocs / NN) = <= #multiprocs
+            let reductionBufferSize = props.multiProcessorCount;
+            assert(reductionBufferSize >= NN * numBlocksZ);
+            shared_ptr<ElemType> reductionBuffer = GetReductionBufferNoCache<ElemType>(reductionBufferSize);
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.8,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            // 'pointers', 'regularOpStrides', and 'regularStrides' are set up to point to the target memory.
+            // We need to reroute them to point to our reductionBuffer.
+            //  - pointer[N-1] -> replace by reductionBuffer
+            //  - regularStrides -> replace [N-1] by regularOpStrides which already represent the NN elements for a dense memory layout
+            //  - beta -> 0 since we write into temp memory
+            //  - kernel must use block.z as second index into the output buffer; add (block.z * NN) to the pointer
+            FixedArray<ElemType*, N> pointers1 = pointers;
+            pointers1[N - 1] = reductionBuffer.get();
+            auto regularStrideVectors1 = regularStrideVectors;
+            for (size_t k = 0; k < regularOpStrides.size(); k++)
+                regularStrideVectors1[N - 1][k] = (ptrdiff_t) regularOpStrideVector[k];
+            FixedMatrix<C_int, N, K> regularStrides1(regularStrideVectors1);
+            ElemType beta1 = 0;
+            ElemType alpha1 = 1;
+            // fprintf(stderr, "LaunchTensorOpWithReductionDebug  2.9,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
+                beta1, pointers1, alpha1, op, reductionOp,
+                regularOpStrides, regularStrides1, NN,
+                reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize,
+                regularOpStrideDivmod, reducingOpDimDivmod);
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  3,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+
+#if 1
+            // now reduce and redistribute
+            // Create a new tensor task, and execute it recursively:
+            //  - input  = reductionBuffer
+            //  - output = true output
+            //  - op dims/strides     = output elements
+            //  - reduce dims/strides = numBlocksZ
+            //  - op = opCopy
+            array<ElemType*, 2> pointerVector2{reductionBuffer.get(), pointerVector[N - 1]};
+            const array<SmallVector<ptrdiff_t>, 2> regularStrideVectors2{regularStrideVectors1[N - 1], regularStrideVectors[N - 1]};
+            const array<SmallVector<ptrdiff_t>, 2> reducingStrideVectors2{SmallVector<ptrdiff_t>{NN}, SmallVector<ptrdiff_t>{0}};
+            const SmallVector<size_t> reducingOpDimVector2{(size_t) numReductionChunks};
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  3.1,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            LaunchTensorOpWithReduction<ElemType, /*N=*/2, /*M=*/1, K>(
+                beta, pointerVector2, alpha, ElementWiseOperator::opCopy, reductionOp,
+                regularOpDims, regularStrideVectors2,
+                reducingOpDimVector2, reducingStrideVectors2);
+            //fprintf(stderr, "LaunchTensorOpWithReductionDebug  3.2,  a.data = %f, result.data = %f \n", double(a.FrobeniusNorm()), double(result.FrobeniusNorm()));
+
+            // (note: ^^this will have a nested syncGuard, which is fine)
+
+#else
+            _launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
+                beta, pointers, alpha, op, reductionOp,
+                regularOpStrides, regularStrides, grid.m_N,
+                reducingOpDims, reducingStrides);
+            //for (size_t z = 0; z < numBlocksZ; z++)
+            //    _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(z == 0 ? beta : 1, pointers, alpha, op,
+            //    regularOpStrides, regularStrides, NN,
+            //    reducingOpDims, reducingStrides, reductionChunkSize * z, reductionChunkSize);
+            vector<ElemType> peekPartial(NN * numBlocksZ, -42);
+            vector<ElemType> peekFinal(NN, -42);
+            CUDA_CALL(cudaMemcpy(peekPartial.data(), reductionBuffer, sizeof(ElemType) * peekPartial.size(), cudaMemcpyDeviceToHost));
+            CUDA_CALL(cudaMemcpy(peekFinal.data(), pointers[pointers.size() - 1], sizeof(ElemType) * peekFinal.size(), cudaMemcpyDeviceToHost));
+            double s1 = 0, s2 = 0;
+            for (auto v : peekPartial)
+                s1 += v;
+            for (auto v : peekFinal)
+                s2 += v;
+            sin(1.0);
+#endif
+        }
+#else
+        else if (beta == 1)
+        {
+            // no need to pre-scale; just add (common for gradients)
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides,
+                                                                                                                                                                       regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize,
+                                                                                                                                                                       regularOpStrideDivmod, reducingOpDimDivmod);
+            return;
+        }
+        else
+        {
+            // We need more than one chunk, we will use atomicAdd().
+            // First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize,
+                                                                                                                                                              regularOpStrideDivmod, reducingOpDimDivmod);
+            // We will leave it like this for a while, but eventually need to revisit using temporary memory.
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize,
+                                                                                                                                                                           regularOpStrideDivmod, reducingOpDimDivmod);
+        }
+#endif
+    }
+}
 // -----------------------------------------------------------------------
 // kernel and launch  --linear unary
 // -----------------------------------------------------------------------
@@ -1205,6 +1477,24 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
     }
 }
 
+template <class ElemType, C_size_t N, C_int K>
+static void TensorOpWithRegularLoopDebug(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+                                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+{
+    size_t dims = reducingOpDims.size();
+    switch (dims)
+    {
+    case 2:
+        return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 1:
+        return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 0:
+        return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides);
+    default:
+        LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (C_int) dims);
+    }
+}
 // tensor operation, generalized in number of arguments
 // This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
 template <class ElemType, C_size_t N>
@@ -1236,6 +1526,17 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
     }
 }
 
+template <class ElemType, C_size_t N>
+void TensorOpNDebug(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+               const array<size_t, N>& offsets,
+               const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result)
+{
+    for (C_size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
+        pointers[i] += offsets[i];
+    return LaunchTensorOpWithReductionDebug<ElemType, N, 1, 0>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides, a, result);
+}
+
 //------------------------------------------------------------------------
 // explicit instantiations--these are being called from GPUMatrix.cu
 //------------------------------------------------------------------------
@@ -1277,6 +1578,21 @@ template void TensorOpN<half, 4>(half beta, array<half*, 4> pointers, half alpha
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
 
+template void TensorOpNDebug<half, 2>(half beta, array<half*, 2> pointers, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                 const array<size_t, 2>& offsets,
+                                 const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                      const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<half>& a, GPUMatrix<half>& result);
+
+template void TensorOpNDebug<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                   const array<size_t, 2>& offsets,
+                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                        const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides,  const GPUMatrix<double>& a, GPUMatrix<double>& result);
+
+template void TensorOpNDebug<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                  const array<size_t, 2>& offsets,
+                                  const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                       const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides, const GPUMatrix<float>& a, GPUMatrix<float>& result);
+
 
 template void LaunchUnaryTensorOp(float beta, const float* pa, float* pb, float alpha, ElementWiseOperator op, size_t regularOpDim);
 template void LaunchUnaryTensorOp(double beta, const double* pa, double* pb, double alpha, ElementWiseOperator op, size_t regularOpDim);
diff --git a/Source/Math/GPUTensor.h b/Source/Math/GPUTensor.h
index dfe0b6ad4..b66559b7b 100644
--- a/Source/Math/GPUTensor.h
+++ b/Source/Math/GPUTensor.h
@@ -23,6 +23,12 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides);
 
+template <class ElemType, C_size_t N>
+void TensorOpNDebug(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+               const array<size_t, N>& offsets,
+               const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, const GPUMatrix<ElemType>& a, GPUMatrix<ElemType>& result);
+
 template <class ElemType>
 void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);
 
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 17df4cf4c..0b27957b4 100755
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -3669,6 +3669,21 @@ Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType thresh
     return *this;
 }
 
+template <class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    if (IsEmpty())
+        LogicError("SetToZeroIfLessThan: Matrix is empty.");
+
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->SetToZeroIfLessThan(threshold),
+                            m_GPUMatrix->SetToZeroIfLessThan(threshold),
+                            NOT_IMPLEMENTED,
+                            m_GPUSparseMatrix->SetToZeroIfLessThan(threshold));
+
+    return *this;
+}
 //sum of all elements
 template <class ElemType>
 ElemType Matrix<ElemType>::SumOfElements() const
@@ -6399,6 +6414,23 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, ElemTy
                             NOT_IMPLEMENTED,
                             NOT_IMPLEMENTED);
 }
+template <class ElemType>
+void Matrix<ElemType>::TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                                const array<size_t, 2>& offsets,
+                                const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
+{
+    VerifyIsDense(*this) && VerifyIsDense(a);
+
+    DecideAndMoveToRightDevice(*this, a);
+
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->TensorOp(beta, *a.m_CPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            m_GPUMatrix->TensorOpDebug(beta, *a.m_GPUMatrix, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
 
 template <class ElemType>
 void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 241be55d2..529a91c52 100755
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -474,6 +474,8 @@ public:
     void InplaceTranspose();
 
     Matrix<ElemType>& SetToZeroIfAbsLessThan(const ElemType threshold);
+    Matrix<ElemType>& SetToZeroIfLessThan(const ElemType threshold);
+
 
     DeviceBoundNumber<ElemType> Sum_AsDeviceBoundNum() const;
     ElemType SumOfAbsElements() const; // sum of all abs(elements)
@@ -658,6 +660,11 @@ public:
                   const std::array<size_t, 2>& offsets,
                   const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                   const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+    void TensorOpDebug(ElemType beta, const Matrix<ElemType>& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                  const std::array<size_t, 2>& offsets,
+                  const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                  const SmallVector<size_t>& reducingOpDims, const std::array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
     void TensorOp(ElemType beta, const Matrix<ElemType>& a, const Matrix<ElemType>& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                   const std::array<size_t, 3>& offsets,
                   const SmallVector<size_t>& regularOpDims, const std::array<SmallVector<ptrdiff_t>, 3>& regularStrides,
diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp
index 995932e34..720343e43 100755
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@@ -709,6 +709,11 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
     return *this;
 }
 
+template <class ElemType>
+GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    return *this;
+}
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSoftThreshold(const ElemType threshold)
 {
@@ -1723,7 +1728,11 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfAbsLessThan(const ElemType
 {
     return *this;
 }
-
+template <class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::SetToZeroIfLessThan(const ElemType threshold)
+{
+    return *this;
+}
 template <class ElemType>
 ElemType GPUMatrix<ElemType>::SumOfAbsElements() const
 {
diff --git a/Source/Math/TensorView.cpp b/Source/Math/TensorView.cpp
index 42175e8e0..fefd95945 100644
--- a/Source/Math/TensorView.cpp
+++ b/Source/Math/TensorView.cpp
@@ -217,7 +217,7 @@ static void PrepareTensorOperands(array<TensorShape, N> shapes, array<size_t, N>
     }
 
     for (size_t i = 0; i < N; i++)
-        offsets[i] = shapes[i].GetOffset();
+        offsets[i] = shapes[i].GetOffset();        
 }
 
 // enforce that in case of broadcasting, the output must not be an input
@@ -249,6 +249,47 @@ void TensorView<ElemType>::DoUnaryOpOf(ElemType beta, const TensorView& a, ElemT
     GetSOB().TensorOp(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }
 
+template <class ElemType>
+void TensorView<ElemType>::DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
+{
+    // static int cc = 0; if (cc++ == 0)
+    //    fprintf(stderr, "Tensor Op: Op %d: %s -> %s\n", (int)op, string(a.GetShape()).c_str(), string(GetShape()).c_str());
+
+    // prepare all tensor descriptor information as needed for execution
+    array<size_t, 2> offsets;
+    array<SmallVector<ptrdiff_t>, 2> regularStrides, reducingStrides;
+    SmallVector<size_t> regularOpDims, reducingOpDims;
+    PrepareTensorOperands<ElemType, 2>(array<TensorShape, 2>{a.GetShape(), GetShape()}, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+
+    // output cannot be input when reducing
+    if (reducingOpDims.size() > 0)
+        CheckDifferentObject(a, *this);
+    /*
+    for (size_t i = 0; i < 2; i++)
+    {
+        // fprintf(stderr, "i = %d, offsets = %d,  regularStrides = %d, reducingStrides = %d \n ", int(i), int(offsets[i]), int(regularStrides[i]), int(reducingStrides[i]));
+        fprintf(stderr, "i = %d, offsets = %d\n ", int(i), int(offsets[i]));
+    }
+    for (size_t i = 0; i < regularOpDims.size(); i++)
+        fprintf(stderr, "i = %d, regularOpDims = %d\n ", int(i), int(regularOpDims[i]));
+    for (size_t i = 0; i < reducingOpDims.size(); i++)
+        fprintf(stderr, "i = %d, reducingOpDims = %d\n ", int(i), int(reducingOpDims[i]));
+
+    for (size_t i = 0; i < 2; i++)
+    {
+        for (size_t j = 0; j < regularStrides[i].size(); j++)
+        {
+            fprintf(stderr, "i = %d, j = %d, regularStrides = %d \n ", int(i), int(j), int(regularStrides[i][j]));
+        }
+        for (size_t j = 0; j < reducingStrides[i].size(); j++)
+        {
+            fprintf(stderr, "i = %d, j = %d, reducingStrides = %d \n ", int(i), int(j), int(reducingStrides[i][j]));
+        }
+    }
+    */
+    // now perform the operation
+    GetSOB().TensorOpDebug(beta, a.GetSOB(), alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+}
 template <class ElemType>
 void TensorView<ElemType>::DoBinaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp)
 {
diff --git a/Source/Math/TensorView.h b/Source/Math/TensorView.h
index a462fce7e..2cd76ba53 100644
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@@ -131,6 +131,7 @@ public:
 #pragma pop_macro("DeclareTernaryTensorOp")
 
     void DoUnaryOpOf  (ElemType beta, const TensorView& a,                                           ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
+    void DoUnaryOpOfDebug(ElemType beta, const TensorView& a, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
     void DoBinaryOpOf (ElemType beta, const TensorView& a, const TensorView& b,                      ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
     void DoTernaryOpOf(ElemType beta, const TensorView& a, const TensorView& b, const TensorView& c, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp);
 
diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp
index ef06834df..ae3a28a84 100644
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@@ -1051,10 +1051,18 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                                     string showWERMode,
                                     bool SVD)
 {
-    PROFILE_SCOPE(profilerEvtMainEpoch);
 
+    PROFILE_SCOPE(profilerEvtMainEpoch);
+    std::vector<std::wstring> decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2);
     ScopedNetworkOperationMode modeGuard(net, NetworkOperationMode::training);
 
+    ComputationNetwork decode_cn_root;
+    decode_cn_root.CopySubTree(*net, decodeOutputNodeNames[0], L"", CopyNodeFlags::copyNodeAll);
+    decode_cn_root.CompileNetwork();
+    std::vector<ComputationNodeBasePtr> decodeOutputNodesTmp = decode_cn_root.OutputNodesByName(decodeOutputNodeNames);
+    decode_cn_root.FormEvalOrder(decodeOutputNodesTmp[0]);
+    decode_cn_root.FormNestedNetwork(decodeOutputNodesTmp[0]);
+
     // bring our 'out' values into consistent state
     epochCriterion = EpochCriterion(0);
     epochEvalErrors.assign(epochEvalErrors.size(), EpochCriterion(0));
@@ -1078,7 +1086,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
     auto ContainsAccumulatedResult = [&evaluationNodesWhichAccumulateResult](ComputationNodeBasePtr node) {
         return evaluationNodesWhichAccumulateResult.find(node) != evaluationNodesWhichAccumulateResult.end();
     };
-
     // MA-related variables
     size_t nSamplesSinceLastModelSync = 0;
     size_t blockSizePerWorker = 0;
@@ -1122,7 +1129,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
     // this is non-trivial, we need a manager object to handle this
     if (numSubminibatchesNeeded > 1)
         smbDispatcher.Init(net, learnableNodes, criterionNodes, evaluationNodes);
-
     // The following is a special feature only supported by the Kaldi2Reader for more efficient sequence training.
     // This attempts to compute the error signal for the whole utterance, which will
     // be fed to the neural network as features. Currently it is a workaround
@@ -1241,6 +1247,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
         numSamplesWithLabelOfNetworkMBR = 0;
 
         auto profGetMinibatch = ProfilerTimeBegin();
+
         bool wasDataRead = DataReaderHelpers::GetMinibatchIntoNetwork<ElemType>(*trainSetDataReader, net, criterionNodes[0],
                                                                                 useDistributedMBReading, useParallelTrain, *inputMatrices, actualMBSize, m_mpi);
 
@@ -1321,12 +1328,10 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                     //net->CompileNetwork();
                     std::vector<std::wstring> encodeOutputNodeNames(outputNodeNamesVector.begin(), outputNodeNamesVector.begin() + 1);
                     std::vector<ComputationNodeBasePtr> encodeOutputNodes = net->OutputNodesByName(encodeOutputNodeNames);
-
                     //
                     //net->CollectInputAndLearnableParameters(encodeOutputNodes[0]);
                     std::list<ComputationNodeBasePtr> InputNodesList = net->InputNodes(criterionNodes[0]);
                     std::vector<std::wstring> encodeInputNodeNames;
-
                     if (SVD)
                         encodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8);
                     else
@@ -1336,7 +1341,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                     *encodeInputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(encodeInputNodes);
 
                     //get decode input matrix
-                    std::vector<std::wstring> decodeOutputNodeNames(outputNodeNamesVector.begin() + 1, outputNodeNamesVector.begin() + 2);
                     std::vector<ComputationNodeBasePtr> decodeOutputNodes = net->OutputNodesByName(decodeOutputNodeNames);
 
                     //net->CollectInputAndLearnableParameters(decodeOutputNodes[0]);
@@ -1346,6 +1350,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                     else
                         decodeInputNodeNames.assign(outputNodeNamesVector.begin() + 7, outputNodeNamesVector.begin() + 8);
                     std::vector<ComputationNodeBasePtr> decodeinputNodes = net->OutputNodesByName(decodeInputNodeNames);
+
                     *decodeinputMatrices = DataReaderHelpersFunctions::RetrieveInputMatrices(decodeinputNodes);
 
                     if (!ordered)
@@ -1359,7 +1364,6 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                         std::vector<ComputationNodeBasePtr> Plustransnodes = net->OutputNodesByName(plusTransNodeNames);
                         net->FormEvalOrder(Plustransnodes[0]);
                     }
-
                     //form eval order for RELU
 
                     auto reffeainput = (*encodeInputMatrices).begin();
@@ -1367,9 +1371,13 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                     auto reflminput = (*decodeinputMatrices).begin();
                     auto decodeMBLayout = reflminput->second.pMBLayout;
 
-                    net->ForwardProp(encodeOutputNodes);
+                    //form eval order for RELU
+                    time_t my_time = time(NULL);
+                    fprintf(stderr, "SGD time 0 = %s", ctime(&my_time));
 
-                    Matrix<ElemType> encodeOutput(net->GetDeviceId());
+                    net->ForwardProp(encodeOutputNodes);
+                    size_t deviceid = net->GetDeviceId();
+                    Matrix<ElemType> encodeOutput(deviceid);
                     encodeOutput.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(encodeOutputNodes[0])->Value()));
 
                     vector<vector<PathInfo>> uttPathsInfo;
@@ -1379,12 +1387,50 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                     vector<float> vt_onebest_wer;
                     vt_onebest_wer.clear();
 
-                    // time_t my_time = time(NULL);
-                    // fprintf(stderr, "SGD time 1 = %s", ctime(&my_time));
+                    my_time = time(NULL);
+                    fprintf(stderr, "SGD time 1 = %s", ctime(&my_time));
                     RNNTDecodeFunctions<ElemType> rnntdfs;
-                    rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net);
-                    //my_time = time(NULL);
-                    //fprintf(stderr,  "SGD time 2 = %s", ctime(&my_time));
+                    //rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net);
+
+                    //vt_printname.push_back(L"DecodeOutputLN");
+
+                    if (m_enableMultiThreadDecodeMBR)
+                    {
+                        ComputationNodeBasePtr WmNode, WmuNode, WmvNode, bmNode;
+                        Matrix<ElemType> Wm(deviceid), Wmu(deviceid), Wmv(deviceid), bm(deviceid);
+                        if (SVD)
+                        {
+                            WmuNode = net->GetNodeFromName(outputNodeNamesVector[4]);   
+                            WmvNode = net->GetNodeFromName(outputNodeNamesVector[5]);
+                            bmNode = net->GetNodeFromName(outputNodeNamesVector[6]);
+                            Wmu.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmuNode)->Value()));
+                            Wmv.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmvNode)->Value()));
+                            WmNode;
+                            Wm;
+                        }
+                        else
+                        {
+                            WmNode = net->GetNodeFromName(outputNodeNamesVector[4]);
+                            bmNode = net->GetNodeFromName(outputNodeNamesVector[5]);
+                            Wm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(WmNode)->Value()));
+                            WmuNode;
+                            WmvNode;
+                            Wmu;
+                            Wmv;
+                        }
+
+                        bm.SetValue(*(&dynamic_pointer_cast<ComputationNode<ElemType>>(bmNode)->Value()));
+                        /*
+                        size_t num_utt = 7;
+                        size_t start_utt = 0;
+                        */
+                        rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeInputNodeNames, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, decode_cn_root,  Wm, Wmu, Wmv, bm); /*, num_utt, start_utt); */
+                    }
+                    else
+                        rnntdfs.RNNT_decode_nbest_MBR(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, net);
+                    // rnntdfs.RNNT_decode_nbest_MBR_Multithread(outputNodeNamesVector, encodeOutput, encodeMBLayout, reflminput->second.GetMatrix<ElemType>(), decodeMBLayout, decodeinputNodes, numBestMBR, lengthNorm, vt_labels, uttPathsInfo, vt_nws, vt_onebest_wer, SVD, *net, decode_cn, decodeinputNodes_tmp);
+                    my_time = time(NULL);
+                    fprintf(stderr, "SGD time 2 = %s", ctime(&my_time));
                     //fprintf(stderr, "decode SGD v0 .\n");
 
                     //net->BumpEvalTimeStamp(decodeinputNodes);
@@ -1400,21 +1446,11 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                     refFeaMatBackup.SetValue(reffeainput->second.GetMatrix<ElemType>());
                     size_t numParallelSequences = encodeMBLayout->GetNumParallelSequences();
 
-                    //my_time = time(NULL);
-                    //fprintf(stderr, "SGD time 3 = %s", ctime(&my_time));
+                    my_time = time(NULL);
+                    fprintf(stderr, "SGD time 3 = %s", ctime(&my_time));
 
                     for (const auto& seq : encodeMBLayout->GetAllSequences())
                     {
-                        /*
-                        if (seqId == 1)
-                        {
-                            for (auto nodeIter = learnableNodes.begin(); nodeIter != learnableNodes.end(); nodeIter++)
-                            {
-                                ComputationNodePtr node = dynamic_pointer_cast<ComputationNode<ElemType>>(*nodeIter);
-                                node->force_gradient_accumulate(true);
-                            }
-                        }
-                        */
                         if (seq.seqId == GAP_SEQUENCE_ID)
                         {
                             continue;
@@ -1423,15 +1459,12 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                         {
                             continue;
                         }
-
-                        //if (firstdebug)
                         cNode->SetMWERInfo(uttPathsInfo[seqId], lengthNorm, wordPathPosteriorFromDecodeMBR, doMBR, vt_nws[seqId]);
 
                         // get the feature MBLayout
                         size_t numFrames = seq.GetNumTimeSteps();
                         numSamplesWithLabelOfNetworkMBR += numFrames;
 
-                        // if (firstdebug)
                         reffeainput->second.pMBLayout->Init(1, numFrames); // 1 channel, 1 utterance
 
                         Matrix<ElemType> fea(deviceID);
@@ -1447,11 +1480,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
 
                             fea.SetColumn(refFeaMatBackup.ColumnSlice(uID, 1), t);
                         }
-                        //if (firstdebug)
-                        {
-                            reffeainput->second.GetMatrix<ElemType>().SetValue(fea);
-                            reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames
-                        }
+
+                        reffeainput->second.GetMatrix<ElemType>().SetValue(fea);
+                        reffeainput->second.pMBLayout->AddSequence(0, 0, 0, numFrames); // guoye: first 0 is for utterance ID, second 0 means 0th channel, lenght is 0 to numFrames
 
                         // guoye: the below 2 commands reset the state, to make sure ForwardProb always get carried out
                         ComputationNetwork::BumpEvalTimeStamp(encodeInputNodes); // guoy: update the time stamp before you do forward prob
@@ -1463,9 +1494,9 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                         size_t nBest = uttPathsInfo[seqId].size();
                         if (nBest > (m_maxFrameNumPerMinibatchMBR / numFrames))
                         {
-                            // reset nBest to make the MB size framenum with budget
+                            // reset nBest to make the MB size
                             nBest = (m_maxFrameNumPerMinibatchMBR / numFrames);
-                        }    
+                        }
                         size_t maxPhoneSeqLen = uttPathsInfo[seqId][0].label_seq.size();
 
                         for (size_t n = 1; n < nBest; n++)
@@ -1583,8 +1614,8 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                         }
                     }
 
-                    //my_time = time(NULL);
-                    //fprintf(stderr, "SGD time 4 = %s", ctime(&my_time));
+                    my_time = time(NULL);
+                    fprintf(stderr, "SGD time 4 = %s", ctime(&my_time));
                 }
                 // ===========================================================
                 // forward prop for evaluate eval nodes
@@ -1790,8 +1821,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
                 {
 #ifdef _DEBUG
                     if (smoothedGradientIter->HasNan("TrainOneEpoch/UpdateWeights(): "))
-                        LogicError("
-                            %ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
+                        LogicError("%ls operation has NaNs in smoothedGradient.", node->NodeName().c_str(), node->OperationName().c_str());
 #endif
                     double nodeDependentLearningRatePerSample = learnRatePerSample * node->GetLearningRateMultiplier();
                     double nodeDependentRegMultiplier = dynamic_pointer_cast<LearnableParameter<ElemType>>(node)->GetRegMultiplier();
@@ -3460,7 +3490,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
     m_lengthNorm = configSGD(L"LengthNorm", true);
     m_showWERMode = configSGD(L"showWERMode", "average");
     m_isSVD = configSGD(L"SVD", true);
-
+    m_enableMultiThreadDecodeMBR = configSGD(L"enableMultiThreadDecodeMBR", true);
     m_maxFrameNumPerMinibatchMBR = configSGD(L"MaxFrameNumPerMinibatchMBR", (size_t) 2000);
     if (m_doGradientCheck && sizeofElemType != sizeof(double))
     {
diff --git a/Source/SGDLib/SGD.h b/Source/SGDLib/SGD.h
index 8db27b79e..106d06b2e 100644
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@@ -292,6 +292,7 @@ protected:
     string m_showWERMode;
     bool m_isSVD;
     size_t m_maxFrameNumPerMinibatchMBR;
+    bool m_enableMultiThreadDecodeMBR;
     // Parallel training
     MPIWrapperPtr m_mpi;
 
diff --git a/Source/SGDLib/SimpleOutputWriter.h b/Source/SGDLib/SimpleOutputWriter.h
index a008636c7..0926a9eeb 100644
--- a/Source/SGDLib/SimpleOutputWriter.h
+++ b/Source/SGDLib/SimpleOutputWriter.h
@@ -421,7 +421,7 @@ public:
         Matrix<ElemType> encodeOutput(deviceid);
         Matrix<ElemType> decodeOutput(deviceid);
         Matrix<ElemType> greedyOutput(deviceid), greedyOutputMax(deviceid);
-        Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
+        Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
         Matrix<ElemType> lmin(deviceid);
         MatrixPool m_matrixPool;
         m_matrixPool.OptimizedMemoryAllocation();
@@ -472,35 +472,7 @@ public:
                 CurSequences = nextSequences;
 
                 vector<typename RNNTDecodeFunctions<ElemType>::Sequence>().swap(nextSequences);
-                //deal with the same prefix
-                /*sort(CurSequences.begin(), CurSequences.end(),
-                     [](const Sequence& a, const Sequence& b) -> bool {
-                         return a.labelseq.size() > b.labelseq.size();
-                     });
-                for (size_t n = 0; n < CurSequences.size() - 1; n++)
-                {
-                    for (size_t h = n + 1; h < CurSequences.size(); h++)
-                    {
-                        if (isPrefix(CurSequences[h], CurSequences[n]))
-                        {
-                            //forward_prop the prefix
-                            forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size());
 
-                            forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
-
-                            size_t idx = CurSequences[h].labelseq.size();
-                            ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0);
-                            for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++)
-                            {
-                                forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1);
-                                forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
-
-                                curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0);
-                            }
-                            CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP);
-                        }
-                    }
-                }*/
                 //nextSequences.clear();
                 while (true)
                 {
@@ -515,8 +487,8 @@ public:
                     rnntdfs.prepareSequence(tempSeq);
                     rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net);
                     if (isSVD)
-                        rnntdfs.forwardmergedSVD(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wmu, Wmv, bm, *m_net);
-                    else rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net);
+                        rnntdfs.forwardmergedSVD(tempSeq, t,  encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wmu, Wmv, bm, m_net);
+                    else rnntdfs.forwardmerged(tempSeq, t,  encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net);
 
                     //sumofENandDE.Print("sum");
                     //sort log posterior and get best N labels
@@ -715,7 +687,7 @@ public:
         Matrix<ElemType> encodeOutput(deviceid);
         Matrix<ElemType> decodeOutput(deviceid);
         Matrix<ElemType> greedyOutput(deviceid), greedyOutputMax(deviceid);
-        Matrix<ElemType> sumofENandDE(deviceid), maxIdx(deviceid), maxVal(deviceid);
+        Matrix<ElemType> maxIdx(deviceid), maxVal(deviceid);
         Matrix<ElemType> lmin(deviceid);
         MatrixPool m_matrixPool;
         m_matrixPool.OptimizedMemoryAllocation();
@@ -760,36 +732,6 @@ public:
                 CurSequences = nextSequences;
 
                 vector<typename RNNTDecodeFunctions<ElemType>::Sequence>().swap(nextSequences);
-                //deal with the same prefix
-                /*sort(CurSequences.begin(), CurSequences.end(),
-                     [](const Sequence& a, const Sequence& b) -> bool {
-                         return a.labelseq.size() > b.labelseq.size();
-                     });
-                for (size_t n = 0; n < CurSequences.size() - 1; n++)
-                {
-                    for (size_t h = n + 1; h < CurSequences.size(); h++)
-                    {
-                        if (isPrefix(CurSequences[h], CurSequences[n]))
-                        {
-                            //forward_prop the prefix
-                            forward_decode(CurSequences[h], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, CurSequences[h].labelseq.size());
-
-                            forwardmerged(CurSequences[h], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
-
-                            size_t idx = CurSequences[h].labelseq.size();
-                            ElemType curlogp = CurSequences[h].logP + decodeOutput(CurSequences[n].labelseq[idx], 0);
-                            for (size_t k = idx; k < CurSequences[n].labelseq.size() - 1; k++)
-                            {
-                                forward_decode(CurSequences[n], decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, k + 1);
-                                forwardmerged(CurSequences[n], t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes);
-
-                                curlogp += decodeOutput(CurSequences[n].labelseq[k + 1], 0);
-                            }
-                            CurSequences[n].logP = decodeOutput.LogAdd(curlogp, CurSequences[n].logP);
-                        }
-                    }
-                }*/
-                //nextSequences.clear();
                 while (true)
                 {
 
@@ -802,7 +744,7 @@ public:
                     CurSequences.erase(maxSeq);
                     rnntdfs.prepareSequence(tempSeq);
                     rnntdfs.forward_decode(tempSeq, decodeinputMatrices, deviceid, decodeOutputNodes, decodeinputNodes, vocabSize, tempSeq.labelseq.size(), *m_net);
-                    rnntdfs.forwardmerged(tempSeq, t, sumofENandDE, encodeOutput, decodeOutput, PlusNode, PlusTransNode, Plusnodes, Plustransnodes, Wm, bm, *m_net);
+                    rnntdfs.forwardmerged(tempSeq, t, encodeOutput, decodeOutput, Plusnodes, Plustransnodes, Wm, bm, m_net);
 
                     //sumofENandDE.Print("sum");
                     //sort log posterior and get best N labels