embr cer backup

2018-12-19 14:34:26 +08:00 · 2018-12-19 14:34:26 +08:00 · 57d3295811
--- a/Source/1BitSGD/AllReduceDistGradAggregator.h
+++ b/Source/1BitSGD/AllReduceDistGradAggregator.h
@ -0,0 +1,631 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "IDistGradAggregator.h"
+#include "CUDAPageLockedMemAllocator.h"
+#include "QuantizedMatrix.h"
+#include "MatrixQuantizer.h"
+#include "MatrixQuantizerGPU.h"
+#include <future>
+#include "TimerUtility.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// =======================================================================
+// AllReduceDistGradAggregator -- 1-bit SGD.
+// This implements
+//    Frank Seide, Hao Fu, Jasha Droppo, Gang Li, and Dong Yu:
+//    "1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs"
+//    In Proc. Interspeech 2014.
+// =======================================================================
+
+template <class ElemType>
+class AllReduceDistGradAggregator : public IDistGradAggregator<ElemType>
+{
+    struct Stripe
+    {
+        size_t m_startCol;
+        size_t m_numCols;
+    };
+
+    UsingIDistGradAggregatorMembers;
+
+    static const int DEBUG_OUTPUT_TRACE_LEVEL = 3;
+
+public:
+    AllReduceDistGradAggregator(const std::shared_ptr<MPIWrapper>& mpi, int nBits, bool zeroThresholdFor1Bit, bool useQuantizationForSelfStripe, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
+        : IDistGradAggregator<ElemType>(mpi), m_numQuantizationBits(nBits), m_zeroThresholdFor1Bit(zeroThresholdFor1Bit), m_useQuantizationForSelfStripe(useQuantizationForSelfStripe),
+        m_traceLevel(traceLevel), m_initialized(false), m_useAsyncAggregation(useAsyncAggregation), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace), m_iterationCount(0)
+    {}
+
+    ~AllReduceDistGradAggregator()
+    {
+        for (size_t i = 0; i < m_recvHeaders.size(); ++i)
+            DistGradHeader::Destroy(m_recvHeaders[i]);
+
+        if (m_bufferedGradHeader != nullptr)
+            DistGradHeader::Destroy(m_bufferedGradHeader);
+    }
+
+    // Gets the range of columns to be processed by the node with the specified rank
+    // when parallel processing using 'numNodes' nodes
+    static Stripe GetStripeForNode(size_t numCols, size_t nodeRank, size_t numNodes)
+    {
+        // Determine which stripe of the gradient is this node responsible for
+        size_t numColsPerNode = numCols / numNodes;
+        size_t residue = numCols % numNodes;
+        size_t startColNumofStripe = (numColsPerNode * nodeRank) + min(residue, nodeRank);
+        size_t numColsinStripe = numColsPerNode + ((nodeRank < residue) ? 1 : 0);
+
+        return Stripe({startColNumofStripe, numColsinStripe});
+    }
+
+    void ResetState(const std::vector<Matrix<ElemType>*>& gradients, int numEvalNodes, bool resetState)
+    {
+        // When called the first time let's setup the quantizers and matrices for holding quantized values.
+        // These can live for the lifetime of the aggregator since the gradient matrix dimensions for learnable parameters
+        // do not change
+        if (!m_initialized)
+        {
+            m_initialized = true;
+            int deviceId = gradients[0]->GetDeviceId();
+            if (deviceId != CPUDEVICE)
+                m_allocator.reset(new CUDAPageLockedMemAllocator(deviceId));
+
+            for (size_t i = 0; i < gradients.size(); i++)
+            {
+                // Make sure none of the gradient matrices are sparse - we currently do not support aggregation of sparse gradient matrices
+                if (gradients[i]->GetMatrixType() != DENSE)
+                    RuntimeError("Gradient aggregation for sparse gradient matrices is currently unsupported!");
+
+                size_t nRow = gradients[i]->GetNumRows();
+                size_t nCol = gradients[i]->GetNumCols();
+                m_preAggGradQuantizers.push_back(std::unique_ptr<MatrixQuantizer<ElemType>>(new MatrixQuantizer<ElemType>(nRow, nCol, deviceId, m_useAsyncAggregation)));
+                m_gradQuantized.push_back(std::unique_ptr<QuantizedMatrix<ElemType>>(new QuantizedMatrix<ElemType>(nRow, nCol, m_numQuantizationBits, CPUDEVICE, m_allocator.get())));
+
+                // Determine which stripe of the gradient is this node responsible for
+                Stripe stripe = GetStripeForNode(nCol, MyRank(), NumProc());
+
+                MatrixQuantizer<ElemType>* currAggGradQuantizer = nullptr;
+                std::vector<std::unique_ptr<QuantizedMatrix<ElemType>>> currRecvGradStripesQuantized;
+                if (stripe.m_numCols > 0)
+                {
+                    currAggGradQuantizer = new MatrixQuantizer<ElemType>(nRow, stripe.m_numCols, deviceId, m_useAsyncAggregation);
+                    for (size_t j = 0; j < NumProc() - 1; ++j)
+                        currRecvGradStripesQuantized.push_back(std::unique_ptr<QuantizedMatrix<ElemType>>(new QuantizedMatrix<ElemType>(nRow, stripe.m_numCols, m_numQuantizationBits, CPUDEVICE, m_allocator.get())));
+                }
+
+                m_aggGradStripeQuantizers.push_back(std::unique_ptr<MatrixQuantizer<ElemType>>(currAggGradQuantizer));
+                m_recvGradStripesQuantized.push_back(std::move(currRecvGradStripesQuantized));
+
+                if (m_useAsyncAggregation)
+                    m_bufferedGradients[gradients[i]].reset(new Matrix<ElemType>(gradients[i]->GetNumRows(), gradients[i]->GetNumCols(), deviceId));
+            }
+
+            if (m_useAsyncAggregation)
+            {
+                m_bufferedGradHeader = DistGradHeader::Create(numEvalNodes);
+                m_bufferedGradHeader->Clear();
+            }
+
+            if (m_mpi->IsMainNode())
+            {
+                for (size_t i = 0; i < NumProc() - 1; ++i)
+                    m_recvHeaders.push_back(DistGradHeader::Create(numEvalNodes));
+            }
+        }
+        else if (resetState)
+        {
+            // If we are resetting state, let's clear previous quantization residues
+
+            // Make sure there is no pending async aggregation
+            if (m_useAsyncAggregation && m_pendingAsyncAggregation.valid())
+                LogicError("Unexpected pending async gradient aggregation found when resetting aggregator state!");
+
+            for (size_t i = 0; i < m_preAggGradQuantizers.size(); ++i)
+                m_preAggGradQuantizers[i]->ResetResidue();
+
+            for (size_t i = 0; i < m_aggGradStripeQuantizers.size(); ++i)
+            {
+                if (m_aggGradStripeQuantizers[i] != nullptr)
+                    m_aggGradStripeQuantizers[i]->ResetResidue();
+            }
+
+            // Zero out the buffered gradients if resetting state
+            if (m_useAsyncAggregation)
+            {
+                for (size_t i = 0; i < gradients.size(); i++)
+                    m_bufferedGradients[gradients[i]]->SetValue(0);
+
+                m_bufferedGradHeader->Clear();
+            }
+        }
+    }
+
+    // Aggregate the gradient matrices across all nodes
+    bool AggregateGradients(const std::vector<Matrix<ElemType>*>& gradients, DistGradHeader* headerCPU, bool resetState) override
+    {
+        ResetState(gradients, headerCPU->numEvalNode, resetState);
+        bool showSyncPerfStats = (m_syncStatsTrace > 0) && ((m_iterationCount % m_syncStatsTrace) == 0);
+        m_iterationCount++;
+
+        if (m_useAsyncAggregation)
+        {
+            // If we are performing async gradient aggregation, let's wait for the pending gradient aggregation to finish
+            // then swap the contents of the buffered gradients and the new gradient matrices and fire an async aggreagation
+            // of the new gradient matrices
+            if (m_pendingAsyncAggregation.valid())
+            {
+                Timer aggregationTimer;
+                if (showSyncPerfStats)
+                    aggregationTimer.Start();
+
+                m_pendingAsyncAggregation.get();
+
+                if (showSyncPerfStats)
+                {
+                    aggregationTimer.Stop();
+                    double gradientAggregationTime = aggregationTimer.ElapsedSeconds();
+                    fprintf(stderr, "Async gradient aggregation wait time: %.6g\n", gradientAggregationTime);
+                }
+            }
+
+            std::vector<Matrix<ElemType>*> newGradients;
+            size_t numGradMatrices = gradients.size();
+            for (size_t i = 0; i < numGradMatrices; i++)
+            {
+                Matrix<ElemType>* bufferedGradientMatrix = m_bufferedGradients[gradients[i]].get();
+                if ((bufferedGradientMatrix == nullptr) ||
+                    (bufferedGradientMatrix->GetNumCols() != gradients[i]->GetNumCols()) ||
+                    (bufferedGradientMatrix->GetNumRows() != gradients[i]->GetNumRows()) ||
+                    (bufferedGradientMatrix->GetDeviceId() != gradients[i]->GetDeviceId()))
+                {
+                    LogicError("No buffered gradient matrix found corresponding to a gradient matrix to be aggregated!");
+                }
+
+                // Swap the gradient matrix contents with the buffered matrices
+                std::swap(*(gradients[i]), *bufferedGradientMatrix);
+
+                newGradients.push_back(bufferedGradientMatrix);
+            }
+
+            // Swap the grad header contents with the buffered grad header
+            swap(*headerCPU, *m_bufferedGradHeader);
+
+            // Initiate aggregation only if any samples were processed in previous iteration
+            if (resetState || (headerCPU->numSamples != 0))
+            {
+                int deviceId = gradients[0]->GetDeviceId();
+                DistGradHeader* newGradHeader = m_bufferedGradHeader;
+
+                // Since we will be aggregating the gradients asynchronously, let us
+                // ensure that the gradient matrices have been computed before starting to aggregate
+                // them asynchronously on another thread. This essentially means that when we are using
+                // a GPU device, we will synchronize on the main GPU compute stream before starting
+                // the gradient aggregation asynchronously on a separate stream
+                MatrixComputeStreamEvent* mainStreamSyncEvent = MatrixComputeStreamEvent::Create(deviceId);
+
+                m_pendingAsyncAggregation = std::async(std::launch::async, [=] {
+                    // We are starting on a new thread. Make sure the new thread is
+                    // setup to use the right device
+                    Matrix<ElemType>::SetDevice(deviceId);
+
+                    // Synchronize the Quantization compute stream with the completion of
+                    // compute of the gradient matrices on the main compute stream
+                    mainStreamSyncEvent->SynchronizeQuantizationComputeStreamWithEvent<ElemType>();
+                    delete mainStreamSyncEvent;
+
+                    AggregateGradientsImpl(newGradients, newGradHeader, showSyncPerfStats);
+                });
+
+                return true;
+            }
+
+            return false;
+        }
+        else
+        {
+            AggregateGradientsImpl(gradients, headerCPU, showSyncPerfStats);
+            return (headerCPU->numSamples != 0);
+        }
+    }
+
+    void AggregateGradientsImpl(const std::vector<Matrix<ElemType>*>& gradients, DistGradHeader* headerCPU, bool showSyncPerfStats)
+    {
+        Timer aggregationTimer;
+        int deviceId = gradients[0]->GetDeviceId();
+        if (showSyncPerfStats)
+        {
+            std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId));
+            mainStreamSyncEvent->SynchronizeEvent();
+            aggregationTimer.Start();
+        }
+
+        size_t numGradMatrices = gradients.size();
+
+        if (headerCPU->numSamples == 0)
+        {
+            assert(headerCPU->criterion == 0.0);
+            assert(headerCPU->numSamplesWithLabel == 0);
+            for (int i = 0; i < headerCPU->numEvalNode; ++i)
+                assert(headerCPU->evalErrors[i].first == 0 && headerCPU->evalErrors[i].second == 0);
+
+            // If the current node did not process any samples, the gradients should be zero'd
+            for (size_t i = 0; i < numGradMatrices; ++i)
+                gradients[i]->SetValue(0);
+
+            if (m_useAsyncAggregation)
+            {
+                std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId));
+                mainStreamSyncEvent->SynchronizeQuantizationComputeStreamWithEvent<ElemType>();
+            }
+        }
+
+        std::vector<std::unique_ptr<Matrix<ElemType>>> aggGradStripes;
+        std::vector<std::unique_ptr<QuantizedMatrix<ElemType>>> aggGradStripesQuantized;
+        for (size_t i = 0; i < gradients.size(); i++)
+        {
+            size_t nCol = gradients[i]->GetNumCols();
+
+            // Determine which stripe of the gradient is this node responsible for
+            Stripe stripe = GetStripeForNode(nCol, MyRank(), NumProc());
+
+            Matrix<ElemType>* currAggGradStripe = nullptr;
+            QuantizedMatrix<ElemType>* currAggGradStripeQuantized = nullptr;
+            if (stripe.m_numCols > 0)
+            {
+                currAggGradStripe = new Matrix<ElemType>(gradients[i]->ColumnSlice(stripe.m_startCol, stripe.m_numCols));
+                currAggGradStripeQuantized = new QuantizedMatrix<ElemType>(m_gradQuantized[i]->ColumnSlice(stripe.m_startCol, stripe.m_numCols));
+            }
+
+            aggGradStripes.push_back(std::unique_ptr<Matrix<ElemType>>(currAggGradStripe));
+            aggGradStripesQuantized.push_back(std::unique_ptr<QuantizedMatrix<ElemType>>(currAggGradStripeQuantized));
+        }
+
+        // Initiate quantization of the gradient matrices
+        for (size_t i = 0; i < numGradMatrices; ++i)
+        {
+            if (m_traceLevel >= DEBUG_OUTPUT_TRACE_LEVEL)
+            {
+                char printHeaderBuf[1024];
+                sprintf(printHeaderBuf, "MPI Rank: %d, Original Gradient Matrix No. %d", (int) MyRank(), (int) i);
+                PrintMatrix(printHeaderBuf, gradients[i]);
+            }
+
+            m_preAggGradQuantizers[i]->QuantizeAsync(*(gradients[i]), *(m_gradQuantized[i]), m_zeroThresholdFor1Bit);
+        }
+
+        // Initiate receive of the stripe to be aggregated by the current node, from all other nodes
+        std::vector<MPI_Request> recvGradStripesQuantizedRequests;
+        std::vector<int> recvRequestIdxToGradientMatrixIdxMap;
+        for (size_t i = 0; i < numGradMatrices; ++i)
+        {
+            Stripe stripe = GetStripeForNode(gradients[i]->GetNumCols(), MyRank(), NumProc());
+            if (stripe.m_numCols > 0)
+            {
+                recvRequestIdxToGradientMatrixIdxMap.push_back(i);
+                for (size_t j = 0; j < NumProc() - 1; ++j)
+                {
+                    int source = (j >= MyRank()) ? (j + 1) : j;
+
+                    recvGradStripesQuantizedRequests.push_back(MPI_Request());
+                    int recvRequestIdx = recvGradStripesQuantizedRequests.size() - 1;
+
+                    m_mpi->Irecv(m_recvGradStripesQuantized[i][j]->Buffer(), m_recvGradStripesQuantized[i][j]->GetSize(), MPI_CHAR, source, i, &(recvGradStripesQuantizedRequests[recvRequestIdx])) || MpiFail("MPI_Irecv");
+                }
+            }
+        }
+
+        // Initiate receive of the header on the main node
+        std::vector<MPI_Request> recvHeaderRequests(NumProc() - 1);
+        if (m_mpi->IsMainNode())
+        {
+            for (size_t j = 0; j < NumProc() - 1; ++j)
+            {
+                int source = (j >= MyRank()) ? (j + 1) : j;
+                // We use a tag of 'numGradMatrices' for the pre-aggregation header
+                m_mpi->Irecv(m_recvHeaders[j], m_recvHeaders[j]->Size(), MPI_CHAR, source, numGradMatrices, &(recvHeaderRequests[j])) || MpiFail("MPI_Irecv");
+            }
+        }
+
+        // Asynchronously send stripes of the quantized gradient matrices to the respective nodes that own aggregation of that stripe
+        std::vector<std::vector<MPI_Request>> sendGradStripesQuantizedRequests(numGradMatrices);
+        for (size_t i = 0; i < numGradMatrices; ++i)
+        {
+            m_preAggGradQuantizers[i]->WaitQuantizeAsyncDone();
+            size_t sendRequestIdx = 0;
+            for (size_t j = 0; j < NumProc(); ++j)
+            {
+                Stripe stripe = GetStripeForNode(gradients[i]->GetNumCols(), j, NumProc());
+                if (stripe.m_numCols > 0)
+                {
+                    // Do not send stripe for self
+                    if (j != MyRank())
+                    {
+                        sendGradStripesQuantizedRequests[i].push_back(MPI_Request());
+                        QuantizedMatrix<ElemType> quantizedStripe = m_gradQuantized[i]->ColumnSlice(stripe.m_startCol, stripe.m_numCols);
+                        if (m_traceLevel >= DEBUG_OUTPUT_TRACE_LEVEL)
+                        {
+                            char printHeaderBuf[1024];
+                            sprintf(printHeaderBuf, "MPI Rank: %d, Sending Gradient Matrix No. %d slice", (int) MyRank(), (int) i);
+                            const size_t numRowsToPeek = 3;
+                            const size_t numColsToPeek = 3;
+                            size_t numRowsToPrint = (std::min)(numRowsToPeek, quantizedStripe.GetNumRows());
+                            size_t numColsToPrint = (std::min)(numColsToPeek, quantizedStripe.GetNumCols());
+
+                            quantizedStripe.Print(printHeaderBuf, 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
+                        }
+
+                        m_mpi->Isend(quantizedStripe.Buffer(), quantizedStripe.GetSize(), MPI_CHAR, j, i, &(sendGradStripesQuantizedRequests[i][sendRequestIdx])) || MpiFail("MPI_Isend");
+                        sendRequestIdx++;
+                    }
+                    else
+                    {
+                        // Initialize the aggregate for the stripe with the quantized gradients instead of the original
+                        // gradients themselves, if so desired
+                        if (m_useQuantizationForSelfStripe)
+                        {
+                            QuantizedMatrix<ElemType> preAggGradSelfStripeQuantized = m_gradQuantized[i]->ColumnSlice(stripe.m_startCol, stripe.m_numCols);
+                            m_aggGradStripeQuantizers[i]->UnquantizeAsync(preAggGradSelfStripeQuantized, *(aggGradStripes[i]), false);
+                        }
+                    }
+                }
+            }
+        }
+
+        // Send the headers from all nodes but the main node
+        MPI_Request sendHeaderRequest;
+        if (!m_mpi->IsMainNode())
+            m_mpi->Isend(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank(), numGradMatrices, &sendHeaderRequest) || MpiFail("MPI_Isend");
+
+        // Wait for the stripes to arrive from each node and unquantize and aggregate
+        size_t numReceivesExpected = recvGradStripesQuantizedRequests.size();
+        size_t numActualReceives = 0;
+        std::vector<int> perGradMatrixReceiveCount(recvRequestIdxToGradientMatrixIdxMap.size(), 0);
+        while (numActualReceives < numReceivesExpected)
+        {
+            int idx = MPI_UNDEFINED;
+            m_mpi->Waitany(recvGradStripesQuantizedRequests.size(), recvGradStripesQuantizedRequests.data(), &idx, MPI_STATUS_IGNORE) || MpiFail("MPI_Waitany");
+            if (idx == MPI_UNDEFINED)
+            {
+                break;
+            }
+
+            numActualReceives++;
+
+            int gradMatrixIdxPosition = idx / (NumProc() - 1);
+            int recvBufferSubIndex = idx % (NumProc() - 1);
+            // Map idx back to the actual gradient matrix index
+            int gradMatrixIdx = recvRequestIdxToGradientMatrixIdxMap[gradMatrixIdxPosition];
+
+            // Wait for the previous Unquantize to finish before issuing a new one
+            if (m_useQuantizationForSelfStripe || (perGradMatrixReceiveCount[gradMatrixIdxPosition] > 0))
+                m_aggGradStripeQuantizers[gradMatrixIdx]->WaitUnquantizeAsyncDone();
+
+            if (m_traceLevel >= DEBUG_OUTPUT_TRACE_LEVEL)
+            {
+                char printHeaderBuf[1024];
+                sprintf(printHeaderBuf, "MPI Rank: %d, Received Gradient Matrix No. %d slice", (int) MyRank(), gradMatrixIdx);
+                const size_t numRowsToPeek = 3;
+                const size_t numColsToPeek = 3;
+                size_t numRowsToPrint = (std::min)(numRowsToPeek, m_recvGradStripesQuantized[gradMatrixIdx][recvBufferSubIndex]->GetNumRows());
+                size_t numColsToPrint = (std::min)(numColsToPeek, m_recvGradStripesQuantized[gradMatrixIdx][recvBufferSubIndex]->GetNumCols());
+
+                m_recvGradStripesQuantized[gradMatrixIdx][recvBufferSubIndex]->Print(printHeaderBuf, 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
+            }
+
+            m_aggGradStripeQuantizers[gradMatrixIdx]->UnquantizeAsync(*(m_recvGradStripesQuantized[gradMatrixIdx][recvBufferSubIndex]), *(aggGradStripes[gradMatrixIdx]), true);
+
+            perGradMatrixReceiveCount[gradMatrixIdxPosition]++;
+
+            // Also issue the quantization if this stripe was the last one expected for this matrix
+            // Note: We issue the quantization without waiting for the unquantization since the same stream
+            // is used for both and they are implicitly sequenced
+            // We reuse the buffer that we used for quantizing and sending out the pre-aggregation gradient
+            if (perGradMatrixReceiveCount[gradMatrixIdxPosition] == (NumProc() - 1))
+            {
+                Stripe stripe = GetStripeForNode(gradients[gradMatrixIdx]->GetNumCols(), MyRank(), NumProc());
+                UNUSED(stripe);
+                assert(stripe.m_numCols > 0);
+                m_aggGradStripeQuantizers[gradMatrixIdx]->QuantizeAsync(*(aggGradStripes[gradMatrixIdx]), *(aggGradStripesQuantized[gradMatrixIdx]), m_zeroThresholdFor1Bit);
+            }
+        }
+
+        assert(numActualReceives == numReceivesExpected);
+
+        // On the main node wait for the headers to arrive and aggregate
+        if (m_mpi->IsMainNode())
+        {
+            size_t numNodesHeadersReceivedFrom = 0;
+            while (numNodesHeadersReceivedFrom < (NumProc() - 1))
+            {
+                int idx = MPI_UNDEFINED;
+                m_mpi->Waitany(recvHeaderRequests.size(), recvHeaderRequests.data(), &idx, MPI_STATUS_IGNORE) || MpiFail("MPI_Waitany");
+                if (idx == MPI_UNDEFINED)
+                    break;
+
+                numNodesHeadersReceivedFrom++;
+
+                headerCPU->Aggregate(m_recvHeaders[idx], true);
+            }
+
+            assert(numNodesHeadersReceivedFrom == (NumProc() - 1));
+        }
+
+        std::vector<std::vector<MPI_Request>> recvAggGradStripesQuantizedRequests(numGradMatrices);
+        // Initiate receive of stripes of quantized aggregated gradients from different nodes
+        for (size_t i = 0; i < numGradMatrices; ++i)
+        {
+            size_t recvRequestIdx = 0;
+            for (size_t j = 0; j < NumProc(); ++j)
+            {
+                // Do not recv stripe for self
+                if (j != MyRank())
+                {
+                    Stripe stripe = GetStripeForNode(gradients[i]->GetNumCols(), j, NumProc());
+                    if (stripe.m_numCols > 0)
+                    {
+                        recvAggGradStripesQuantizedRequests[i].push_back(MPI_Request());
+                        QuantizedMatrix<ElemType> quantizedStripe = m_gradQuantized[i]->ColumnSlice(stripe.m_startCol, stripe.m_numCols);
+                        m_mpi->Irecv(quantizedStripe.Buffer(), quantizedStripe.GetSize(), MPI_CHAR, j, numGradMatrices + 1 + i, &(recvAggGradStripesQuantizedRequests[i][recvRequestIdx])) || MpiFail("MPI_Irecv");
+                        recvRequestIdx++;
+                    }
+                }
+            }
+        }
+
+        MPI_Request recvAggHeaderRequest;
+        // Initiate receive of the aggregate header
+        if (!m_mpi->IsMainNode())
+            m_mpi->Irecv(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank(), numGradMatrices + 1 + numGradMatrices, &recvAggHeaderRequest) || MpiFail("MPI_Irecv");
+
+        // Initiate broadcast of quantized aggregated gradient stripes to all other nodes
+        std::vector<std::vector<MPI_Request>> sendAggGradStripeQuantizedRequests(numGradMatrices);
+        for (size_t i = 0; i < numGradMatrices; ++i)
+        {
+            Stripe stripe = GetStripeForNode(gradients[i]->GetNumCols(), MyRank(), NumProc());
+            if (stripe.m_numCols > 0)
+            {
+                sendAggGradStripeQuantizedRequests[i] = std::vector<MPI_Request>(NumProc() - 1);
+                m_aggGradStripeQuantizers[i]->WaitQuantizeAsyncDone();
+                for (size_t j = 0; j < NumProc() - 1; ++j)
+                {
+                    int dest = (j >= MyRank()) ? (j + 1) : j;
+                    // TODO: Should we use MPI_Bcast instead for better performance
+                    m_mpi->Isend(aggGradStripesQuantized[i]->Buffer(), aggGradStripesQuantized[i]->GetSize(), MPI_CHAR, dest, numGradMatrices + 1 + i, &(sendAggGradStripeQuantizedRequests[i][j])) || MpiFail("MPI_Irecv");
+                }
+            }
+        }
+
+        // Initiate send of the aggregate header from main node
+        std::vector<MPI_Request> sendAggHeaderRequests(NumProc() - 1);
+        if (m_mpi->IsMainNode())
+        {
+            for (size_t j = 0; j < NumProc() - 1; ++j)
+            {
+                int dest = (j >= MyRank()) ? (j + 1) : j;
+                // TODO: Should we use MPI_Bcast instead for better performance
+                m_mpi->Isend(headerCPU, headerCPU->Size(), MPI_CHAR, dest, numGradMatrices + 1 + numGradMatrices, &(sendAggHeaderRequests[j])) || MpiFail("MPI_Isend");
+            }
+        }
+
+        // Wait to receive all aggregated stripes and unquantize
+        for (size_t i = 0; i < numGradMatrices; ++i)
+        {
+            m_mpi->Waitall(recvAggGradStripesQuantizedRequests[i].size(), recvAggGradStripesQuantizedRequests[i].data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");
+
+            m_preAggGradQuantizers[i]->UnquantizeAsync(*(m_gradQuantized[i]), *(gradients[i]), false);
+        }
+
+        // Wait to receive aggregate header
+        if (!m_mpi->IsMainNode())
+            m_mpi->Wait(&recvAggHeaderRequest, MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
+
+        // Wait for all the unquantizations to finish
+        for (size_t i = 0; i < numGradMatrices; ++i)
+        {
+            m_preAggGradQuantizers[i]->WaitUnquantizeAsyncDone();
+
+            if (m_traceLevel >= DEBUG_OUTPUT_TRACE_LEVEL)
+            {
+                char printHeaderBuf[1024];
+                sprintf(printHeaderBuf, "MPI Rank: %d, Aggregated Gradient Matrix No. %d", (int) MyRank(), (int) i);
+                PrintMatrix(printHeaderBuf, gradients[i]);
+            }
+        }
+
+        // Wait for completion of the async send requests
+        for (int i = 0; i < sendGradStripesQuantizedRequests.size(); ++i)
+        {
+            if (sendGradStripesQuantizedRequests[i].size() > 0)
+                m_mpi->Waitall(sendGradStripesQuantizedRequests[i].size(), sendGradStripesQuantizedRequests[i].data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");
+        }
+
+        if (!m_mpi->IsMainNode())
+            m_mpi->Wait(&sendHeaderRequest, MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
+
+        for (int i = 0; i < sendAggGradStripeQuantizedRequests.size(); ++i)
+        {
+            if (sendAggGradStripeQuantizedRequests[i].size() > 0)
+                m_mpi->Waitall(sendAggGradStripeQuantizedRequests[i].size(), sendAggGradStripeQuantizedRequests[i].data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");
+        }
+
+        if (m_mpi->IsMainNode())
+            m_mpi->Waitall(sendAggHeaderRequests.size(), sendAggHeaderRequests.data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");
+
+        if (showSyncPerfStats)
+        {
+            aggregationTimer.Stop();
+            double gradientAggregationTime = aggregationTimer.ElapsedSeconds();
+            fprintf(stderr, "Actual gradient aggregation time: %.6g\n", gradientAggregationTime);
+        }
+    }
+
+    // Debug helper to print matrix contents
+    static void PrintMatrix(const char* printHeader, Matrix<ElemType>* matrixToPrint, bool peek = true)
+    {
+        if (peek)
+        {
+            const size_t numRowsToPeek = 3;
+            const size_t numColsToPeek = 3;
+
+            size_t numRowsToPrint = (std::min)(numRowsToPeek, matrixToPrint->GetNumRows());
+            size_t numColsToPrint = (std::min)(numColsToPeek, matrixToPrint->GetNumCols());
+
+            matrixToPrint->Print(printHeader, 0, numRowsToPrint - 1, 0, numColsToPrint - 1);
+        }
+        else
+        {
+            matrixToPrint->Print(printHeader);
+        }
+
+        fflush(stderr);
+    }
+
+private:
+    std::unique_ptr<CUDAPageLockedMemAllocator> m_allocator;
+
+    std::vector<std::unique_ptr<MatrixQuantizer<ElemType>>> m_preAggGradQuantizers;
+    std::vector<std::unique_ptr<QuantizedMatrix<ElemType>>> m_gradQuantized;
+
+    std::vector<std::unique_ptr<MatrixQuantizer<ElemType>>> m_aggGradStripeQuantizers;
+    std::vector<std::vector<std::unique_ptr<QuantizedMatrix<ElemType>>>> m_recvGradStripesQuantized;
+    std::vector<DistGradHeader*> m_recvHeaders;
+
+    // Number of bits that each gradient value is quantized to before communication
+    // with other nodes
+    int m_numQuantizationBits;
+
+    // option for handling the mean for 1-bit quantization
+    // force 1-bit quant to threshold against 0 rather than the midpoint between lower and upper
+    bool m_zeroThresholdFor1Bit;
+
+    // Since the self-stripe in an all-reduce is not communicated, there is really no reason to
+    // quantize it for reduced communication. However, we add this as an option for for consistency
+    // across all stripes if desired
+    bool m_useQuantizationForSelfStripe;
+
+    // Perform asynchronous gradient aggregation using double buffering of the gradient matrices
+    bool m_useAsyncAggregation;
+
+    // Future corresponding to the current in-flight async gradient aggregation
+    std::future<void> m_pendingAsyncAggregation;
+
+    // Buffered gradients that we asynchronously aggregate
+    std::unordered_map<Matrix<ElemType>*, std::unique_ptr<Matrix<ElemType>>> m_bufferedGradients;
+    DistGradHeader* m_bufferedGradHeader;
+
+    int m_traceLevel;
+    int m_syncStatsTrace;
+
+    // Only used for controlling frequency of measuring/showing gradient aggregation perf stats
+    size_t m_iterationCount;
+
+    bool m_initialized;
+};
+
+} } }
--- a/Source/1BitSGD/BlockMomentumDistributedLearner.h
+++ b/Source/1BitSGD/BlockMomentumDistributedLearner.h
@ -0,0 +1,660 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma  once
+
+#include <vector>
+#include "CNTKLibrary.h"
+#include "DistributedLearnerBase.h"
+#include <numeric>
+#include <iostream>
+#include <sstream>
+
+namespace CNTK
+{
+    ///
+    /// Block Momentum Trainer.
+    ///
+    class BlockMomentumDistributedLearner : public DistributedLearnerBase
+    {
+    private:
+        enum class Action;
+        friend std::ostream& operator<<(std::ostream& out, const Action action)
+        {
+            static std::map<Action, std::string> actionStr;
+            if (actionStr.size() == 0)
+            {
+                actionStr[Action::Aggregate] = "Aggregate";
+                actionStr[Action::AggregateMetrics] = "AggregateMetrics";
+                actionStr[Action::Checkpoint] = "Checkpoint";
+                actionStr[Action::Shutdown] = "Shutdown";
+                actionStr[Action::Wait] = "Wait";
+            }
+            return out << actionStr[action];
+        }
+
+        // Print debug info about synchronization action requested and granted
+        void DebugPrintSynchronizeInfo(Action requestedAction, Action grantedAction)
+        {
+            if (GetTraceLevel() >= TraceLevel::Info)
+            {
+                std::ostringstream outString;
+                outString << "BMUF Rank " << m_communicator->CurrentWorker().m_globalRank << " Action requested " << requestedAction << " Action returned " << grantedAction << std::endl;
+                std::cerr << outString.str(); //stderr output
+            }
+        }
+
+        template<class T> using Matrix = Microsoft::MSR::CNTK::Matrix<T>;
+
+    public:
+        BlockMomentumDistributedLearner(
+            DistributedCommunicatorPtr communicator,
+            LearnerPtr learner,
+            size_t distributedAfterSamples,
+            size_t globalModelAggregationBlockSize,
+            bool useNesterovMomentum,
+            bool resetSGDMomentumAfterAggregation,
+            double blockLearningRate)
+            : BlockMomentumDistributedLearner(
+                  communicator,
+                  learner,
+                  distributedAfterSamples,
+                  globalModelAggregationBlockSize,
+                  useNesterovMomentum,
+                  resetSGDMomentumAfterAggregation,
+                  blockLearningRate,
+                  Momentum2TimeConstant(1.0 - 1.0 / (double)communicator->Workers().size(), globalModelAggregationBlockSize))
+        {}
+
+        BlockMomentumDistributedLearner(
+            DistributedCommunicatorPtr communicator,
+            LearnerPtr learner,
+            size_t distributedAfterSamples,
+            size_t globalModelAggregationBlockSize,
+            bool useNesterovMomentum,
+            bool resetSGDMomentumAfterAggregation,
+            double blockLearningRate,
+            double blockMomentumAsTimeConstant)
+            : DistributedLearnerBase(communicator, learner, distributedAfterSamples),
+            m_useNesterovMomentum(useNesterovMomentum),
+            m_resetSGDMomentumAfterAggregation(resetSGDMomentumAfterAggregation),
+            m_blockLearningRate(blockLearningRate),
+            m_blockMomentumAsTimeConstantPerWorker(blockMomentumAsTimeConstant / communicator->Workers().size()),
+            m_globalModelAggregationBlockSize(globalModelAggregationBlockSize),
+            m_numSamplesSeenInCurrentBlock(0),
+            m_endOfDataReached(false),
+            m_localTotalNumSamplesSeen(0),
+            m_syncPeriodPerWorker(globalModelAggregationBlockSize / communicator->Workers().size())
+        {
+            if (m_syncPeriodPerWorker == 0)
+                InvalidArgument("Sync period is too small.");
+
+            // Need to allocate memory here to make sure not hitting OOM
+            std::vector<NDArrayViewPtr> parameterValues;
+            GetParameterValues(learner->Parameters(), parameterValues);
+
+            m_blockLevelSmoothedGradient.resize(parameterValues.size());
+            m_prevParameters.resize(parameterValues.size());
+            m_tempBlockGradient.resize(parameterValues.size());
+            Reset(parameterValues);
+        }
+
+        size_t MinibatchSizeScaleFactor() override
+        {
+            return m_communicator->Workers().size();
+        }
+
+        bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info) override
+        {
+            // mark start of block before local update
+            std::vector<NDArrayViewPtr> values;
+            GetParameterValues(m_learner->Parameters(), values);
+
+            // note this is only for the first update, after that SyncBlock handles the bookkeeping
+            if (!m_prevParamInitialized)
+            {
+                Reset(values);
+                m_prevParamInitialized = true;
+            }
+
+            // do local update first, then block update. Local update would have different gradient for each worker,
+            // and this order is to make sure all workers got the same model after block update
+            if (!info.IsEmpty())
+            {
+                // For block momentum the number of aggreagate/checkpoints should match, so for now we ignore the return value of local learners.
+                auto profWeights = Microsoft::MSR::CNTK::ScopeProfile(Microsoft::MSR::CNTK::profilerEvtMainWeights);
+                m_learner->Update(gradientValues, info.numberOfSamples, info.atEndOfSweep);
+
+                // after local update, use the latest model for block update
+                values.clear();
+                GetParameterValues(m_learner->Parameters(), values);
+            }
+
+            auto profGradientAgg = Microsoft::MSR::CNTK::ProfilerTimeBegin();
+            bool updated = PerformDistributedUpdateIfNeeded(values, info);
+            Microsoft::MSR::CNTK::ProfilerTimeEnd(profGradientAgg, Microsoft::MSR::CNTK::profilerEvtMainGradient);
+
+            return updated;
+        }
+
+        // Optionally overridable method to get checkpoint state associated with this Distributed train method
+        Dictionary CreateCheckpoint() override
+        {
+            std::vector<NDArrayViewPtr> values;
+            GetParameterValues(m_learner->Parameters(), values);
+
+            // During checkpoint, other workers could be in aggregation state. Let's allow them to finish aggregation.
+            Action action;
+            while ((action = SynchronizeAction(Action::Checkpoint)) != Action::Checkpoint)
+            {
+                DebugPrintSynchronizeInfo(Action::Checkpoint, action);
+
+                if (action == Action::Wait)
+                    continue;
+                if (action == Action::Aggregate)
+                    AggregateImpl(values);
+                else
+                    RuntimeError("Unexpected action received.");
+            }
+
+            DebugPrintSynchronizeInfo(Action::Checkpoint, action);
+
+            // Always aggregate before the checkpoint, so prevParameter and m_numSamplesSeenInCurrentBlock don't need to be saved
+            SynchronizeAction(Action::Aggregate);
+            AggregateImpl(values);
+            
+            std::vector<DictionaryValue> serializedSmoothedGradients;
+            for (auto sg : m_blockLevelSmoothedGradient)
+            {
+                serializedSmoothedGradients.push_back(*sg);
+            }
+
+            Dictionary result;
+            result[L"base"] = DistributedLearnerBase::CreateCheckpoint();
+            result[L"localTotalNumSamplesSeen"] = m_localTotalNumSamplesSeen;
+            result[L"blockLevelSmoothedGradient"] = serializedSmoothedGradients;
+            return result;
+        }
+
+        void RestoreFromCheckpoint(const Dictionary& checkpoint) override
+        {
+            DistributedLearnerBase::RestoreFromCheckpoint(checkpoint[L"base"].Value<Dictionary>());
+            m_localTotalNumSamplesSeen = checkpoint[L"localTotalNumSamplesSeen"].Value<size_t>();
+            const auto& smoothedGradients = checkpoint[L"blockLevelSmoothedGradient"].Value<std::vector<DictionaryValue>>();
+
+            if (m_blockLevelSmoothedGradient.size() != smoothedGradients.size())
+                RuntimeError("Inconsistent parameter size between learner and checkpoint");
+
+            for (size_t i = 0; i < m_blockLevelSmoothedGradient.size(); i++)
+            {
+                m_blockLevelSmoothedGradient[i]->CopyFrom(smoothedGradients[i].Value<NDArrayView>());
+            }
+
+            m_prevParamInitialized = false;
+        }
+
+    private:
+        // Block momentum needs to do aggregation of loss and eval across workers.
+        virtual void DoAggregateMetricsIfNeeded(NDArrayViewPtr& localTrainingLoss, NDArrayViewPtr& localEvalCriterion) override
+        {
+            m_shutDownSeenBefore = false;
+            // If shutdown has been agreed upon before, then return from metrics aggregation. Other shutdown workers won't be able to sync now.
+            if (m_communicator->Workers().size() == 1 || m_shutDownSeenBefore)
+            {
+                return;
+            }
+
+            Action action;
+            while ((action = SynchronizeAction(Action::AggregateMetrics)) != Action::AggregateMetrics)
+            {
+                DebugPrintSynchronizeInfo(Action::AggregateMetrics, action);
+
+                std::vector<NDArrayViewPtr> paramValues;
+                GetParameterValues(m_learner->Parameters(), paramValues);
+
+                switch (action)
+                {
+                    // Aggregate params first and try for aggregate metrics again
+                    case Action::Aggregate:                        
+                        AggregateImpl(paramValues);
+                        break;
+                    // Can't do checkpointing here since not called from checkpointing code, so return. Checkpointing will be called again eventually.
+                    case Action::Checkpoint:
+                        return;
+                    // Can't aggregate metrics since others are going in shutdown. 
+                    case Action::Shutdown:
+                        m_shutDownSeenBefore = true;
+                        return; // Can't aggregate if another worker is in shutdown mode
+                }
+            }
+
+            DebugPrintSynchronizeInfo(Action::AggregateMetrics, action);
+
+            // Synchronization complete - Start the loss and eval aggregation
+            float averageTrainingLoss = 0;
+            if (localTrainingLoss)
+            {
+                averageTrainingLoss = localTrainingLoss->AsScalar<float>();
+            }
+
+            float averageEvalCriterion = 0;
+            if (localEvalCriterion)
+            {
+                averageEvalCriterion = localEvalCriterion->AsScalar<float>();
+            }
+
+            NDArrayViewPtr inPlaceAggregateTrainingLoss = std::make_shared<NDArrayView>(averageTrainingLoss, NDShape{}, DeviceDescriptor::CPUDevice());
+            NDArrayViewPtr inPlaceAggregateEvalCriterion = std::make_shared<NDArrayView>(averageEvalCriterion, NDShape{}, DeviceDescriptor::CPUDevice());
+            vector<NDArrayViewPtr> inPlaceAggregateVector = { inPlaceAggregateTrainingLoss, inPlaceAggregateEvalCriterion };
+
+            m_communicator->AggregateInPlace(inPlaceAggregateVector, m_communicator->Workers());
+            
+            if (localTrainingLoss)
+            {
+                inPlaceAggregateTrainingLoss->SetValue(inPlaceAggregateTrainingLoss->AsScalar<float>() / m_communicator->Workers().size());
+                localTrainingLoss->CopyFrom(*inPlaceAggregateTrainingLoss);
+            }
+
+            if (localEvalCriterion)
+            {
+                inPlaceAggregateEvalCriterion->SetValue(inPlaceAggregateEvalCriterion->AsScalar<float>() / m_communicator->Workers().size());
+                localEvalCriterion->CopyFrom(*inPlaceAggregateEvalCriterion);
+            }
+        }
+
+        // Optional override that gets called per minibatch after finishing gradient computation but before updating model parameters
+        bool PerformDistributedUpdateIfNeeded(std::vector<NDArrayViewPtr>& parameterValues, MinibatchInfo& info)
+        {
+            // If the last minibatch, set the end of data state.
+            if (info.atEndOfData)
+                m_endOfDataReached = true;
+
+            m_localTotalNumSamplesSeen += info.numberOfSamples;
+            m_sampleCount += info.numberOfSamples;
+
+            if (m_distributeAfterSamples > m_sampleCount)
+            {
+                if (m_endOfDataReached)
+                {
+                    // We have not even reached distributed state,
+                    // simply stop processing by returning false.
+                    return false;
+                }
+                return true;
+            }
+
+            if (!m_endOfDataReached)
+            {
+                m_numSamplesSeenInCurrentBlock += info.numberOfSamples;
+                if (m_numSamplesSeenInCurrentBlock < m_syncPeriodPerWorker)
+                    return true;
+
+                Aggregate(parameterValues);
+                return true;
+            }
+
+            return Shutdown(parameterValues);
+        }
+
+        // Before doing any work, the distributed learner synchronizes with other learners to
+        // decide what to do next.
+        // The priority of actons are:
+        // 1) If any worker wants to aggregate - aggregation is done.
+        // 2) If any worker wants to checkpoint and nobody wants to aggregate - checkpointing is done. If anyone wants to aggregate metrics, wait to allow it to come in checkpoint state.
+        // 3) If all want to shutdown - it means we reached the end of the data and shutdown can be done.If anyone wants to aggregate metrics, wait to allow it to come in shutdown state.
+        // 4) If all worker wants to aggregate metrics - metrics aggregation is done. Otherwise return aggregate, checkpoint or shutdown if anyone else wants it
+        // The priority above eliminate resolves situations when some of the workers run out of data
+        // and other workers require checkpointing or aggregation.
+        enum class Action
+        {
+            Wait, // Waits in the current state without doing anything.
+            Aggregate,
+            AggregateMetrics, // Used to allow aggregation of loss and eval metrics.
+            Checkpoint,
+            Shutdown
+        };
+
+        void GetParameterValues(const std::vector<Parameter>& parameters, std::vector<NDArrayViewPtr>& result)
+        {
+            for (auto p : parameters)
+                result.push_back(p.Value());
+        }
+
+        void Aggregate(std::vector<NDArrayViewPtr>& parameters)
+        {
+            // Synchronization action. Aggregate has the highest priority, so the expected result is aggregate.
+            Action action = SynchronizeAction(Action::Aggregate);
+            if (action != Action::Aggregate)
+                LogicError("Unexpected action during aggregation.");
+
+            AggregateImpl(parameters);
+        }
+
+        bool Shutdown(std::vector<NDArrayViewPtr>& parameters)
+        {
+            // During shutdown, other workers could be in checkpointing or aggregation state.
+            // Finished workers should properly behave in this case.
+            Action action;
+            while ((action = SynchronizeAction(Action::Shutdown)) != Action::Shutdown)
+            {
+                DebugPrintSynchronizeInfo(Action::Shutdown, action);
+
+                switch (action)
+                {
+                case Action::Aggregate:
+                    AggregateImpl(parameters);
+                    break;
+                case Action::Checkpoint:
+                    // Somebody still has to call the checkpoint from the outside.
+                    return true;
+                case Action::Wait:
+                    // Someone is in aggregate metrics. Wait for it to come to shutdown.
+                    continue;
+                default:
+                    RuntimeError("Unexpected action received.");
+                }
+            }
+
+            DebugPrintSynchronizeInfo(Action::Shutdown, action);
+
+            // Last synchronization
+            AggregateImpl(parameters);
+            return false; // Make compiler happy.
+        }
+
+        // Synchronize(Agree) on action before doing it. This is needed to prevent deadlock in MPI. 
+        // Aggregate is highest priority. So AggregateImpl can be called after calling SynchronizeAction(Action::Aggreagte). 
+        // Others need to ask for permission in a loop
+        Action SynchronizeAction(Action self)
+        {
+            assert(self == Action::Checkpoint || self == Action::Aggregate || self == Action::Shutdown || self == Action::AggregateMetrics);
+
+            double data[2] = { static_cast<double>(self), static_cast<double>(m_localTotalNumSamplesSeen) };
+            auto a = std::make_shared<NDArrayView>(DataType::Double, NDShape{ 2 }, &data, sizeof(double) * 2, DeviceDescriptor::CPUDevice());
+            m_communicator->Concatenate(std::vector<NDArrayViewPtr> { a }, m_actionBuffer, m_communicator->Workers());
+            assert(m_actionBuffer.size() == 1);
+
+            auto buffer = m_actionBuffer.front()->DataBuffer<double>();
+            auto bufferSize = m_actionBuffer.front()->Shape().TotalSize();
+            auto bufferEnd = buffer + bufferSize;
+
+            std::vector<Action> actions;
+            actions.reserve(m_communicator->Workers().size());
+
+            std::vector<size_t> localNumberOfSamples;
+            localNumberOfSamples.reserve(m_communicator->Workers().size());
+
+            for (const double* start = buffer; start != bufferEnd; start +=2)
+            {
+                actions.push_back(static_cast<Action>((int)*start));
+                localNumberOfSamples.push_back(static_cast<size_t>(*(start + 1)));
+            }
+            m_sampleCount = std::accumulate(localNumberOfSamples.begin(), localNumberOfSamples.end(), (size_t)0);
+
+            // If all want to aggregate metrics, only then we aggregate metrics.
+            if (std::all_of(actions.begin(), actions.end(), [](Action c) { return c == Action::AggregateMetrics; }))
+                return Action::AggregateMetrics;
+
+            // If all want to shutdown - we shutdown.
+            if (std::all_of(actions.begin(), actions.end(), [](Action c) { return c == Action::Shutdown; }))
+                return Action::Shutdown;
+
+            // If all want to checkpoint - we checkpoint.
+            if (std::all_of(actions.begin(), actions.end(), [](Action c) { return c == Action::Checkpoint; }))
+                return Action::Checkpoint;
+
+            // If all are either in Checkpoint, Shutdown or AggregateMetrics, 
+            //      Then AggregateMetrics state has lowest priority. Workers in it return without doing anything. Other workers wait for Aggregate Metrics to come in their state.
+            //      Between Checkpoint and Shutdown, Shutdown has lower priority. Shutdown worker will return and checkpoint worker will wait for others to come in checkpoint state.
+            if (std::all_of(actions.begin(), actions.end(), [](Action c) { return c == Action::Checkpoint || c == Action::Shutdown || c == Action::AggregateMetrics; }))
+            {
+                bool isAnyCheckpoint = std::any_of(actions.begin(), actions.end(), [](Action c) { return c == Action::Checkpoint; });
+                bool isAnyShutdown = std::any_of(actions.begin(), actions.end(), [](Action c) { return c == Action::Shutdown; });
+                bool isAnyAggregateMetrics = std::any_of(actions.begin(), actions.end(), [](Action c) { return c == Action::AggregateMetrics; });
+                if (self == Action::Shutdown)
+                {
+                    // Do checkpoint first if any other requests checkpoint. Then come back to shutdown.
+                    if (isAnyCheckpoint)
+                    {
+                        return Action::Checkpoint;
+                    }
+
+                    // Allow the aggregate metrics to come in shutdown state and request again.
+                    if (isAnyAggregateMetrics)
+                    {
+                        return Action::Wait;
+                    }
+
+                    return Action::Shutdown;
+                }
+                else if (self == Action::Checkpoint)
+                {
+                    // Wait for other in shutdown or aggregate metrics state to come to checkpoint state
+                    if (isAnyShutdown || isAnyAggregateMetrics)
+                    {
+                        return Action::Wait;
+                    }
+
+                    return Action::Checkpoint;
+                }
+                else if (self == Action::AggregateMetrics)
+                {
+                    // AggregateMetrics can't do aggregate metrics if anyone is in shutdown
+                    if (isAnyShutdown)
+                    {
+                        return Action::Shutdown;
+                    }
+
+                    // If all others are either metrics aggregate or checkpoint then state returned is checkpoint and we don't do metrics aggregation
+                    return Action::Checkpoint;
+                }
+            }
+
+            // Otherwise we aggregate. This is given priority by all other workers in checkpoint, shutdown or aggregate metrics states.
+            return Action::Aggregate;
+        }
+
+        void AggregateImpl(std::vector<NDArrayViewPtr>& parameters)
+        {
+            // Let update the weights.
+            if (parameters.front()->GetDataType() == DataType::Double)
+                SynchronizeModel<double>(parameters);
+            else if (parameters.front()->GetDataType() == DataType::Float)
+                SynchronizeModel<float>(parameters);
+            else if (parameters.front()->GetDataType() == DataType::Float16)
+                SynchronizeModel<half>(parameters);
+            else
+                RuntimeError("Unsupported type.");
+
+            m_numSamplesSeenInCurrentBlock = 0;
+
+            if (m_resetSGDMomentumAfterAggregation)
+                m_learner->ResetSmoothedGradients();
+        }
+
+        Dictionary CreateCheckpointImpl(std::vector<NDArrayViewPtr>& parameters)
+        {
+            // During checkpoint, other workers could be in aggregation state. Let's allow them to finish aggregation.
+            Action action;
+            while ((action = SynchronizeAction(Action::Checkpoint)) != Action::Checkpoint)
+            {
+                DebugPrintSynchronizeInfo(Action::Checkpoint, action);
+
+                if (action == Action::Wait)
+                    continue;
+                if (action == Action::Aggregate)
+                    AggregateImpl(parameters);
+                else
+                    RuntimeError("Unexpected action received.");
+            }
+
+            DebugPrintSynchronizeInfo(Action::Checkpoint, action);
+
+            return DistributedLearnerBase::CreateCheckpoint();
+        }
+
+        bool IsResetRequired(std::vector<NDArrayViewPtr>& parameters) const
+        {
+            if (m_prevParameters.size() != parameters.size() ||
+                m_blockLevelSmoothedGradient.size() != parameters.size())
+                return true;
+
+            for (size_t i = 0; i < parameters.size(); ++i)
+            {
+                if (m_prevParameters[i]->Shape() != parameters[i]->Shape() ||
+                    m_prevParameters[i]->Device() != parameters[i]->Device() ||
+                    m_blockLevelSmoothedGradient[i]->Shape() != parameters[i]->Shape() ||
+                    m_blockLevelSmoothedGradient[i]->Device() != parameters[i]->Device())
+                {
+                    return true;
+                }
+            }
+            return false;
+        }
+
+        void Reset(const std::vector<NDArrayViewPtr>& parameters)
+        {
+            for (size_t i = 0; i < parameters.size(); ++i)
+            {
+                auto& p = parameters[i];
+
+                if (p->GetDataType() == DataType::Double)
+                    ResetBuffer<double>(i, p);
+                else if (p->GetDataType() == DataType::Float)
+                    ResetBuffer<float>(i, p);
+                else
+                    RuntimeError("Unsupported type.");
+            }
+        }
+
+        template<class ElemType>
+        void ResetBuffer(size_t index, const NDArrayViewPtr& p)
+        {
+            auto data = p->GetMatrix<ElemType>();
+            if (!m_blockLevelSmoothedGradient[index])
+            {
+                // has not been initialized yet
+                auto pSmoothedGrad = std::make_shared<NDArrayView>(AsDataType<ElemType>(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId()));
+                pSmoothedGrad->SetValue(static_cast<ElemType>(0));
+                m_blockLevelSmoothedGradient[index] = pSmoothedGrad;
+            }
+
+            if (!m_prevParameters[index])
+            {
+                NDArrayViewPtr newValue = std::make_shared<NDArrayView>(AsDataType<ElemType>(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId()));
+                std::shared_ptr<Matrix<ElemType>> newData = newValue->GetWritableMatrix<ElemType>();
+                newData->SetValue(*data);
+                m_prevParameters[index] = newValue;
+            }
+            else
+            {
+                m_prevParameters[index]->GetWritableMatrix<ElemType>()->SetValue(*data);
+            }
+
+            if (!m_tempBlockGradient[index])
+            {
+                m_tempBlockGradient[index] = std::make_shared<NDArrayView>(AsDataType<ElemType>(), p->Shape(), AsDeviceDescriptor(data->GetDeviceId()));
+            }
+        }
+
+        template<class ElemType>
+        void SynchronizeModel(const std::vector<NDArrayViewPtr>& parameterValues)
+        {
+            ElemType blockMomentum = (ElemType)TimeConstant2Momentum(m_blockMomentumAsTimeConstantPerWorker, m_numSamplesSeenInCurrentBlock);
+
+            // 1. Let's aggregate weights
+            for (size_t i = 0; i < parameterValues.size(); ++i)
+            {
+                // Get current model
+                Matrix<ElemType>& previousWeight = *m_prevParameters[i]->GetWritableMatrix<ElemType>();                  // prev model value
+                Matrix<ElemType>& currentWeight = *parameterValues[i]->GetWritableMatrix<ElemType>();
+                Matrix<ElemType>& blockGrad = *m_tempBlockGradient[i]->GetWritableMatrix<ElemType>();
+
+                // Subtract it from the previous model
+                blockGrad = previousWeight - currentWeight; // matW becomes local block gradient (of one worker)
+            }
+
+            // Send block gradient over MPI nodes.
+            m_communicator->AggregateInPlace(m_tempBlockGradient, m_communicator->Workers());
+
+            // 2. Let's update the model
+            for (size_t i = 0; i < parameterValues.size(); ++i)
+            {
+                // 2 block gradient aggregation
+                // 2.1. get current model
+                Matrix<ElemType>& previousWeight = *m_prevParameters[i]->GetWritableMatrix<ElemType>();                  // prev model value
+                Matrix<ElemType>& currentWeight = *parameterValues[i]->GetWritableMatrix<ElemType>();
+                Matrix<ElemType>& blockGrad = *m_tempBlockGradient[i]->GetWritableMatrix<ElemType>();
+                // 2.2. model update 
+                {
+                    Matrix<ElemType>& sg = *m_blockLevelSmoothedGradient[i]->GetWritableMatrix<ElemType>();       // smoothed gradient
+                    // 2.2.1 update block level smoothed gradient; 
+                    // This is essentially a first-order infinite impulse response (IIR) filter with the gain (1 - blockMomentum)*m_blockLearningRate:
+                    // smoothedGradient(t)=blockMomentum * smoothedGradients(t-1) + (1 - blockMomentum)*m_blockLearningRate*blockGrad(t)
+                    Matrix<ElemType>::ScaleAndAdd((ElemType)((1 - blockMomentum)*m_blockLearningRate), blockGrad, (ElemType)blockMomentum, sg);
+                    // 2.2.2 update parameters; 
+                    currentWeight.SetValue(previousWeight);
+                    currentWeight -= sg;
+                    // 2.2.3 Nesterov Momentum 
+                    // A Nesterov momentum here is to do a partial weight update before calculating the gradient, i.e., 
+                    // (step 1) w(t) <-- w(t) - \eta* v(t) 
+                    // (step 2) g(t+1) <-- forwardbackward on minibatches with initial model as w(t)
+                    // (step 3) v(t+1) <-- \eta*v(t) + (1-\eta)*learningRate*g(t+1)
+                    // (step 4) w(t+1) <-- w(t)-v(t)
+                    // (step 5) t      <-- t+1
+                    // without step 1, this becomes stanard momentum
+                    if (m_useNesterovMomentum)
+                    {
+                        Matrix<ElemType>::ScaleAndAdd((ElemType)-blockMomentum, sg, currentWeight);
+                    }
+                    // 2.2.4 update bookkeeping
+                    previousWeight.SetValue(currentWeight);
+                }
+            }
+        }
+
+        static double TimeConstant2Momentum(double timeConstant, size_t syncPeroid)
+        {
+            if (timeConstant == 0)
+                return 0;
+            else
+                return exp(-((double)syncPeroid) / timeConstant);
+        }
+
+        static double Momentum2TimeConstant(double bm, size_t syncPeroid)
+        {
+            if (bm >= 1.0 || bm < 0.0)
+            {
+                InvalidArgument("Unexpected block momentum (%.2f). Block momentum should be in the range of [0,1)\n", bm);
+            }
+            return -(double)syncPeroid / log(bm);
+        }
+
+        const bool m_resetSGDMomentumAfterAggregation;
+        const bool m_useNesterovMomentum;
+        const double m_blockLearningRate;
+        const double m_blockMomentumAsTimeConstantPerWorker;
+
+        const size_t m_syncPeriodPerWorker;
+        const size_t m_globalModelAggregationBlockSize;
+        size_t m_numSamplesSeenInCurrentBlock;
+        size_t m_localTotalNumSamplesSeen;
+
+        // parameters at the last model aggregation point
+        std::vector<NDArrayViewPtr> m_prevParameters;
+        std::vector<NDArrayViewPtr> m_blockLevelSmoothedGradient;
+        std::vector<NDArrayViewPtr> m_tempBlockGradient;
+
+        // temp storage for MPI
+        std::vector<NDArrayViewPtr> m_actionBuffer;
+
+        bool m_prevParamInitialized = false;
+
+        bool m_endOfDataReached;
+        bool m_shutDownSeenBefore = false;
+
+        DISABLE_COPY_AND_MOVE(BlockMomentumDistributedLearner);
+     };
+}
--- a/Source/1BitSGD/BlockMomentumSGD.h
+++ b/Source/1BitSGD/BlockMomentumSGD.h
@ -0,0 +1,298 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma  once 
+
+#include "../SGDLib/MASGD.h"
+
+
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // Implementation of Blockwise Model Update and Filtering (BMUF, a.k.a. block momentum) 
+    // For detail, see the following paper
+    // Kai Chen and Qiang Huo, "Scalable training of deep learning machines by incremental block training 
+    // with intra-block parallel optimization and blockwise model-update filtering", 
+    // in International Conference on Acoustics, Speech and Signal Processing , March 2016, Shanghai, China. 
+
+    template<typename ElemType>
+    class BlockMomentumSGD : public IMASGD<ElemType>
+    {
+        typedef IMASGD<ElemType> Base;
+        using Base::m_pMPI;
+        using Base::m_deviceId;
+        using Base::DownCast;
+    
+     protected:
+        bool m_resetSGDMomentumAfterAggregation; 
+        bool m_useNesterovMomentum;
+        double m_blockLearningRate; 
+        double m_blockMomentumAsTimeConstantPerWorker; 
+        size_t m_syncPeriodPerWorker; 
+        map < wstring, shared_ptr<Matrix<ElemType>>>     m_prevParameters;       // parameters at the last model aggregation point
+        map < wstring, shared_ptr<Matrix<ElemType>>>    m_blockLevelSmoothedGradient; 
+
+    public:
+        BlockMomentumSGD(const MPIWrapperPtr& pMPI, size_t reportFreq, DEVICEID_TYPE devID, 
+                        bool useNestrovMomentum, bool resetSGDM, 
+                        double blockLearningRate, 
+                        double blockMomentumAsTimeConstant, size_t syncPeriod)
+            :IMASGD<ElemType>(pMPI, reportFreq, devID)
+        {
+            m_syncPeriodPerWorker = syncPeriod / pMPI->NumNodesInUse();
+            m_blockMomentumAsTimeConstantPerWorker = blockMomentumAsTimeConstant / pMPI->NumNodesInUse(); 
+            m_useNesterovMomentum = useNestrovMomentum;
+            m_resetSGDMomentumAfterAggregation = resetSGDM; 
+            m_blockLearningRate = blockLearningRate;
+        }
+
+        /*virtual*/ void OnEpochStart(const std::list<ComputationNodeBasePtr>& LearnableNodes) override
+        {
+            Base::OnEpochStart(LearnableNodes); 
+            for (auto& pNode : LearnableNodes)
+            {
+                auto pnode = DownCast(pNode);
+                wstring name = pNode->NodeName();
+
+                Matrix<ElemType>& NodeValue = pnode->Value();
+                if (m_blockLevelSmoothedGradient.find(name) == m_blockLevelSmoothedGradient.end())
+                {
+                    // has not been initialized yet
+                    auto pSmoothedGrad = make_shared<Matrix<ElemType>> (NodeValue.GetDeviceId());
+                    pSmoothedGrad->Resize(NodeValue.GetNumRows(), NodeValue.GetNumCols());
+                    pSmoothedGrad->SetValue((ElemType)0); 
+                    m_blockLevelSmoothedGradient[name] = pSmoothedGrad; 
+                }
+                if (m_prevParameters.find(name) == m_prevParameters.end())
+                {
+                    auto pValue = make_shared<Matrix<ElemType>>  (NodeValue.GetDeviceId());
+                    pValue->SetValue(NodeValue);
+                    m_prevParameters[name] = pValue;
+                }
+                else
+                {
+                    m_prevParameters[name]->SetValue(NodeValue);
+                }
+            }
+            fprintf(stderr, "Parallel training (%d workers) using BlockMomentumSGD with "
+                            "block momentum = %6.4f, "
+                            "block momentum time constant (per worker) = %6.4f, "
+                            "block learning rate = %6.4f, "
+                            "block size per worker = %d samples, "
+                            "%s"
+                            "%s"
+                            "\n",
+                            (int)m_pMPI->NumNodesInUse(),      
+                            BlockMomentumSGD<double>::TimeConstant2Momentum(m_blockMomentumAsTimeConstantPerWorker, m_syncPeriodPerWorker), 
+                            m_blockMomentumAsTimeConstantPerWorker,
+                            m_blockLearningRate, 
+                            (int)m_syncPeriodPerWorker, 
+                            m_useNesterovMomentum ? "using Nesterov-style block momentum, " : "" , 
+                            m_resetSGDMomentumAfterAggregation ? "resetting SGD momentum after sync." : "."
+                );
+        }
+        /*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& LearnableNodes, 
+            std::list<Matrix<ElemType>>&                smoothedGradient,
+            size_t                                      samplesSinceLastSync) override
+        {
+            Base::OnEpochEnd(LearnableNodes, smoothedGradient, samplesSinceLastSync);
+        }
+        /*virtual*/ void ModelAggregationProcessing(
+            size_t samplesSinceLastSync,
+            const std::list<ComputationNodeBasePtr>& learnableNodes,
+            std::list<Matrix<ElemType>>& smoothedGradient,
+            size_t& totalSamplesProcessed,
+            float& secondsOnCommunication
+            ) override
+        {
+            //----------------------------------------
+            // 1. communicate with other nodes to negotiate contribution weights
+            //----------------------------------------
+            int   nTotalSamples = samplesSinceLastSync;
+            ElemType blockMomentum = (ElemType)BlockMomentumSGD<double>::TimeConstant2Momentum(m_blockMomentumAsTimeConstantPerWorker, m_syncPeriodPerWorker);
+            Timer commTimer;
+            secondsOnCommunication = 0.0f;
+            commTimer.Start();
+            m_pMPI->AllReduce(&nTotalSamples, 1);
+            commTimer.Stop();
+            secondsOnCommunication += (float)commTimer.ElapsedSeconds();
+            totalSamplesProcessed = nTotalSamples;
+
+            for (auto& pBaseNode : learnableNodes)
+            {
+                if (!pBaseNode->IsParameterUpdateRequired())
+                {
+                    continue;
+                }
+                wstring name = pBaseNode->NodeName();
+                // 2 block gradient aggregation 
+                auto pNode = DownCast(pBaseNode);
+                // 2.1. get current model  
+                Matrix<ElemType>& prevWeight = *m_prevParameters[name];               // prev model value 
+                Matrix<ElemType>& currentWeight = pNode->Value();                        // current model 
+                // 2.1.2. subtract it from the previous model                   
+                Matrix<ElemType>  blockGrad(prevWeight.DeepClone());            
+                blockGrad -= currentWeight;                                              // matW becomes local block gradient (of one worker)
+                // 2.1.3. send block gradient over MPI nodes; 
+                unique_ptr<ElemType[]> px(blockGrad.CopyToArray());
+                size_t    nx = blockGrad.GetNumElements();
+                // 2.1.4. inplace sum 
+                commTimer.Restart();
+                m_pMPI->AllReduce(px.get(), nx);
+                commTimer.Stop();
+                secondsOnCommunication += (float)commTimer.ElapsedSeconds();
+                // 2.1.5. global block gradient
+                blockGrad.SetValue(blockGrad.GetNumRows(),
+                                   blockGrad.GetNumCols(),
+                                   blockGrad.GetDeviceId(),
+                                   px.get()
+                                   ); 
+                // 2.2. model update 
+                {
+                    // alias for better readability 
+                    Matrix<ElemType>& smoothedGradientUpdate = *m_blockLevelSmoothedGradient[name];       // smoothed gradient                   
+                    // 2.2.1 update block level smoothed gradient; 
+                    // This is essentially a first-order infinite impulse response (IIR) filter with the gain (1 - blockMomentum)*m_blockLearningRate:
+                    // smoothedGradientUpdate(t)=blockMomentum * smoothedGradients(t-1) + (1 - blockMomentum)*m_blockLearningRate*blockGrad(t)
+                    Matrix<ElemType>::ScaleAndAdd((ElemType)((1 - blockMomentum)*m_blockLearningRate), blockGrad, (ElemType)blockMomentum, smoothedGradientUpdate); 
+                    // 2.2.2 update parameters; 
+                    currentWeight.SetValue(prevWeight);
+                    currentWeight -= smoothedGradientUpdate;
+                    // 2.2.3 Nesterov Momentum 
+                    // A Nesterov momentum here is to do a partial weight update before calculating the gradient, i.e., 
+                    // (step 1) w(t) <-- w(t) - \eta* v(t) 
+                    // (step 2) g(t+1) <-- forwardbackward on minibatches with initial model as w(t)
+                    // (step 3) v(t+1) <-- \eta*v(t) + (1-\eta)*learningRate*g(t+1)
+                    // (step 4) w(t+1) <-- w(t)-v(t)
+                    // (step 5) t      <-- t+1
+                    // without step 1, this becomes stanard momentum
+                    if (m_useNesterovMomentum)
+                    {
+                        Matrix<ElemType>::ScaleAndAdd((ElemType)-blockMomentum, smoothedGradientUpdate, currentWeight);
+                    }
+                    // 2.2.4 update bookkeeping
+                    prevWeight.SetValue(currentWeight);
+                }
+            }
+            //----------------------------------------
+            // 3. reset SGD momentum if necessary 
+            //----------------------------------------
+            if (m_resetSGDMomentumAfterAggregation)
+            {
+                for (Matrix<ElemType>& x : smoothedGradient)
+                {
+                    x.SetValue((ElemType)0);
+                }
+            }
+        }
+
+        /*virtual*/ void SaveToCheckPoint(File& fstream) override
+        {
+            if (m_pMPI->IsMainNode())
+            {
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMACKP");
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BOptions");
+                fstream << m_resetSGDMomentumAfterAggregation;
+                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EOptions");
+
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMomentumAsTimeConstant");
+                fstream << m_blockMomentumAsTimeConstantPerWorker; 
+                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMomentumAsTimeConstant");
+
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BSyncPeriodInSamples"); 
+                fstream << m_syncPeriodPerWorker; 
+                fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ESyncPeriodInSamples");
+
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BParam");
+                SaveParameters(fstream, m_prevParameters);
+                SaveParameters(fstream, m_blockLevelSmoothedGradient);
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"EParam");
+
+                fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"EMACKP");
+            }
+        }
+        /*virtual*/ void LoadFromCheckPoint(File& fstream) override
+        {
+            if (fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMACKP"))
+            {
+                fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BOptions");
+                fstream >> m_resetSGDMomentumAfterAggregation;
+                fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EOptions");
+
+                fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMomentumAsTimeConstant");
+                fstream >> m_blockMomentumAsTimeConstantPerWorker;
+                fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMomentumAsTimeConstant");
+
+                fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BSyncPeriodInSamples");
+                fstream >> m_syncPeriodPerWorker;
+                fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ESyncPeriodInSamples");
+
+                fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BParam");
+                LoadParameters(fstream, m_prevParameters, m_deviceId);
+                LoadParameters(fstream, m_blockLevelSmoothedGradient, m_deviceId);
+                fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"EParam");
+
+                fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMACKP");
+            }
+        }
+    private:
+       // helper function to save/load map<wstring, shared_ptr<Matrix<ElemType>> structure 
+       void SaveParameters(File& f, const map<wstring, shared_ptr<Matrix<ElemType>>>& parameters) const
+        {
+            // save sizeof(ElemType)
+            unsigned int size = sizeof(ElemType);
+            f << size;
+            // save number of pairs 
+            unsigned int numPairs = parameters.size();
+            f << numPairs;
+            for (auto& x : parameters)
+            {
+                f << x.first;
+                f << *x.second;
+            }
+            f.Flush();
+            return;
+        }
+       void LoadParameters(File& f, map<wstring, shared_ptr<Matrix<ElemType>>>& parameters, DEVICEID_TYPE deviceID)
+       {
+           unsigned int size = 0;
+           unsigned int pair = 0;
+           f >> size;
+           f >> pair;
+           if (size != sizeof(ElemType))
+           {
+               LogicError("Mismatched ElemType in loading BlockMomentumSGD checkpoint. Expecting %s, while loading element size=%d\n",
+                   sizeof(ElemType) == 4 ? "float" : "double",
+                   size
+                   );
+           }
+           parameters.clear();
+           for (size_t i = 0; i < pair; i++)
+           {
+               wstring name;
+               f >> name;
+               shared_ptr<Matrix<ElemType>> mat = make_shared<Matrix<ElemType>>(deviceID);
+               f >> *mat;
+               parameters[name] = mat;
+           }
+       }
+
+
+    public:
+
+       static double TimeConstant2Momentum(double timeConstant, size_t syncPeroid)
+       {
+           return exp(-((double)syncPeroid) / timeConstant);
+       }
+       static double Momentum2TimeConstant(double bm, size_t syncPeroid)
+       {
+           if (bm >= 1.0 || bm < 0.0)
+           {
+               InvalidArgument("Unexpected block momentum (%.2f). Block momentum should be in the range of [0,1)\n", bm);
+           }
+           return -(double)syncPeroid / log(bm); 
+       }
+    };
+} } }
--- a/Source/1BitSGD/MatrixQuantizer.h
+++ b/Source/1BitSGD/MatrixQuantizer.h
@ -0,0 +1,87 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "ColumnQuantizer.h"
+#include "QuantizedMatrix.h"
+#include "MatrixQuantizerImpl.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// This type does the quantization on a matrix
+// This is a technique to reduce the cost of communicating
+// the gradient matrices during aggregation across all nodes in
+// data-parallel SGD training, at the end of each minibatch.
+// Refer this paper http://research.microsoft.com/apps/pubs/?id=230137
+// for details.
+class MatrixQuantizerBase
+{};
+
+template <class ElemType>
+class MatrixQuantizer final : public MatrixQuantizerBase
+{
+public:
+    MatrixQuantizer(size_t numRows, size_t numCols, int deviceId, bool useAsync) : MatrixQuantizer(deviceId, useAsync)
+    {
+        m_residual = std::make_shared<Matrix<ElemType>>(numRows, numCols, deviceId, DENSE);
+    }
+
+    MatrixQuantizer(int deviceId, bool useAsync) : m_residual(nullptr)
+    {
+        m_quantizerImpl.reset(MatrixQuantizerImpl<ElemType>::Create(deviceId, useAsync));
+    }
+
+    // Disallow copy and move construction and assignment
+    DISABLE_COPY_AND_MOVE(MatrixQuantizer);
+
+    void QuantizeAsync(const Matrix<ElemType>& inMatrix, QuantizedMatrix<ElemType>& outQMatrix, bool zeroThresholdFor1Bit)
+    {
+        m_quantizerImpl->QuantizeAsync(inMatrix, *m_residual, outQMatrix, *m_residual, zeroThresholdFor1Bit);
+    }
+
+    void QuantizeAsync(const Matrix<ElemType>& inMatrix, const Matrix<ElemType>& inResidual, QuantizedMatrix<ElemType>& outQMatrix, Matrix<ElemType>& outResidual, bool zeroThresholdFor1Bit)
+    {
+        m_quantizerImpl->QuantizeAsync(inMatrix, inResidual, outQMatrix, outResidual, zeroThresholdFor1Bit);
+    }
+
+    void WaitQuantizeAsyncDone()
+    {
+        m_quantizerImpl->WaitQuantizeAsyncDone();
+    }
+
+    void UnquantizeAsync(QuantizedMatrix<ElemType>& inQMatrix, Matrix<ElemType>& outMatrix, bool add = false)
+    {
+        m_quantizerImpl->UnquantizeAsync(inQMatrix, outMatrix, add);
+    }
+
+    void WaitUnquantizeAsyncDone()
+    {
+        m_quantizerImpl->WaitUnquantizeAsyncDone();
+    }
+
+    int GetDeviceId() const
+    {
+        return m_quantizerImpl->GetDeviceId();
+    }
+
+    void ResetResidue()
+    {
+        m_residual->SetValue(0.0);
+    }
+
+    const Matrix<ElemType>& GetResidualMatrix() const
+    {
+        return *m_residual;
+    }
+
+private:
+    std::unique_ptr<MatrixQuantizerImpl<ElemType>> m_quantizerImpl;
+
+    // the residual matrix
+    std::shared_ptr<Matrix<ElemType>> m_residual;
+};
+
+} } }
--- a/Source/1BitSGD/QuantizedDataParallelDistributedLearner.h
+++ b/Source/1BitSGD/QuantizedDataParallelDistributedLearner.h
@ -0,0 +1,99 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma  once
+
+#include <vector>
+#include "CNTKLibrary.h"
+#include "DistributedLearnerBase.h"
+#include "PerformanceProfiler.h"
+
+namespace CNTK
+{
+    ///
+    /// Quantized Distributed Trainer.
+    ///
+    class QuantizedDataParallelDistributedLearner : public DistributedLearnerBase
+    {
+    public:
+        QuantizedDataParallelDistributedLearner(QuantizedDistributedCommunicatorPtr communicator, LearnerPtr learner, size_t distributeAfterSamples, bool useAsyncBufferedParameterUpdate)
+            : DistributedLearnerBase(communicator, learner, distributeAfterSamples)
+        {
+            if (useAsyncBufferedParameterUpdate)
+                LogicError("Asynchronous parameter update is not yet supported.");
+        }
+
+        // Optional override that gets called per minibatch after finishing gradient computation but before updating model parameters
+        bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info) override
+        {
+            if (m_sampleCount >= m_distributeAfterSamples)
+            {
+                auto profGradientAgg = Microsoft::MSR::CNTK::ScopeProfile(Microsoft::MSR::CNTK::profilerEvtMainGradient);
+
+                if (info.IsEmpty())
+                    PrepaireZeroGradients(gradientValues);
+
+                ConvertToOrdered(gradientValues, m_gradientBuffer);
+
+                std::vector<NDArrayViewPtr> headerToAggregate;
+                headerToAggregate.push_back(info.evalCriterionValue);
+                headerToAggregate.push_back(info.trainingLossValue);
+
+                auto value = MakeSharedObject<NDArrayView>(static_cast<double>(info.numberOfSamples), NDShape{ 1 }, DeviceDescriptor::CPUDevice());
+                headerToAggregate.push_back(value);
+
+                m_communicator->AggregateInPlace(headerToAggregate, m_communicator->Workers());
+
+                info.numberOfSamples = static_cast<size_t>(*headerToAggregate.back()->DataBuffer<double>());
+
+                std::vector<NDArrayViewPtr> gradients;
+                for (const auto& i : m_gradientBuffer)
+                    gradients.push_back(i.second);
+                m_gradientBuffer.clear();
+
+                dynamic_cast<QuantizedDistributedCommunicator*>(m_communicator.get())->QuantizedAggregateInPlace(
+                    gradients,
+                    m_residuals,
+                    m_stripeResiduals,
+                    m_communicator->Workers());
+            }
+
+            auto profWeights = Microsoft::MSR::CNTK::ScopeProfile(Microsoft::MSR::CNTK::profilerEvtMainWeights);
+
+            m_sampleCount += info.numberOfSamples;
+            if (info.IsEmpty())
+                return false;
+
+            return m_learner->Update(gradientValues, info.numberOfSamples, info.atEndOfSweep);
+        }
+
+        // Optionally overridable method to get checkpoint state associated with this Distributed train method
+        Dictionary CreateCheckpoint() override
+        {
+            // Resetting the residuals.
+            // We do this to make sure that the returned checkpoint state is consistent with the in - memory state, since we do not checkpoint the residues.
+            for (size_t i = 0; i < m_residuals.size(); ++i)
+                if (m_residuals[i]->GetDataType() == DataType::Double)
+                    m_residuals[i]->SetValue(0.0);
+                else
+                    m_residuals[i]->SetValue(0.0f);
+
+            for (size_t i = 0; i < m_stripeResiduals.size(); ++i)
+                if (m_stripeResiduals[i])
+                    if (m_stripeResiduals[i]->GetDataType() == DataType::Double)
+                        m_stripeResiduals[i]->SetValue(0.0);
+                    else
+                        m_stripeResiduals[i]->SetValue(0.0f);
+
+            return DistributedLearnerBase::CreateCheckpoint();
+        }
+
+    private:
+        // Residuals of quantized gradients.
+        std::vector<NDArrayViewPtr> m_residuals;
+        // Residuals of quantized aggregated stripes this node is responsible for.
+        std::vector<NDArrayViewPtr> m_stripeResiduals;
+    };
+}
--- a/Source/1BitSGD/QuantizedDistributedCommunicator.h
+++ b/Source/1BitSGD/QuantizedDistributedCommunicator.h
@ -0,0 +1,567 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "Basics.h"
+#include "MPIWrapper.h"
+#include "CNTKLibrary.h"
+#include "MatrixQuantizerImpl.h"
+#include "MatrixQuantizer.h"
+#include "CUDAPageLockedMemAllocator.h"
+#include "Utils.h"
+#include "DistributedCommunicator.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+    class MatrixQuantizerBase;
+
+    class QuantizedMatrixBase;
+    std::shared_ptr<QuantizedMatrixBase> QuantizedMatrixBasePtr;
+
+    class CUDAPageLockedMemAllocator;
+} } }
+
+namespace CNTK
+{
+    class QuantizedMPICommunicatorImpl final : public MPICommunicatorImpl, public QuantizedDistributedCommunicator
+    {
+        using Base = MPICommunicatorImpl;
+
+        template<class T> using vector = std::vector<T>;
+        template<class T> using shared_ptr = std::shared_ptr<T>;
+        template<class T> using unordered_set = std::unordered_set<T>;
+
+        using MpiFail = Microsoft::MSR::CNTK::MpiFail;
+        using QuantizedMatrixBase = Microsoft::MSR::CNTK::QuantizedMatrixBase;
+        using QuantizedMatrixBasePtr = shared_ptr<QuantizedMatrixBase>;
+        using MatrixQuantizerBase = Microsoft::MSR::CNTK::MatrixQuantizerBase;
+        using CUDAPageLockedMemAllocator = Microsoft::MSR::CNTK::CUDAPageLockedMemAllocator;
+
+        template<class T> using MatrixQuantizer = Microsoft::MSR::CNTK::MatrixQuantizer<T>;
+        template<class T> using QuantizedMatrix = Microsoft::MSR::CNTK::QuantizedMatrix<T>;
+        template<class T> using Matrix = Microsoft::MSR::CNTK::Matrix<T>;
+
+    public:
+        QuantizedMPICommunicatorImpl(bool zeroThresholdFor1Bit, bool useQuantizationForSelfStripe, size_t numQuantizationBits)
+            : m_zeroThresholdFor1Bit(zeroThresholdFor1Bit), m_useQuantizationForSelfStripe(useQuantizationForSelfStripe), m_numQuantizationBits(numQuantizationBits)
+        {}
+
+        void QuantizedAggregateInPlace(
+            std::vector<NDArrayViewPtr>& inValues,
+            std::vector<NDArrayViewPtr>& valueQuantizationResidues,
+            std::vector<NDArrayViewPtr>& stripeQuantizationResidues,
+            const std::unordered_set<DistributedWorkerDescriptor>& sendToWorkers) override
+        {
+            QuantizedAggregate(
+                inValues, valueQuantizationResidues, stripeQuantizationResidues,
+                inValues, valueQuantizationResidues, stripeQuantizationResidues,
+                sendToWorkers);
+        }
+
+        // A collective communication API to perform quantized aggregation of values across all workers of this communicator
+        void QuantizedAggregate(
+            const vector<NDArrayViewPtr>& inValues,
+            const vector<NDArrayViewPtr>& valueQuantizationResidues,
+            const vector<NDArrayViewPtr>& stripeQuantizationResidues,
+            vector<NDArrayViewPtr>& aggregatedOutputs,
+            vector<NDArrayViewPtr>& newQuantizationResidues,
+            vector<NDArrayViewPtr>& newStripeQuantizationResidues,
+            const unordered_set<DistributedWorkerDescriptor>& sendToWorkers) override
+        {
+            CheckWorkers(sendToWorkers);
+
+            if (Workers().size() == 1) // No need to aggregate anything.
+            {
+                aggregatedOutputs = inValues;
+                newQuantizationResidues = valueQuantizationResidues;
+                newStripeQuantizationResidues = stripeQuantizationResidues;
+                return;
+            }
+
+            if (inValues.empty())
+                return;
+
+            DataType dataType = inValues.front()->GetDataType();
+            for (const auto& v : inValues)
+            {
+                if (v->GetDataType() != dataType)
+                    RuntimeError("Currently values of different types are not supported for quantize.");
+            }
+
+            if (dataType == DataType::Float)
+                QuantizedAggregate<float>(inValues, valueQuantizationResidues, stripeQuantizationResidues, aggregatedOutputs, newQuantizationResidues, newStripeQuantizationResidues, sendToWorkers);
+            else if (dataType == DataType::Double)
+                QuantizedAggregate<double>(inValues, valueQuantizationResidues, stripeQuantizationResidues, aggregatedOutputs, newQuantizationResidues, newStripeQuantizationResidues, sendToWorkers);
+            else
+                LogicError("Unexpected type value.");
+        }
+
+        // Redefining inherited members.
+        // TODO: Use using and virtual inheritance after switching to VS2015.
+        const std::unordered_set<DistributedWorkerDescriptor>& Workers() const override { return Base::Workers(); }
+        const DistributedWorkerDescriptor& CurrentWorker() const override { return Base::CurrentWorker(); }
+        DistributedCommunicatorPtr SubGroup(const std::unordered_set<DistributedWorkerDescriptor>& g) const override { return Base::SubGroup(g); }
+        void Concatenate(
+            const std::vector<ValuePtr>& in,
+            std::vector<ValuePtr>& out,
+            const std::unordered_set<DistributedWorkerDescriptor>& w) override
+        {
+            Base::Concatenate(in, out, w);
+        }
+
+        void AggregateInPlace(
+            const std::vector<NDArrayViewPtr>& values,
+            const std::unordered_set<DistributedWorkerDescriptor>& sendToWorkers) override
+        {
+            Base::AggregateInPlace(values, sendToWorkers);
+        }
+
+        void Aggregate(
+            const std::vector<NDArrayViewPtr>& values,
+            std::vector<NDArrayViewPtr>& outputValues,
+            const std::unordered_set<DistributedWorkerDescriptor>& sendToWorkers) override
+        {
+            Base::Aggregate(values, outputValues, sendToWorkers);
+        }
+
+        void Barrier() override
+        {
+            Base::Barrier();
+        }
+
+        virtual void Concatenate(
+            const std::vector<NDArrayViewPtr>& input,
+            std::vector<NDArrayViewPtr>& output,
+            const std::unordered_set<DistributedWorkerDescriptor>& sendToWorkers) override
+        {
+            Base::Concatenate(input, output, sendToWorkers);
+        }
+
+        virtual void Gather(
+            const Dictionary& input,
+            std::vector<DictionaryPtr>& output,
+            const std::unordered_set<DistributedWorkerDescriptor>& sendToWorkers) override
+        {
+            Base::Gather(input, output, sendToWorkers);
+        }
+
+    private:
+        struct Stripe
+        {
+            size_t m_startCol;
+            size_t m_numCols;
+        };
+
+        // Determine which stripe of the gradient is this node responsible for
+        Stripe GetStripeForNode(size_t numCols, size_t nodeRank, size_t numNodes)
+        {
+            size_t numColsPerNode = numCols / numNodes;
+            size_t residue = numCols % numNodes;
+            size_t startColNumofStripe = (numColsPerNode * nodeRank) + min(residue, nodeRank);
+            size_t numColsinStripe = numColsPerNode + ((nodeRank < residue) ? 1 : 0);
+            return Stripe{ startColNumofStripe, numColsinStripe };
+        }
+
+        template <typename ElementType>
+        MatrixQuantizer<ElementType>& GetQuantizer(const shared_ptr<MatrixQuantizerBase>& quantizer)
+        {
+            return static_cast<MatrixQuantizer<ElementType>&>(*quantizer);
+        }
+
+        template <typename ElementType>
+        QuantizedMatrix<ElementType>& GetQuantizedMatrix(QuantizedMatrixBase& matrix)
+        {
+            return static_cast<QuantizedMatrix<ElementType>&>(matrix);
+        }
+
+        void InitializeBuffers(
+            const vector<NDArrayViewPtr>& inValues,
+            vector<NDArrayViewPtr>& valueQuantizationResidues,
+            vector<NDArrayViewPtr>& stripeQuantizationResidues,
+            vector<NDArrayViewPtr>& aggregatedOutputs,
+            vector<NDArrayViewPtr>& newQuantizationResidues,
+            vector<NDArrayViewPtr>& newStripeQuantizationResidues)
+        {
+            m_preAggregatedGradientQuantizers.resize(std::max(inValues.size(), valueQuantizationResidues.size()));
+            if (inValues.size() != m_preAggregatedGradientQuantizers.size())
+                LogicError("Number of aggregated values should be equal number of quantized residuals.");
+
+            m_quantizedGradients.resize(inValues.size());
+            m_aggregatedGradientStripeQuantizers.resize(std::max(inValues.size(), stripeQuantizationResidues.size()));
+            if (inValues.size() != m_aggregatedGradientStripeQuantizers.size())
+                LogicError("Number of aggregated values should be equal number of striped quantized residuals.");
+
+            m_recvGradientStripesQuantized.resize(inValues.size());
+
+            if (valueQuantizationResidues.empty())
+                valueQuantizationResidues.resize(inValues.size());
+
+            if (stripeQuantizationResidues.empty())
+                stripeQuantizationResidues.resize(inValues.size());
+
+            if (newQuantizationResidues.empty())
+                newQuantizationResidues.resize(inValues.size());
+
+            if (newStripeQuantizationResidues.empty())
+                newStripeQuantizationResidues.resize(inValues.size());
+
+            for (auto i = 0; i < inValues.size(); ++i)
+            {
+                auto view = inValues[i];
+
+                // Make sure none of the values are sparse - we currently do not support aggregation of sparse matrices
+                if (view->GetStorageFormat() != StorageFormat::Dense)
+                    RuntimeError("Aggregation for sparse matrices is currently not supported!");
+
+                // Currently we always use async aggregation. Is this correct?
+                if (view->GetDataType() == DataType::Float)
+                    InitializeBuffer<float>(inValues, valueQuantizationResidues, stripeQuantizationResidues, aggregatedOutputs, newQuantizationResidues, newStripeQuantizationResidues, i);
+                else if (view->GetDataType() == DataType::Double)
+                    InitializeBuffer<double>(inValues, valueQuantizationResidues, stripeQuantizationResidues, aggregatedOutputs, newQuantizationResidues, newStripeQuantizationResidues, i);
+                else
+                    LogicError("Unsupported type");
+            }
+        }
+
+        template<class ElemType>
+        void InitializeBuffer(
+            const vector<NDArrayViewPtr>& inValues,
+            vector<NDArrayViewPtr>& valueQuantizationResidues,
+            vector<NDArrayViewPtr>& stripeQuantizationResidues,
+            vector<NDArrayViewPtr>& /*aggregatedOutputs*/,
+            vector<NDArrayViewPtr>& newQuantizationResidues,
+            vector<NDArrayViewPtr>& newStripeQuantizationResidues,
+            size_t index)
+        {
+            int rank = static_cast<int>(CurrentWorker().m_globalRank);
+            int numWorkers = static_cast<int>(Workers().size());
+
+            auto value = inValues[index];
+            auto v = GetMatrix<ElemType>(value);
+            size_t nRow = v->GetNumRows();
+            size_t nCol = v->GetNumCols();
+
+            if (!valueQuantizationResidues[index])
+            {
+                auto residual = MakeSharedObject<NDArrayView>(AsDataType<ElemType>(), NDShape{ nRow, nCol }, AsDeviceDescriptor(v->GetDeviceId()));
+                auto outputResidual = MakeSharedObject<NDArrayView>(AsDataType<ElemType>(), NDShape{ nRow, nCol }, AsDeviceDescriptor(v->GetDeviceId()));
+                valueQuantizationResidues[index] = residual;
+                newQuantizationResidues[index] = outputResidual;
+            }
+
+            Stripe stripe = GetStripeForNode(v->GetNumCols(), rank, numWorkers);
+            if (!stripeQuantizationResidues[index] && stripe.m_numCols > 0)
+            {
+                auto residual = MakeSharedObject<NDArrayView>(::CNTK::AsDataType<ElemType>(), NDShape{ nRow, stripe.m_numCols }, AsDeviceDescriptor(v->GetDeviceId()));
+                auto outputResidual = MakeSharedObject<NDArrayView>(::CNTK::AsDataType<ElemType>(), NDShape{ nRow, stripe.m_numCols }, AsDeviceDescriptor(v->GetDeviceId()));
+                stripeQuantizationResidues[index] = residual;
+                newStripeQuantizationResidues[index] = outputResidual;
+            }
+
+            auto inResidual = valueQuantizationResidues[index];
+
+            // Initialize buffer.
+            m_quantizedGradients[index] = std::make_shared<QuantizedMatrix<ElemType>>(v->GetNumRows(), v->GetNumCols(), m_numQuantizationBits, CPUDEVICE, m_allocator.get());
+
+            // Initialize gradient quantizer.
+            m_preAggregatedGradientQuantizers[index] = std::make_shared<MatrixQuantizer<ElemType>>(GetMatrix<ElemType>(inResidual)->GetDeviceId(), true);
+
+            // Determine which stripe of the gradient is this node responsible for
+            MatrixQuantizer<ElemType>* aggregatedGradientStripeQuantizers = nullptr;
+            if (stripe.m_numCols > 0)
+            {
+                // Initialize quantizer
+                aggregatedGradientStripeQuantizers = new MatrixQuantizer<ElemType>(GetMatrix<ElemType>(inResidual)->GetDeviceId(), true);
+                m_recvGradientStripesQuantized[index].resize(numWorkers - 1);
+                for (size_t j = 0; j < numWorkers - 1; ++j)
+                    m_recvGradientStripesQuantized[index][j]= std::unique_ptr<QuantizedMatrix<ElemType>>(new QuantizedMatrix<ElemType>(v->GetNumRows(), stripe.m_numCols, m_numQuantizationBits, CPUDEVICE, m_allocator.get()));
+            }
+
+            m_aggregatedGradientStripeQuantizers[index] = std::unique_ptr<MatrixQuantizer<ElemType>>(aggregatedGradientStripeQuantizers);
+        }
+
+        template<class ElemType>
+        void QuantizedAggregate(
+            const vector<NDArrayViewPtr>& inValues,
+            const vector<NDArrayViewPtr>& formalValueQuantizationResidues,
+            const vector<NDArrayViewPtr>& formalStripeQuantizationResidues,
+            vector<NDArrayViewPtr>& aggregatedOutputs,
+            vector<NDArrayViewPtr>& newQuantizationResidues,
+            vector<NDArrayViewPtr>& newStripeQuantizationResidues,
+            const unordered_set<DistributedWorkerDescriptor>& sendToWorkers)
+        {
+            CheckWorkers(sendToWorkers);
+
+            const int numWorkers = static_cast<int>(Workers().size());
+            const int rank = static_cast<int>(CurrentWorker().m_globalRank);
+
+            auto valueQuantizationResidues = formalValueQuantizationResidues;
+            auto stripeQuantizationResidues = formalStripeQuantizationResidues;
+
+            InitializeBuffers(
+                inValues,
+                valueQuantizationResidues,
+                stripeQuantizationResidues,
+                aggregatedOutputs,
+                newQuantizationResidues,
+                newStripeQuantizationResidues);
+
+            vector<shared_ptr<Matrix<ElemType>>> inputValues;
+            vector<shared_ptr<Matrix<ElemType>>> outputValues;
+            vector<shared_ptr<Matrix<ElemType>>> inputResiduals;
+            vector<shared_ptr<Matrix<ElemType>>> outputResiduals;
+            vector<shared_ptr<Matrix<ElemType>>> inputStripeResiduals;
+            vector<shared_ptr<Matrix<ElemType>>> outputStripeResiduals;
+
+            // Check that input corresponds to output and covert NDArrayViews to the corresponding matrices.
+            for (size_t i = 0; i < inValues.size(); i++)
+            {
+                assert(inValues[i]->Shape().TotalSize() == aggregatedOutputs[i]->Shape().TotalSize());
+                assert(inValues[i]->GetDataType() == aggregatedOutputs[i]->GetDataType());
+                assert(inValues[i]->Device() == aggregatedOutputs[i]->Device());
+
+                assert(inValues[i] != nullptr);
+                inputValues.push_back(GetWritableMatrix<ElemType>(inValues[i]));
+
+                assert(aggregatedOutputs[i] != nullptr);
+                outputValues.push_back(GetWritableMatrix<ElemType>(aggregatedOutputs[i]));
+
+                assert(valueQuantizationResidues[i] != nullptr);
+                inputResiduals.push_back(GetWritableMatrix<ElemType>(valueQuantizationResidues[i]));
+
+                assert(newQuantizationResidues[i] != nullptr);
+                outputResiduals.push_back(GetWritableMatrix<ElemType>(newQuantizationResidues[i]));;
+
+                // Stripe residuals can be null in case when the stripe does not belong to this node.
+                inputStripeResiduals.push_back(stripeQuantizationResidues[i] ? GetWritableMatrix<ElemType>(stripeQuantizationResidues[i]) : nullptr);;
+                outputStripeResiduals.push_back(newStripeQuantizationResidues[i]? GetWritableMatrix<ElemType>(newStripeQuantizationResidues[i]) : nullptr);
+            }
+
+            // Prepare receiving buffers.
+            vector<std::unique_ptr<Matrix<ElemType>>> aggGradStripes;
+            vector<std::unique_ptr<QuantizedMatrix<ElemType>>> aggGradStripesQuantized;
+            for (size_t i = 0; i < inputValues.size(); i++)
+            {
+                size_t nCol = inputValues[i]->GetNumCols();
+
+                // Determine which stripe of the gradient is this node responsible for
+                Stripe stripe = GetStripeForNode(nCol, rank, numWorkers);
+                Matrix<ElemType>* currAggGradStripe = nullptr;
+                QuantizedMatrix<ElemType>* currAggGradStripeQuantized = nullptr;
+                if (stripe.m_numCols > 0)
+                {
+                    currAggGradStripe = new Matrix<ElemType>(inputValues[i]->ColumnSlice(stripe.m_startCol, stripe.m_numCols));
+                    currAggGradStripeQuantized = new QuantizedMatrix<ElemType>(GetQuantizedMatrix<ElemType>(*m_quantizedGradients[i]).ColumnSlice(stripe.m_startCol, stripe.m_numCols));
+                }
+
+                aggGradStripes.push_back(std::unique_ptr<Matrix<ElemType>>(currAggGradStripe));
+                aggGradStripesQuantized.push_back(std::unique_ptr<QuantizedMatrix<ElemType>>(currAggGradStripeQuantized));
+            }
+
+            // Initiate quantization of the gradient matrices
+            for (size_t i = 0; i < inValues.size(); ++i)
+                GetQuantizer<ElemType>(m_preAggregatedGradientQuantizers[i]).QuantizeAsync(*(inputValues[i]), *(inputResiduals[i]), GetQuantizedMatrix<ElemType>(*(m_quantizedGradients[i])), *(outputResiduals[i]), m_zeroThresholdFor1Bit);
+
+            // Initiate receive of the stripe to be aggregated by the current node, from all other nodes
+            vector<MPI_Request> recvGradStripesQuantizedRequests;
+            vector<int> recvRequestIdxToGradientMatrixIdxMap;
+            for (int i = 0; i < inputValues.size(); ++i)
+            {
+                Stripe stripe = GetStripeForNode(inputValues[i]->GetNumCols(), rank, numWorkers);
+                if (stripe.m_numCols > 0)
+                {
+                    recvRequestIdxToGradientMatrixIdxMap.push_back(i);
+                    for (int j = 0; j < numWorkers - 1; ++j)
+                    {
+                        int source = (j >= rank) ? (j + 1) : j;
+
+                        recvGradStripesQuantizedRequests.push_back(MPI_Request());
+                        int recvRequestIdx = (int)recvGradStripesQuantizedRequests.size() - 1;
+
+                        m_mpi->Irecv(GetQuantizedMatrix<ElemType>(*m_recvGradientStripesQuantized[i][j]).Buffer(), (int)GetQuantizedMatrix<ElemType>(*m_recvGradientStripesQuantized[i][j]).GetSize(), MPI_CHAR, source, i, &(recvGradStripesQuantizedRequests[recvRequestIdx])) || MpiFail("MPI_Irecv");
+                    }
+                }
+            }
+
+            // Asynchronously send stripes of the quantized gradient matrices to the respective nodes that own aggregation of that stripe
+            std::vector<std::vector<MPI_Request>> sendGradStripesQuantizedRequests(inValues.size());
+            for (int i = 0; i < inValues.size(); ++i)
+            {
+                GetQuantizer<ElemType>(m_preAggregatedGradientQuantizers[i]).WaitQuantizeAsyncDone();
+
+                size_t sendRequestIdx = 0;
+                for (int j = 0; j < numWorkers; ++j)
+                {
+                    Stripe stripe = GetStripeForNode(inputValues[i]->GetNumCols(), j, numWorkers);
+                    if (stripe.m_numCols > 0)
+                    {
+                        // Do not send stripe for self
+                        if (j != rank)
+                        {
+                            sendGradStripesQuantizedRequests[i].push_back(MPI_Request());
+                            QuantizedMatrix<ElemType> quantizedStripe = GetQuantizedMatrix<ElemType>(*m_quantizedGradients[i]).ColumnSlice(stripe.m_startCol, stripe.m_numCols);
+
+                            m_mpi->Isend(quantizedStripe.Buffer(), (int)quantizedStripe.GetSize(), MPI_CHAR, j, i, &(sendGradStripesQuantizedRequests[i][sendRequestIdx])) || MpiFail("MPI_Isend");
+                            sendRequestIdx++;
+                        }
+                        else
+                        {
+                            // Initialize the aggregate for the stripe with the quantized gradients instead of the original
+                            // gradients themselves, if so desired
+                            if (m_useQuantizationForSelfStripe)
+                            {
+                                QuantizedMatrix<ElemType> preAggGradSelfStripeQuantized = GetQuantizedMatrix<ElemType>(*m_quantizedGradients[i]).ColumnSlice(stripe.m_startCol, stripe.m_numCols);
+                                GetQuantizer<ElemType>(m_aggregatedGradientStripeQuantizers[i]).UnquantizeAsync(preAggGradSelfStripeQuantized, *(aggGradStripes[i]), false);
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Wait for the stripes to arrive from each node and unquantize and aggregate
+            size_t numReceivesExpected = recvGradStripesQuantizedRequests.size();
+            size_t numActualReceives = 0;
+            std::vector<int> perGradMatrixReceiveCount(recvRequestIdxToGradientMatrixIdxMap.size(), 0);
+            while (numActualReceives < numReceivesExpected)
+            {
+                int idx = MPI_UNDEFINED;
+                m_mpi->Waitany((int)recvGradStripesQuantizedRequests.size(), recvGradStripesQuantizedRequests.data(), &idx, MPI_STATUS_IGNORE) || MpiFail("MPI_Waitany");
+                if (idx == MPI_UNDEFINED)
+                {
+                    break;
+                }
+
+                numActualReceives++;
+
+                int gradMatrixIdxPosition = idx / (numWorkers - 1);
+                int recvBufferSubIndex = idx % (numWorkers - 1);
+
+                // Map idx back to the actual gradient matrix index
+                int gradMatrixIdx = recvRequestIdxToGradientMatrixIdxMap[gradMatrixIdxPosition];
+
+                // Wait for the previous Unquantize to finish before issuing a new one
+                if (m_useQuantizationForSelfStripe || (perGradMatrixReceiveCount[gradMatrixIdxPosition] > 0))
+                    GetQuantizer<ElemType>(m_aggregatedGradientStripeQuantizers[gradMatrixIdx]).WaitUnquantizeAsyncDone();
+
+                GetQuantizer<ElemType>(m_aggregatedGradientStripeQuantizers[gradMatrixIdx]).UnquantizeAsync(
+                    GetQuantizedMatrix<ElemType>(*m_recvGradientStripesQuantized[gradMatrixIdx][recvBufferSubIndex]),
+                    *(aggGradStripes[gradMatrixIdx]),
+                    true);
+
+                perGradMatrixReceiveCount[gradMatrixIdxPosition]++;
+
+                // Also issue the quantization if this stripe was the last one expected for this matrix
+                // Note: We issue the quantization without waiting for the unquantization since the same stream
+                // is used for both and they are implicitly sequenced
+                // We reuse the buffer that we used for quantizing and sending out the pre-aggregation gradient
+                if (perGradMatrixReceiveCount[gradMatrixIdxPosition] == (numWorkers - 1))
+                {
+                    Stripe stripe = GetStripeForNode(inputValues[gradMatrixIdx]->GetNumCols(), rank, numWorkers);
+                    UNUSED(stripe);
+                    assert(stripe.m_numCols > 0);
+                    GetQuantizer<ElemType>(m_aggregatedGradientStripeQuantizers[gradMatrixIdx]).QuantizeAsync(
+                        *(aggGradStripes[gradMatrixIdx]),
+                        *(inputStripeResiduals[gradMatrixIdx]),
+                        *(aggGradStripesQuantized[gradMatrixIdx]),
+                        *(outputStripeResiduals[gradMatrixIdx]),
+                        m_zeroThresholdFor1Bit);
+                }
+            }
+
+            assert(numActualReceives == numReceivesExpected);
+
+            vector<vector<MPI_Request>> recvAggGradStripesQuantizedRequests(inValues.size());
+            // Initiate receive of stripes of quantized aggregated gradients from different nodes
+            for (int i = 0; i < inValues.size(); ++i)
+            {
+                int recvRequestIdx = 0;
+                for (int j = 0; j < numWorkers; ++j)
+                {
+                    // Do not recv stripe for self
+                    if (j != rank)
+                    {
+                        Stripe stripe = GetStripeForNode(inputValues[i]->GetNumCols(), j, numWorkers);
+                        if (stripe.m_numCols > 0)
+                        {
+                            recvAggGradStripesQuantizedRequests[i].push_back(MPI_Request());
+                            QuantizedMatrix<ElemType> quantizedStripe = GetQuantizedMatrix<ElemType>(*m_quantizedGradients[i]).ColumnSlice(stripe.m_startCol, stripe.m_numCols);
+                            m_mpi->Irecv(quantizedStripe.Buffer(), (int)quantizedStripe.GetSize(), MPI_CHAR, j, (int)inValues.size() + 1 + i, &(recvAggGradStripesQuantizedRequests[i][recvRequestIdx])) || MpiFail("MPI_Irecv");
+                            recvRequestIdx++;
+                        }
+                    }
+                }
+            }
+
+            // Initiate broadcast of quantized aggregated gradient stripes to all other nodes
+            vector<vector<MPI_Request>> sendAggGradStripeQuantizedRequests(inValues.size());
+            for (int i = 0; i < inValues.size(); ++i)
+            {
+                Stripe stripe = GetStripeForNode(inputValues[i]->GetNumCols(), rank, numWorkers);
+                if (stripe.m_numCols > 0)
+                {
+                    sendAggGradStripeQuantizedRequests[i] = std::vector<MPI_Request>(numWorkers - 1);
+                    GetQuantizer<ElemType>(m_aggregatedGradientStripeQuantizers[i]).WaitQuantizeAsyncDone();
+                    for (int j = 0; j < numWorkers - 1; ++j)
+                    {
+                        int dest = (j >= rank) ? (j + 1) : j;
+
+                        // TODO: Should we use MPI_Bcast instead for better performance
+                        m_mpi->Isend(aggGradStripesQuantized[i]->Buffer(), (int)aggGradStripesQuantized[i]->GetSize(), MPI_CHAR, dest, (int)inValues.size() + 1 + i, &(sendAggGradStripeQuantizedRequests[i][j])) || MpiFail("MPI_Irecv");
+                    }
+                }
+            }
+
+            // Wait to receive all aggregated stripes and unquantize
+            for (size_t i = 0; i < inValues.size(); ++i)
+            {
+                m_mpi->Waitall((int)recvAggGradStripesQuantizedRequests[i].size(), recvAggGradStripesQuantizedRequests[i].data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");
+                GetQuantizer<ElemType>(m_preAggregatedGradientQuantizers[i]).UnquantizeAsync(GetQuantizedMatrix<ElemType>(*m_quantizedGradients[i]), *(outputValues[i]), false);
+            }
+
+            // Wait for all the unquantizations to finish
+            for (size_t i = 0; i < inValues.size(); ++i)
+                GetQuantizer<ElemType>(m_preAggregatedGradientQuantizers[i]).WaitUnquantizeAsyncDone();
+
+            // Wait for completion of the async send requests
+            for (int i = 0; i < sendGradStripesQuantizedRequests.size(); ++i)
+            {
+                if (sendGradStripesQuantizedRequests[i].size() > 0)
+                    m_mpi->Waitall((int)sendGradStripesQuantizedRequests[i].size(), sendGradStripesQuantizedRequests[i].data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");
+            }
+
+            for (int i = 0; i < sendAggGradStripeQuantizedRequests.size(); ++i)
+            {
+                if (sendAggGradStripeQuantizedRequests[i].size() > 0)
+                    m_mpi->Waitall((int)sendAggGradStripeQuantizedRequests[i].size(), sendAggGradStripeQuantizedRequests[i].data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");
+            }
+        }
+
+        // option for handling the mean for 1-bit quantization
+        // force 1-bit quant to threshold against 0 rather than the midpoint between lower and upper
+        const bool m_zeroThresholdFor1Bit;
+
+        // Number of bits that each gradient value is quantized to before communication with other nodes.
+        const size_t m_numQuantizationBits;
+
+        // Since the self-stripe in an all-reduce is not communicated, there is really no reason to
+        // quantize it for reduced communication. However, we add this as an option for for consistency
+        // across all stripes if desired
+        const bool m_useQuantizationForSelfStripe;
+
+        const std::unique_ptr<CUDAPageLockedMemAllocator> m_allocator;
+
+        // Buffer for quantized gradients.
+        vector<QuantizedMatrixBasePtr> m_quantizedGradients;
+
+        // Buffer for quantized stripes.
+        vector<vector<QuantizedMatrixBasePtr>> m_recvGradientStripesQuantized;
+
+        // Quantizers to quantize initial gradients.
+        vector<shared_ptr<MatrixQuantizerBase>> m_preAggregatedGradientQuantizers;
+
+        // Quantizers to quantize aggregated stripes.
+        vector<shared_ptr<MatrixQuantizerBase>> m_aggregatedGradientStripeQuantizers;
+    };
+}
--- a/Source/1BitSGD/V2AllReduceDistGradAggregator.h
+++ b/Source/1BitSGD/V2AllReduceDistGradAggregator.h
@ -0,0 +1,299 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#undef _SCL_SECURE_NO_WARNINGS
+#include "CNTKLibrary.h"
+#include "Utils.h"
+
+#include "IDistGradAggregator.h"
+#include "CUDAPageLockedMemAllocator.h"
+#include "QuantizedMatrix.h"
+#include "MatrixQuantizer.h"
+#include "MatrixQuantizerGPU.h"
+#include <future>
+#include "TimerUtility.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// =======================================================================
+// AllReduceDistGradAggregator -- 1-bit SGD.
+// This implements
+//    Frank Seide, Hao Fu, Jasha Droppo, Gang Li, and Dong Yu:
+//    "1-bit stochastic gradient descent and its application to data-parallel distributed training of speech DNNs"
+//    In Proc. Interspeech 2014.
+// =======================================================================
+
+template <class ElemType>
+class V2AllReduceDistGradAggregator : public IDistGradAggregator<ElemType>
+{
+    UsingIDistGradAggregatorMembers;
+
+    static const int DEBUG_OUTPUT_TRACE_LEVEL = 3;
+    ::CNTK::QuantizedDistributedCommunicatorPtr m_communicator;
+
+public:
+    V2AllReduceDistGradAggregator(::CNTK::QuantizedDistributedCommunicatorPtr communicator, bool useAsyncAggregation, int traceLevel, int syncStatsTrace)
+        : IDistGradAggregator<ElemType>(nullptr), m_traceLevel(traceLevel), m_initialized(false), m_useAsyncAggregation(useAsyncAggregation), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace), m_iterationCount(0),
+        m_communicator(communicator)
+    {}
+
+    ~V2AllReduceDistGradAggregator()
+    {
+        if (m_bufferedGradHeader != nullptr)
+            DistGradHeader::Destroy(m_bufferedGradHeader);
+    }
+
+    void Initialize(const std::vector<Matrix<ElemType>*>& gradients, int numEvalNodes)
+    {
+        // When called the first time let's setup the quantizers and matrices for holding quantized values.
+        // These can live for the lifetime of the aggregator since the gradient matrix dimensions for learnable parameters
+        // do not change
+        m_initialized = true;
+        int deviceId = gradients[0]->GetDeviceId();
+
+        for (size_t i = 0; i < gradients.size(); i++)
+        {
+            // Make sure none of the gradient matrices are sparse - we currently do not support aggregation of sparse gradient matrices
+            if (gradients[i]->GetMatrixType() != DENSE)
+                RuntimeError("Gradient aggregation for sparse gradient matrices is currently unsupported!");
+
+            if (m_useAsyncAggregation)
+                m_bufferedGradients[gradients[i]].reset(new Matrix<ElemType>(gradients[i]->GetNumRows(), gradients[i]->GetNumCols(), deviceId));
+        }
+
+        if (m_useAsyncAggregation)
+        {
+            m_bufferedGradHeader = DistGradHeader::Create(numEvalNodes);
+            m_bufferedGradHeader->Clear();
+        }
+    }
+
+    void ResetState(const std::vector<Matrix<ElemType>*>& gradients)
+    {
+        // If we are resetting state, let's clear previous quantization residues
+        // Make sure there is no pending async aggregation
+        if (m_useAsyncAggregation && m_pendingAsyncAggregation.valid())
+            LogicError("Unexpected pending async gradient aggregation found when resetting aggregator state!");
+
+        for (size_t i = 0; i < m_residuals.size(); ++i)
+            m_residuals[i]->SetValue(static_cast<ElemType>(0.0));
+
+        for (size_t i = 0; i < m_stripeResiduals.size(); ++i)
+            if (m_stripeResiduals[i])
+                m_stripeResiduals[i]->SetValue(static_cast<ElemType>(0.0));
+
+        // Zero out the buffered gradients if resetting state
+        if (m_useAsyncAggregation)
+        {
+            for (size_t i = 0; i < gradients.size(); i++)
+                m_bufferedGradients[gradients[i]]->SetValue(static_cast<ElemType>(0));
+
+            m_bufferedGradHeader->Clear();
+        }
+    }
+
+    // Aggregate the gradient matrices across all nodes
+    bool AggregateGradients(const std::vector<Matrix<ElemType>*>& gradients, DistGradHeader* headerCPU, bool resetState) override
+    {
+        if (!m_initialized)
+            Initialize(gradients, headerCPU->numEvalNode);
+        else if (resetState)
+            ResetState(gradients);
+
+        bool showSyncPerfStats = (m_syncStatsTrace > 0) && ((m_iterationCount % m_syncStatsTrace) == 0);
+        m_iterationCount++;
+
+        if (m_useAsyncAggregation)
+        {
+            // If we are performing async gradient aggregation, let's wait for the pending gradient aggregation to finish
+            // then swap the contents of the buffered gradients and the new gradient matrices and fire an async aggreagation
+            // of the new gradient matrices
+            if (m_pendingAsyncAggregation.valid())
+            {
+                Timer aggregationTimer;
+                if (showSyncPerfStats)
+                    aggregationTimer.Start();
+
+                m_pendingAsyncAggregation.get();
+
+                if (showSyncPerfStats)
+                {
+                    aggregationTimer.Stop();
+                    double gradientAggregationTime = aggregationTimer.ElapsedSeconds();
+                    fprintf(stderr, "Async gradient aggregation wait time: %.6g\n", gradientAggregationTime);
+                }
+            }
+
+            std::vector<Matrix<ElemType>*> newGradients;
+            size_t numGradMatrices = gradients.size();
+            for (size_t i = 0; i < numGradMatrices; i++)
+            {
+                Matrix<ElemType>* bufferedGradientMatrix = m_bufferedGradients[gradients[i]].get();
+                if ((bufferedGradientMatrix == nullptr) ||
+                    (bufferedGradientMatrix->GetNumCols() != gradients[i]->GetNumCols()) ||
+                    (bufferedGradientMatrix->GetNumRows() != gradients[i]->GetNumRows()) ||
+                    (bufferedGradientMatrix->GetDeviceId() != gradients[i]->GetDeviceId()))
+                {
+                    LogicError("No buffered gradient matrix found corresponding to a gradient matrix to be aggregated!");
+                }
+
+                // Swap the gradient matrix contents with the buffered matrices
+                std::swap(*(gradients[i]), *bufferedGradientMatrix);
+
+                newGradients.push_back(bufferedGradientMatrix);
+            }
+
+            // Swap the grad header contents with the buffered grad header
+            swap(*headerCPU, *m_bufferedGradHeader);
+
+            // Initiate aggregation only if any samples were processed in previous iteration
+            if (resetState || (headerCPU->numSamples != 0))
+            {
+                int deviceId = gradients[0]->GetDeviceId();
+                DistGradHeader* newGradHeader = m_bufferedGradHeader;
+
+                // Since we will be aggregating the gradients asynchronously, let us
+                // ensure that the gradient matrices have been computed before starting to aggregate
+                // them asynchronously on another thread. This essentially means that when we are using
+                // a GPU device, we will synchronize on the main GPU compute stream before starting
+                // the gradient aggregation asynchronously on a separate stream
+                MatrixComputeStreamEvent* mainStreamSyncEvent = MatrixComputeStreamEvent::Create(deviceId);
+
+                m_pendingAsyncAggregation = std::async(std::launch::async, [=] {
+                    // We are starting on a new thread. Make sure the new thread is
+                    // setup to use the right device
+                    Matrix<ElemType>::SetDevice(deviceId);
+
+                    // Synchronize the Quantization compute stream with the completion of
+                    // compute of the gradient matrices on the main compute stream
+                    mainStreamSyncEvent->SynchronizeQuantizationComputeStreamWithEvent<ElemType>();
+                    delete mainStreamSyncEvent;
+
+                    AggregateGradientsImpl(newGradients, newGradHeader, showSyncPerfStats);
+                });
+
+                return true;
+            }
+
+            return false;
+        }
+        else
+        {
+            AggregateGradientsImpl(gradients, headerCPU, showSyncPerfStats);
+            return (headerCPU->numSamples != 0);
+        }
+    }
+
+    void AggregateGradientsImpl(const std::vector<Matrix<ElemType>*>& gradients, DistGradHeader* headerCPU, bool showSyncPerfStats)
+    {
+        Timer aggregationTimer;
+        int deviceId = gradients[0]->GetDeviceId();
+        if (showSyncPerfStats)
+        {
+            std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId));
+            mainStreamSyncEvent->SynchronizeEvent();
+            aggregationTimer.Start();
+        }
+
+        size_t numGradMatrices = gradients.size();
+
+        if (headerCPU->numSamples == 0)
+        {
+            assert(headerCPU->criterion == 0.0);
+            assert(headerCPU->numSamplesWithLabel == 0);
+            for (int i = 0; i < headerCPU->numEvalNode; ++i)
+                assert(headerCPU->evalErrors[i].first == 0 && headerCPU->evalErrors[i].second == 0);
+
+            // If the current node did not process any samples, the gradients should be zero'd
+            for (size_t i = 0; i < numGradMatrices; ++i)
+                gradients[i]->SetValue(static_cast<ElemType>(0));
+
+            if (m_useAsyncAggregation)
+            {
+                std::unique_ptr<MatrixComputeStreamEvent> mainStreamSyncEvent(MatrixComputeStreamEvent::Create(deviceId));
+                mainStreamSyncEvent->SynchronizeQuantizationComputeStreamWithEvent<ElemType>();
+            }
+        }
+
+        // Aggregate header.
+        size_t numberOfElements = 1 + 1 + 1 + headerCPU->numEvalNode * 2;
+        std::unique_ptr<double[]> headerBuffer(new double[numberOfElements]);
+        headerBuffer[0] = headerCPU->criterion;
+        headerBuffer[1] = static_cast<double>(headerCPU->numSamples);
+        headerBuffer[2] = static_cast<double>(headerCPU->numSamplesWithLabel);
+        for (size_t i = 0; i < headerCPU->numEvalNode; ++i)
+        {
+            headerBuffer[3 + 2 * i] = headerCPU->evalErrors[i].first;
+            headerBuffer[3 + 2 * i + 1] = static_cast<double>(headerCPU->evalErrors[i].second);
+        }
+
+        auto headerData = ::CNTK::MakeSharedObject<::CNTK::NDArrayView>(::CNTK::DataType::Double, ::CNTK::NDShape{ numberOfElements }, headerBuffer.get(), numberOfElements * sizeof(double), ::CNTK::DeviceDescriptor::CPUDevice());
+        std::vector<::CNTK::NDArrayViewPtr> valuesToAggregate{ headerData };
+
+        // TODO: Should be async
+        m_communicator->AggregateInPlace(valuesToAggregate, m_communicator->Workers());
+
+        // Copy data back to the header
+        headerCPU->criterion = headerBuffer[0];
+        headerCPU->numSamples = static_cast<size_t>(headerBuffer[1]);
+        headerCPU->numSamplesWithLabel = static_cast<size_t>(headerBuffer[2]);
+        for (size_t i = 0; i < headerCPU->numEvalNode; ++i)
+        {
+            headerCPU->evalErrors[i].first = headerBuffer[3 + 2 * i];
+            headerCPU->evalErrors[i].second = static_cast<size_t>(headerBuffer[3 + 2 * i + 1]);
+        }
+
+        // Aggregate gradients.
+        std::vector<::CNTK::NDArrayViewPtr> gradientValues;
+        for (size_t i = 0; i < gradients.size(); ++i)
+        {
+            assert(gradients[i]->Data() != nullptr);
+            ::CNTK::NDShape shape{ gradients[i]->GetNumRows(), gradients[i]->GetNumCols() };
+            auto data = ::CNTK::MakeSharedObject<::CNTK::NDArrayView>(::CNTK::AsDataType<ElemType>(), shape, gradients[i]->Data(), gradients[i]->GetNumElements() * sizeof(ElemType), ::CNTK::AsDeviceDescriptor(gradients[i]->GetDeviceId()));
+            gradientValues.push_back(data);
+        }
+
+        m_communicator->QuantizedAggregateInPlace(
+            gradientValues,
+            m_residuals,
+            m_stripeResiduals,
+            m_communicator->Workers());
+
+        if (showSyncPerfStats)
+        {
+            aggregationTimer.Stop();
+            double gradientAggregationTime = aggregationTimer.ElapsedSeconds();
+            fprintf(stderr, "Actual gradient aggregation time: %.6g\n", gradientAggregationTime);
+        }
+    }
+
+private:
+    // Perform asynchronous gradient aggregation using double buffering of the gradient matrices
+    bool m_useAsyncAggregation;
+
+    // Future corresponding to the current in-flight async gradient aggregation
+    std::future<void> m_pendingAsyncAggregation;
+
+    // Buffered gradients that we asynchronously aggregate
+    std::unordered_map<Matrix<ElemType>*, std::unique_ptr<Matrix<ElemType>>> m_bufferedGradients;
+    DistGradHeader* m_bufferedGradHeader;
+
+    int m_traceLevel;
+    int m_syncStatsTrace;
+
+    // Only used for controlling frequency of measuring/showing gradient aggregation perf stats
+    size_t m_iterationCount;
+
+    bool m_initialized;
+
+    // Residuals of quantized gradients.
+    std::vector<::CNTK::NDArrayViewPtr> m_residuals;
+    // Residuals of quantized aggregated stripes this node is responsible for.
+    std::vector<::CNTK::NDArrayViewPtr> m_stripeResiduals;
+};
+
+} } }
--- a/Source/1BitSGD/V2BlockMomentumSGD.h
+++ b/Source/1BitSGD/V2BlockMomentumSGD.h
@ -0,0 +1,361 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma  once 
+
+#include "../SGDLib/MASGD.h"
+#include <map>
+#include <string>
+#include <memory>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // Implementation of Blockwise Model Update and Filtering (BMUF, a.k.a. block momentum)
+    // For detail, see the following paper
+    // Kai Chen and Qiang Huo, "Scalable training of deep learning machines by incremental block training
+    // with intra-block parallel optimization and blockwise model-update filtering",
+    // in International Conference on Acoustics, Speech and Signal Processing , March 2016, Shanghai, China.
+    template<typename ElemType>
+    class V2BlockMomentumSGD : public IMASGD<ElemType>
+    {
+        typedef IMASGD<ElemType> Base;
+        using Base::m_deviceId;
+        using Base::DownCast;
+
+        bool m_resetSGDMomentumAfterAggregation;
+        bool m_useNesterovMomentum;
+        double m_blockLearningRate;
+        double m_blockMomentumAsTimeConstantPerWorker;
+        size_t m_syncPeriodPerWorker;
+        ::CNTK::DistributedCommunicatorPtr m_communicator;
+        bool m_someWorkerHasFinished;
+
+        // parameters at the last model aggregation point
+        std::map<std::wstring, std::shared_ptr<Matrix<ElemType>>> m_prevParameters;
+        std::map<std::wstring, std::shared_ptr<Matrix<ElemType>>> m_blockLevelSmoothedGradient;
+
+    public:
+        V2BlockMomentumSGD(const MPIWrapperPtr& pMPI,
+            ::CNTK::DistributedCommunicatorPtr communicator,
+            size_t reportFrequency,
+            DEVICEID_TYPE deviceId,
+            bool useNestrovMomentum,
+            bool resetSGDM,
+            double blockLearningRate,
+            double blockMomentumAsTimeConstant,
+            size_t syncPeriod)
+            : IMASGD<ElemType>(pMPI, reportFrequency, deviceId),
+            m_communicator(communicator),
+            m_useNesterovMomentum(useNestrovMomentum),
+            m_resetSGDMomentumAfterAggregation(resetSGDM),
+            m_blockLearningRate(blockLearningRate),
+            m_blockMomentumAsTimeConstantPerWorker(blockMomentumAsTimeConstant / communicator->Workers().size())
+        {
+            m_syncPeriodPerWorker = syncPeriod / communicator->Workers().size();
+            if (m_syncPeriodPerWorker == 0)
+                InvalidArgument("Sync period is too small.");
+        }
+
+        void OnEpochStart(const std::list<ComputationNodeBasePtr>& learnableNodes) override
+        {
+            m_someWorkerHasFinished = false;
+
+            for (auto& n : learnableNodes)
+            {
+                auto node = DownCast(n);
+                std::wstring name = node->NodeName();
+
+                Matrix<ElemType>& value = node->Value();
+                if (m_blockLevelSmoothedGradient.find(name) == m_blockLevelSmoothedGradient.end())
+                {
+                    // has not been initialized yet
+                    auto pSmoothedGrad = make_shared<Matrix<ElemType>> (value.GetDeviceId());
+                    pSmoothedGrad->Resize(value.GetNumRows(), value.GetNumCols());
+                    pSmoothedGrad->SetValue((ElemType)0); 
+                    m_blockLevelSmoothedGradient[name] = pSmoothedGrad; 
+                }
+
+                if (m_prevParameters.find(name) == m_prevParameters.end())
+                {
+                    auto newValue = make_shared<Matrix<ElemType>>(value.GetDeviceId());
+                    newValue->SetValue(value);
+                    m_prevParameters[name] = newValue;
+                }
+                else
+                {
+                    m_prevParameters[name]->SetValue(value);
+                }
+            }
+
+            fprintf(stderr, "Parallel training (%d workers) using BlockMomentumSGD with "
+                            "block momentum = %6.4f, "
+                            "block momentum time constant (per worker) = %6.4f, "
+                            "block learning rate = %6.4f, "
+                            "block size per worker = %d samples, "
+                            "%s"
+                            "%s"
+                            "\n",
+                            (int)m_communicator->Workers().size(),
+                            BlockMomentumSGD<double>::TimeConstant2Momentum(m_blockMomentumAsTimeConstantPerWorker, m_syncPeriodPerWorker),
+                            m_blockMomentumAsTimeConstantPerWorker,
+                            m_blockLearningRate, 
+                            (int)m_syncPeriodPerWorker, 
+                            m_useNesterovMomentum ? "using Nesterov-style block momentum, " : "" , 
+                            m_resetSGDMomentumAfterAggregation ? "resetting SGD momentum after sync." : ".");
+        }
+
+        bool OnArrivingAtSyncPoint(
+            const std::list<ComputationNodeBasePtr>& learnableNodes,        /* input/output: */
+            std::list<Matrix<ElemType>>& smoothedGradient,                  /* input/output: under some setup, it will reset to zero*/
+            size_t  samplesSinceLastSync                                    /* input:  samples processed since last sync on this worker only */
+            ) override
+        {
+            if (m_someWorkerHasFinished)
+                return false;
+
+            // Let's check the status.
+            double statusValue = 0;
+            auto status = ::CNTK::MakeSharedObject<::CNTK::NDArrayView>(::CNTK::DataType::Double, ::CNTK::NDShape{ 1 }, &statusValue, sizeof(double), ::CNTK::DeviceDescriptor::CPUDevice());
+            std::vector<::CNTK::NDArrayViewPtr> aggregatedStatus { status };
+            m_communicator->AggregateInPlace(aggregatedStatus, m_communicator->Workers());
+
+            if (statusValue > 0)
+            {
+                m_someWorkerHasFinished = true;
+                return false;
+            }
+
+            // Otherwise let update the weights.
+            float secondsOnCommunication = 0.0f;
+            size_t totalSamples = 0;
+            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
+            return true;
+        }
+
+        /*virtual*/ void OnEpochEnd(const std::list<ComputationNodeBasePtr>& learnableNodes,
+            std::list<Matrix<ElemType>>& smoothedGradient,
+            size_t samplesSinceLastSync) override
+        {
+            if (!m_someWorkerHasFinished)
+            {
+                // Let's update the other guys that we have finished.
+                m_someWorkerHasFinished = true;
+
+                double statusValue = 1;
+                auto status = ::CNTK::MakeSharedObject<::CNTK::NDArrayView>(::CNTK::DataType::Double, ::CNTK::NDShape{ 1 }, &statusValue, sizeof(double), ::CNTK::DeviceDescriptor::CPUDevice());
+                std::vector<::CNTK::NDArrayViewPtr> aggregatedStatus{ status };
+                m_communicator->AggregateInPlace(aggregatedStatus, m_communicator->Workers());
+            }
+
+            // Let's update our weights no matter what.
+            float secondsOnCommunication = 0.0f;
+            size_t totalSamples = 0;
+            ModelAggregationProcessing(samplesSinceLastSync, learnableNodes, smoothedGradient, totalSamples, secondsOnCommunication);
+        }
+
+        /*virtual*/ void ModelAggregationProcessing(
+            size_t /*samplesSinceLastSync*/,
+            const std::list<ComputationNodeBasePtr>& learnableNodes,
+            std::list<Matrix<ElemType>>& smoothedGradient,
+            size_t&                                   /*totalSamplesProcessed*/,   /* out */
+            float&                                    secondsOnCommunication   /* out */
+            ) override
+        {
+            ElemType blockMomentum = (ElemType)BlockMomentumSGD<double>::TimeConstant2Momentum(m_blockMomentumAsTimeConstantPerWorker, m_syncPeriodPerWorker);
+            Timer commTimer;
+            secondsOnCommunication = 0.0f;
+
+            // 1. Let's aggregate weights
+            std::map<std::wstring, std::shared_ptr<Matrix<ElemType>>> aggregatedWeights;
+            std::vector<::CNTK::NDArrayViewPtr> aggregatedWeightsPrepared;
+            for (auto& pBaseNode : learnableNodes)
+            {
+                if (!pBaseNode->IsParameterUpdateRequired())
+                    continue;
+
+                wstring name = pBaseNode->NodeName();
+                auto pNode = DownCast(pBaseNode);
+
+                // Get current model
+                Matrix<ElemType>& prevWeight = *m_prevParameters[name];                  // prev model value
+                Matrix<ElemType>& currentWeight = pNode->Value();                        // current model
+
+                // Subtract it from the previous model
+                auto blockGrad = std::make_shared<Matrix<ElemType>>(prevWeight, CPUDEVICE);
+                *blockGrad -= currentWeight;                                              // matW becomes local block gradient (of one worker)
+
+                aggregatedWeights[name] = blockGrad;
+                ::CNTK::NDShape shape{ blockGrad->GetNumElements() };
+                auto data = ::CNTK::MakeSharedObject<::CNTK::NDArrayView>(::CNTK::AsDataType<ElemType>(), shape, blockGrad->Data(), blockGrad->GetNumElements() * sizeof(ElemType), ::CNTK::AsDeviceDescriptor(blockGrad->GetDeviceId()));
+                aggregatedWeightsPrepared.push_back(data);
+            }
+
+            // Send block gradient over MPI nodes.
+            m_communicator->AggregateInPlace(aggregatedWeightsPrepared, m_communicator->Workers());
+
+            // 2. Let's update the model
+            for (auto& pBaseNode : learnableNodes)
+            {
+                if (!pBaseNode->IsParameterUpdateRequired())
+                    continue;
+
+                wstring name = pBaseNode->NodeName();
+                auto pNode = DownCast(pBaseNode);
+
+                // 2 block gradient aggregation
+                // 2.1. get current model
+                Matrix<ElemType>& prevWeight = *m_prevParameters[name];                  // prev model value
+                Matrix<ElemType>& currentWeight = pNode->Value();                        // current model
+                auto blockGrad = aggregatedWeights[name];
+                // 2.2. model update 
+                {
+                    Matrix<ElemType>& sg = *m_blockLevelSmoothedGradient[name];       // smoothed gradient
+                    blockGrad->TransferToDeviceIfNotThere(sg.GetDeviceId());
+                    // 2.2.1 update block level smoothed gradient; 
+                    // This is essentially a first-order infinite impulse response (IIR) filter with the gain (1 - blockMomentum)*m_blockLearningRate:
+                    // smoothedGradient(t)=blockMomentum * smoothedGradients(t-1) + (1 - blockMomentum)*m_blockLearningRate*blockGrad(t)
+                    Matrix<ElemType>::ScaleAndAdd((ElemType)((1 - blockMomentum)*m_blockLearningRate), *blockGrad, (ElemType)blockMomentum, sg);
+                    // 2.2.2 update parameters; 
+                    currentWeight.SetValue(prevWeight);
+                    currentWeight -= sg;
+                    // 2.2.3 Nesterov Momentum 
+                    // A Nesterov momentum here is to do a partial weight update before calculating the gradient, i.e., 
+                    // (step 1) w(t) <-- w(t) - \eta* v(t) 
+                    // (step 2) g(t+1) <-- forwardbackward on minibatches with initial model as w(t)
+                    // (step 3) v(t+1) <-- \eta*v(t) + (1-\eta)*learningRate*g(t+1)
+                    // (step 4) w(t+1) <-- w(t)-v(t)
+                    // (step 5) t      <-- t+1
+                    // without step 1, this becomes stanard momentum
+                    if (m_useNesterovMomentum)
+                    {
+                        Matrix<ElemType>::ScaleAndAdd((ElemType)-blockMomentum, sg, currentWeight);
+                    }
+                    // 2.2.4 update bookkeeping
+                    prevWeight.SetValue(currentWeight);
+                }
+            }
+            //----------------------------------------
+            // 3. reset SGD momentum if necessary 
+            //----------------------------------------
+            if (m_resetSGDMomentumAfterAggregation)
+            {
+                for (Matrix<ElemType>& x : smoothedGradient)
+                {
+                    x.SetValue((ElemType)0);
+                }
+            }
+        }
+
+        void SaveToCheckPoint(File& fstream) override
+        {
+            if (!m_communicator->CurrentWorker().IsMain())
+                return;
+
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMACKP");
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BOptions");
+            fstream << m_resetSGDMomentumAfterAggregation;
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EOptions");
+
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BMomentumAsTimeConstant");
+            fstream << m_blockMomentumAsTimeConstantPerWorker;
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"EMomentumAsTimeConstant");
+
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BSyncPeriodInSamples");
+            fstream << m_syncPeriodPerWorker;
+            fstream.PutMarker(FileMarker::fileMarkerEndSection, L"ESyncPeriodInSamples");
+
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"BParam");
+            SaveParameters(fstream, m_prevParameters);
+            SaveParameters(fstream, m_blockLevelSmoothedGradient);
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"EParam");
+
+            fstream.PutMarker(FileMarker::fileMarkerBeginSection, L"EMACKP");
+        }
+
+        void LoadFromCheckPoint(File& fstream) override
+        {
+            if (!fstream.TryGetMarker(FileMarker::fileMarkerBeginSection, L"BMACKP"))
+                return;
+
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BOptions");
+            fstream >> m_resetSGDMomentumAfterAggregation;
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EOptions");
+
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BMomentumAsTimeConstant");
+            fstream >> m_blockMomentumAsTimeConstantPerWorker;
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMomentumAsTimeConstant");
+
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BSyncPeriodInSamples");
+            fstream >> m_syncPeriodPerWorker;
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"ESyncPeriodInSamples");
+
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"BParam");
+            LoadParameters(fstream, m_prevParameters, m_deviceId);
+            LoadParameters(fstream, m_blockLevelSmoothedGradient, m_deviceId);
+            fstream.GetMarker(FileMarker::fileMarkerBeginSection, L"EParam");
+
+            fstream.GetMarker(FileMarker::fileMarkerEndSection, L"EMACKP");
+        }
+
+    private:
+
+       // helper function to save/load map<wstring, shared_ptr<Matrix<ElemType>> structure 
+       void SaveParameters(File& f, const map<wstring, shared_ptr<Matrix<ElemType>>>& parameters) const
+        {
+            // save sizeof(ElemType)
+            unsigned int size = sizeof(ElemType);
+            f << size;
+            // save number of pairs 
+            unsigned int numPairs = parameters.size();
+            f << numPairs;
+            for (auto& x : parameters)
+            {
+                f << x.first;
+                f << *x.second;
+            }
+            f.Flush();
+            return;
+        }
+
+       void LoadParameters(File& f, map<wstring, shared_ptr<Matrix<ElemType>>>& parameters, DEVICEID_TYPE deviceID)
+       {
+           unsigned int size = 0;
+           unsigned int pair = 0;
+           f >> size;
+           f >> pair;
+           if (size != sizeof(ElemType))
+           {
+               LogicError("Mismatched ElemType in loading BlockMomentumSGD checkpoint. Expecting %s, while loading element size=%d\n",
+                   sizeof(ElemType) == 4 ? "float" : "double",
+                   size
+                   );
+           }
+           parameters.clear();
+           for (size_t i = 0; i < pair; i++)
+           {
+               wstring name;
+               f >> name;
+               shared_ptr<Matrix<ElemType>> mat = make_shared<Matrix<ElemType>>(deviceID);
+               f >> *mat;
+               parameters[name] = mat;
+           }
+       }
+
+    public:
+       static double TimeConstant2Momentum(double timeConstant, size_t syncPeroid)
+       {
+           return exp(-((double)syncPeroid) / timeConstant);
+       }
+
+       static double Momentum2TimeConstant(double bm, size_t syncPeroid)
+       {
+           if (bm >= 1.0 || bm < 0.0)
+           {
+               InvalidArgument("Unexpected block momentum (%.2f). Block momentum should be in the range of [0,1)\n", bm);
+           }
+           return -(double)syncPeroid / log(bm); 
+       }
+    };
+} } }
--- a/Source/Common/Common.vcxproj
+++ b/Source/Common/Common.vcxproj
@ -72,6 +72,8 @@
  <ItemDefinitionGroup Condition="$(GpuBuild)">
    <ClCompile>
      <AdditionalIncludeDirectories>$(BOOST_INCLUDE_PATH);%(AdditionalIncludeDirectories);$(CudaInclude)</AdditionalIncludeDirectories>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</TreatWarningAsError>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@ -24,7 +24,10 @@
 #include "simplesenonehmm.h"
 #include "Matrix.h"
 #include <set>
-namespace msra { namespace math {
+namespace msra
+{
+namespace math
+{

 class ssematrixbase;
 template <class ssematrixbase>
@ -34,14 +37,20 @@ class ssematrixstriperef;
 };
 };

-namespace msra { namespace lm {
+namespace msra
+{
+namespace lm
+{

 class CMGramLM;
 class CSymbolSet;
 };
 }; // for numer-lattice building

-namespace msra { namespace asr {
+namespace msra
+{
+namespace asr
+{

 template <typename A, typename B>
 class htkmlfreader;
@ -49,7 +58,10 @@ struct htkmlfentry;
 };
 }; // for numer lattice building

-namespace msra { namespace lattices {
+namespace msra
+{
+namespace lattices
+{

 typedef msra::math::ssematrixbase matrixbase;
 typedef msra::math::ssematrix<matrixbase> matrix;
@ -72,8 +84,8 @@ class lattice
    // definie structure for nbest EMBR
    struct TokenInfo
    {
-        double score; // the score of the token
-        size_t prev_edge_index; // edge ending with this token, edge start points to the previous node
+        double score;            // the score of the token
+        size_t prev_edge_index;  // edge ending with this token, edge start points to the previous node
        size_t prev_token_index; // the token index in the previous node
    };
    struct PrevTokenInfo
@ -87,8 +99,8 @@ class lattice
    {
        // for sorting purpose
        // make sure the map is stored with keys in descending order
-        std::map<double, std::vector<PrevTokenInfo>, std::greater <double>> mp_score_token_infos; // for sorting the tokens in map
-        std::vector<TokenInfo> vt_nbest_tokens; // stores the nbest tokens in the node
+        std::map<double, std::vector<PrevTokenInfo>, std::greater<double>> mp_score_token_infos; // for sorting the tokens in map
+        std::vector<TokenInfo> vt_nbest_tokens;                                                  // stores the nbest tokens in the node
    };

    struct header_v1_v2
@ -118,10 +130,13 @@ private:
    static_assert(sizeof(aligninfo) == 4, "unexpected size of aligninfo");
    std::vector<nodeinfo> nodes;
    mutable std::vector<std::vector<uint64_t>> vt_node_out_edge_indices; // vt_node_out_edge_indices[i]: it stores the outgoing edge indices starting from node i
-    std::vector<bool> is_special_words; // true if it is special words that do not count to WER computation, false if it is not
+    std::vector<bool> is_special_words;                                  // true if it is special words that do not count to WER computation, false if it is not
    std::vector<edgeinfowithscores> edges;
    std::vector<aligninfo> align;
-   
+
+    //linquan
+    std::unordered_map<int, std::wstring> id2wordmap4node; //keep id2word mapping specific for nodes of current lattice
+
    // V2 lattices  --for a while, we will store both in RAM, until all code is updated
    static int fsgn(float f)
    {
@ -286,10 +301,9 @@ public: // TODO: make private again once
        }

        // sort edges
-        sort(edges2.begin(), edges2.end(), [&](const edgeinfo& e1, const edgeinfo& e2)
-             {
-                 return uniqueorder(e1, e2) < 0;
-             });
+        sort(edges2.begin(), edges2.end(), [&](const edgeinfo& e1, const edgeinfo& e2) {
+            return uniqueorder(e1, e2) < 0;
+        });

        // create a uniq'ed version of the align[] array, into uniquededgedatatokens[]
        uniquededgedatatokens.resize(0);
@ -363,10 +377,9 @@ public: // TODO: make private again once
                (int) align.size(), (int) numimpliedsp, (int) nonuniquenonsptokens, (int) uniquealigntokens, 100.0f * uniquealigntokens / nonuniquenonsptokens);

        // sort it back into original order (sorted by E, then by S)
-        sort(edges2.begin(), edges2.end(), [&](const edgeinfo& e1, const edgeinfo& e2)
-             {
-                 return latticeorder(e1, e2) < 0;
-             });
+        sort(edges2.begin(), edges2.end(), [&](const edgeinfo& e1, const edgeinfo& e2) {
+            return latticeorder(e1, e2) < 0;
+        });

        // TODO: be more consistent--we should clear out edges[] at this point!
    }
@ -731,7 +744,6 @@ private:
                                      const float lmf, const float wp, const float amf, const_array_ref<size_t>& uids,
                                      const edgealignments& thisedgealignments, std::vector<double>& Eframescorrect) const;

-    
    void sMBRerrorsignal(parallelstate& parallelstate,
                         msra::math::ssematrixbase& errorsignal, msra::math::ssematrixbase& errorsignalneg,
                         const std::vector<double>& logpps, const float amf, double minlogpp,
@ -768,7 +780,7 @@ private:
                                 const std::vector<double>& logEframescorrect, const double logEframescorrecttotal,
                                 msra::math::ssematrixbase& errorsignal, msra::math::ssematrixbase& errorsignalneg) const;
    void parallelEMBRerrorsignal(parallelstate& parallelstate, const edgealignments& thisedgealignments,
-        const std::vector<double>& edgeweights, msra::math::ssematrixbase& errorsignal) const;
+                                 const std::vector<double>& edgeweights, msra::math::ssematrixbase& errorsignal) const;
    void parallelmmierrorsignal(parallelstate& parallelstate, const edgealignments& thisedgealignments,
                                const std::vector<double>& logpps, msra::math::ssematrixbase& errorsignal) const;

@ -780,14 +792,14 @@ private:
                                          std::vector<double>& Eframescorrectbuf, double& logEframescorrecttotal) const;

    double parallelbackwardlatticeEMBR(parallelstate& parallelstate, const std::vector<float>& edgeacscores,
-        const float lmf, const float wp,
-        const float amf, std::vector<double>& edgelogbetas,
-        std::vector<double>& logbetas) const;
-    
-    void EMBRsamplepaths(const std::vector<double> &edgelogbetas,
-        const std::vector<double> &logbetas, const size_t numPathsEMBR, const bool enforceValidPathEMBR,  const bool excludeSpecialWords, std::vector< std::vector<size_t> > & vt_paths) const;
+                                       const float lmf, const float wp,
+                                       const float amf, std::vector<double>& edgelogbetas,
+                                       std::vector<double>& logbetas) const;

-    void EMBRnbestpaths(std::vector<NBestToken>& tokenlattice, std::vector<std::vector<size_t>> & vt_paths, std::vector<double>& path_posterior_probs) const;
+    void EMBRsamplepaths(const std::vector<double>& edgelogbetas,
+                         const std::vector<double>& logbetas, const size_t numPathsEMBR, const bool enforceValidPathEMBR, const bool excludeSpecialWords, std::vector<std::vector<size_t>>& vt_paths) const;
+
+    void EMBRnbestpaths(std::vector<NBestToken>& tokenlattice, std::vector<std::vector<size_t>>& vt_paths, std::vector<double>& path_posterior_probs) const;

    double get_edge_weights(std::vector<size_t>& wids, std::vector<std::vector<size_t>>& vt_paths, std::vector<double>& vt_edge_weights, std::vector<double>& vt_path_posterior_probs, std::string getPathMethodEMBR, double& onebestwer) const;

@ -806,14 +818,15 @@ private:
                                  std::vector<double>& logEframescorrect, std::vector<double>& Eframescorrectbuf,
                                  double& logEframescorrecttotal) const;

-    double backwardlatticeEMBR(const std::vector<float>& edgeacscores, parallelstate& parallelstate, std::vector<double> &edgelogbetas,
-                                  std::vector<double>& logbetas,
-                                  const float lmf, const float wp, const float amf) const;
+    double backwardlatticeEMBR(const std::vector<float>& edgeacscores, parallelstate& parallelstate, std::vector<double>& edgelogbetas,
+                               std::vector<double>& logbetas,
+                               const float lmf, const float wp, const float amf) const;

-    void constructnodenbestoken(std::vector<NBestToken> &tokenlattice, const bool wordNbest, size_t numtokens2keep, size_t nidx) const;
+    void constructnodenbestoken(std::vector<NBestToken>& tokenlattice, const bool wordNbest, size_t numtokens2keep, size_t nidx) const;
+
+    double nbestlatticeEMBR(const std::vector<float>& edgeacscores, parallelstate& parallelstate, std::vector<NBestToken>& vt_nbesttokens, const size_t numtokens, const bool enforceValidPathEMBR, const bool excludeSpecialWords,
+                            const float lmf, const float wp, const float amf, const bool wordNbest, const bool useAccInNbest, const float accWeightInNbest, const size_t numPathsEMBR, std::vector<size_t> wids) const;

-    double nbestlatticeEMBR(const std::vector<float> &edgeacscores, parallelstate &parallelstate, std::vector<NBestToken> &vt_nbesttokens, const size_t numtokens, const bool enforceValidPathEMBR,  const bool excludeSpecialWords,
-        const float lmf, const float wp, const float amf,  const bool wordNbest, const bool useAccInNbest, const float accWeightInNbest, const size_t numPathsEMBR, std::vector<size_t> wids) const;
 public:
    // construct from a HTK lattice file
    void fromhtklattice(const std::wstring& path, const std::unordered_map<std::string, size_t>& unitmap);
@ -822,7 +835,6 @@ public:
    void frommlf(const std::wstring& key, const std::unordered_map<std::string, size_t>& unitmap, const msra::asr::htkmlfreader<msra::asr::htkmlfentry, lattice::htkmlfwordsequence>& labels,
                 const msra::lm::CMGramLM& lm, const msra::lm::CSymbolSet& unigramsymbols);

-  
    // check consistency
    //  - only one end node
    //  - only forward edges
@ -929,7 +941,7 @@ public:
    template <typename HMMLOOKUPFUNCTION>
    void dump(FILE* f, const HMMLOOKUPFUNCTION& gethmmname) const // dump a lattice in HTK-like format
    {
-        fprintf(f, "N=%lu L=%lu\n", (unsigned long)nodes.size(), (unsigned long)edges.size());
+        fprintf(f, "N=%lu L=%lu\n", (unsigned long) nodes.size(), (unsigned long) edges.size());
        // foreach_index (i, nodes)
        //    fprintf (f, "I=%d\tt=%.2f\n", i, nodes[i].t * 0.01f);
        foreach_index (j, edges)
@ -1011,8 +1023,8 @@ public:
            RuntimeError("freadvector: malformed file, number of vector elements differs from head, for tag %s", tag);
        freadOrDie(v, sz, f);
    }
-    
-    bool CheckTag(const char*& buffer, const std::string& expectedTag) 
+
+    bool CheckTag(const char*& buffer, const std::string& expectedTag)
    {
        std::string tag(buffer, expectedTag.length());
        if (tag != expectedTag)
@ -1020,15 +1032,16 @@ public:
        buffer += expectedTag.length();
        return true;
    }
-    
+
    int ReadTagFromBuffer(const char*& buffer, const std::string& expectedTag, size_t expectedSize = SIZE_MAX)
    {
-        if (!CheckTag(buffer, expectedTag)) {
+        if (!CheckTag(buffer, expectedTag))
+        {
            // since lattice is packed densely by the reader, we may need to shift the buffer by 2 bytes.
            if (!CheckTag(buffer, expectedTag.substr(2)))
                RuntimeError("ReadTagFromBuffer: malformed file, missing expected tag: %s,", expectedTag.c_str());
        }
-        int* sz = (int*)buffer;
+        int* sz = (int*) buffer;
        if (expectedSize != SIZE_MAX && *sz != expectedSize)
            RuntimeError("ReadTagFromBuffer: malformed file, number of vector elements differs from head, for tag %zu", expectedSize);

@ -1041,7 +1054,8 @@ public:
    {
        int sz = ReadTagFromBuffer(buffer, expectedTag, expectedsize);
        v.resize(sz);
-        for (size_t i = 0;i < sz;i++) {
+        for (size_t i = 0; i < sz; i++)
+        {
            const T* element = reinterpret_cast<const T*>(buffer);
            v[i] = *element;
            buffer += sizeof(T);
@ -1056,7 +1070,8 @@ public:
    // This will also map the aligninfo entries to the new symbol table, through idmap.
    // V1 lattices will be converted. 'spsenoneid' is used in that process.
    template <class IDMAP>
-    void fread(FILE* f, const IDMAP& idmap, size_t spunit, std::set<int>& specialwordids)
+    void fread(FILE* f, const IDMAP& idmap, size_t spunit,
+               std::unordered_map<int, std::wstring>& id2wordmapping, std::set<int>& specialwordids)
    {
        size_t version = freadtag(f, "LAT ");
        if (version == 1)
@ -1089,7 +1104,7 @@ public:
            freadvector(f, "EDGS", edges2, info.numedges); // uniqued edges
            freadvector(f, "ALNS", uniquededgedatatokens); // uniqued alignments
            fcheckTag(f, "END ");
-            ProcessV2EMBRLattice(spunit, info, uniquededgedatatokens, idmap, specialwordids);
+            ProcessV2EMBRLattice(spunit, info, uniquededgedatatokens, idmap, id2wordmapping, specialwordids);
        }
        else
            RuntimeError("fread: unsupported lattice format version");
@ -1114,25 +1129,23 @@ public:
        ProcessV2Lattice(spunit, info, uniquededgedatatokens, idmap);
    }

-    
    // Helper method to process v2 Lattice format
    template <class IDMAP>
-    void ProcessV2Lattice(size_t spunit, header_v1_v2& info, std::vector<aligninfo>& uniquededgedatatokens, const IDMAP& idmap) 
+    void ProcessV2Lattice(size_t spunit, header_v1_v2& info, std::vector<aligninfo>& uniquededgedatatokens, const IDMAP& idmap)
    {
        // check if we need to map
        if (info.impliedspunitid != SIZE_MAX && info.impliedspunitid >= idmap.size()) // we have buggy lattices like that--what do they mean??
        {
-            fprintf(stderr, "ProcessV2Lattice: detected buggy spunit id %d which is out of range (%d entries in map)\n", (int)info.impliedspunitid, (int)idmap.size());
+            fprintf(stderr, "ProcessV2Lattice: detected buggy spunit id %d which is out of range (%d entries in map)\n", (int) info.impliedspunitid, (int) idmap.size());
            RuntimeError("ProcessV2Lattice: out of bounds spunitid");
        }

        // This is critical--we have a buggy lattice set that requires no mapping where mapping would fail
        bool needsmapping = false;
-        foreach_index(k, idmap)
+        foreach_index (k, idmap)
        {
-            if (idmap[k] != (size_t)k
-                && (k != (int)idmap.size() - 1 || idmap[k] != spunit) // that HACK that we add one more /sp/ entry at the end...
-                )
+            if (idmap[k] != (size_t) k && (k != (int) idmap.size() - 1 || idmap[k] != spunit) // that HACK that we add one more /sp/ entry at the end...
+            )
            {
                needsmapping = true;
                break;
@ -1174,7 +1187,7 @@ public:
                k += skipscoretokens;
                uniquealignments++;
            }
-            fprintf(stderr, "ProcessV2Lattice: mapped %d unique alignments\n", (int)uniquealignments);
+            fprintf(stderr, "ProcessV2Lattice: mapped %d unique alignments\n", (int) uniquealignments);
        }
        if (info.impliedspunitid != spunit)
        {
@ -1184,38 +1197,58 @@ public:
        }
        // reconstruct old lattice format from this   --TODO: remove once we change to new data representation
        rebuildedges(info.impliedspunitid != spunit /*to be able to read somewhat broken V2 lattice archives*/);
-
    }
-    
-        template <class IDMAP>
-    void ProcessV2EMBRLattice(size_t spunit, header_v1_v2& info, std::vector<aligninfo>& uniquededgedatatokens, const IDMAP& idmap, std::set<int>& specialwordids) 
+
+    template <class IDMAP>
+    void ProcessV2EMBRLattice(size_t spunit, header_v1_v2& info, std::vector<aligninfo>& uniquededgedatatokens, const IDMAP& idmap,
+                              std::unordered_map<int, std::wstring>& id2wordmapping, std::set<int>& specialwordids)
    {
+        std::unordered_map<int, std::wstring>::const_iterator maptable_itr;
+        std::unordered_map<int, std::wstring>::const_iterator nodemaptable_itr;
+        int wordid;
+
        vt_node_out_edge_indices.resize(info.numnodes);
        for (size_t j = 0; j < info.numedges; j++)
        {
            // an edge with !NULL pointing to not <s>
-            // this code make sure if you always start from <s> in the sampled path. 
+            // this code make sure if you always start from <s> in the sampled path.
            // mask here: we delay the processing in EMBRsamplepaths controlled by flag: enforceValidPathEMBR
            // if (edges2[j].S == 0 && nodes[edges2[j].E].wid != 1) continue;

            vt_node_out_edge_indices[edges2[j].S].push_back(j);
-
        }

        is_special_words.resize(info.numnodes);
+
        for (size_t i = 0; i < info.numnodes; i++)
        {
-            if (specialwordids.find(int(nodes[i].wid)) != specialwordids.end())    is_special_words[i] = true;
-            else is_special_words[i] = false;
+            if (specialwordids.find(int(nodes[i].wid)) != specialwordids.end())
+                is_special_words[i] = true;
+            else
+                is_special_words[i] = false;
+
+            if (!id2wordmapping.empty())
+            {
+                wordid = int(nodes[i].wid);
+                maptable_itr = id2wordmapping.find(wordid);
+				if (maptable_itr != id2wordmapping.end())
+                {
+                    if (id2wordmap4node.find(wordid) == id2wordmap4node.end())
+                    {
+                        id2wordmap4node.insert(std::pair<int, std::wstring>(maptable_itr->first, maptable_itr->second));
+                    }
+                }
+                else //in theory, never happens
+                {
+                    fprintf(stderr, "no mapping id2word for %d \n", wordid);
+                    id2wordmap4node.insert(std::pair<int, std::wstring>(wordid, std::to_wstring(wordid)));
+                }
+            }
        }

-
-        
-        ProcessV2Lattice(spunit, info, uniquededgedatatokens, idmap); 
+        ProcessV2Lattice(spunit, info, uniquededgedatatokens, idmap);
    }

-
-    
    // parallel versions (defined in parallelforwardbackward.cpp)
    class parallelstate
    {
@ -1263,13 +1296,13 @@ public:
    // Note: logLLs and posteriors may be the same matrix (aliased).
    double forwardbackward(parallelstate& parallelstate, const class msra::math::ssematrixbase& logLLs, const class msra::asr::simplesenonehmm& hmms,
                           class msra::math::ssematrixbase& result, class msra::math::ssematrixbase& errorsignalbuf,
-                           const float lmf, const float wp, const float amf, const float boostingfactor, const bool sMBRmode, const bool EMBR, const std::string EMBRUnit, const size_t numPathsEMBR, const bool enforceValidPathEMBR,  const std::string getPathMethodEMBR, const std::string showWERMode,
+                           const float lmf, const float wp, const float amf, const float boostingfactor, const bool sMBRmode, const bool EMBR, const std::string EMBRUnit, const size_t numPathsEMBR, const bool enforceValidPathEMBR, const std::string getPathMethodEMBR, const std::string showWERMode,
                           const bool excludeSpecialWords, const bool wordNbest, const bool useAccInNbest, const float accWeightInNbest, const size_t numRawPathsEMBR,
-                            array_ref<size_t> uids, std::vector<size_t> wids, const_array_ref<size_t> bounds = const_array_ref<size_t>(),
+                           array_ref<size_t> uids, std::vector<size_t> wids, const_array_ref<size_t> bounds = const_array_ref<size_t>(),
                           const_array_ref<htkmlfwordsequence::word> transcript = const_array_ref<htkmlfwordsequence::word>(), const std::vector<float>& transcriptunigrams = std::vector<float>()) const;
-    
-   void EMBRerrorsignal(parallelstate &parallelstate,
-        const edgealignments &thisedgealignments, std::vector<double>& edge_weights, msra::math::ssematrixbase &errorsignal) const;
+
+    void EMBRerrorsignal(parallelstate& parallelstate,
+                         const edgealignments& thisedgealignments, std::vector<double>& edge_weights, msra::math::ssematrixbase& errorsignal) const;
    std::wstring key; // (keep our own name (key) so we can identify ourselves for diagnostics messages)
    const wchar_t* getkey() const
    {
@ -1294,14 +1327,14 @@ public:
    // set of phoneme mappings
    typedef std::vector<unsigned int> symbolidmapping;
    template <class SYMMAP>
-    static void GetSymList(symbolidmapping& idmap, const std::wstring& symlistpath, const SYMMAP& symmap) 
+    static void GetSymList(symbolidmapping& idmap, const std::wstring& symlistpath, const SYMMAP& symmap)
    {
        std::vector<char> textbuffer;
        auto lines = msra::files::fgetfilelines(symlistpath, textbuffer);
        // establish mapping of each entry to the corresponding id in 'symmap'; this should fail if the symbol is not found
        idmap.reserve(lines.size() + 1); // last entry is a fake entry to return the /sp/ unit
        std::string symstring, tosymstring;
-        foreach_index(i, lines)
+        foreach_index (i, lines)
        {
            char* sym = lines[i];
            // parse out a mapping  (log SPC phys)
@ -1317,17 +1350,20 @@ public:
            }
            else
            {
-                if ((size_t)i != idmap.size()) // non-mappings must come first (this is to ensure compatibility with pre-mapping files)
+                if ((size_t) i != idmap.size()) // non-mappings must come first (this is to ensure compatibility with pre-mapping files)
                    RuntimeError("GetSymList: mixed up symlist file");
                symstring = sym; // (reusing existing object to avoid malloc)
-                idmap.push_back((unsigned int)getid(symmap, symstring));
+                idmap.push_back((unsigned int) getid(symmap, symstring));
            }
        }
        // append a fixed-position entry: last entry means /sp/
-        idmap.push_back((unsigned int)getid(symmap, "sp"));
+        idmap.push_back((unsigned int) getid(symmap, "sp"));
    }

 private:
+    //linquan: id2word mapping table used for eMBR CER/WER calculation
+    //std::unordered_map<int, std::wstring> id2wordmapping;
+
    const std::unordered_map<std::string, size_t>& modelsymmap; // [triphone name] -> index used in model
    // set of lattice archive files referenced
    // Note that .toc files can be concatenated, i.e. one .toc file can reference multiple archive files.
@ -1342,7 +1378,7 @@ private:
            archivepaths.push_back(path);
        return i;
    }
-    
+
    mutable std::vector<symbolidmapping> symmaps; // [archiveindex][unit] -> global unit map
    template <class SYMMAP>
    static size_t getid(const SYMMAP& symmap, const std::string& key)
@ -1364,7 +1400,6 @@ private:
            if (verbosity > 0)
                fprintf(stderr, "getcachedidmap: reading '%S'\n", symlistpath.c_str());
            archive::GetSymList(idmap, symlistpath, symmap);
-
        }
        return idmap;
    }
@ -1451,7 +1486,7 @@ public:
            char c;
            uint64_t offset;
 #ifdef _WIN32
-            if (sscanf_s(q, "[%I64u]%c", &offset, &c, (unsigned int)sizeof(c)) != 1)
+            if (sscanf_s(q, "[%I64u]%c", &offset, &c, (unsigned int) sizeof(c)) != 1)
 #else

            if (sscanf(q, "[%" PRIu64 "]%c", &offset, &c) != 1)
@ -1492,7 +1527,8 @@ public:
    // Lattices will have unit ids updated according to the modelsymmap.
    // V1 lattices will be converted. 'spsenoneid' is used in the conversion for optimizing storing 0-frame /sp/ aligns.
    void getlattice(const std::wstring& key, lattice& L,
-        std::set<int>& specialwordids, size_t expectedframes = SIZE_MAX) const
+                    std::unordered_map<int, std::wstring>& id2wordmapping, std::set<int>& specialwordids,
+                    size_t expectedframes = SIZE_MAX) const
    {
        auto iter = toc.find(key);
        if (iter == toc.end())
@ -1519,7 +1555,7 @@ public:
            // seek to start
            fsetpos(f, offset);
            // get it
-            L.fread(f, idmap, spunit, specialwordids);
+            L.fread(f, idmap, spunit, id2wordmapping, specialwordids);
            L.setverbosity(verbosity);
 #ifdef HACK_IN_SILENCE // hack to simulate DEL in the lattice
            const size_t silunit = getid(modelsymmap, "sil");
@ -1553,8 +1589,7 @@ public:
    //  - dump to stdout
    //  - merge two lattices (for merging numer into denom lattices)
    static void convert(const std::wstring& intocpath, const std::wstring& intocpath2, const std::wstring& outpath,
-        const msra::asr::simplesenonehmm& hset, std::set<int>& specialwordids);
+                        const msra::asr::simplesenonehmm& hset, std::unordered_map<int, std::wstring>& id2wordmapping, std::set<int>& specialwordids);
 };
 };
 };
-
--- a/Source/Common/Include/latticesource.h
+++ b/Source/Common/Include/latticesource.h
@ -36,12 +36,16 @@ class latticesource
 {
    const msra::lattices::archive numlattices, denlattices;
    int verbosity;
+	//linquan
+    std::unordered_map<int, std::wstring> id2wordmapping;

 public:
+    
    typedef msra::dbn::latticepair latticepair;
    latticesource(std::pair<std::vector<std::wstring>, std::vector<std::wstring>> latticetocs, const std::unordered_map<std::string, size_t>& modelsymmap, std::wstring RootPathInToc)
        : numlattices(latticetocs.first, modelsymmap, RootPathInToc), denlattices(latticetocs.second, modelsymmap, RootPathInToc), verbosity(0)
    {
+        id2wordmapping.insert(std::pair<int, std::wstring>(0, L"0"));
    }

    bool empty() const
@ -62,10 +66,11 @@ public:
 #endif
    }

-    void getlattices(const std::wstring& key, std::shared_ptr<const latticepair>& L, size_t expectedframes, std::set<int>& specialwordids) const
+    void getlattices(const std::wstring& key, std::shared_ptr<const latticepair>& L, size_t expectedframes, 
+		std::unordered_map<int, std::wstring>& id2wordmapping1, std::set<int> & specialwordids) const
    {
        std::shared_ptr<latticepair> LP(new latticepair);
-        denlattices.getlattice(key, LP->second, specialwordids, expectedframes); // this loads the lattice from disk, using the existing L.second object
+        denlattices.getlattice(key, LP->second, id2wordmapping1, specialwordids, expectedframes); // this loads the lattice from disk, using the existing L.second object
        L = LP;
    }

@ -75,5 +80,11 @@ public:
        numlattices.setverbosity(veb);
        denlattices.setverbosity(veb);
    }
+
+	void setid2wordmapping(std::unordered_map<int, std::wstring>& mapping)
+	{
+        this->id2wordmapping.clear();
+        this->id2wordmapping = mapping;
+	}
 };
 } }
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@ -83,6 +83,7 @@
  <ItemDefinitionGroup Condition="$(GpuBuild)">
    <ClCompile>
      <AdditionalIncludeDirectories>%(AdditionalIncludeDirectories);$(CudaInclude)</AdditionalIncludeDirectories>
+      <TreatWarningAsError Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</TreatWarningAsError>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
--- a/Source/ComputationNetworkLib/SpecialPurposeNodes.h
+++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.h
@ -841,7 +841,7 @@ public:
        
                
                latticePair->second.ReadFromBuffer(buffer, m_idmap, m_idmap.back());
-                assert((currentLabelSeq.tEnd - currentLabelSeq.tBegin) == latticePair->second.info.numframes);
+                //assert((currentLabelSeq.tEnd - currentLabelSeq.tBegin) == latticePair->second.info.numframes);
                // The size of the vector is small -- the number of sequences in the minibatch. 
                // Iteration likely will be faster than the overhead with unordered_map
                for (size_t pos = 0; pos < labelSequencesMap.size();pos++)
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@ -44,18 +44,30 @@ typedef unsigned int UNINT32;
 int msra::numa::node_override = -1; // for numahelpers.h
 #endif

-namespace msra { namespace lm {
+namespace msra
+{
+namespace lm
+{

 /*static*/ const mgram_map::index_t mgram_map::nindex = (mgram_map::index_t) -1; // invalid index
 }
 }

-namespace msra { namespace asr {
-    /*static*/ std::unordered_map<std::wstring, unsigned int> htkfeatreader::parsedpath::archivePathStringMap;
-    /*static*/ std::vector<std::wstring> htkfeatreader::parsedpath::archivePathStringVector;
-}}
+namespace msra
+{
+namespace asr
+{
+/*static*/ std::unordered_map<std::wstring, unsigned int> htkfeatreader::parsedpath::archivePathStringMap;
+/*static*/ std::vector<std::wstring> htkfeatreader::parsedpath::archivePathStringVector;
+}
+}

-namespace Microsoft { namespace MSR { namespace CNTK {
+namespace Microsoft
+{
+namespace MSR
+{
+namespace CNTK
+{

 // Create a Data Reader
 //DATAREADER_API IDataReader* DataReaderFactory(void)
@ -101,9 +113,9 @@ void HTKMLFReader<ElemType>::InitFromConfig(const ConfigRecordType& readerConfig
    }
 }

-void readwordidmap(const std::wstring &pathname, std::unordered_map<std::string, int>& wordidmap, int start_id)
+void readwordidmap(const std::wstring& pathname, std::unordered_map<std::string, int>& wordidmap, int start_id)
 {
-    std::unordered_map<std::string, int>::iterator mp_itr;
+    std::unordered_map<std::wstring, int>::iterator mp_itr;
    auto_file_ptr f(fopenOrDie(pathname, L"rbS"));
    fprintf(stderr, "readwordidmap: reading %ls \n", pathname.c_str());
    char buf[1024];
@ -119,13 +131,81 @@ void readwordidmap(const std::wstring &pathname, std::unordered_map<std::string,
        }
        if (wordidmap.find(std::string(word)) == wordidmap.end())
        {
-            wordidmap.insert(pair<std::string, int>(string(word),start_id++));
+            wordidmap.insert(pair<std::string, int>(string(word), start_id++));
        }
    }

    fclose(f);
 }

+void readwordidmap2(const std::wstring& pathname, std::unordered_map<std::wstring, std::wstring>& wordidmap)
+{
+    std::unordered_map<std::wstring, int>::iterator mp_itr;
+    auto_file_ptr f(fopenOrDie(pathname, L"rtS, ccs=UTF-8"));
+    fprintf(stderr, "readwordidmap: reading %ls \n", pathname.c_str());
+    std::wstring buffer;
+    std::wstring wordid;
+    std::wstring word;
+
+    while (!feof(f))
+    {
+        buffer = fgetlinew(f);
+      /*  size_t posstart = buffer.find_first_of('\t');
+        if (posstart == std::wstring::npos)
+        {
+            posstart = buffer.find_first_of(' ');
+        }
+        size_t posend = buffer.find_last_of('\t');
+        if (posend == std::wstring::npos)
+        {
+            posend = buffer.find_last_of(' ');
+        }
+
+        wordid = buffer.substr(0, posstart);
+        word = buffer.substr(posend + 1, buffer.length());*/
+
+		size_t pos = buffer.find(' ');
+        wordid = buffer.substr(0, pos);
+        word = buffer.substr(pos + 1, buffer.length());
+
+        if (wordid == L"" || word == L"")
+            continue;
+
+        //if (wordidmap.find(std::wstring(wordid)) == wordidmap.end())
+        {
+            wordidmap.insert(std::pair<std::wstring, std::wstring>(std::wstring(wordid), std::wstring(word)));
+        }
+
+        buffer.clear();
+    }
+
+    fclose(f);
+}
+
+std::unordered_map<int, std::wstring> CombineMappingTable(const std::unordered_map<std::string, int> trainwordidmap, const std::unordered_map<std::wstring, std::wstring>& wordidmap)
+{
+    std::unordered_map<int, std::wstring> combinedmapping;
+    std::unordered_map<std::string, int>::const_iterator idmap_itr;
+    std::unordered_map<std::wstring, std::wstring>::const_iterator maptable_itr;
+	std::wstring id;
+    std::wstring wid;
+    std::wstring word;
+
+    for (idmap_itr = trainwordidmap.begin(); idmap_itr != trainwordidmap.end(); ++idmap_itr)
+    {  		
+		maptable_itr = wordidmap.find(s2ws(idmap_itr->first));
+        if (maptable_itr != wordidmap.end())
+            combinedmapping.insert(std::pair<int, std::wstring>(idmap_itr->second, maptable_itr->second));
+        else
+        {
+            //fprintf(stderr, "no mapping id for %ls %d \n", s2ws(idmap_itr->first).c_str(), (int) idmap_itr->second);
+            combinedmapping.insert(std::pair<int, std::wstring>(idmap_itr->second, s2ws(idmap_itr->first)));
+		}
+	}
+
+	return combinedmapping;
+}
+
 // Load all input and output data.
 // Note that the terms features imply be real-valued quantities and
 // labels imply categorical quantities, irrespective of whether they
@ -350,9 +430,12 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
    if (readerConfig.Exists(L"randomize"))
    {
        wstring randomizeString = readerConfig.CanBeString(L"randomize") ? readerConfig(L"randomize") : wstring();
-        if      (EqualCI(randomizeString, L"none")) randomize = randomizeNone;
-        else if (EqualCI(randomizeString, L"auto")) randomize = randomizeAuto;
-        else                                        randomize = readerConfig(L"randomize"); // TODO: could this not just be randomizeString?
+        if (EqualCI(randomizeString, L"none"))
+            randomize = randomizeNone;
+        else if (EqualCI(randomizeString, L"auto"))
+            randomize = randomizeAuto;
+        else
+            randomize = readerConfig(L"randomize"); // TODO: could this not just be randomizeString?
    }

    m_frameMode = readerConfig(L"frameMode", true);
@ -373,7 +456,7 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
    if (readMethod == L"blockRandomize" && randomize == randomizeNone)
        InvalidArgument("'randomize' cannot be 'none' when 'readMethod' is 'blockRandomize'.");

-    if (readMethod == L"rollingWindow" && numExpandToUtt>0)
+    if (readMethod == L"rollingWindow" && numExpandToUtt > 0)
        RuntimeError("rollingWindow reader does not support expandToUtt. Change to blockRandomize.");

    // read all input files (from multiple inputs)
@ -391,7 +474,7 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
            n++;
        }

-        fprintf(stderr, " %lu entries\n", (unsigned long)n);
+        fprintf(stderr, " %lu entries\n", (unsigned long) n);

        if (i == 0)
            numFiles = n;
@ -410,7 +493,7 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
            // second, remove trailing slash if there is any
            // TODO: when gcc -v is 4.9 or greater, this should be: std::regex_replace(rootpath, L"\\/+$", wstring());
            int stringPos = 0;
-            for (stringPos = (int) (rootpath.length() - 1); stringPos >= 0; stringPos--) 
+            for (stringPos = (int) (rootpath.length() - 1); stringPos >= 0; stringPos--)
            {
                if (rootpath[stringPos] != L'/')
                {
@ -472,40 +555,40 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
        unigrampath = (const wstring&) readerConfig(L"unigram");

    if (readerConfig.Exists(L"wordidmap"))
-        wordidmappath = (const wstring&)readerConfig(L"wordidmap");
+        wordidmappath = (const wstring&) readerConfig(L"wordidmap");

    // load a unigram if needed (this is used for MMI training)
    msra::lm::CSymbolSet unigramsymbols;
    std::set<int> specialwordids;
    std::vector<string> specialwords;
    std::unordered_map<std::string, int> wordidmap;
+    std::unordered_map<std::wstring, std::wstring> wordidmap2;
+    std::unordered_map<int, std::wstring> id2wordmapping;
    std::unordered_map<std::string, int>::iterator wordidmap_itr;
-    
+
    std::unique_ptr<msra::lm::CMGramLM> unigram;
    size_t silencewordid = SIZE_MAX;
    size_t startwordid = SIZE_MAX;
    size_t endwordid = SIZE_MAX;
    if (unigrampath != L"")
    {
-       
+
        unigram.reset(new msra::lm::CMGramLM());
-       
+
        unigramsymbols["!NULL"];
        unigramsymbols["<s>"];
        unigramsymbols["</s>"];
        unigramsymbols["!sent_start"];
        unigramsymbols["!sent_end"];
        unigramsymbols["!silence"];
-             
+
        unigram->read(unigrampath, unigramsymbols, false /*filterVocabulary--false will build the symbol map*/, 1 /*maxM--unigram only*/);
-   
+
        silencewordid = unigramsymbols["!silence"]; // give this an id (even if not in the LM vocabulary)
        startwordid = unigramsymbols["<s>"];
        endwordid = unigramsymbols["</s>"];
-        
-        specialwordids.clear();
-        

+        specialwordids.clear();

        specialwordids.insert(unigramsymbols["<s>"]);
        specialwordids.insert(unigramsymbols["</s>"]);
@ -533,12 +616,10 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&

        // this is to exclude the unknown words in lattice brought when merging the numerator lattice into denominator lattice.
        specialwordids.insert(0xfffff);
-        
-        
    }

    else if (wordidmappath != L"")
-        //   if(true)
+    //   if(true)
    {
        wordidmap.insert(pair<std::string, int>("!NULL", 0));
        wordidmap.insert(pair<std::string, int>("<s>", 1));
@ -552,12 +633,19 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
        endwordid = 2;

        int start_id = 6;
+        /*changed by Linquan*/
        readwordidmap(wordidmappath, wordidmap, start_id);
+        const std::wstring mappingTable = L"D:\\Development\\TAMER\\eMBR\\CERCriteria\\Data2\\word2wid.dict.mapping.dict";
+        readwordidmap2(mappingTable, wordidmap2);
+		//id--> word string mapping
+        id2wordmapping = CombineMappingTable(wordidmap, wordidmap2);
+			
+		//temp code for debug
+		//id2wordmapping.insert(std::pair<int, std::wstring>(0, L"0"));

        specialwordids.clear();
        specialwords.clear();
-
-        specialwords.push_back("<s>"); 
+		specialwords.push_back("<s>");

        specialwords.push_back("</s>");
        specialwords.push_back("!NULL");
@ -581,7 +669,7 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
        specialwords.push_back("[SPN/]");
        specialwords.push_back("[UNKNOWN/]");
        specialwords.push_back(".]");
-        
+
        for (size_t i = 0; i < specialwords.size(); i++)
        {
            wordidmap_itr = wordidmap.find(specialwords[i]);
@ -590,7 +678,6 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&

        // this is to exclude the unknown words in lattice brought when merging the numerator lattice into denominator lattice.
        specialwordids.insert(0xfffff);
-
    }

    if (!unigram && latticetocs.second.size() > 0)
@ -605,10 +692,10 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
            msra::asr::htkfeatreader::parsedpath ppath(infilesmulti[0][i]);
            const wstring ppathStr = (wstring) ppath;

-            // delete extension (or not if none) 
-            // TODO: when gcc -v is 4.9 or greater, this should be: regex_replace((wstring)ppath, wregex(L"\\.[^\\.\\\\/:]*$"), wstring()); 
+            // delete extension (or not if none)
+            // TODO: when gcc -v is 4.9 or greater, this should be: regex_replace((wstring)ppath, wregex(L"\\.[^\\.\\\\/:]*$"), wstring());
            int stringPos = 0;
-            for (stringPos = (int) ppathStr.length() - 1; stringPos >= 0; stringPos--) 
+            for (stringPos = (int) ppathStr.length() - 1; stringPos >= 0; stringPos--)
            {
                if (ppathStr[stringPos] == L'.' || ppathStr[stringPos] == L'\\' || ppathStr[stringPos] == L'/' || ppathStr[stringPos] == L':')
                {
@ -616,10 +703,11 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
                }
            }

-            if (ppathStr[stringPos] == L'.') {
+            if (stringPos >= 0 && ppathStr[stringPos] == L'.')
+            {
                restrictmlftokeys.insert(ppathStr.substr(0, stringPos));
            }
-            else 
+            else
            {
                restrictmlftokeys.insert(ppathStr);
            }
@ -637,7 +725,7 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
    foreach_index (i, mlfpathsmulti)
    {
        msra::asr::htkmlfreader<msra::asr::htkmlfentry, msra::lattices::lattice::htkmlfwordsequence>
-           labels(mlfpathsmulti[i], restrictmlftokeys, statelistpaths[i], wordidmap, htktimetoframe); // label MLF
+            labels(mlfpathsmulti[i], restrictmlftokeys, statelistpaths[i], wordidmap, htktimetoframe); // label MLF
        // get the temp file name for the page file

        // Make sure 'msra::asr::htkmlfreader' type has a move constructor
@ -656,11 +744,13 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&

        // now get the frame source. This has better randomization and doesn't create temp files
        bool useMersenneTwisterRand = readerConfig(L"useMersenneTwisterRand", false);
-        m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(useMersenneTwisterRand, infilesmulti, labelsmulti,  specialwordids, m_featDims, m_labelDims,
-                                                                         numContextLeft, numContextRight, randomize, 
-                                                                         *m_lattices, m_latticeMap, m_frameMode, 
+        m_frameSource.reset(new msra::dbn::minibatchutterancesourcemulti(useMersenneTwisterRand, infilesmulti, labelsmulti, id2wordmapping, specialwordids, 
+																		 m_featDims, m_labelDims, numContextLeft, numContextRight, randomize,
+                                                                         *m_lattices, m_latticeMap, m_frameMode,
                                                                         m_expandToUtt, m_maxUtteranceLength, m_truncated));
        m_frameSource->setverbosity(m_verbosity);
+		m_lattices->setid2wordmapping(id2wordmapping);
+
    }
    else if (EqualCI(readMethod, L"rollingWindow"))
    {
@ -725,8 +815,8 @@ void HTKMLFReader<ElemType>::PrepareForTrainingOrTesting(const ConfigRecordType&
        const bool mayhavenoframe = false;
        int addEnergy = 0;

-        m_frameSource.reset(new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims, 
-                                                                     numContextLeft, numContextRight, randomize, 
+        m_frameSource.reset(new msra::dbn::minibatchframesourcemulti(infilesmulti, labelsmulti, m_featDims, m_labelDims,
+                                                                     numContextLeft, numContextRight, randomize,
                                                                     pagePaths, mayhavenoframe, addEnergy));
        m_frameSource->setverbosity(m_verbosity);
    }
@ -773,7 +863,7 @@ void HTKMLFReader<ElemType>::PrepareForWriting(const ConfigRecordType& readerCon
            size_t windowFrames = contextWindow[0];
            if (windowFrames % 2 == 0)
            {
-                RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", (int)windowFrames);
+                RuntimeError("augmentationextent: neighbor expansion of input features to %d not symmetrical", (int) windowFrames);
            }

            size_t context = windowFrames / 2; // extend each side by this
@ -1030,7 +1120,7 @@ void HTKMLFReader<ElemType>::StartMinibatchLoopToWrite(size_t mbSize, size_t /*e

 template <class ElemType>
 bool HTKMLFReader<ElemType>::GetMinibatch4SE(std::vector<shared_ptr<const msra::dbn::latticepair>>& latticeinput,
-    vector<size_t>& uids, vector<size_t>& wids, vector<short>& nws, vector<size_t>& boundaries, vector<size_t>& extrauttmap)
+                                             vector<size_t>& uids, vector<size_t>& wids, vector<short>& nws, vector<size_t>& boundaries, vector<size_t>& extrauttmap)
 {
    if (m_trainOrTest)
    {
@ -1043,8 +1133,8 @@ bool HTKMLFReader<ElemType>::GetMinibatch4SE(std::vector<shared_ptr<const msra::
 }
 template <class ElemType>
 bool HTKMLFReader<ElemType>::GetMinibatch4SEToTrainOrTest(std::vector<shared_ptr<const msra::dbn::latticepair>>& latticeinput,
-    
-    std::vector<size_t>& uids, std::vector<size_t>& wids, std::vector<short>& nws, std::vector<size_t>& boundaries, std::vector<size_t>& extrauttmap)
+
+                                                          std::vector<size_t>& uids, std::vector<size_t>& wids, std::vector<short>& nws, std::vector<size_t>& boundaries, std::vector<size_t>& extrauttmap)
 {
    latticeinput.clear();
    uids.clear();
@ -1245,7 +1335,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                        for (size_t des = 0; des < m_numSeqsPerMB; des++) // try to found a slot
                        {
                            if (framenum + m_numValidFrames[des] < m_mbNumTimeSteps)
-                            { 
+                            {
                                // found !
                                m_extraSeqsPerMB.push_back(des);
                                if (m_latticeBufferMultiUtt[src] != nullptr)
@ -1338,8 +1428,8 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                // add utterance to MBLayout
                assert(m_numFramesToProcess[i] > startFr || (m_noData && m_numFramesToProcess[i] == startFr));
                if (m_numFramesToProcess[i] > startFr)
-                {   // in an edge case (m_noData), startFr is at end
-                    m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, i, -(ptrdiff_t)startFr, m_numFramesToProcess[i] - startFr);
+                { // in an edge case (m_noData), startFr is at end
+                    m_pMBLayout->AddSequence(NEW_SEQUENCE_ID, i, -(ptrdiff_t) startFr, m_numFramesToProcess[i] - startFr);
                }

                if (startFr + m_mbNumTimeSteps < m_numFramesToProcess[i]) // end of this minibatch does not reach until end of utterance
@ -1380,9 +1470,9 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                                for (size_t j = startFr, k = 0; j < endFr; j++, k++) // column major, so iterate columns
                                {
                                    // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
-                                    memcpy_s(&m_featuresBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim], 
-                                             sizeof(ElemType) * dim, 
-                                             &m_featuresBufferMultiUtt[i].get()[j * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], 
+                                    memcpy_s(&m_featuresBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim],
+                                             sizeof(ElemType) * dim,
+                                             &m_featuresBufferMultiUtt[i].get()[j * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]],
                                             sizeof(ElemType) * dim);
                                }
                            }
@ -1392,7 +1482,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                                {
                                    for (int d = 0; d < dim; d++)
                                    {
-                                        m_featuresBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim + d] = 
+                                        m_featuresBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim + d] =
                                            m_featuresBufferMultiUtt[i].get()[j * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
                                    }
                                }
@ -1413,7 +1503,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                            {
                                for (int d = 0; d < dim; d++)
                                {
-                                    m_labelsBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim + d] = 
+                                    m_labelsBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim + d] =
                                        m_labelsBufferMultiUtt[i].get()[j * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
                                }
                            }
@ -1453,9 +1543,9 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                                for (size_t j = startFr, k = 0; j < endFr; j++, k++) // column major, so iterate columns
                                {
                                    // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns
-                                    memcpy_s(&m_featuresBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim], 
-                                             sizeof(ElemType) * dim, 
-                                             &m_featuresBufferMultiUtt[i].get()[j * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], 
+                                    memcpy_s(&m_featuresBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim],
+                                             sizeof(ElemType) * dim,
+                                             &m_featuresBufferMultiUtt[i].get()[j * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]],
                                             sizeof(ElemType) * dim);
                                }
                            }
@ -1465,7 +1555,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                                {
                                    for (int d = 0; d < dim; d++)
                                    {
-                                        m_featuresBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim + d] = 
+                                        m_featuresBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim + d] =
                                            m_featuresBufferMultiUtt[i].get()[j * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
                                    }
                                }
@ -1486,7 +1576,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                            {
                                for (int d = 0; d < dim; d++)
                                {
-                                    m_labelsBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim + d] = 
+                                    m_labelsBufferMultiIO[id].get()[(k * m_numSeqsPerMB + i) * dim + d] =
                                        m_labelsBufferMultiUtt[i].get()[j * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
                                }
                            }
@ -1530,9 +1620,9 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                                        for (size_t t = startT, fr = 0; t < endT; t++, fr++) // column major, so iterate columns
                                        {
                                            // copy over the entire column at once, need to do this because SSEMatrix may have gaps at the end of the columns (for SSE alignment)
-                                            memcpy_s(&m_featuresBufferMultiIO[id].get()[(t * m_numSeqsPerMB + i) * dim], 
-                                                     sizeof(ElemType) * dim, 
-                                                     &m_featuresBufferMultiUtt[i].get()[fr * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]], 
+                                            memcpy_s(&m_featuresBufferMultiIO[id].get()[(t * m_numSeqsPerMB + i) * dim],
+                                                     sizeof(ElemType) * dim,
+                                                     &m_featuresBufferMultiUtt[i].get()[fr * dim + m_featuresStartIndexMultiUtt[id + i * numOfFea]],
                                                     sizeof(ElemType) * dim);
                                        }
                                    }
@ -1542,7 +1632,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                                        {
                                            for (int d = 0; d < dim; d++)
                                            {
-                                                m_featuresBufferMultiIO[id].get()[(t * m_numSeqsPerMB + i) * dim + d] = 
+                                                m_featuresBufferMultiIO[id].get()[(t * m_numSeqsPerMB + i) * dim + d] =
                                                    m_featuresBufferMultiUtt[i].get()[fr * dim + d + m_featuresStartIndexMultiUtt[id + i * numOfFea]];
                                            }
                                        }
@ -1556,7 +1646,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToTrainOrTest(StreamMinibatchInputs& ma
                                    {
                                        for (int d = 0; d < dim; d++)
                                        {
-                                            m_labelsBufferMultiIO[id].get()[(t * m_numSeqsPerMB + i) * dim + d] = 
+                                            m_labelsBufferMultiIO[id].get()[(t * m_numSeqsPerMB + i) * dim + d] =
                                                m_labelsBufferMultiUtt[i].get()[fr * dim + d + m_labelsStartIndexMultiUtt[id + i * numOfLabel]];
                                        }
                                    }
@ -1684,7 +1774,7 @@ void HTKMLFReader<ElemType>::fillOneUttDataforParallelmode(StreamMinibatchInputs
            {
                for (int d = 0; d < dim; d++)
                {
-                    m_labelsBufferMultiIO[id].get()[(k * m_numSeqsPerMB + channelIndex) * dim + d] = 
+                    m_labelsBufferMultiIO[id].get()[(k * m_numSeqsPerMB + channelIndex) * dim + d] =
                        m_labelsBufferMultiUtt[sourceChannelIndex].get()[j * dim + d + m_labelsStartIndexMultiUtt[id + sourceChannelIndex * numOfLabel]];
                }
            }
@ -1731,8 +1821,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToWrite(StreamMinibatchInputs& matrices
            msra::dbn::matrix feat;
            string featkind;
            unsigned int sampperiod;
-            msra::util::attempt(5, [&]()
-            {
+            msra::util::attempt(5, [&]() {
                reader.read(path, featkind, sampperiod, feat); // whole file read as columns of feature vectors
            });

@ -1741,7 +1830,7 @@ bool HTKMLFReader<ElemType>::GetMinibatchToWrite(StreamMinibatchInputs& matrices
                nfr = feat.cols();
            }
            else if (feat.cols() == 1 && nfr > 1)
-            { 
+            {
                // This broadcasts a vector to be multiple columns, as needed for i-vector support
                msra::dbn::matrix feat_col(feat);
                feat.resize(feat.rows(), nfr);
@ -1965,7 +2054,6 @@ bool HTKMLFReader<ElemType>::ReNewBufferForMultiIO(size_t i)

        m_nwsBufferMultiUtt[i].clear();
        m_nwsBufferMultiUtt[i] = m_mbiter->nwords();
-
    }

    m_processedFrame[i] = 0;
@ -2186,7 +2274,7 @@ unique_ptr<CUDAPageLockedMemAllocator>& HTKMLFReader<ElemType>::GetCUDAAllocator
    if (m_cudaAllocator == nullptr)
    {
        m_cudaAllocator.reset(new CUDAPageLockedMemAllocator(deviceID));
-    } 
+    }
    return m_cudaAllocator;
 }

@ -2197,26 +2285,23 @@ std::shared_ptr<ElemType> HTKMLFReader<ElemType>::AllocateIntermediateBuffer(int
    {
        // Use pinned memory for GPU devices for better copy performance
        size_t totalSize = sizeof(ElemType) * numElements;
-        return std::shared_ptr<ElemType>((ElemType*) GetCUDAAllocator(deviceID)->Malloc(totalSize), 
-                                         [this, deviceID](ElemType* p)
-                                         {
+        return std::shared_ptr<ElemType>((ElemType*) GetCUDAAllocator(deviceID)->Malloc(totalSize),
+                                         [this, deviceID](ElemType* p) {
                                             this->GetCUDAAllocator(deviceID)->Free((char*) p);
                                         });
    }
-        
+
    else
    {
-        return std::shared_ptr<ElemType>(new ElemType[numElements], 
-                                         [](ElemType* p)
-                                         {
+        return std::shared_ptr<ElemType>(new ElemType[numElements],
+                                         [](ElemType* p) {
                                             delete[] p;
                                         });
    }
 }

-
 template class HTKMLFReader<float>;
 template class HTKMLFReader<double>;
-} } }
-
-
+}
+}
+}
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
@ -91,7 +91,7 @@
      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
-      <TreatWarningAsError>true</TreatWarningAsError>
+      <TreatWarningAsError>false</TreatWarningAsError>
      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\common\include;..\..\Math</AdditionalIncludeDirectories>
      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">..\..\common\include;..\..\Math</AdditionalIncludeDirectories>
      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">..\..\common\include;..\..\Math</AdditionalIncludeDirectories>
--- a/Source/Readers/HTKMLFReader/htkfeatio.h
+++ b/Source/Readers/HTKMLFReader/htkfeatio.h
@ -1117,20 +1117,16 @@ class htkmlfreader : public map<wstring, std::pair<vector<ENTRY>, vector<unsigne
                {
     
     // convert word to uppercase
-     if (strcmp(toks[6], "<s>") != 0 
-                        && strcmp(toks[6], "</s>") != 0
-                        && strcmp(toks[6], "!sent_start") != 0
-                        && strcmp(toks[6], "!sent_end") != 0
-                       && strcmp(toks[6], "!silence") != 0)
-     {
-      for(size_t j = 0; j < strlen(toks[6]); j++)
-      {
-       if(toks[6][j] >= 'a' && toks[6][j] <= 'z')
-       {
-        toks[6][j] = toks[6][j] + 'A' - 'a';
-       }
-      }
-     }
+                    if (strcmp(toks[6], "<s>") != 0 && strcmp(toks[6], "</s>") != 0 && strcmp(toks[6], "!sent_start") != 0 && strcmp(toks[6], "!sent_end") != 0 && strcmp(toks[6], "!silence") != 0)
+                    {
+                        for (size_t j = 0; j < strlen(toks[6]); j++)
+                        {
+                            if (toks[6][j] >= 'a' && toks[6][j] <= 'z')
+                            {
+                                toks[6][j] = toks[6][j] + 'A' - 'a';
+                            }
+                        }
+                    }
                    const char* w = toks[6]; // the word name
                                             // For some alignment MLF the sentence start and end are both represented by <s>, we change sentence end <s> to be </s>
                    if (i > s && strcmp(w, "<s>") == 0)
--- a/Source/Readers/HTKMLFReader/latticearchive.cpp
+++ b/Source/Readers/HTKMLFReader/latticearchive.cpp
@ -405,8 +405,9 @@ void lattice::dedup()
 //  - empty ("") -> don't output, just check the format
 //  - dash ("-") -> dump lattice to stdout instead
 /*static*/ void archive::convert(const std::wstring &intocpath, const std::wstring &intocpath2, const std::wstring &outpath,
-                                 const msra::asr::simplesenonehmm &hset, std::set<int>& specialwordids)
-                                 {
+                                 const msra::asr::simplesenonehmm &hset,
+                                 std::unordered_map<int, std::wstring> &id2wordmapping, std::set<int> &specialwordids)
+{
    const auto &modelsymmap = hset.getsymmap();

    const std::wstring tocpath = outpath + L".toc";
@ -457,7 +458,7 @@ void lattice::dedup()

        // fetch lattice  --this performs any necessary format conversions already
        lattice L;
-        archive.getlattice(key, L, specialwordids);
+        archive.getlattice(key, L, id2wordmapping, specialwordids);
        lattice L2;
        if (mergemode)
        {
@ -467,7 +468,7 @@ void lattice::dedup()
                skippedmerges++;
                continue;
            }
-            archive2.getlattice(key, L2, specialwordids);
+            archive2.getlattice(key, L2, id2wordmapping, specialwordids);
            // merge it in
            // This will connect each node with matching 1-phone context conditions; aimed at merging numer lattices.
            L.removefinalnull(); // get rid of that final !NULL headache
@ -508,20 +509,19 @@ void lattice::dedup()
                    invmodelsymmap[iter->second] = iter->first.c_str();
            }
            L.rebuildedges(false);
-            L.dump(stdout, [&](size_t i)
-                   {
-                       return invmodelsymmap[i];
-                   });
+            L.dump(stdout, [&](size_t i) {
+                return invmodelsymmap[i];
+            });
        }
    } // end for (toclines)
    if (skippedmerges > 0)
-        fprintf(stderr, "convert: %lu out of %lu merge operations skipped due to secondary lattice missing\n", (unsigned long)skippedmerges, (unsigned long)toclines.size());
+        fprintf(stderr, "convert: %lu out of %lu merge operations skipped due to secondary lattice missing\n", (unsigned long) skippedmerges, (unsigned long) toclines.size());

    // write out the updated unit map
    if (f && ftoc)
        writeunitmap(symlistpath, modelsymmap);

-    fprintf(stderr, "converted %lu lattices\n", (unsigned long)toclines.size());
+    fprintf(stderr, "converted %lu lattices\n", (unsigned long) toclines.size());
 }

 // ---------------------------------------------------------------------------
--- a/Source/Readers/HTKMLFReader/utterancesourcemulti.h
+++ b/Source/Readers/HTKMLFReader/utterancesourcemulti.h
@ -44,6 +44,9 @@ class minibatchutterancesourcemulti : public minibatchsource
    // const std::vector<std::unique_ptr<latticesource>> &lattices;
    const latticesource &lattices;

+	//linquan
+    std::unordered_map<int, std::wstring> id2wordmapping; //keep id-to-real word/character mapping 
+
    // Flag indicating whether to use Mersenne Twister random generator.
    bool m_useMersenneTwister;
    std::mt19937_64 m_rng;
@ -178,7 +181,8 @@ class minibatchutterancesourcemulti : public minibatchsource
        }
        // page in data for this chunk
        // We pass in the feature info variables by ref which will be filled lazily upon first read
-        void requiredata(std::string &featkind, size_t &featdim, unsigned int &sampperiod, const latticesource &latticesource, std::set<int>& specialwordids, int verbosity = 0) const
+        void requiredata(std::string &featkind, size_t &featdim, unsigned int &sampperiod, const latticesource &latticesource, 
+			 std::unordered_map<int, std::wstring>& id2wordmapping, std::set<int> &specialwordids, int verbosity = 0) const
        {
            
            if (numutterances() == 0)
@ -209,7 +213,7 @@ class minibatchutterancesourcemulti : public minibatchsource
                    reader.read(utteranceset[i].parsedpath, (const std::string &)featkind, sampperiod, uttframes, utteranceset[i].needsExpansion);  // note: file info here used for checkuing only
                    // page in lattice data
                    if (!latticesource.empty())
-                        latticesource.getlattices(utteranceset[i].key(), lattices[i], uttframes.cols(), specialwordids);
+                        latticesource.getlattices(utteranceset[i].key(), lattices[i], uttframes.cols(), id2wordmapping, specialwordids);
                }
                if (verbosity)
                {
@ -936,14 +940,14 @@ public:
    // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
    // This mode requires utterances with time stamps.
    minibatchutterancesourcemulti(bool useMersenneTwister, const std::vector<std::vector<std::wstring>> &infiles, const std::vector<std::map<std::wstring, std::pair<std::vector<msra::asr::htkmlfentry>, std::vector<unsigned int>>>> &labels,
-       std::set<int>& specialwordids,
+                                  std::unordered_map<int, std::wstring> &id2wordmapping, std::set<int> &specialwordids, /*deal with WER/CER sepcifically*/
                                  std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange,
                                  const latticesource &lattices, const std::map<std::wstring, msra::lattices::lattice::htkmlfwordsequence> &allwordtranscripts, const bool framemode, std::vector<bool> expandToUtt,
                                  const size_t maxUtteranceLength, const bool truncated)
                                  : vdim(vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod(0), featdim(0), randomizationrange(randomizationrange), currentsweep(SIZE_MAX), 
                                  lattices(lattices), allwordtranscripts(allwordtranscripts), framemode(framemode), chunksinram(0), timegetbatch(0), verbosity(2), m_generatePhoneBoundaries(!lattices.empty()), 
-                                  m_frameRandomizer(randomizedchunks, useMersenneTwister), expandToUtt(expandToUtt), m_useMersenneTwister(useMersenneTwister), maxUtteranceLength(maxUtteranceLength), truncated(truncated)
-                                  , specialwordids(specialwordids)
+                                  m_frameRandomizer(randomizedchunks, useMersenneTwister), expandToUtt(expandToUtt), m_useMersenneTwister(useMersenneTwister), maxUtteranceLength(maxUtteranceLength), truncated(truncated), 
+								  specialwordids(specialwordids), id2wordmapping(id2wordmapping)
    // [v-hansu] change framemode (lattices.empty()) into framemode (false) to run utterance mode without lattice
    // you also need to change another line, search : [v-hansu] comment out to run utterance mode without lattice
    {
@ -1595,7 +1599,7 @@ private:
                    fprintf(stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", m, (int) chunkindex, (int) chunk.globalts, (int) (chunk.globalte() - 1), (int) (chunksinram + 1));
                msra::util::attempt(5, [&]() // (reading from network)
                                    {
-                    chunkdata.requiredata(featkind[m], featdim[m], sampperiod[m], this->lattices, specialwordids, verbosity);
+                                        chunkdata.requiredata(featkind[m], featdim[m], sampperiod[m], this->lattices, id2wordmapping, specialwordids, verbosity);
                                    });
            }
            chunksinram++;
--- a/Source/SequenceTrainingLib/latticeforwardbackward.cpp
+++ b/Source/SequenceTrainingLib/latticeforwardbackward.cpp
@ -816,19 +816,14 @@ float compute_wer(vector<size_t> &ref, vector<size_t> &rec)


 double lattice::nbestlatticeEMBR(const std::vector<float> &edgeacscores, parallelstate &parallelstate, std::vector<NBestToken> &tokenlattice, const size_t numtokens, const bool enforceValidPathEMBR, const bool excludeSpecialWords, 
-    const float lmf, const float wp, const float amf, const bool wordNbest, const bool useAccInNbest, const float accWeightInNbest, const size_t numPathsEMBR, std::vector<size_t> wids) const
+    const float lmf, const float wp, const float amf, const bool wordNbest, const bool useAccInNbest, const float accWeightInNbest, const size_t numPathsEMBR, std::vector<size_t> wids 
+	/*, std::unordered_map<int, std::wstring> wordipmap /*added by linquan*/) const
 { // ^^ TODO: remove this
  // --- hand off to parallelized (CUDA) implementation if available
-    
  
-
    std::map<double, std::vector<PrevTokenInfo>>::iterator mp_itr;

-
-    
-
    size_t numtokens2keep;
-  

    // TODO: support parallel state
    parallelstate;
@ -943,7 +938,38 @@ double lattice::nbestlatticeEMBR(const std::vector<float> &edgeacscores, paralle
                     if (!is_special_words[edges[path[k]].E]) path_ids.push_back(nodes[edges[path[k]].E].wid);
                 }

-                 float wer = compute_wer(wids, path_ids);
+				 /*Linquan added
+				 //revert character
+                 std::vector<std::wstring> refwords;
+                 std::vector<std::wstring> regwords;
+                 float wer;
+                 if (wordipmap.size() >0 )
+                 {
+                     for (std::vector<size_t>::const_iterator it = wids.begin(); it != wids.end(); ++it)
+                     {
+                         std::unordered_map<int, std::wstring>::const_iterator maptable_itr = wordipmap.find(*it);
+                         if (maptable_itr != wordipmap.end())
+                             refwords.push_back(maptable_itr->second);
+                         else
+                             refwords.push_back(std::to_wstring(*it));
+					 }
+
+					 for (std::vector<size_t>::const_iterator it = path_ids.begin(); it != path_ids.end(); ++it)
+                     {
+                         std::unordered_map<int, std::wstring>::const_iterator maptable_itr = wordipmap.find(*it);
+                         if (maptable_itr != wordipmap.end())
+                             refwords.push_back(maptable_itr->second);
+                         else
+                             refwords.push_back(std::to_wstring(*it));
+                     }
+
+					  wer = compute_wer(wids, path_ids);
+				 }
+                 else
+					 wer = compute_wer(wids, path_ids);
+				 */
+
+				 float wer = compute_wer(wids, path_ids);
                 // will favor the path with better WER
                 pathscore -= double(accWeightInNbest*wer);

--- a/Tests/UnitTests/V2LibraryCSTests/V2LibraryCSTests.csproj
+++ b/Tests/UnitTests/V2LibraryCSTests/V2LibraryCSTests.csproj
@ -35,9 +35,9 @@
  </PropertyGroup>

  <ItemGroup>
-    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.7.0" />
-    <PackageReference Include="MSTest.TestAdapter" Version="1.2.1" />
-    <PackageReference Include="MSTest.TestFramework" Version="1.2.1" />
+    <PackageReference Include="Microsoft.NET.Test.Sdk" Version="15.9.0" />
+    <PackageReference Include="MSTest.TestAdapter" Version="1.4.0" />
+    <PackageReference Include="MSTest.TestFramework" Version="1.4.0" />
    
    <ProjectReference Include="..\..\..\bindings\csharp\CNTKLibraryManagedDll\CNTKLibraryManagedDll.csproj" />
  </ItemGroup>