Packing matries in V1 Aggregation

Packing aggregated matrixes into continuous buffer so less MPI operations are used. Usage: Introduce a configurable threshold size (units = KB) and only pack gradients into a continous buffer whose total size is below the threshold. This is to avoid possible memory overflow. The default threshold size is 32KB. To change this value, specify "packThresholdsizeInKB=value" (e.g. 2048 for 2MB) in train/eval config file. Notes: acquire continous buffer only if NCCL is not supported Fallback to normal execution if not enough extra continous memory available change reduce all to unblocking reduce all SimpleDistAggregate: Broadcast the aggregated headers to all nodes Configurable threshold size to pack gradients in a buffer simpleAggregator: pack the gradient into continous buffer if its size is less than PackThresholdSize Add all constant definitions to a header file in Common
2016-12-13 13:58:26 -08:00 · 2016-12-13 13:58:26 -08:00 · a6e13ce969
--- a/Source/Common/Include/Constants.h
+++ b/Source/Common/Include/Constants.h
@ -0,0 +1,13 @@
 // Constants.h -- the constants used by CNTK
 //
 #pragma once
 #ifndef _CONSTANTS_H_
 #define _CONSTANTS_H_
 // Constants used in aggregation
 const size_t DEFAULT_PACK_THRESHOLD_SIZE_IN_KB = 32;
 const size_t DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES = DEFAULT_PACK_THRESHOLD_SIZE_IN_KB * 1024;
 #endif
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -6,6 +6,7 @@
 #include "Basics.h"
 #include "ComputationNode.h"
 #include "Constants.h"
 #include "Matrix.h"
 #include "TensorView.h"
 #include <unordered_set>
@ -1448,7 +1449,8 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
    shared_ptr<ComputationNetwork> net,
    set<shared_ptr<ComputationNodeBase>> evalNodesWhichAccumulateResult,
    shared_ptr<DistGradHeader> gradHeader,
-    shared_ptr<MPIWrapper> mpi);
+    shared_ptr<MPIWrapper> mpi,
    size_t packThresholdSizeInBytes = (size_t)DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES);
 // -----------------------------------------------------------------------
 // EpochAccumulatorNode calculates mean values of all samples used in forward pass.
@ -1499,7 +1501,8 @@ protected:
        shared_ptr<ComputationNetwork> net,
        set<shared_ptr<ComputationNodeBase>> evalNodesWhichAccumulateResult,
        shared_ptr<DistGradHeader> gradHeader,
-        shared_ptr<MPIWrapper> mpi);
+        shared_ptr<MPIWrapper> mpi,
        size_t packThresholdSize);
    void Reset();
--- a/Source/Math/NcclComm.cpp
+++ b/Source/Math/NcclComm.cpp
@ -93,6 +93,23 @@ void NcclComm::AllReduceImpl(void* buffer, size_t count, DataType dtype)
        RuntimeError("NcclComm ncclAllReduce failed: %s", ncclGetErrorString(res));
 }
 void NcclComm::BroadcastImpl(void* buffer, size_t count, MPI_Datatype dtype, int root)
 {
    ncclResult_t res;
    if (dtype == MPI_CHAR)
    {
        res = ncclBcast(buffer, count, ncclChar, root, m_ncclComm, m_stream);
    }
    else
    {
        RuntimeError("NcclComm Broadcast supports Char type only");
    }
    if (res != ncclSuccess)
    {
        RuntimeError("NcclComm ncclBcast failed: %s", ncclGetErrorString(res));
    }
 }
 void NcclComm::Sync()
 {
    cudaStreamSynchronize(m_stream) || "NcclComm: cudaStreamSynchronize failed";
--- a/Source/Math/NcclComm.h
+++ b/Source/Math/NcclComm.h
@ -23,6 +23,7 @@ class NcclComm
 private:
    enum class DataType : int {FLOAT, DOUBLE};
    void AllReduceImpl(void* buffer, size_t count, DataType dtype);
    void BroadcastImpl(void* buffer, size_t count, MPI_Datatype dtype, int root);
    cudaStream_t m_stream;
    ncclComm_t m_ncclComm;
 #endif
@ -53,6 +54,20 @@ public:
        RuntimeError("NcclComm: CNTK was built without NCCL support.");
 #endif
    }
 #pragma warning( push )
 #pragma warning ( disable : 4100 ) // Disable warning 4100 in Broadcast function
    void Broadcast(void* buffer, size_t count, MPI_Datatype dtype, int root)
    {
 #ifdef USE_NCCL
        BroadcastImpl(buffer, count, dtype, root);
 #else
        RuntimeError("NcclComm: CNTK was built without NCCL support.");
 #endif
    }
 };
 #pragma warning( pop )
 }}}
--- a/Source/SGDLib/AccumulatorAggregation.h
+++ b/Source/SGDLib/AccumulatorAggregation.h
@ -26,7 +26,8 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
    std::shared_ptr<ComputationNetwork> net,
    std::set<std::shared_ptr<ComputationNodeBase>> evalNodesWhichAccumulateResult,
    std::shared_ptr<DistGradHeader> gradHeader,
-    std::shared_ptr<MPIWrapper> mpi)
+    std::shared_ptr<MPIWrapper> mpi,
    size_t packThresholdSizeInBytes)
 {
    // Accumulator stores mean value and number of samples. Aggregation performs simple summation of values,
    // so we transfer sum instead of mean, and calculate mean after aggregation is finished.
@ -58,7 +59,8 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
            mpi,
            false /*useAsyncAggregation*/,
            net->GetDeviceId(),
-            0 /*syncStatsTrace*/);
+            0 /*syncStatsTrace*/,
            packThresholdSizeInBytes);
    // Prepare header.
    const size_t c_evalNodes = 1;
@ -127,10 +129,11 @@ void AggregateAccumulatorValuesAndUpdateEpochEvaluation(
    std::vector<EpochCriterion>& epochEvalErrors,
    const std::vector<ComputationNodeBasePtr>& evaluationNodes,
    CriterionAccumulator<ElemType> localEpochEvalErrors,
-    std::function<bool(ComputationNodeBasePtr)> containsAccumulatedResult)
+    std::function<bool(ComputationNodeBasePtr)> containsAccumulatedResult,
    size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES)
 {
    // Each node contains accumulated values for part of the data set, we have to aggregate accumulated values.
-    AggregateAccumulatorValuesAndUpdateEvaluation<ElemType>(net, evalNodesWhichAccumulateResult, gradHeader, mpi);
+    AggregateAccumulatorValuesAndUpdateEvaluation<ElemType>(net, evalNodesWhichAccumulateResult, gradHeader, mpi, packThresholdSizeInBytes);
    // After values of accumulators have been aggregated accross nodes, we have to update evaluation results for
    // evaluation nodes that accumulate results.
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -1511,7 +1511,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
        // and recalculate evaluation errors based on accumulators.
        AggregateAccumulatorValuesAndUpdateEpochEvaluation<ElemType>(
            net, evaluationNodesWhichAccumulateResult, m_gradHeader, m_mpi, epochEvalErrors, evaluationNodes,
-            localEpochEvalErrors, ContainsAccumulatedResult);
+            localEpochEvalErrors, ContainsAccumulatedResult, m_packThresholdSizeInBytes);
    }
    return totalEpochSamples;
@ -2111,7 +2111,7 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
        if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines.
            m_distGradAgg = std::make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, ::CNTK::MPICommunicator());
        else
-            m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace);
+            m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes);
    }
    m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
@ -2701,6 +2701,8 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
    m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t) SIZE_MAX);
    m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t) 1);
    m_packThresholdSizeInBytes = configSGD(L"packThresholdSizeInKB", DEFAULT_PACK_THRESHOLD_SIZE_IN_KB) * 1024;
    if (configAALR.Exists(L"numMiniBatch4LRSearch"))
    {
        LOGPRINTF(stderr, "WARNING: 'numMiniBatch4LRSearch' is deprecated, please remove it and use 'numSamples4Search' instead.\n");
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -200,6 +200,9 @@ protected:
    intargvector m_numSamples4Search;
    size_t m_numBestSearchEpoch;
    // Threshold size in bytes for single gradient to do packing
    size_t m_packThresholdSizeInBytes;
    LearningRateSearchAlgorithm m_autoLearnRateSearchType;
    AdaptationRegType m_adaptationRegType;
--- a/Source/SGDLib/SimpleDistGradAggregator.h
+++ b/Source/SGDLib/SimpleDistGradAggregator.h
@ -6,6 +6,7 @@
 #pragma once
 #include "Constants.h"
 #include "IDistGradAggregator.h"
 #include "CUDAPageLockedMemAllocator.h"
 #include "NcclComm.h"
@ -22,8 +23,9 @@ class SimpleDistGradAggregator : public IDistGradAggregator<ElemType>
    UsingIDistGradAggregatorMembers;
 public:
-    SimpleDistGradAggregator(const MPIWrapperPtr& mpi, bool useAsyncAggregation, int deviceId, int syncStatsTrace)
+    SimpleDistGradAggregator(const MPIWrapperPtr& mpi, bool useAsyncAggregation, int deviceId, int syncStatsTrace, size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES)
-        : IDistGradAggregator<ElemType>(mpi), m_useAsyncAggregation(useAsyncAggregation), m_initialized(false), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace), m_iterationCount(0), m_nccl(deviceId, mpi)
+        : IDistGradAggregator<ElemType>(mpi), m_useAsyncAggregation(useAsyncAggregation), m_initialized(false), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace),
        m_iterationCount(0), m_nccl(deviceId, mpi), m_packThresholdSizeInBytes(packThresholdSizeInBytes)
    {}
    ~SimpleDistGradAggregator()
@ -144,25 +146,65 @@ private:
            m_initialized = true;
            int deviceId = gradients[0]->GetDeviceId();
-            if (!m_nccl.IsSupported() && deviceId != CPUDEVICE)
+            if (!m_nccl.IsSupported() && (deviceId != CPUDEVICE))
                m_allocator.reset(new CUDAPageLockedMemAllocator(deviceId));
            size_t packedGradientsSizeInElements = 0;
            for (size_t i = 0; i < gradients.size(); i++)
            {
                if (!m_useAsyncAggregation && sizeof(ElemType) * gradients[i]->GetNumElements() <= m_packThresholdSizeInBytes)
                {
                    packedGradientsSizeInElements += gradients[i]->GetNumElements();
                    m_packedGradientsIndex.push_back(i);
                }
                else
                {
                    m_gradientIndexToAggregate.push_back(i);
                }
                // Make sure none of the gradient matrixes are sparse - we currently do not support aggregation of sparse gradient matrices
                if (gradients[i]->GetMatrixType() != DENSE)
                    RuntimeError("Gradient aggregation for sparse gradient matrices is currently unsupported!");
                if (!m_nccl.IsSupported() && deviceId != CPUDEVICE)
                {
                    m_gpuDataTransferers.push_back(std::make_unique<GPUDataTransferer>(deviceId, m_useAsyncAggregation));
                    m_intermediateCPUBuffers.push_back(AllocateIntermediateBuffer(deviceId, gradients[i]->GetNumElements()));
                }
                if (m_useAsyncAggregation)
                    m_bufferedGradients[gradients[i]].reset(new Matrix<ElemType>(gradients[i]->GetNumRows(), gradients[i]->GetNumCols(), deviceId));
            }
            // Packing matrices into continous buffer if not doing async aggregation
            m_aggregationBuffer.reset();
            if (packedGradientsSizeInElements > 0)
            {
                m_aggregationBuffer.reset(new (std::nothrow) Matrix<ElemType>(1, packedGradientsSizeInElements, deviceId));
            }
            // If no extra continous buffer allocated or using async aggregation
            if (m_aggregationBuffer == nullptr)
            {
                m_gradientIndexToAggregate.clear();
                m_packedGradientsIndex.clear();
                packedGradientsSizeInElements = 0;
                // Reuse "@param m_gradientIndexToAggregate" for following code, if no continous buffer allocated
                for (size_t i = 0; i < gradients.size(); i++)
                {
                    m_gradientIndexToAggregate.push_back(i);
                }
            }
            else
            {
                // First element is reserved for continous buffer
                m_gradientIndexToAggregate.insert(m_gradientIndexToAggregate.begin(), 1, (size_t)-1);
            }
            // If running on GPU and NCCL not supported, initialize GPU and CPU data transfer
            if (!m_nccl.IsSupported() && (deviceId != CPUDEVICE))
            {
                for (size_t i : m_gradientIndexToAggregate)
                {
                    m_gpuDataTransferers.push_back(std::make_unique<GPUDataTransferer>(deviceId, m_useAsyncAggregation));
                    m_intermediateCPUBuffers.push_back(AllocateIntermediateBuffer(deviceId,
                        (i == -1) ? packedGradientsSizeInElements : gradients[i]->GetNumElements()));
                }
            }
            if (m_useAsyncAggregation)
            {
                m_bufferedGradHeader = DistGradHeader::Create(numEvalNodes);
@ -223,11 +265,33 @@ private:
            }
        }
-        // Initiate transfer of the gradient matrices to the CPU if needed
+        // Copy all gradient data into a single contiguous buffer, if additional continous buffer allocated
-        if (!m_nccl.IsSupported() && deviceId >= 0)
+        size_t offset = 0;
        for (size_t i : m_packedGradientsIndex)
        {
-            for (size_t i = 0; i < numGradMatrices; ++i)
+            m_aggregationBuffer->ColumnSlice(offset, gradients[i]->GetNumElements()).AssignValuesOf(gradients[i]->Reshaped(1, gradients[i]->GetNumElements()));
-                m_gpuDataTransferers[i]->CopyGPUToCPUAsync(gradients[i]->Data(), gradients[i]->GetNumElements(), m_intermediateCPUBuffers[i].get());
+            offset += gradients[i]->GetNumElements();
        }
        // Initiate transfer of the bufferred data to the CPU if needed
        if (!m_nccl.IsSupported() && deviceId != CPUDEVICE)
        {
            size_t gpuDataTransfersIdx = 0;
            Matrix<ElemType>* gpuCopyBuffer = m_aggregationBuffer.get();
            for (size_t i : m_gradientIndexToAggregate)
            {
                if (i != -1)
                {
                    gpuCopyBuffer = gradients[i];
                }
                else
                {
                    // i == -1, first element is for packed gradients, which should not be with AsyncAggregation
                    assert(m_useAsyncAggregation == false);
                }
                m_gpuDataTransferers[gpuDataTransfersIdx]->CopyGPUToCPUAsync(gpuCopyBuffer->Data(), gpuCopyBuffer->GetNumElements(), m_intermediateCPUBuffers[gpuDataTransfersIdx].get());
                gpuDataTransfersIdx++;
            }
        }
        // Initiate receive of the header on the main node
@ -248,26 +312,35 @@ private:
            m_mpi->Isend(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank(), numGradMatrices, &sendHeaderRequest) || MpiFail("MPI_Isend");
        // Perform async allreduce on the gradient data
-        std::vector<MPI_Request> allReduceRequests(numGradMatrices);
+        std::vector<MPI_Request> allReduceRequests;
        if (!m_nccl.IsSupported())
        {
-            for (size_t i = 0; i < numGradMatrices; ++i)
+            size_t allReduceIndex = 0;
            ElemType* reductionBuffer;
            for (size_t i : m_gradientIndexToAggregate)
            {
-                ElemType* reductionBuffer = gradients[i]->Data();
+                allReduceRequests.push_back(MPI_Request());
-                if (deviceId >= 0)
+                reductionBuffer = (i == -1)? m_aggregationBuffer->Data() : gradients[i]->Data();
                if (deviceId != CPUDEVICE)
                {
-                    m_gpuDataTransferers[i]->WaitForCopyGPUToCPUAsync();
+                    m_gpuDataTransferers[allReduceIndex]->WaitForCopyGPUToCPUAsync();
-                    reductionBuffer = m_intermediateCPUBuffers[i].get();
+                    reductionBuffer = m_intermediateCPUBuffers[allReduceIndex].get();
                }
-                // On Windows this async MPI_Iallreduce call requires MS MPI v7 or higher to be installed
+                m_mpi->Iallreduce(MPI_IN_PLACE, reductionBuffer, (i == -1) ? m_aggregationBuffer->GetNumElements() : gradients[i]->GetNumElements(),
-                m_mpi->Iallreduce(MPI_IN_PLACE, reductionBuffer, gradients[i]->GetNumElements(),
+                    MPIWrapper::GetDataType(reductionBuffer), MPI_SUM, &allReduceRequests.back()) || MpiFail("MPI_Iallreduce");
-                               MPIWrapper::GetDataType(reductionBuffer), MPI_SUM,
+                allReduceIndex++;
                               &allReduceRequests[i]) || MpiFail("MPI_Iallreduce");
            }
-        }
+        } 
        else
-            m_nccl.AllReduce(gradients);
+        {
            std::vector<Matrix<ElemType>*> ncclReduceGradients;
            for (size_t i : m_gradientIndexToAggregate)
            {
                ncclReduceGradients.push_back((i == -1) ? m_aggregationBuffer.get() : gradients[i]);
            }
            m_nccl.AllReduce(ncclReduceGradients);
        }
        // On the main node wait for the headers to arrive and aggregate
        if (m_mpi->IsMainNode())
@ -290,52 +363,48 @@ private:
            assert(numNodesHeadersReceivedFrom == (NumProc() - 1));
        }
-        // Initiate receive of the aggregate header
+        // Broadcast the aggregated header to all nodes
-        MPI_Request recvAggHeaderRequest;
+        m_mpi->Bcast(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank());
        if (!m_mpi->IsMainNode())
            m_mpi->Irecv(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank(), numGradMatrices + 1 + numGradMatrices, &recvAggHeaderRequest) || MpiFail("MPI_Irecv");
        // Intiate send of the aggregate header from main node
        std::vector<MPI_Request> sendAggHeaderRequests(NumProc() - 1);
        if (m_mpi->IsMainNode())
        {
            for (size_t j = 0; j < NumProc() - 1; ++j)
            {
                int dest = (j >= MyRank()) ? (j + 1) : j;
                // TODO: Should we use MPI_Bcast instead for better performance
                m_mpi->Isend(headerCPU, headerCPU->Size(), MPI_CHAR, dest, numGradMatrices + 1 + numGradMatrices, &(sendAggHeaderRequests[j])) || MpiFail("MPI_Isend");
            }
        }
        // Wait for the allreduce operations to finish and initiate transfer back to the GPU if needed
        if (!m_nccl.IsSupported())
        {
            for (size_t i = 0; i < numGradMatrices; ++i)
            {
                m_mpi->Wait(&allReduceRequests[i], MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
                if (deviceId >= 0)
                    m_gpuDataTransferers[i]->CopyCPUToGPUAsync(m_intermediateCPUBuffers[i].get(), gradients[i]->GetNumElements(), gradients[i]->Data());
            }
        }
        // Wait to receive aggregate header
        if (!m_mpi->IsMainNode())
            m_mpi->Wait(&recvAggHeaderRequest, MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
        // Wait for all the transfers to finish
        if (m_nccl.IsSupported())
            m_nccl.Sync();
        else if (deviceId >= 0)
        {
-            for (size_t i = 0; i < numGradMatrices; ++i)
+            m_nccl.Sync();
-                m_gpuDataTransferers[i]->WaitForCopyCPUToGPUAsync();
+        }
        else
        {
            // Wait for the allreduce operations to finish and initiate transfer back to the GPU if needed
            size_t gpuDataTransfersIdx = 0; // Index of allReduceRequest for each un-packed gradient
            for (size_t i : m_gradientIndexToAggregate)
            {
                m_mpi->Wait(&allReduceRequests[gpuDataTransfersIdx], MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
                if (deviceId != CPUDEVICE)
                {
                    m_gpuDataTransferers[gpuDataTransfersIdx]->CopyCPUToGPUAsync(m_intermediateCPUBuffers[gpuDataTransfersIdx].get(),
                        (i == -1) ? m_aggregationBuffer->GetNumElements() : gradients[i]->GetNumElements(),
                        (i == -1) ? m_aggregationBuffer->Data() : gradients[i]->Data());
                }
                gpuDataTransfersIdx++;
            }
            // Wait for copy data from CPU to GPU, if not running on CPU and not NCCL enabled
            if (deviceId != CPUDEVICE)
            {
                for (size_t i = 0; i < m_gradientIndexToAggregate.size(); i++)
                    m_gpuDataTransferers[i]->WaitForCopyCPUToGPUAsync();
            }
        }
        // Copy data back to the packed gradients from the continous buffer
        offset = 0;
        for (size_t i : m_packedGradientsIndex)
        {
            gradients[i]->AssignValuesOf(m_aggregationBuffer->ColumnSlice(offset, gradients[i]->GetNumElements()).Reshaped(gradients[i]->GetNumRows(), gradients[i]->GetNumCols()));
            offset += gradients[i]->GetNumElements();
        }
        // Wait for completion of the async send requests
        if (!m_mpi->IsMainNode())
            m_mpi->Wait(&sendHeaderRequest, MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
        else
            m_mpi->Waitall(sendAggHeaderRequests.size(), sendAggHeaderRequests.data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");
        if (showSyncPerfStats)
        {
@ -347,8 +416,8 @@ private:
 private:
    std::unique_ptr<CUDAPageLockedMemAllocator> m_allocator;
    std::vector<std::shared_ptr<ElemType>> m_intermediateCPUBuffers;
    std::vector<std::shared_ptr<ElemType>> m_intermediateCPUBuffers;
    std::vector<std::unique_ptr<GPUDataTransferer>> m_gpuDataTransferers;
    std::vector<DistGradHeader*> m_recvHeaders;
@ -363,6 +432,13 @@ private:
    std::unordered_map<Matrix<ElemType>*, std::unique_ptr<Matrix<ElemType>>> m_bufferedGradients;
    DistGradHeader* m_bufferedGradHeader;
    // Packing small gradients (size not larger than threshold size) into a continous buffer to reduce MPI calls.
    // Threshold size to pack a gradient into the continous buffer, default 32KB (tunable by define "packThresholdSizeInKB=[value]")
    const size_t m_packThresholdSizeInBytes;
    std::unique_ptr<Matrix<ElemType>> m_aggregationBuffer;
    std::vector<size_t> m_packedGradientsIndex;
    std::vector<size_t> m_gradientIndexToAggregate;
    int m_syncStatsTrace;
    // Only used for controlling frequency of measuring/showing gradient aggregation perf stats