No aggregation or NCCL initialization if only one node used

undef _SCL_SECURE_NO_WARNINGS for training
This commit is contained in:
Junjie Qian 2018-03-14 11:39:44 -07:00
Родитель 542406d130
Коммит 10c327e7f6
1 изменённых файлов: 19 добавлений и 12 удалений

Просмотреть файл

@ -6,7 +6,9 @@
#pragma once
#undef _SCL_SECURE_NO_WARNINGS
#include "Constants.h"
#include "CNTKLibrary.h"
#include "IDistGradAggregator.h"
#include "CUDAPageLockedMemAllocator.h"
#include "NcclComm.h"
@ -25,7 +27,7 @@ class SimpleDistGradAggregator : public IDistGradAggregator<ElemType>
public:
SimpleDistGradAggregator(const MPIWrapperPtr& mpi, bool useAsyncAggregation, int deviceId, int syncStatsTrace, size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES)
: IDistGradAggregator<ElemType>(mpi), m_useAsyncAggregation(useAsyncAggregation), m_initialized(false), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace),
m_iterationCount(0), m_nccl(deviceId, mpi), m_packThresholdSizeInBytes(packThresholdSizeInBytes)
m_iterationCount(0), m_packThresholdSizeInBytes(packThresholdSizeInBytes)
{}
~SimpleDistGradAggregator()
@ -40,6 +42,14 @@ public:
// Aggregate the gradient matrices across all nodes
bool AggregateGradients(const std::vector<Matrix<ElemType>*>& gradients, DistGradHeader* headerCPU, bool resetState) override
{
if (m_mpi->NumNodesInUse() == 1) // No need to aggregate anything.
return (headerCPU->numSamples != 0);
// Initialize NCCL
if (m_nccl == nullptr)
m_nccl.reset(new NcclComm(::CNTK::DeviceDescriptor::UseDefaultDevice().Id(), m_mpi));
ResetState(gradients, headerCPU->numEvalNode, resetState);
bool showSyncPerfStats = (m_syncStatsTrace > 0) && ((m_iterationCount % m_syncStatsTrace) == 0);
m_iterationCount++;
@ -145,7 +155,7 @@ private:
return false;
// Do not copy if NCCL is supported or GPUDirect RDMA is used
if (m_nccl.IsSupported() || m_mpi->UseGpuGdr() == true)
if (m_nccl->IsSupported() || m_mpi->UseGpuGdr() == true)
return false;
return true;
@ -251,9 +261,6 @@ private:
void AggregateGradientsImpl(const std::vector<Matrix<ElemType>*>& gradients, DistGradHeader* headerCPU, bool showSyncPerfStats)
{
if (m_mpi->NumNodesInUse() == 1) // No need to aggregate anything.
return;
Timer aggregationTimer;
int deviceId = gradients[0]->GetDeviceId();
if (showSyncPerfStats)
@ -319,7 +326,7 @@ private:
if (numGradientIndex > 0)
{
// non-GDR && GPU && non-NCCL: need to copy data from GPU to CPU
if ((m_mpi->UseGpuGdr() == 0) && (deviceId != CPUDEVICE) && !m_nccl.IsSupported())
if ((m_mpi->UseGpuGdr() == 0) && (deviceId != CPUDEVICE) && !m_nccl->IsSupported())
{
Matrix<ElemType>* gpuCopyBuffer = m_aggregationBuffer.get();
@ -380,7 +387,7 @@ private:
}
}
// non-NCCL, using CPU, using GDR
else if (!m_nccl.IsSupported())
else if (!m_nccl->IsSupported())
{
ElemType* reductionBuffer;
for (size_t i : m_gradientIndexToAggregate)
@ -401,14 +408,14 @@ private:
}
}
}
else if (m_nccl.IsSupported())
else if (m_nccl->IsSupported())
{
std::vector<Matrix<ElemType>*> ncclReduceGradients;
for (size_t i : m_gradientIndexToAggregate)
{
ncclReduceGradients.push_back((i == -1) ? m_aggregationBuffer.get() : gradients[i]);
}
m_nccl.AllReduce(ncclReduceGradients);
m_nccl->AllReduce(ncclReduceGradients);
}
}
@ -436,9 +443,9 @@ private:
// Broadcast the aggregated header to all nodes
m_mpi->Bcast(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank());
if (m_nccl.IsSupported())
if (m_nccl->IsSupported())
{
m_nccl.Sync();
m_nccl->Sync();
}
// Non-GDR && GPU
else if ((m_mpi->UseGpuGdr() == 0) && (deviceId != CPUDEVICE))
@ -509,6 +516,6 @@ private:
bool m_initialized;
NcclComm m_nccl;
std::unique_ptr<NcclComm> m_nccl;
};
} } }