CNTK/Source/Math/GPUDataTransferer.cpp

144 строки
5.0 KiB
C++

#include "stdafx.h"
#include "Basics.h"
#include "GPUDataTransferer.h"
#include "GPUMatrix.h"
#pragma comment (lib, "cudart.lib")
#pragma warning (disable: 4267) // conversion from 'size_t' to 'unsigned int'; happens in CUDA <<<a,b>>> syntax if a and b are size_t
#pragma warning (disable: 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
#pragma warning (disable: 4702) // unreachable code; triggered for unknown reasons
namespace Microsoft { namespace MSR { namespace CNTK {
// CUDA failed
// Since the outer code sometimes does not recover properly, as an option we log and die right away.
// This is needed for our GCD farm which has intermittent CUDA errors that sometimes cause the DBN tool, when running with MPI, to hang instead of terminating.
static void cudafail(const char * msg)
{
// TODO: get from an env variable
bool dieoncudafailure = false;
if (!dieoncudafailure)
{
RuntimeError("%s", msg);
}
fprintf(stderr, "%s\n", msg);
fprintf(stderr, "cudafail: terminating\n"), fflush(stderr);
#ifdef WIN32
TerminateProcess(GetCurrentProcess(), EXIT_FAILURE); // fail the hard way to ensure it won't hang elsewhere
#else
exit(1);
#endif
}
// allows to write cudaFunction() || "error" (CUDA runtime)
static
#ifdef WIN32
__declspec(noinline)
#endif
void operator|| (cudaError_t rc, const char * msg)
{
if (rc != cudaSuccess)
{
char buf[1000];
sprintf_s(buf, 1000, "%s: %s (cuda error %d)", msg, cudaGetErrorString(rc), rc);
cudafail(buf);
}
}
// same but for event
template<class ElemType>
void GPUDataTransferer<ElemType>::SyncEvent(cudaEvent_t ev)
{
auto rc = cudaEventQuery(ev);
if (rc != cudaErrorNotReady)
{
// if Event is ready then no need to wait
rc || "cudaEventQuery failed";
return;
}
// we must wait
cudaEventSynchronize(ev) || "cudaEventSynchronize failed";
}
//streams
template<class ElemType>
cudaStream_t GPUDataTransferer<ElemType>::m_fetchStream = NULL;
template<class ElemType>
cudaStream_t GPUDataTransferer<ElemType>::m_assignStream = NULL;
template<class ElemType>
cudaStream_t GPUDataTransferer<ElemType>::GetFetchStream()
{
return m_fetchStream;
}
template<class ElemType>
GPUDataTransferer<ElemType>::GPUDataTransferer(int deviceId, bool useConcurrentStreams)
: m_deviceId(deviceId)
{
PrepareDevice(m_deviceId);
// events
// Note: Do NOT use cudaEventBlockingSync (which supposedly yields the process)--it will totally break cudaEventSynchronize(), causing it to take 50 or 100 ms randomly.
cudaEventCreateWithFlags(&m_fetchCompleteEvent, cudaEventDisableTiming) || "cudaEventCreateWithFlags failed";
cudaEventCreateWithFlags(&m_assignCompleteEvent, cudaEventDisableTiming) || "cudaEventCreateWithFlags failed";
#pragma warning (disable: 4127)
if (useConcurrentStreams && (m_fetchStream == NULL))
{
cudaStreamCreateWithFlags(&m_fetchStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
cudaStreamCreateWithFlags(&m_assignStream, cudaStreamNonBlocking) || "cudaStreamCreateWithFlags failed";
}
}
template<class ElemType>
GPUDataTransferer<ElemType>::~GPUDataTransferer()
{
// BUGBUG: we don't destroy our streams (they are static variables); we need a static destructor, I am too lazy now
cudaEventDestroy(m_assignCompleteEvent);
cudaEventDestroy(m_fetchCompleteEvent);
}
template<class ElemType>
void GPUDataTransferer<ElemType>::CopyGPUToCPUAsync(ElemType* gpuBuffer, size_t numElements, ElemType* cpuBuffer)
{
PrepareDevice(m_deviceId);
cudaMemcpyAsync(cpuBuffer, gpuBuffer, numElements * sizeof(ElemType), cudaMemcpyDeviceToHost, m_fetchStream) || "cudaMemcpyAsync failed";
cudaEventRecord(m_fetchCompleteEvent, m_fetchStream) || "cudaEventRecord failed";
}
template<class ElemType>
void GPUDataTransferer<ElemType>::WaitForCopyGPUToCPUAsync()
{
PrepareDevice(m_deviceId);
SyncEvent(m_fetchCompleteEvent);
}
template<class ElemType>
void GPUDataTransferer<ElemType>::CopyCPUToGPUAsync(ElemType* cpuBuffer, size_t numElements, ElemType* gpuBuffer)
{
PrepareDevice(m_deviceId);
cudaMemcpyAsync(gpuBuffer, cpuBuffer, numElements * sizeof(ElemType), cudaMemcpyHostToDevice, m_assignStream) || "cudaMemcpyAsync failed";
cudaEventRecord(m_assignCompleteEvent, m_assignStream) || "cudaEventRecord failed";
}
template<class ElemType>
void GPUDataTransferer<ElemType>::WaitForCopyCPUToGPUAsync()
{
PrepareDevice(m_deviceId);
SyncEvent(m_assignCompleteEvent);
}
//explicit
template class GPUDataTransferer<float>;
template class GPUDataTransferer<double>;
}}}