2016-01-18 11:35:54 +03:00
// cudalib.cpp -- all CUDA calls (but not cublas) are encapsulated here
// All actual CUDA API calls go here, to keep the header out of our other headers.
//
// F. Seide, V-hansu
2016-01-18 11:36:14 +03:00
# define _CRT_SECURE_NO_WARNINGS 1 // so we can use getenv()...
2016-01-18 11:35:54 +03:00
# include "Basics.h"
2016-01-18 11:36:14 +03:00
# include <cuda_runtime_api.h> // for CUDA API
# include <cuda.h> // for device API
2016-01-18 11:35:54 +03:00
# include "cudalib.h"
# include "cudadevice.h"
# include <string>
# include <assert.h>
# include <cublas_v2.h>
2016-01-18 11:36:14 +03:00
# undef NOMULTIDEVICE // define this to disable any context/driver stuff
2016-01-18 11:35:54 +03:00
# ifndef NOMULTIDEVICE
2016-01-18 11:36:14 +03:00
# pragma comment(lib, "cuda.lib") // link CUDA device API
2016-01-18 11:35:54 +03:00
# endif
2016-01-18 11:36:14 +03:00
# pragma comment(lib, "cudart.lib") // link CUDA runtime
# pragma comment(lib, "cublas.lib")
2016-01-18 11:35:54 +03:00
namespace msra { namespace cuda {
2016-01-18 11:36:14 +03:00
static int devicesallocated = - 1 ; // -1 means not initialized
2016-01-18 11:35:54 +03:00
// allows to write cudaFunction() || "error" (CUDA runtime)
2016-01-18 11:36:14 +03:00
static void operator | | ( cudaError_t rc , const char * msg )
2016-01-18 11:35:54 +03:00
{
if ( rc ! = cudaSuccess )
2016-01-18 11:36:14 +03:00
RuntimeError ( " %s: %s (cuda error %d) " , msg , cudaGetErrorString ( rc ) , ( int ) rc ) ;
2016-01-18 11:35:54 +03:00
}
2016-01-18 11:36:14 +03:00
cudaStream_t GetCurrentStream ( )
{
return cudaStreamDefault ;
}
2016-01-18 11:35:54 +03:00
// synchronize with ongoing thread
2016-01-18 11:36:14 +03:00
void join ( )
{
2016-01-18 11:35:54 +03:00
cudaDeviceSynchronize ( ) | | " cudaDeviceSynchronize failed " ;
2016-01-18 11:36:14 +03:00
}
2016-01-18 11:35:54 +03:00
// allocate a stack to store the devices that have been pushed
const int stackSize = 20 ;
static int curStack = 0 ;
static size_t deviceStack [ stackSize ] = { 0 } ;
// memory allocation
2016-01-18 11:36:14 +03:00
void * mallocbytes ( size_t nelem , size_t sz )
2016-01-18 11:35:54 +03:00
{
2016-01-18 11:36:14 +03:00
for ( size_t retry = 0 ; ; retry + + )
2016-01-18 11:35:54 +03:00
{
try
{
2016-01-23 00:58:47 +03:00
// fprintf (stderr, "mallocbytes: allocating %d elements of size %d, %d bytes\n", (int) nelem, (int) sz, (int) (nelem * sz)); // comment out by [v-hansu] to get rid out annoying output
2016-01-18 11:36:14 +03:00
void * p ;
cudaMalloc ( & p , nelem * sz ) | | " cudaMalloc failed " ;
2016-01-18 11:35:54 +03:00
return p ;
}
2016-01-18 11:36:14 +03:00
catch ( const std : : exception & e )
2016-01-18 11:35:54 +03:00
{
2016-01-18 11:36:14 +03:00
fprintf ( stderr , " mallocbytes: failed with error %s \n " , e . what ( ) ) ;
2016-01-18 11:35:54 +03:00
if ( retry > = 5 )
throw ;
}
}
}
2016-01-18 11:36:14 +03:00
void freebytes ( void * p )
2016-01-18 11:35:54 +03:00
{
2016-01-18 11:36:14 +03:00
cudaFree ( p ) | | " cudaFree failed " ;
2016-01-18 11:35:54 +03:00
}
2016-01-18 11:36:14 +03:00
void memcpyh2d ( void * dst , size_t byteoffset , const void * src , size_t nbytes )
2016-01-18 11:35:54 +03:00
{
2016-01-18 11:36:14 +03:00
cudaMemcpy ( byteoffset + ( char * ) dst , src , nbytes , cudaMemcpyHostToDevice ) | | " cudaMemcpy failed " ;
2016-01-18 11:35:54 +03:00
}
2016-01-18 11:36:14 +03:00
void memcpyd2h ( void * dst , const void * src , size_t byteoffset , size_t nbytes )
{
cudaMemcpy ( dst , byteoffset + ( const char * ) src , nbytes , cudaMemcpyDeviceToHost ) | | " cudaMemcpy failed " ;
}
} ;
} ;