CNTK/Source/Math/cudabasetypes.h

166 строки
3.6 KiB
C++

// cudabasetypes.h -- basic types used on both CUDA and PC side
//
// F. Seide, V-hansu
#pragma once
#ifdef __CUDA_ARCH__ // we are compiling under CUDA
#define ON_CUDA 1
#ifdef __device__
#define cudacode __device__
#define cudasharedcode __device__ __host__
#else
#define cudacode
#define cudasharedcode
#endif
#else
#define ON_CUDA 0 // TODO: this does not work for some combination--fix this
#ifdef __device__
#define cudacode __device__
#define cudasharedcode __device__ __host__
#else
#define cudacode
#define cudasharedcode
#endif
#endif
#include <assert.h>
namespace msra { namespace cuda {
typedef size_t cuda_size_t; // TODO: verify if this is consistent across CPU/CUDA, or use uint32 or so
// we wrap CUDA pointers so that we don't accidentally use them in CPU code
template <typename T>
class cuda_ptr
{
T* p; // CUDA pointers are the same as host (e.g. Win32 is restricted to 32-bit CUDA pointers)
public:
void swap(cuda_ptr& other)
{
T* tmp = p;
p = other.p;
other.p = tmp;
}
cudacode T& operator[](size_t index)
{
return p[index];
}
cudacode const T& operator[](size_t index) const
{
return p[index];
}
cudasharedcode cuda_ptr operator+(size_t index) const
{
return cuda_ptr(p + index);
}
cudasharedcode cuda_ptr operator-(size_t index) const
{
return cuda_ptr(p - index);
}
cuda_ptr(T* pp)
: p(pp)
{
}
T* get() const
{
return p;
}
};
// reference to a vector (without allocation) that lives in CUDA RAM
// This can be directly passed by value to CUDA functions.
template <typename T>
class vectorref
{
cuda_ptr<T> p; // pointer in CUDA space of this device
cuda_size_t n; // number of elements
public:
cudasharedcode size_t size() const throw()
{
return n;
}
cudacode T& operator[](size_t i)
{
return p[i];
}
cudacode const T& operator[](size_t i) const
{
return p[i];
}
cuda_ptr<T> get() const throw()
{
return p;
}
cuda_ptr<T> reset(cuda_ptr<T> pp, size_t nn) throw()
{
p.swap(pp);
n = nn;
return pp;
}
vectorref(cuda_ptr<T> pp, size_t nn)
: p(pp), n(nn)
{
}
vectorref()
: p(0), n(0)
{
}
};
// reference to a matrix
template <typename T>
class matrixref
{
protected:
cuda_ptr<T> p; // pointer in CUDA space of this device
size_t numrows; // rows()
size_t numcols; // cols()
size_t colstride; // height of column = rows() rounded to multiples of 4
cudasharedcode size_t locate(size_t i, size_t j) const
{
return j * colstride + i;
} // matrix in column-wise storage
matrixref()
: p(0), numrows(0), numcols(0), colstride(0)
{
}
public:
matrixref(T* p, size_t numRows, size_t numCols, size_t colStride)
: p(p), numrows(numRows), numcols(numCols), colstride(colStride)
{
}
cuda_ptr<T> get() const throw()
{
return p;
}
cudasharedcode size_t rows() const throw()
{
return numrows;
}
cudasharedcode size_t cols() const throw()
{
return numcols;
}
cudasharedcode void reshape(const size_t newrows, const size_t newcols)
{
assert(rows() * cols() == newrows * newcols);
numrows = newrows;
numcols = newcols;
};
cudasharedcode size_t getcolstride() const throw()
{
return colstride;
}
cudacode T& operator()(size_t i, size_t j)
{
return p[locate(i, j)];
}
cudacode const T& operator()(size_t i, size_t j) const
{
return p[locate(i, j)];
}
};
} }