Fix for GPU Sparse ColumnSlice

This commit is contained in:
bmitra 2015-11-28 17:16:38 -08:00
Родитель 925f28e689
Коммит d5387bd49b
6 изменённых файлов: 112 добавлений и 94 удалений

Просмотреть файл

@ -54,6 +54,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t BufferSize() const {return m_elemSizeAllocated*sizeof(ElemType);}
ElemType* BufferPointer() const;
inline size_t GetNumElemAllocated() const { return m_elemSizeAllocated; }
CPUSparseMatrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
CPUMatrix<ElemType> CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;

Просмотреть файл

@ -1256,6 +1256,7 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
int start = aColCSCIndex[col];
int end = aColCSCIndex[col + 1];
int current = start;
for (size_t nc = 0; nc < N; nc++)
{
// recover the 5 indices from the loop counter
@ -1278,6 +1279,9 @@ __global__ void _tensorShuffleScaleAndAddRowSparse(
}
}
}
cColCSCIndex[col] = start;
cColCSCIndex[col + 1] = end;
}
template<class ElemType>

Просмотреть файл

@ -128,14 +128,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
ChangeDeviceTo(deepCopy.m_computeDevice);
deepCopy.PrepareDevice();
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_nz, deepCopy.m_format, true, false);
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.m_elemSizeAllocated, deepCopy.m_format, true, false);
m_nz = deepCopy.m_nz;
CUDA_CALL(cudaMemcpy(NzValues(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
m_sliceViewOffset = deepCopy.m_sliceViewOffset;
CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));
CUDA_CALL(cudaMemcpy(MajorIndexLocation(), deepCopy.MajorIndexLocation(), MajorIndexSize(), cudaMemcpyDeviceToDevice));
CUDA_CALL(cudaMemcpy(SecondaryIndexLocation(), deepCopy.SecondaryIndexLocation(), SecondaryIndexSize(), cudaMemcpyDeviceToDevice));
m_externalBuffer = false;
m_sliceViewOffset = 0;
SetMatrixName(deepCopy.m_matrixName);
//TODO: to copy other varibles used only for class based LM
@ -165,12 +166,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (deepCopy.GetFormat() == matrixFormatSparseCSR)
{
SetMatrixFromCSRFormat(deepCopy.RowLocation(), deepCopy.ColLocation(), deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
SetMatrixFromCSRFormat(deepCopy.RowLocation(), deepCopy.ColLocation(), deepCopy.BufferPointer(), deepCopy.GetNumElemAllocated(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
}
else if (deepCopy.GetFormat() == matrixFormatSparseCSC)
{
SetMatrixFromCSCFormat(deepCopy.ColLocation(), deepCopy.RowLocation(), deepCopy.NzValues(), deepCopy.NzCount(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
SetMatrixFromCSCFormat(deepCopy.ColLocation(), deepCopy.RowLocation(), deepCopy.BufferPointer(), deepCopy.GetNumElemAllocated(), deepCopy.GetNumRows(), deepCopy.GetNumCols());
}
else
NOT_IMPLEMENTED;
@ -189,7 +189,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (this->GetFormat() == matrixFormatSparseCSR)
{
//we need to do conversion because CPUSparseMatrix uses size_t for indexes while GPUSparseMatrix uses int
cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumNZElements(), true, false);
cpuSparseMatrix.Resize(GetNumRows(), GetNumCols(), GetNumElemAllocated(), true, false);
cpuSparseMatrix.SetNzCount(GetNumNZElements());
PrepareDevice();
@ -210,7 +210,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
}
CUDA_CALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost));
}
else if (this->GetFormat() == matrixFormatSparseCSC)
@ -236,7 +236,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
}
CUDA_CALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost));
}
else
NOT_IMPLEMENTED;
@ -269,22 +269,22 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseScsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)NzValues(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
CUSPARSE_CALL(cusparseScsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)BufferPointer(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
}
else
{
CUSPARSE_CALL(cusparseDcsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)NzValues(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
CUSPARSE_CALL(cusparseDcsr2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)BufferPointer(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
}
}
else if (m_format == MatrixFormat::matrixFormatSparseCSC)
{
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)NzValues(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (float*)BufferPointer(), RowLocation(), ColLocation(), (float*)denseMatrix.BufferPointer(), int(m_numRows)));
}
else
{
CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)NzValues(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, int(m_numRows), int(m_numCols), descr, (double*)BufferPointer(), RowLocation(), ColLocation(), (double*)denseMatrix.BufferPointer(), int(m_numRows)));
}
}
else
@ -333,14 +333,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_nz),
(float*)NzValues(), RowLocation(), ColLocation(), (float*)outMatrix.NzValues(),
CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_elemSizeAllocated),
(float*)BufferPointer(), RowLocation(), ColLocation(), (float*)outMatrix.BufferPointer(),
outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
}
else
{
CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_nz),
(double*)NzValues(), RowLocation(), ColLocation(), (double*)outMatrix.NzValues(),
CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, int(m_numRows), int(m_numCols), int(m_elemSizeAllocated),
(double*)BufferPointer(), RowLocation(), ColLocation(), (double*)outMatrix.BufferPointer(),
outMatrix.RowLocation(), outMatrix.ColLocation(), CUSPARSE_ACTION_NUMERIC, CUSPARSE_INDEX_BASE_ZERO));
}
}
@ -499,12 +499,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseSdense2csr(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<float*>(denseMatrix.BufferPointer()),
(int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(NzValues()), RowLocation(), ColLocation()));
(int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(BufferPointer()), RowLocation(), ColLocation()));
}
else
{
CUSPARSE_CALL(cusparseDdense2csr(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<double*>(denseMatrix.BufferPointer()),
(int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(NzValues()), RowLocation(), ColLocation()));
(int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(BufferPointer()), RowLocation(), ColLocation()));
}
}
else if (m_format == MatrixFormat::matrixFormatSparseCSC)
@ -512,12 +512,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseSdense2csc(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<float*>(denseMatrix.BufferPointer()),
(int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(NzValues()), RowLocation(), ColLocation()));
(int)m_numRows, nnzPerRowOrCol, reinterpret_cast<float*>(BufferPointer()), RowLocation(), ColLocation()));
}
else
{
CUSPARSE_CALL(cusparseDdense2csc(cusparseHandle, (int)m_numRows, (int)m_numCols, descr, reinterpret_cast<double*>(denseMatrix.BufferPointer()),
(int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(NzValues()), RowLocation(), ColLocation()));
(int)m_numRows, nnzPerRowOrCol, reinterpret_cast<double*>(BufferPointer()), RowLocation(), ColLocation()));
}
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
@ -529,11 +529,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseMatrix<ElemType>& deepCopy)
{
Clear();
if (this != &deepCopy)
{
SetValue(deepCopy);
}
return *this;
SetMatrixName(deepCopy.m_matrixName);
return *this;
}
template<class ElemType>
@ -566,7 +567,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (this != &moveFrom)
{
if (OwnBuffer())
Clear(); //always delete the data pointer since we will use the pointer from moveFrom
ReleaseMemory(); //always delete the data pointer since we will use the pointer from moveFrom
m_computeDevice = moveFrom.m_computeDevice;
m_numRows = moveFrom.m_numRows;
m_numCols = moveFrom.m_numCols;
@ -595,13 +596,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template<class ElemType>
GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
{
Clear();
ReleaseMemory();
}
template<class ElemType>
void GPUSparseMatrix<ElemType>::Clear()
void GPUSparseMatrix<ElemType>::ReleaseMemory()
{
// If m_externalBuffer is true then this matrix
// If OwnBuffer() is false then this matrix
// is simply a view over another matrix. In that
// case we shouldn't free anything.
if (OwnBuffer())
@ -617,9 +618,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CUDA_CALL(cudaFree(m_rowToId));
m_rowToId = nullptr;
ZeroInit(m_format, m_computeDevice);
}
ZeroInit(m_format, m_computeDevice);
}
//ResizeAsAndCopyIndexFrom - Resize this sparse matrix to have the same element structure as the passed matrix
@ -662,7 +663,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (m_pArray != nullptr)
{
CUDA_CALL(cudaMemcpy(pArray, NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
CUDA_CALL(cudaMemcpy(pArray, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));
GPUSPARSE_INDEX_TYPE* majorIndexInNewBuffer = (GPUSPARSE_INDEX_TYPE*)(pArray + m_elemSizeAllocated);
GPUSPARSE_INDEX_TYPE* secondaryIndexInNewBuffer = majorIndexInNewBuffer + MajorIndexCount(numRows, numCols, m_elemSizeAllocated, m_format);
@ -734,7 +735,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (m_nz > numNZElemToReserve || m_totalBufferSizeAllocated > bufferSizeNeeded)
LogicError("Resize: To keep values m_nz should <= numNZElemToReserve.");
CUDA_CALL(cudaMemcpy(pArray, NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
CUDA_CALL(cudaMemcpy(pArray, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToDevice));
GPUSPARSE_INDEX_TYPE* majorIndexInNewBuffer = (GPUSPARSE_INDEX_TYPE*)(pArray + numNZElemToReserve);
@ -795,7 +796,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SetNzCount(nz);
cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
CUDA_CALL(cudaMemcpy(NzValues(), h_Val, NzSize(), kind));
CUDA_CALL(cudaMemcpy(BufferPointer(), h_Val, NzSize(), kind));
if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
{
@ -817,7 +818,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// this function will allocate memory while the caller needs to release it
template<class ElemType>
void GPUSparseMatrix<ElemType>::GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
void GPUSparseMatrix<ElemType>::GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &numElemAllocated, size_t &nz, size_t &numRows, size_t &numCols) const
{
if (!OwnBuffer())
LogicError("Cannot Set since the buffer is managed externally.");
@ -825,6 +826,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (h_CSRRow != nullptr || h_Col != nullptr || h_Val != nullptr)
LogicError("GetMatrixFromCSRFormat: Passed pointers must be nullptr");
numElemAllocated = GetNumElemAllocated();
nz = GetNumNZElements();
numRows = GetNumRows();
numCols = GetNumCols();
@ -833,12 +835,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return;
else
{
h_Val = new ElemType[nz];
h_Val = new ElemType[numElemAllocated];
h_CSRRow = new CPUSPARSE_INDEX_TYPE[m_numRows + 1];
h_Col = new CPUSPARSE_INDEX_TYPE[nz];
PrepareDevice();
CUDA_CALL(cudaMemcpy(h_Val, NzValues(), NzSize(), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(h_Val, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost));
if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
{
@ -875,7 +877,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
SetNzCount(nz);
cudaMemcpyKind kind = IsOnDevice ? cudaMemcpyDeviceToDevice : cudaMemcpyHostToDevice;
CUDA_CALL(cudaMemcpy(NzValues(), h_Val, NzSize(), kind));
CUDA_CALL(cudaMemcpy(BufferPointer(), h_Val, NzSize(), kind));
if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
{
@ -897,11 +899,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// this function will allocate memory while the caller needs to release it
template<class ElemType>
void GPUSparseMatrix<ElemType>::GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const
void GPUSparseMatrix<ElemType>::GetMatrixFromCSCFormat(GPUSPARSE_INDEX_TYPE*& h_CSCCol, GPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &numElemAllocated, size_t &nz, size_t &numRows, size_t &numCols) const
{
if (h_CSCCol != nullptr || h_Row != nullptr || h_Val != nullptr)
LogicError("GetMatrixFromCSCFormat: Passed pointers must be nullptr");
numElemAllocated = GetNumElemAllocated();
nz = GetNumNZElements();
numRows = GetNumRows();
numCols = GetNumCols();
@ -910,12 +913,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
return;
else
{
h_Val = new ElemType[nz];
h_Val = new ElemType[numElemAllocated];
h_CSCCol = new GPUSPARSE_INDEX_TYPE[m_numRows + 1];
h_Row = new GPUSPARSE_INDEX_TYPE[nz];
PrepareDevice();
CUDA_CALL(cudaMemcpy(h_Val, NzValues(), NzSize(), cudaMemcpyDeviceToHost));
CUDA_CALL(cudaMemcpy(h_Val, BufferPointer(), GetSizeElemAllocated(), cudaMemcpyDeviceToHost));
if (sizeof(CPUSPARSE_INDEX_TYPE) == sizeof(GPUSPARSE_INDEX_TYPE))
{
@ -1038,7 +1041,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
alpha,
reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
transposeA,
reinterpret_cast<const ElemType*>(rhs.NzValues()), //sparse nz values
reinterpret_cast<const ElemType*>(rhs.BufferPointer()), //sparse nz values
rhs.RowLocation(),
rhs.ColLocation(),
beta,
@ -1072,7 +1075,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
alpha,
reinterpret_cast<const ElemType*>(lhs.BufferPointer()), //dense
transposeA,
reinterpret_cast<const ElemType*>(rhs.NzValues()), //sparse nz values
reinterpret_cast<const ElemType*>(rhs.BufferPointer()), //sparse nz values
rhs.RowLocation(),
rhs.ColLocation(),
reinterpret_cast<ElemType*> (c.BufferPointer()) //dense target
@ -1121,10 +1124,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CUDA_LONG N = (CUDA_LONG)c.GetNumCols();
int blocksPerGrid = (int)ceil(1.0*N / threadsPerBlock);
_tensorShuffleScaleAndAddRowSparse<ElemType> << <blocksPerGrid, threadsPerBlock, 0, t_stream >> >(
reinterpret_cast<const ElemType*>(a.NzValues()), // source nz values
reinterpret_cast<const ElemType*>(a.BufferPointer()), // source nz values
a.RowLocation(),
a.ColLocation(),
reinterpret_cast<ElemType*>(c.NzValues()), // target nz values
reinterpret_cast<ElemType*>(c.BufferPointer()), // target nz values
c.RowLocation(),
c.ColLocation(),
D, S, M, K, T);
@ -1204,7 +1207,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t nnz = m*c.m_blockSize;
c.Resize(m, n, nnz, true, true); //we need to keep the col2blockid and blockid2col info when resizing.
c.m_nz = nnz;
CUDA_CALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
CUDA_CALL(cudaMemset(c.BufferPointer(), 0, sizeof(ElemType)*(c.m_elemSizeAllocated)));
LONG64 N = (LONG64)lhs.GetNumElements(); //here we process for each row in lhs and each column in rhs (==columns in lhs)
blocksPerGrid = (int)ceil(((double)N) / threadsPerBlock);
@ -1213,11 +1216,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
lhs.BufferPointer(),
m,
l,
rhs.NzValues(),
rhs.BufferPointer(),
rhs.RowLocation(),
rhs.ColLocation(),
c.ColOrRow2BlockId(),
c.NzValues());
c.BufferPointer());
}
else
{
@ -1225,7 +1228,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
size_t nnz = m*c.m_blockSize;
c.Resize(m, n, nnz, true, false);
c.m_nz = nnz;
CUDA_CALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
CUDA_CALL(cudaMemset(c.BufferPointer(), 0, sizeof(ElemType)*(c.m_elemSizeAllocated)));
CUDA_CALL(cudaMemset(c.BlockId2ColOrRow(), 0, sizeof(GPUSPARSE_INDEX_TYPE)*(c.m_blockSize)));
LONG64 N = (LONG64)lhs.GetNumElements(); //here we process for each row in lhs and each column in rhs (==columns in lhs)
@ -1235,11 +1238,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
lhs.BufferPointer(),
m,
l,
rhs.NzValues(),
rhs.BufferPointer(),
rhs.RowLocation(),
rhs.ColLocation(),
rhs.m_rowToId,
c.NzValues(),
c.BufferPointer(),
c.BlockId2ColOrRow());
}
@ -1307,7 +1310,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
lhs.GetNumRows(),
lhs.GetNumCols(),
lhs.m_blockSize,
lhs.NzValues(),
lhs.BufferPointer(),
lhs.BlockId2ColOrRow(),
rhs.BufferPointer());
@ -1388,7 +1391,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
GetNumRows(),
GetNumCols(),
m_blockSize,
NzValues(),
BufferPointer(),
BlockId2ColOrRow(),
c.BufferPointer());
@ -1435,7 +1438,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
int blocksPerGrid = (m_nz + threadsPerBlock - 1) / threadsPerBlock;
bool colMajor = (m_format == MatrixFormat::matrixFormatSparseBlockCol ? true : false);
size_t len = colMajor ? GetNumRows() : GetNumCols();
_adagrad4BlockSparse<ElemType> << <blocksPerGrid, threadsPerBlock >> >(c.GetArray(), c.GetNumRows(), NzValues(), BlockId2ColOrRow(), multipliers, colMajor, len, m_nz);
_adagrad4BlockSparse<ElemType> << <blocksPerGrid, threadsPerBlock >> >(c.GetArray(), c.GetNumRows(), BufferPointer(), BlockId2ColOrRow(), multipliers, colMajor, len, m_nz);
}
else
NOT_IMPLEMENTED;
@ -1494,13 +1497,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
if (sizeof(ElemType)==sizeof(float))
{
CUSPARSE_CALL(cusparseScsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumNZElements(),reinterpret_cast <float*>(&alpha),descr,reinterpret_cast <const float*>(a.NzValues()),
CUSPARSE_CALL(cusparseScsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumElemAllocated(),reinterpret_cast <float*>(&alpha),descr,reinterpret_cast <const float*>(a.BufferPointer()),
a.RowLocation(), a.ColLocation(), reinterpret_cast <float*>(b.BufferPointer()),
(int)b.GetNumRows(),reinterpret_cast <float*>(&beta),reinterpret_cast <float*>(c.BufferPointer()),(int)c.GetNumRows()));
}
else
{
CUSPARSE_CALL(cusparseDcsrmm(cusparseHandle,oper,m,n,k,(int)a.GetNumNZElements(),reinterpret_cast <double*>(&alpha),descr,reinterpret_cast <const double*>(a.NzValues()),
CUSPARSE_CALL(cusparseDcsrmm(cusparseHandle, oper, m, n, k, (int)a.GetNumElemAllocated(), reinterpret_cast <double*>(&alpha), descr, reinterpret_cast <const double*>(a.BufferPointer()),
a.RowLocation(), a.ColLocation(), reinterpret_cast <double*>(b.BufferPointer()),
(int)b.GetNumRows(),reinterpret_cast <double*>(&beta),reinterpret_cast <double*>(c.BufferPointer()),(int)c.GetNumRows()));
}
@ -1586,9 +1589,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
bool allocatedBuffer = false;
// do we have enough memory to store just the row buffer?
if (cSize >= rowBufferRequired && c.NzValues() != nullptr && canReuseBuffer)
if (cSize >= rowBufferRequired && c.BufferPointer() != nullptr && canReuseBuffer)
{
csrRowPtrC = (GPUSPARSE_INDEX_TYPE*)c.NzValues();
csrRowPtrC = (GPUSPARSE_INDEX_TYPE*)c.BufferPointer();
}
else
{
@ -1665,15 +1668,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//Step 2
if (sizeof(float)==sizeof(ElemType))
{
CUSPARSE_CALL(cusparseScsrgemm(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,(const float*)S1.NzValues(),S1.RowLocation(),S1.ColLocation(),
descrB,nnzB,(const float*)S2.NzValues(),S2.RowLocation(),S2.ColLocation(),
descrC,(float*)c.NzValues(),c.RowLocation(),c.ColLocation()));
CUSPARSE_CALL(cusparseScsrgemm(cusparseHandle, operA, operB, m, n, k, descrA, nnzA, (const float*)S1.BufferPointer(), S1.RowLocation(), S1.ColLocation(),
descrB, nnzB, (const float*)S2.BufferPointer(), S2.RowLocation(), S2.ColLocation(),
descrC, (float*)c.BufferPointer(), c.RowLocation(), c.ColLocation()));
}
else
{
CUSPARSE_CALL(cusparseDcsrgemm(cusparseHandle,operA,operB,m,n,k,descrA,nnzA,(const double*)S1.NzValues(),S1.RowLocation(),S1.ColLocation(),
descrB,nnzB,(const double*)S2.NzValues(),S2.RowLocation(),S2.ColLocation(),
descrC,(double*)c.NzValues(),c.RowLocation(),c.ColLocation()));
CUSPARSE_CALL(cusparseDcsrgemm(cusparseHandle, operA, operB, m, n, k, descrA, nnzA, (const double*)S1.BufferPointer(), S1.RowLocation(), S1.ColLocation(),
descrB, nnzB, (const double*)S2.BufferPointer(), S2.RowLocation(), S2.ColLocation(),
descrC, (double*)c.BufferPointer(), c.RowLocation(), c.ColLocation()));
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
@ -1731,13 +1734,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
//Step 2
if (sizeof(ElemType)==sizeof(float))
{
CUSPARSE_CALL(cusparseScsrgeam(cusparseHandle,m,n,reinterpret_cast <const float*>(&alpha),descrA,nnzA,reinterpret_cast <const float*>(a.NzValues()),a.RowLocation(),a.ColLocation(),
reinterpret_cast <const float*>(&beta),descrB,nnzB,reinterpret_cast <const float*>(b.NzValues()),b.RowLocation(),b.ColLocation(),descrC,reinterpret_cast <float*>(c.NzValues()),c.RowLocation(),c.ColLocation()));
CUSPARSE_CALL(cusparseScsrgeam(cusparseHandle, m, n, reinterpret_cast <const float*>(&alpha), descrA, nnzA, reinterpret_cast <const float*>(a.BufferPointer()), a.RowLocation(), a.ColLocation(),
reinterpret_cast <const float*>(&beta), descrB, nnzB, reinterpret_cast <const float*>(b.BufferPointer()), b.RowLocation(), b.ColLocation(), descrC, reinterpret_cast <float*>(c.BufferPointer()), c.RowLocation(), c.ColLocation()));
}
else
{
CUSPARSE_CALL(cusparseDcsrgeam(cusparseHandle,m,n,reinterpret_cast <const double*>(&alpha),descrA,nnzA,reinterpret_cast <const double*>(a.NzValues()),a.RowLocation(),a.ColLocation(),
reinterpret_cast <const double*>(&beta),descrB,nnzB,reinterpret_cast <const double*>(b.NzValues()),b.RowLocation(),b.ColLocation(),descrC,reinterpret_cast <double*>(c.NzValues()),c.RowLocation(),c.ColLocation()));
CUSPARSE_CALL(cusparseDcsrgeam(cusparseHandle, m, n, reinterpret_cast <const double*>(&alpha), descrA, nnzA, reinterpret_cast <const double*>(a.BufferPointer()), a.RowLocation(), a.ColLocation(),
reinterpret_cast <const double*>(&beta), descrB, nnzB, reinterpret_cast <const double*>(b.BufferPointer()), b.RowLocation(), b.ColLocation(), descrC, reinterpret_cast <double*>(c.BufferPointer()), c.RowLocation(), c.ColLocation()));
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
@ -1766,7 +1769,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
CUDA_LONG M=(CUDA_LONG)a.GetNumRows();
int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);
_sparseCSRPlusDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.NzValues(),a.RowLocation(),a.ColLocation(),c.BufferPointer(),M);
_sparseCSRPlusDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(alpha,a.BufferPointer(),a.RowLocation(),a.ColLocation(),c.BufferPointer(),M);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -1859,11 +1862,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(a.NzValues()), a.RowLocation(), a.ColLocation(), reinterpret_cast<float*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast<float*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
}
else
{
CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(a.NzValues()), a.RowLocation(), a.ColLocation(), reinterpret_cast<double*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(a.BufferPointer()), a.RowLocation(), a.ColLocation(), reinterpret_cast<double*>(cscValA), cscRowIndA, cscColPtrA, cpVals, idxBase));
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
@ -1871,7 +1874,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
}
else if (a.m_format == matrixFormatSparseCSC)
{
cscValA = (ElemType*)a.NzValues();
cscValA = (ElemType*)a.BufferPointer();
cscRowIndA = a.RowLocation();
cscColPtrA = a.ColLocation();
}
@ -2023,7 +2026,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
if (do_sync) CUDA_CALL(cudaEventCreate(&done));
CUDA_LONG M=(CUDA_LONG)a.GetNumRows();
int blocksPerGrid =(int)ceil(1.0*M/threadsPerBlock);
_sparseCSRElemMulDense<ElemType><<<blocksPerGrid,threadsPerBlock>>>(a.NzValues(),a.RowLocation(),a.ColLocation(),b.BufferPointer(),c.BufferPointer(),M);
_sparseCSRElemMulDense<ElemType> << <blocksPerGrid, threadsPerBlock >> >(a.BufferPointer(), a.RowLocation(), a.ColLocation(), b.BufferPointer(), c.BufferPointer(), M);
if (do_sync) CUDA_CALL(cudaEventRecord(done));
if (do_sync) CUDA_CALL(cudaEventSynchronize(done));
if (do_sync) CUDA_CALL(cudaEventDestroy(done));
@ -2120,26 +2123,26 @@ namespace Microsoft { namespace MSR { namespace CNTK {
{
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->NzValues()), this->RowLocation(), this->ColLocation(),
reinterpret_cast<float*>(c.NzValues()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->BufferPointer()), this->RowLocation(), this->ColLocation(),
reinterpret_cast<float*>(c.BufferPointer()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
}
else
{
CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->NzValues()), this->RowLocation(), this->ColLocation(),
reinterpret_cast<double*>(c.NzValues()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->BufferPointer()), this->RowLocation(), this->ColLocation(),
reinterpret_cast<double*>(c.BufferPointer()), c.ColLocation(), c.RowLocation(), cpVals, idxBase));
}
}
else if (m_format == matrixFormatSparseCSC)
{
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->NzValues()), this->ColLocation(), this->RowLocation(),
reinterpret_cast<float*>(c.NzValues()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
CUSPARSE_CALL(cusparseScsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const float*>(this->BufferPointer()), this->ColLocation(), this->RowLocation(),
reinterpret_cast<float*>(c.BufferPointer()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
}
else
{
CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->NzValues()), this->ColLocation(), this->RowLocation(),
reinterpret_cast<double*>(c.NzValues()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
CUSPARSE_CALL(cusparseDcsr2csc(cusparseHandle, m, n, nnz, reinterpret_cast<const double*>(this->BufferPointer()), this->ColLocation(), this->RowLocation(),
reinterpret_cast<double*>(c.BufferPointer()), c.RowLocation(), c.ColLocation(), cpVals, idxBase));
}
}
else
@ -2191,7 +2194,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
slice.m_computeDevice = m_computeDevice;
slice.m_numRows = m_numRows;
slice.m_numCols = numCols;
slice.m_nz = m_nz;
slice.m_nz = SecondaryIndexValueAt(startColumn + numCols) - SecondaryIndexValueAt(startColumn);
slice.m_elemSizeAllocated = m_elemSizeAllocated;
slice.m_totalBufferSizeAllocated = m_totalBufferSizeAllocated;
slice.m_pArray = m_pArray;
@ -2237,11 +2240,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CUSPARSE_CALL(cusparseSetStream(cusparseHandle, t_stream));
if (sizeof(ElemType) == sizeof(float))
{
CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, m, numCols, descr, (float*)NzValues(), RowLocation(), ColLocation() + startColumn, (float*)slice.BufferPointer(), m));
CUSPARSE_CALL(cusparseScsc2dense(cusparseHandle, m, numCols, descr, (float*)BufferPointer(), RowLocation(), ColLocation() + startColumn, (float*)slice.BufferPointer(), m));
}
else
{
CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, m, numCols, descr, (double*)NzValues(), RowLocation(), ColLocation() + startColumn, (double*)slice.BufferPointer(), m));
CUSPARSE_CALL(cusparseDcsc2dense(cusparseHandle, m, numCols, descr, (double*)BufferPointer(), RowLocation(), ColLocation() + startColumn, (double*)slice.BufferPointer(), m));
}
if (do_sync) CUDA_CALL(cudaEventRecord(done));
@ -2787,7 +2790,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
stream<<us.GetMatrixName();
}
size_t nz = us.GetNumNZElements(), numRows=us.GetNumRows(), numCols=us.GetNumCols();
size_t nz = us.GetNumNZElements(), numElemAllocated = us.GetNumElemAllocated(), numRows = us.GetNumRows(), numCols = us.GetNumCols();
size_t compressedSize = us.SecondaryIndexCount();
int format = us.GetFormat();
@ -2800,9 +2803,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
CPUSPARSE_INDEX_TYPE* unCompressedIndex = nullptr;
if (us.m_format == matrixFormatSparseCSC)
us.GetMatrixFromCSCFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, numRows, numCols);
us.GetMatrixFromCSCFormat(compressedIndex, unCompressedIndex, dataBuffer, numElemAllocated, nz, numRows, numCols);
else if (us.m_format == matrixFormatSparseCSR)
us.GetMatrixFromCSRFormat(compressedIndex, unCompressedIndex, dataBuffer, nz, numRows, numCols);
us.GetMatrixFromCSRFormat(compressedIndex, unCompressedIndex, dataBuffer, numElemAllocated, nz, numRows, numCols);
else
NOT_IMPLEMENTED;

Просмотреть файл

@ -69,8 +69,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
// in memory format is always in the following order:
// Non-zero data elements, Full index locations, compressed index locations
// In CSR row data is compressed, in CSC col data is compressed
inline const ElemType* NzValues() const {return m_pArray;}
inline ElemType* NzValues() {return m_pArray;}
inline const ElemType* NzValues() const { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
inline ElemType* NzValues() { return m_format != matrixFormatSparseCSC ? m_pArray : m_pArray + SecondaryIndexValueAt(m_sliceViewOffset); }
inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const //row/col ids in CSC/CSR format, blockId2col/blockId2row in BlockCol/BlockRow format
@ -134,6 +134,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
inline ElemType* BufferPointer() const { return m_pArray; }
inline size_t GetNumElemAllocated() const { return m_elemSizeAllocated; }
inline size_t GetSizeElemAllocated() const { return sizeof(ElemType)*m_elemSizeAllocated; }
// the column and row locations will swap based on what format we are in. Full index always follows the data array
GPUSPARSE_INDEX_TYPE* RowLocation() const
@ -163,7 +165,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
assert(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR);
return (m_format&matrixFormatRowMajor) ? MajorIndexSize() : SecondaryIndexSize();
}
}
GPUSPARSE_INDEX_TYPE SecondaryIndexValueAt(size_t idx) const
{
GPUSPARSE_INDEX_TYPE value;
CUDA_CALL(cudaMemcpy(&value, SecondaryIndexLocation() + idx, sizeof(GPUSPARSE_INDEX_TYPE), cudaMemcpyDeviceToHost));
return value;
}
GPUSPARSE_INDEX_TYPE* BlockId2ColOrRow() const
{
//not a valid function for other formats
@ -225,9 +234,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
const size_t nz, const size_t numRows, const size_t numCols, const bool IsOnDevice = false, const DEVICEID_TYPE devId = -1);
//Gets sparse matrix in CSR format. this acts as deep copy. All passed pointers must be NULL. the function will allocate memory itself.
void GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
void GetMatrixFromCSRFormat(CPUSPARSE_INDEX_TYPE*& h_CSRRow, CPUSPARSE_INDEX_TYPE*& h_Col, ElemType*& h_Val, size_t &numElemAllocated, size_t &nz, size_t &numRows, size_t &numCols) const;
void GetMatrixFromCSCFormat(CPUSPARSE_INDEX_TYPE*& h_CSCCol, CPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &nz, size_t &numRows, size_t &numCols) const;
void GetMatrixFromCSCFormat(CPUSPARSE_INDEX_TYPE*& h_CSCCol, CPUSPARSE_INDEX_TYPE*& h_Row, ElemType*& h_Val, size_t &numElemAllocated, size_t &nz, size_t &numRows, size_t &numCols) const;
void ConvertToSparseFormat(MatrixFormat newFormat);
void ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const;
@ -326,7 +335,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
private:
void performInplaceFunction(const int kind);
void DeepCopy(const GPUSparseMatrix<ElemType>& deepCopyFrom);
void Clear();
void ReleaseMemory();
void PrepareBuffer(const size_t numRows, const size_t numCols, const bool canReuseBuffer, std::function<size_t(GPUSPARSE_INDEX_TYPE* csrRowPtrC)> func);
size_t ElemCountFromBufferSize(const size_t numRows, const size_t numCols, const MatrixFormat format, const size_t totalBufferSize) const;

Просмотреть файл

@ -222,8 +222,8 @@ namespace Microsoft
float *arr = nullptr;
int *ii = nullptr;
int *jj = nullptr;
size_t nz, nr, nc;
poweredMatrix.GetMatrixFromCSRFormat(ii, jj, arr, nz, nr, nc);
size_t ea, nz, nr, nc;
poweredMatrix.GetMatrixFromCSRFormat(ii, jj, arr, ea, nz, nr, nc);
for (int index = 0; index < c_size; ++index)
{

Просмотреть файл

@ -180,6 +180,7 @@ namespace Microsoft
bool transposeA = false, transposeB = false;
float alpha = 2.4f;
float beta = 0.0f;
Matrix<float>::MultiplyAndWeightedAdd(alpha, mAdense, transposeA, mBdense, transposeB, beta, mCdense);
Matrix<float>::MultiplyAndWeightedAdd(alpha, mAsparse, transposeA, mBsparse, transposeB, beta, mCsparse);
mCsparse.SwitchToMatrixType(MatrixType::DENSE, matrixFormatDense, true);