merge m_blockVals with m_pArray in GPUSparseMatrix
Change ClassBasedCrossEntropyNode to use TypeNames instead of literals when determining the node types.
This commit is contained in:
Родитель
b7fb0ddc87
Коммит
f3dfe81034
|
@ -1143,8 +1143,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
if (m_children.size() != 3)
|
if (m_children.size() != 3)
|
||||||
throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs.");
|
throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs.");
|
||||||
|
|
||||||
if (Inputs(0)->OperationName() != L"SparseInputValue"
|
if (Inputs(0)->OperationName() != SparseInputValue<ElemType>::TypeName()
|
||||||
&& Inputs(0)->OperationName() != L"InputValue")
|
&& Inputs(0)->OperationName() != InputValue<ElemType>::TypeName())
|
||||||
throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
|
throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label.");
|
||||||
|
|
||||||
if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols() && // input and matrix can be timed
|
if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols() && // input and matrix can be timed
|
||||||
|
|
|
@ -141,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
|
|
||||||
public:
|
public:
|
||||||
const ElemType* NzValues() const { return m_pArray; }
|
const ElemType* NzValues() const { return m_pArray; }
|
||||||
ElemType* NzValues() { return m_pArray; }
|
inline ElemType* NzValues() { return m_pArray; }
|
||||||
size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use
|
size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use
|
||||||
|
|
||||||
CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
|
CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format
|
||||||
|
|
|
@ -2290,11 +2290,11 @@ __global__ void _denseMultSparseCSCAndWeightedAddToDense(
|
||||||
//assume resultValues are 0-initialized
|
//assume resultValues are 0-initialized
|
||||||
template<class ElemType>
|
template<class ElemType>
|
||||||
__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
|
__global__ void _denseMulSparseCSCTransposeToSparseBlockCol(
|
||||||
ElemType alpha,
|
const ElemType alpha,
|
||||||
ElemType* lhsValues,
|
const ElemType* lhsValues,
|
||||||
size_t numRowsLhs,
|
const size_t numRowsLhs,
|
||||||
size_t numColsRhs,
|
const size_t numColsRhs,
|
||||||
ElemType* rhsNZValues,
|
const ElemType* rhsNZValues,
|
||||||
const GPUSPARSE_INDEX_TYPE* rhsRows,
|
const GPUSPARSE_INDEX_TYPE* rhsRows,
|
||||||
const GPUSPARSE_INDEX_TYPE* rhsCols,
|
const GPUSPARSE_INDEX_TYPE* rhsCols,
|
||||||
const size_t* rhsRowIdx,
|
const size_t* rhsRowIdx,
|
||||||
|
|
|
@ -79,7 +79,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
m_matrixName=nullptr;
|
m_matrixName=nullptr;
|
||||||
|
|
||||||
m_blockSize = 0;
|
m_blockSize = 0;
|
||||||
m_blockVal = nullptr;
|
|
||||||
m_blockIds = nullptr;
|
m_blockIds = nullptr;
|
||||||
|
|
||||||
m_expandedSize = 0;
|
m_expandedSize = 0;
|
||||||
|
@ -241,7 +240,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
|
CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount());
|
||||||
}
|
}
|
||||||
|
|
||||||
CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
|
CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
|
||||||
|
|
||||||
}
|
}
|
||||||
else if (this->GetFormat() == matrixFormatSparseCSC)
|
else if (this->GetFormat() == matrixFormatSparseCSC)
|
||||||
|
@ -267,7 +266,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
|
CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount());
|
||||||
}
|
}
|
||||||
|
|
||||||
CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
|
CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
NOT_IMPLEMENTED;
|
NOT_IMPLEMENTED;
|
||||||
|
@ -571,7 +570,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
m_matrixName=moveFrom.m_matrixName;
|
m_matrixName=moveFrom.m_matrixName;
|
||||||
|
|
||||||
m_blockSize = moveFrom.m_blockSize;
|
m_blockSize = moveFrom.m_blockSize;
|
||||||
m_blockVal = moveFrom.m_blockVal;
|
|
||||||
m_blockIds = moveFrom.m_blockIds;
|
m_blockIds = moveFrom.m_blockIds;
|
||||||
|
|
||||||
m_expandedSize = moveFrom.m_expandedSize;
|
m_expandedSize = moveFrom.m_expandedSize;
|
||||||
|
@ -602,7 +600,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
m_matrixName=moveFrom.m_matrixName;
|
m_matrixName=moveFrom.m_matrixName;
|
||||||
|
|
||||||
m_blockSize = moveFrom.m_blockSize;
|
m_blockSize = moveFrom.m_blockSize;
|
||||||
m_blockVal = moveFrom.m_blockVal;
|
|
||||||
m_blockIds = moveFrom.m_blockIds;
|
m_blockIds = moveFrom.m_blockIds;
|
||||||
|
|
||||||
m_expandedSize = moveFrom.m_expandedSize;
|
m_expandedSize = moveFrom.m_expandedSize;
|
||||||
|
@ -636,8 +633,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
if(m_pArray != nullptr)
|
if(m_pArray != nullptr)
|
||||||
CUDACALL(cudaFree(m_pArray));
|
CUDACALL(cudaFree(m_pArray));
|
||||||
|
|
||||||
if(m_blockVal != nullptr)
|
|
||||||
CUDACALL(cudaFree(m_blockVal));
|
|
||||||
if(m_blockIds != nullptr)
|
if(m_blockIds != nullptr)
|
||||||
CUDACALL(cudaFree(m_blockIds));
|
CUDACALL(cudaFree(m_blockIds));
|
||||||
if (m_rowToId != nullptr)
|
if (m_rowToId != nullptr)
|
||||||
|
@ -669,22 +664,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
//-------------------------------------------------------------------------
|
//-------------------------------------------------------------------------
|
||||||
// Start of new GPU Sparse Matrix code
|
// Start of new GPU Sparse Matrix code
|
||||||
//-------------------------------------------------------------------------
|
//-------------------------------------------------------------------------
|
||||||
|
|
||||||
template<class ElemType>
|
|
||||||
ElemType* GPUSparseMatrix<ElemType>::BufferPointer() const
|
|
||||||
{
|
|
||||||
if(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR)
|
|
||||||
{
|
|
||||||
return m_pArray;
|
|
||||||
}
|
|
||||||
else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
|
|
||||||
{
|
|
||||||
return m_blockVal;
|
|
||||||
}
|
|
||||||
else
|
|
||||||
NOT_IMPLEMENTED;
|
|
||||||
}
|
|
||||||
|
|
||||||
template<class ElemType>
|
template<class ElemType>
|
||||||
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
|
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly)
|
||||||
{
|
{
|
||||||
|
@ -728,13 +707,17 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
m_totalBufferSizeAllocated = bufferSizeNeeded;
|
m_totalBufferSizeAllocated = bufferSizeNeeded;
|
||||||
m_elemSizeAllocated = numNZElemToReserve;
|
m_elemSizeAllocated = numNZElemToReserve;
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
m_elemSizeAllocated = ElemCountFromBufferSize();
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
|
else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow)
|
||||||
{
|
{
|
||||||
if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
|
if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly))
|
||||||
{
|
{
|
||||||
if (m_blockVal != nullptr)
|
if (m_pArray != nullptr)
|
||||||
CUDACALL(cudaFree(m_blockVal));
|
CUDACALL(cudaFree(m_pArray));
|
||||||
if (m_blockIds != nullptr)
|
if (m_blockIds != nullptr)
|
||||||
CUDACALL(cudaFree(m_blockIds));
|
CUDACALL(cudaFree(m_blockIds));
|
||||||
if (m_block2UniqId != nullptr)
|
if (m_block2UniqId != nullptr)
|
||||||
|
@ -742,7 +725,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
|
|
||||||
PrepareDevice();
|
PrepareDevice();
|
||||||
size_t newCompIndexSize = max(numRows, numCols) + 1;
|
size_t newCompIndexSize = max(numRows, numCols) + 1;
|
||||||
CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve));
|
CUDACALL(cudaMalloc((void **)&m_pArray, sizeof(ElemType)*numNZElemToReserve));
|
||||||
CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize));
|
CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize));
|
||||||
CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize));
|
CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize));
|
||||||
|
|
||||||
|
@ -997,7 +980,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
c.m_blockSize = rhs.m_blockSize;
|
c.m_blockSize = rhs.m_blockSize;
|
||||||
c.m_nz = m*c.m_blockSize;
|
c.m_nz = m*c.m_blockSize;
|
||||||
c.Resize(m, n, c.m_nz);
|
c.Resize(m, n, c.m_nz);
|
||||||
CUDACALL(cudaMemset(c.m_blockVal, 0, sizeof(ElemType)*(c.m_nz)));
|
CUDACALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz)));
|
||||||
CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));
|
CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize)));
|
||||||
|
|
||||||
LONG64 N = (LONG64)lhs.GetNumElements(); //here we process for each row in lhs and each column in rhs (==columns in lhs)
|
LONG64 N = (LONG64)lhs.GetNumElements(); //here we process for each row in lhs and each column in rhs (==columns in lhs)
|
||||||
|
@ -1009,11 +992,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
lhs.BufferPointer(),
|
lhs.BufferPointer(),
|
||||||
m,
|
m,
|
||||||
l,
|
l,
|
||||||
rhs.BufferPointer(),
|
rhs.NzValues(),
|
||||||
rhs.RowLocation(),
|
rhs.RowLocation(),
|
||||||
rhs.ColLocation(),
|
rhs.ColLocation(),
|
||||||
rhs.m_rowToId,
|
rhs.m_rowToId,
|
||||||
c.BufferPointer(),
|
c.NzValues(),
|
||||||
c.m_blockIds);
|
c.m_blockIds);
|
||||||
|
|
||||||
if (do_sync) CUDACALL(cudaEventRecord(done));
|
if (do_sync) CUDACALL(cudaEventRecord(done));
|
||||||
|
@ -1054,7 +1037,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
lhs.GetNumRows(),
|
lhs.GetNumRows(),
|
||||||
lhs.GetNumCols(),
|
lhs.GetNumCols(),
|
||||||
lhs.m_blockSize,
|
lhs.m_blockSize,
|
||||||
lhs.m_blockVal,
|
lhs.NzValues(),
|
||||||
lhs.m_blockIds,
|
lhs.m_blockIds,
|
||||||
rhs.BufferPointer());
|
rhs.BufferPointer());
|
||||||
|
|
||||||
|
@ -1115,7 +1098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
label.m_block2Id,
|
label.m_block2Id,
|
||||||
cls.BufferPointer(),
|
cls.BufferPointer(),
|
||||||
idx2cls.BufferPointer(),
|
idx2cls.BufferPointer(),
|
||||||
etp.m_pArray,
|
etp.NzValues(),
|
||||||
etp.MajorIndexLocation(),
|
etp.MajorIndexLocation(),
|
||||||
etp.SecondaryIndexLocation());
|
etp.SecondaryIndexLocation());
|
||||||
|
|
||||||
|
@ -1195,7 +1178,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
}
|
}
|
||||||
grd.m_blockSize = label.m_blockSize;
|
grd.m_blockSize = label.m_blockSize;
|
||||||
grd.m_nz = nz;
|
grd.m_nz = nz;
|
||||||
CUDACALL(cudaMemset(grd.m_blockVal,0,sizeof(ElemType)*(grd.m_nz)));
|
CUDACALL(cudaMemset(grd.BufferPointer(),0,sizeof(ElemType)*(grd.m_nz)));
|
||||||
CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize)));
|
CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize)));
|
||||||
|
|
||||||
cudaEvent_t done = nullptr;
|
cudaEvent_t done = nullptr;
|
||||||
|
@ -1214,7 +1197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
idx2cls.BufferPointer(),
|
idx2cls.BufferPointer(),
|
||||||
input.BufferPointer(),
|
input.BufferPointer(),
|
||||||
input.GetNumRows(),
|
input.GetNumRows(),
|
||||||
grd.m_blockVal,
|
grd.BufferPointer(),
|
||||||
grd.m_blockIds);
|
grd.m_blockIds);
|
||||||
if (do_sync) CUDACALL(cudaEventRecord(done));
|
if (do_sync) CUDACALL(cudaEventRecord(done));
|
||||||
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
||||||
|
@ -1232,8 +1215,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
cudaEvent_t done = nullptr;
|
cudaEvent_t done = nullptr;
|
||||||
if (do_sync) CUDACALL(cudaEventCreate(&done));
|
if (do_sync) CUDACALL(cudaEventCreate(&done));
|
||||||
ElemType * values = NzValues();
|
ElemType * values = NzValues();
|
||||||
if (m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow)
|
|
||||||
values = m_blockVal;
|
|
||||||
_inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
|
_inplaceTruncate<ElemType><<<blocksPerGrid,threadsPerBlock>>>(values,threshold,N);
|
||||||
if (do_sync) CUDACALL(cudaEventRecord(done));
|
if (do_sync) CUDACALL(cudaEventRecord(done));
|
||||||
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
if (do_sync) CUDACALL(cudaEventSynchronize(done));
|
||||||
|
@ -1270,7 +1251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
GetNumRows(),
|
GetNumRows(),
|
||||||
GetNumCols(),
|
GetNumCols(),
|
||||||
m_blockSize,
|
m_blockSize,
|
||||||
BufferPointer(),
|
NzValues(),
|
||||||
m_blockIds,
|
m_blockIds,
|
||||||
c.BufferPointer());
|
c.BufferPointer());
|
||||||
|
|
||||||
|
|
|
@ -49,9 +49,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
// in memory format is always in the following order:
|
// in memory format is always in the following order:
|
||||||
// Non-zero data elements, Full index locations, compressed index locations
|
// Non-zero data elements, Full index locations, compressed index locations
|
||||||
// In CSR row data is compressed, in CSC col data is compressed
|
// In CSR row data is compressed, in CSC col data is compressed
|
||||||
const ElemType* NzValues() const {return m_pArray;}
|
inline const ElemType* NzValues() const {return m_pArray;}
|
||||||
ElemType* NzValues() {return m_pArray;}
|
inline ElemType* NzValues() {return m_pArray;}
|
||||||
size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
|
inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use
|
||||||
|
|
||||||
GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
|
GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format
|
||||||
size_t MajorIndexCount() const { return m_nz; }
|
size_t MajorIndexCount() const { return m_nz; }
|
||||||
|
@ -82,8 +82,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
size_t BufferSizeNeeded(const size_t numNZ) const
|
size_t BufferSizeNeeded(const size_t numNZ) const
|
||||||
{ return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
|
{ return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); }
|
||||||
|
|
||||||
size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
|
inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; }
|
||||||
ElemType* BufferPointer() const;
|
inline ElemType* BufferPointer() const { return m_pArray; }
|
||||||
|
|
||||||
// the column and row locations will swap based on what format we are in. Full index always follows the data array
|
// the column and row locations will swap based on what format we are in. Full index always follows the data array
|
||||||
GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
|
GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); }
|
||||||
|
@ -125,7 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
|
bool IsEqualTo(const GPUMatrix<ElemType>& a, const ElemType threshold = 1e-8) const;
|
||||||
public:
|
public:
|
||||||
virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
|
virtual DEVICEID_TYPE GetComputeDeviceId(void) const;
|
||||||
size_t GetNumNZElements() const {return m_nz;}
|
inline size_t GetNumNZElements() const {return m_nz;}
|
||||||
|
|
||||||
//Sets sparse matrix in CSR format. this acts as deep copy
|
//Sets sparse matrix in CSR format. this acts as deep copy
|
||||||
void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val,
|
void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val,
|
||||||
|
@ -249,7 +249,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
||||||
size_t m_totalBufferSizeAllocated;
|
size_t m_totalBufferSizeAllocated;
|
||||||
|
|
||||||
size_t m_blockSize; //block size
|
size_t m_blockSize; //block size
|
||||||
ElemType *m_blockVal; //block values
|
|
||||||
size_t *m_blockIds; //block ids
|
size_t *m_blockIds; //block ids
|
||||||
size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.
|
size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.
|
||||||
|
|
||||||
|
|
Загрузка…
Ссылка в новой задаче