diff --git a/MachineLearning/cn/TrainingCriterionNode.h b/MachineLearning/cn/TrainingCriterionNode.h index 16709ec71..39ca907b9 100644 --- a/MachineLearning/cn/TrainingCriterionNode.h +++ b/MachineLearning/cn/TrainingCriterionNode.h @@ -1143,8 +1143,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { if (m_children.size() != 3) throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires three inputs."); - if (Inputs(0)->OperationName() != L"SparseInputValue" - && Inputs(0)->OperationName() != L"InputValue") + if (Inputs(0)->OperationName() != SparseInputValue::TypeName() + && Inputs(0)->OperationName() != InputValue::TypeName()) throw std::logic_error("ClassBasedCrossEntropyWithSoftmaxNode criterion requires the first input to be the label."); if (!(Inputs(1)->FunctionValues().GetNumRows() == Inputs(2)->FunctionValues().GetNumCols() && // input and matrix can be timed diff --git a/Math/Math/CPUSparseMatrix.h b/Math/Math/CPUSparseMatrix.h index 56831159a..aac886ec1 100644 --- a/Math/Math/CPUSparseMatrix.h +++ b/Math/Math/CPUSparseMatrix.h @@ -141,7 +141,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { public: const ElemType* NzValues() const { return m_pArray; } - ElemType* NzValues() { return m_pArray; } + inline ElemType* NzValues() { return m_pArray; } size_t NzSize() const { return sizeof(ElemType)*m_nz; } // actual number of element bytes in use CPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return m_unCompIndex; } //this is the major index, row/col ids in CSC/CSR format diff --git a/Math/Math/GPUMatrixCUDAKernels.cu b/Math/Math/GPUMatrixCUDAKernels.cu index 7351819fb..9030c7c15 100644 --- a/Math/Math/GPUMatrixCUDAKernels.cu +++ b/Math/Math/GPUMatrixCUDAKernels.cu @@ -2290,11 +2290,11 @@ __global__ void _denseMultSparseCSCAndWeightedAddToDense( //assume resultValues are 0-initialized template __global__ void _denseMulSparseCSCTransposeToSparseBlockCol( - ElemType alpha, - ElemType* lhsValues, - size_t numRowsLhs, - size_t numColsRhs, - ElemType* rhsNZValues, + const ElemType alpha, + const ElemType* lhsValues, + const size_t numRowsLhs, + const size_t numColsRhs, + const ElemType* rhsNZValues, const GPUSPARSE_INDEX_TYPE* rhsRows, const GPUSPARSE_INDEX_TYPE* rhsCols, const size_t* rhsRowIdx, diff --git a/Math/Math/GPUSparseMatrix.cu b/Math/Math/GPUSparseMatrix.cu index 970e4ca35..74fda4f6b 100644 --- a/Math/Math/GPUSparseMatrix.cu +++ b/Math/Math/GPUSparseMatrix.cu @@ -79,7 +79,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_matrixName=nullptr; m_blockSize = 0; - m_blockVal = nullptr; m_blockIds = nullptr; m_expandedSize = 0; @@ -241,7 +240,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { CopyBuffer(cpuSparseMatrix.ColLocation(), h_Col, MajorIndexCount()); } - CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost)); + CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost)); } else if (this->GetFormat() == matrixFormatSparseCSC) @@ -267,7 +266,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { CopyBuffer(cpuSparseMatrix.RowLocation(), h_Row, MajorIndexCount()); } - CUDACALL(cudaMemcpy(cpuSparseMatrix.BufferPointer(), NzValues(), NzSize(), cudaMemcpyDeviceToHost)); + CUDACALL(cudaMemcpy(cpuSparseMatrix.NzValues(), NzValues(), NzSize(), cudaMemcpyDeviceToHost)); } else NOT_IMPLEMENTED; @@ -571,7 +570,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_matrixName=moveFrom.m_matrixName; m_blockSize = moveFrom.m_blockSize; - m_blockVal = moveFrom.m_blockVal; m_blockIds = moveFrom.m_blockIds; m_expandedSize = moveFrom.m_expandedSize; @@ -602,7 +600,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_matrixName=moveFrom.m_matrixName; m_blockSize = moveFrom.m_blockSize; - m_blockVal = moveFrom.m_blockVal; m_blockIds = moveFrom.m_blockIds; m_expandedSize = moveFrom.m_expandedSize; @@ -636,8 +633,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { if(m_pArray != nullptr) CUDACALL(cudaFree(m_pArray)); - if(m_blockVal != nullptr) - CUDACALL(cudaFree(m_blockVal)); if(m_blockIds != nullptr) CUDACALL(cudaFree(m_blockIds)); if (m_rowToId != nullptr) @@ -669,22 +664,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { //------------------------------------------------------------------------- // Start of new GPU Sparse Matrix code //------------------------------------------------------------------------- - - template - ElemType* GPUSparseMatrix::BufferPointer() const - { - if(m_format == matrixFormatSparseCSC || m_format == matrixFormatSparseCSR) - { - return m_pArray; - } - else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) - { - return m_blockVal; - } - else - NOT_IMPLEMENTED; - } - template void GPUSparseMatrix::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const bool growOnly) { @@ -728,13 +707,17 @@ namespace Microsoft { namespace MSR { namespace CNTK { m_totalBufferSizeAllocated = bufferSizeNeeded; m_elemSizeAllocated = numNZElemToReserve; } + else + { + m_elemSizeAllocated = ElemCountFromBufferSize(); + } } else if (matrixFormat == MatrixFormat::matrixFormatSparseBlockCol || matrixFormat == MatrixFormat::matrixFormatSparseBlockRow) { if (m_elemSizeAllocated < numNZElemToReserve || (m_elemSizeAllocated > numNZElemToReserve && !growOnly)) { - if (m_blockVal != nullptr) - CUDACALL(cudaFree(m_blockVal)); + if (m_pArray != nullptr) + CUDACALL(cudaFree(m_pArray)); if (m_blockIds != nullptr) CUDACALL(cudaFree(m_blockIds)); if (m_block2UniqId != nullptr) @@ -742,7 +725,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { PrepareDevice(); size_t newCompIndexSize = max(numRows, numCols) + 1; - CUDACALL(cudaMalloc((void **)&m_blockVal, sizeof(ElemType)*numNZElemToReserve)); + CUDACALL(cudaMalloc((void **)&m_pArray, sizeof(ElemType)*numNZElemToReserve)); CUDACALL(cudaMalloc((void **)&m_blockIds, sizeof(size_t)*newCompIndexSize)); CUDACALL(cudaMalloc((void **)&m_block2UniqId, sizeof(size_t)*newCompIndexSize)); @@ -997,7 +980,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { c.m_blockSize = rhs.m_blockSize; c.m_nz = m*c.m_blockSize; c.Resize(m, n, c.m_nz); - CUDACALL(cudaMemset(c.m_blockVal, 0, sizeof(ElemType)*(c.m_nz))); + CUDACALL(cudaMemset(c.NzValues(), 0, sizeof(ElemType)*(c.m_nz))); CUDACALL(cudaMemset(c.m_blockIds, 0, sizeof(size_t)*(c.m_blockSize))); LONG64 N = (LONG64)lhs.GetNumElements(); //here we process for each row in lhs and each column in rhs (==columns in lhs) @@ -1009,11 +992,11 @@ namespace Microsoft { namespace MSR { namespace CNTK { lhs.BufferPointer(), m, l, - rhs.BufferPointer(), + rhs.NzValues(), rhs.RowLocation(), rhs.ColLocation(), rhs.m_rowToId, - c.BufferPointer(), + c.NzValues(), c.m_blockIds); if (do_sync) CUDACALL(cudaEventRecord(done)); @@ -1054,7 +1037,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { lhs.GetNumRows(), lhs.GetNumCols(), lhs.m_blockSize, - lhs.m_blockVal, + lhs.NzValues(), lhs.m_blockIds, rhs.BufferPointer()); @@ -1115,7 +1098,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { label.m_block2Id, cls.BufferPointer(), idx2cls.BufferPointer(), - etp.m_pArray, + etp.NzValues(), etp.MajorIndexLocation(), etp.SecondaryIndexLocation()); @@ -1195,7 +1178,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { } grd.m_blockSize = label.m_blockSize; grd.m_nz = nz; - CUDACALL(cudaMemset(grd.m_blockVal,0,sizeof(ElemType)*(grd.m_nz))); + CUDACALL(cudaMemset(grd.BufferPointer(),0,sizeof(ElemType)*(grd.m_nz))); CUDACALL(cudaMemset(grd.m_blockIds,0,sizeof(size_t)*(grd.m_blockSize))); cudaEvent_t done = nullptr; @@ -1214,7 +1197,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { idx2cls.BufferPointer(), input.BufferPointer(), input.GetNumRows(), - grd.m_blockVal, + grd.BufferPointer(), grd.m_blockIds); if (do_sync) CUDACALL(cudaEventRecord(done)); if (do_sync) CUDACALL(cudaEventSynchronize(done)); @@ -1232,8 +1215,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { cudaEvent_t done = nullptr; if (do_sync) CUDACALL(cudaEventCreate(&done)); ElemType * values = NzValues(); - if (m_format == matrixFormatSparseBlockCol || m_format == matrixFormatSparseBlockRow) - values = m_blockVal; _inplaceTruncate<<>>(values,threshold,N); if (do_sync) CUDACALL(cudaEventRecord(done)); if (do_sync) CUDACALL(cudaEventSynchronize(done)); @@ -1270,7 +1251,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { GetNumRows(), GetNumCols(), m_blockSize, - BufferPointer(), + NzValues(), m_blockIds, c.BufferPointer()); diff --git a/Math/Math/GPUSparseMatrix.h b/Math/Math/GPUSparseMatrix.h index d7b6a2269..b68ef9783 100644 --- a/Math/Math/GPUSparseMatrix.h +++ b/Math/Math/GPUSparseMatrix.h @@ -49,9 +49,9 @@ namespace Microsoft { namespace MSR { namespace CNTK { // in memory format is always in the following order: // Non-zero data elements, Full index locations, compressed index locations // In CSR row data is compressed, in CSC col data is compressed - const ElemType* NzValues() const {return m_pArray;} - ElemType* NzValues() {return m_pArray;} - size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use + inline const ElemType* NzValues() const {return m_pArray;} + inline ElemType* NzValues() {return m_pArray;} + inline size_t NzSize() const {return sizeof(ElemType)*m_nz;} // actual number of element bytes in use GPUSPARSE_INDEX_TYPE* MajorIndexLocation() const { return (GPUSPARSE_INDEX_TYPE*)(m_pArray + m_elemSizeAllocated); } //this is the major index, row/col ids in CSC/CSR format size_t MajorIndexCount() const { return m_nz; } @@ -82,8 +82,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t BufferSizeNeeded(const size_t numNZ) const { return sizeof(ElemType)*numNZ + sizeof(GPUSPARSE_INDEX_TYPE)*(numNZ + SecondaryIndexCount(numNZ)); } - size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; } - ElemType* BufferPointer() const; + inline size_t BufferSizeAllocated() const { return m_totalBufferSizeAllocated; } + inline ElemType* BufferPointer() const { return m_pArray; } // the column and row locations will swap based on what format we are in. Full index always follows the data array GPUSPARSE_INDEX_TYPE* RowLocation() const { return (m_format&matrixFormatRowMajor) ? SecondaryIndexLocation() : MajorIndexLocation(); } @@ -125,7 +125,7 @@ namespace Microsoft { namespace MSR { namespace CNTK { bool IsEqualTo(const GPUMatrix& a, const ElemType threshold = 1e-8) const; public: virtual DEVICEID_TYPE GetComputeDeviceId(void) const; - size_t GetNumNZElements() const {return m_nz;} + inline size_t GetNumNZElements() const {return m_nz;} //Sets sparse matrix in CSR format. this acts as deep copy void SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE *h_CSRRow, const GPUSPARSE_INDEX_TYPE *h_Col, const ElemType *h_Val, @@ -249,7 +249,6 @@ namespace Microsoft { namespace MSR { namespace CNTK { size_t m_totalBufferSizeAllocated; size_t m_blockSize; //block size - ElemType *m_blockVal; //block values size_t *m_blockIds; //block ids size_t *m_rowToId; //the id showing the order row number is observed in the nnz values.