From 7793b7be939b79fc8bae13f32d951f8fd62c9bb8 Mon Sep 17 00:00:00 2001 From: Frank Seide Date: Tue, 1 Mar 2016 10:43:50 -0800 Subject: [PATCH] some clean-up of sparse matrices; disabled meaningless elementwise sparse ops such as log(); sparse matrice slices now keep track of their original matrix (not used yet); bug fix: ~SyncGuard() should not swallow errors since CUDA errors may get discovered here; bug fix: HTKMLFReader test 16 must use consistent precision --- Source/Math/CPUSparseMatrix.cpp | 216 +++++++++--------- Source/Math/CPUSparseMatrix.h | 34 +-- Source/Math/CommonMatrix.h | 41 +++- Source/Math/GPUMatrix.h | 27 ++- Source/Math/GPUSparseMatrix.cu | 209 +++++++++-------- Source/Math/GPUSparseMatrix.h | 45 ++-- Source/Math/Matrix.h | 2 +- .../HTKMLFReaderSimpleDataLoop16_Config.cntk | 2 +- 8 files changed, 323 insertions(+), 253 deletions(-) diff --git a/Source/Math/CPUSparseMatrix.cpp b/Source/Math/CPUSparseMatrix.cpp index f8d00c25a..627efb0f1 100644 --- a/Source/Math/CPUSparseMatrix.cpp +++ b/Source/Math/CPUSparseMatrix.cpp @@ -59,16 +59,13 @@ #define BLAS_COLMAJOR (int) MatrixOrder::ColMajor, #endif -#define SWAP(a, b) \ - { \ - (a) ^= (b); \ - (b) ^= (a); \ - (a) ^= (b); \ - } +// TODO: Move to CommonMatrix.h #define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing + namespace Microsoft { namespace MSR { namespace CNTK { #pragma region Helpful Enum Definitions + enum class MatrixOrder { RowMajor = 101, // row-major arrays @@ -77,56 +74,55 @@ enum class MatrixOrder enum class MatrixTranspose : char { - NoTrans = 'N', // trans='N' - Trans = 'T', // trans='T' - ConjTrans = 'C' // trans='C' + NoTrans = 'N', // trans='N' + Trans = 'T', // trans='T' + ConjTrans = 'C' // trans='C' }; enum class SymMatrixType : char { - Up = 'U', // symmetric matrix is stored in the upper part - Low = 'L', // symmetric matrix is stored in thelower part - Full = 'F', // full populated - NotSymmetric = 'N' // not a symmetric matrix + Up = 'U', // symmetric matrix is stored in the upper part + Low = 'L', // symmetric matrix is stored in thelower part + Full = 'F', // full populated + NotSymmetric = 'N' // not a symmetric matrix }; enum class MatrixOpSide : char { - Left = 'L', // left multiply + Left = 'L', // left multiply Right = 'R', // right multiply }; + #pragma endregion Helpful Enum Definitions #pragma region Constructors and Destructor -//should only be used by constructors. -template -void CPUSparseMatrix::ZeroInit() -{ - m_numRows = 0; - m_numCols = 0; - m_elemSizeAllocated = 0; - m_compIndexSize = 0; - m_externalBuffer = false; - m_computeDevice = CPUDEVICE; - m_nz = 0; - m_matrixName = NULL; +//------------------------------------------------------------------------- +// construction and conversion +//------------------------------------------------------------------------- +// should only be used by constructors. +template +/*private*/ void CPUSparseMatrix::ZeroInit() +{ + Base::ZeroInit(); + m_computeDevice = CPUDEVICE; + + m_sliceOf = nullptr; + m_compIndexSize = 0; // if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR) { - m_colIdx = -1; - m_pArray = NULL; - m_unCompIndex = NULL; - m_compIndex = NULL; + m_colIdx = -1; + m_unCompIndex = nullptr; + m_compIndex = nullptr; } // else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) { - m_blockSize = 0; + m_blockSize = 0; m_blockIdShift = 0; - m_pArray = NULL; - m_blockIds = NULL; + m_blockIds = nullptr; } - m_nzValues = NULL; + m_nzValues = nullptr; } //should only be used by constructors. @@ -144,19 +140,17 @@ void CPUSparseMatrix::CheckInit(const MatrixFormat format) template CPUSparseMatrix::CPUSparseMatrix(const MatrixFormat format) { - CheckInit(format); } template CPUSparseMatrix::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size) { - CheckInit(format); Resize(numRows, numCols, size, true, false); } -//copy constructor, deep copy +// copy constructor, deep copy template CPUSparseMatrix::CPUSparseMatrix(const CPUSparseMatrix& deepCopyFrom) { @@ -166,7 +160,7 @@ CPUSparseMatrix::CPUSparseMatrix(const CPUSparseMatrix& deep SetMatrixName(deepCopyFrom.m_matrixName); } -//assignment operator, deep copy +// assignment operator, deep copy template CPUSparseMatrix& CPUSparseMatrix::operator=(const CPUSparseMatrix& deepCopyFrom) { @@ -177,29 +171,23 @@ CPUSparseMatrix& CPUSparseMatrix::operator=(const CPUSparseM return *this; } -//move constructor, shallow copy +// move constructor, shallow copy template CPUSparseMatrix::CPUSparseMatrix(CPUSparseMatrix&& moveFrom) { - m_format = moveFrom.m_format; - m_numRows = moveFrom.m_numRows; - m_numCols = moveFrom.m_numCols; - m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; + Base::ShallowCopyFrom(moveFrom); + // BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now. + m_compIndexSize = moveFrom.m_compIndexSize; - m_externalBuffer = moveFrom.m_externalBuffer; - m_computeDevice = moveFrom.m_computeDevice; - m_nz = moveFrom.m_nz; - m_matrixName = moveFrom.m_matrixName; - m_colIdx = moveFrom.m_colIdx; - m_pArray = moveFrom.m_pArray; - m_nzValues = moveFrom.m_nzValues; + m_colIdx = moveFrom.m_colIdx; + m_nzValues = moveFrom.m_nzValues; m_unCompIndex = moveFrom.m_unCompIndex; - m_compIndex = moveFrom.m_compIndex; + m_compIndex = moveFrom.m_compIndex; - m_blockSize = moveFrom.m_blockSize; + m_blockSize = moveFrom.m_blockSize; m_blockIdShift = moveFrom.m_blockIdShift; - m_blockIds = moveFrom.m_blockIds; + m_blockIds = moveFrom.m_blockIds; // release the pointer from the source object so that the destructor won't release it twice moveFrom.ZeroInit(); @@ -213,26 +201,19 @@ CPUSparseMatrix& CPUSparseMatrix::operator=(CPUSparseMatrix< { if (OwnBuffer()) ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom + Base::ShallowCopyFrom(moveFrom); + // BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now. - m_format = moveFrom.m_format; - m_numRows = moveFrom.m_numRows; - m_numCols = moveFrom.m_numCols; - m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; m_compIndexSize = moveFrom.m_compIndexSize; - m_externalBuffer = moveFrom.m_externalBuffer; - m_computeDevice = moveFrom.m_computeDevice; - m_nz = moveFrom.m_nz; - m_matrixName = moveFrom.m_matrixName; - m_colIdx = moveFrom.m_colIdx; - m_pArray = moveFrom.m_pArray; - m_nzValues = moveFrom.m_nzValues; + m_colIdx = moveFrom.m_colIdx; + m_nzValues = moveFrom.m_nzValues; m_unCompIndex = moveFrom.m_unCompIndex; - m_compIndex = moveFrom.m_compIndex; + m_compIndex = moveFrom.m_compIndex; - m_blockSize = moveFrom.m_blockSize; + m_blockSize = moveFrom.m_blockSize; m_blockIdShift = moveFrom.m_blockIdShift; - m_blockIds = moveFrom.m_blockIds; + m_blockIds = moveFrom.m_blockIds; // release the pointer from the source object so that the destructor won't release it twice moveFrom.ZeroInit(); @@ -249,9 +230,8 @@ CPUSparseMatrix::~CPUSparseMatrix() template void CPUSparseMatrix::ReleaseMemory() { - // If m_externalBuffer is true then this matrix - // is simply a view over another matrix. In that - // case we shouldn't free anything. + // If m_externalBuffer is true then this matrix is simply a view over another matrix. + // In that case we shouldn't free anything. if (!m_externalBuffer) { delete[] m_matrixName; @@ -335,24 +315,26 @@ void CPUSparseMatrix::SetValue(const size_t row, const size_t col, con m_nz++; } -//make sure call order in colume wise for CSC and row wise for CSR +// make sure call order in colume wise for CSC and row wise for CSR template void CPUSparseMatrix::SetValue(const CPUSparseMatrix& v) { - if (!OwnBuffer()) + if (!OwnBuffer()) // TODO: GPU version allows to overwrite a view with a fresh non-view LogicError("Cannot modify since the buffer is managed externally."); - this->Reset(); - m_format = v.GetFormat(); + Reset(); + m_format = v.GetFormat(); + m_externalBuffer = false; + m_sliceOf = nullptr; - this->Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize()); + Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize()); m_nz = v.NzCount(); if (m_nz > 0) { - memcpy(this->NzValues(), v.NzValues(), v.NzSize()); - memcpy(this->RowLocation(), v.RowLocation(), v.RowSize()); - memcpy(this->ColLocation(), v.ColLocation(), v.ColSize()); + memcpy(NzValues(), v.NzValues(), v.NzSize()); + memcpy(RowLocation(), v.RowLocation(), v.RowSize()); + memcpy(ColLocation(), v.ColLocation(), v.ColSize()); } } @@ -402,17 +384,22 @@ CPUSparseMatrix CPUSparseMatrix::ColumnSlice(size_t startCol CPUSparseMatrix slice(m_format); slice.m_numRows = m_numRows; slice.m_numCols = numCols; + // BUGBUG: m_matrixName? + // BUGBUG: m_sliceViewOffset? + slice.m_externalBuffer = true; + slice.m_sliceOf = const_cast*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be. if (m_format == MatrixFormat::matrixFormatSparseCSC) { - slice.m_pArray = m_pArray; - slice.m_nzValues = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against m_pArray - slice.m_unCompIndex = m_unCompIndex; - slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it! - slice.m_externalBuffer = true; - slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn]; + slice.m_pArray = m_pArray; + + slice.m_nzValues = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against m_pArray + slice.m_unCompIndex = m_unCompIndex; + slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it! + slice.m_compIndexSize = numCols + 1; + + slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn]; slice.m_elemSizeAllocated = slice.m_nz; - slice.m_compIndexSize = numCols + 1; } else if (m_format == MatrixFormat::matrixFormatSparseBlockCol) { @@ -446,13 +433,15 @@ CPUSparseMatrix CPUSparseMatrix::ColumnSlice(size_t startCol endColBlock = (long long) m_blockSize; } - slice.m_pArray = m_pArray + startColBlock * m_numRows; - slice.m_nzValues = slice.m_pArray; - slice.m_blockIds = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers - slice.m_blockSize = (size_t) max((long long) 0, endColBlock - startColBlock); + // BUGBUG: m_elemSizeAllocated? + slice.m_pArray = m_pArray + startColBlock * m_numRows; + + slice.m_nzValues = slice.m_pArray; + slice.m_blockIds = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers + slice.m_blockSize = (size_t) max((long long) 0, endColBlock - startColBlock); slice.m_blockIdShift = m_blockIdShift + startColumn; - slice.m_externalBuffer = true; - slice.m_nz = slice.m_blockSize * m_numRows; + + slice.m_nz = slice.m_blockSize * m_numRows; } return slice; @@ -461,9 +450,6 @@ CPUSparseMatrix CPUSparseMatrix::ColumnSlice(size_t startCol template CPUMatrix CPUSparseMatrix::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const { - // if (numCols == 0) - // LogicError("The slice cannot have 0 columns."); - if (startColumn + numCols > m_numCols) InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) m_numCols); @@ -614,17 +600,21 @@ void CPUSparseMatrix::Resize(const size_t numRows, const size_t numCol } } -//Reset matrix so it can be reused +// Reset matrix to 0. template void CPUSparseMatrix::Reset() { + if (!OwnBuffer()) + LogicError("Cannot Reset since the buffer is managed externally."); + m_nz = 0; m_colIdx = -1; m_blockSize = 0; m_blockIdShift = 0; } -//c = alpha*op(lhs) * op(rhs) + beta*c +// c = alpha*op(lhs) * op(rhs) + beta*c +// dense x sparse = dense template void CPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix& lhs, const bool transposeA, const CPUSparseMatrix& rhs, const bool transposeB, ElemType beta, CPUMatrix& c) @@ -711,7 +701,8 @@ void CPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const CPU } } -//c = alpha * op(lhs) * op(rhs) +// dense x sparse = sparse +// c = alpha * op(lhs) * op(rhs) template void CPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const CPUMatrix& lhs, const bool transposeA, const CPUSparseMatrix& rhs, const bool transposeB, CPUSparseMatrix& c) @@ -807,6 +798,7 @@ void CPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const CPUMatrix void CPUSparseMatrix::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix& lhs, CPUMatrix& rhs) { @@ -861,7 +853,7 @@ void CPUSparseMatrix::ScaleAndAdd(const ElemType alpha, const CPUSpars } template -bool CPUSparseMatrix::AreEqual(const CPUSparseMatrix& a, const CPUSparseMatrix& b, const ElemType threshold) +/*static*/ bool CPUSparseMatrix::AreEqual(const CPUSparseMatrix& a, const CPUSparseMatrix& b, const ElemType threshold) { if (a.IsEmpty() || b.IsEmpty()) LogicError("AreEqual: one of the input matrices is empty."); @@ -894,6 +886,7 @@ void CPUSparseMatrix::NormalGrad(CPUMatrix& c, const ElemTyp c.Resize(GetNumRows(), GetNumCols()); c.SetValue(0.0); } + // BUGBUG: dimension/ownbuffer check? if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow) { @@ -927,6 +920,7 @@ ElemType CPUSparseMatrix::Adagrad(CPUMatrix& c, const bool n c.Resize(GetNumRows(), GetNumCols()); c.SetValue(0.0); } + // BUGBUG: dimension/ownbuffer check? ElemType aveMultiplier = 0; @@ -1156,12 +1150,12 @@ CPUSparseMatrix& CPUSparseMatrix::InplaceSoftThreshold(const template ElemType CPUSparseMatrix::FrobeniusNorm() const { - if (this->IsEmpty()) - LogicError("FrobeniusNorm: Matrix is empty."); + if (IsEmpty()) + return 0; - ElemType v = 0; + ElemType v = 0; // TODO: do this in 'double'? - long m = (long) this->NzCount(); + long m = (long) NzCount(); const ElemType* nzValues = NzValues(); //four-way unrolling @@ -1183,8 +1177,8 @@ ElemType CPUSparseMatrix::FrobeniusNorm() const template ElemType CPUSparseMatrix::SumOfAbsElements() const { - if (this->IsEmpty()) - LogicError("SumOfAbsElements: Matrix is empty."); + if (IsEmpty()) + return 0; if (sizeof(ElemType) == sizeof(double)) { @@ -1209,12 +1203,12 @@ ElemType CPUSparseMatrix::SumOfAbsElements() const template ElemType CPUSparseMatrix::SumOfElements() const { - if (this->IsEmpty()) - LogicError("SumOfElements: Matrix is empty."); + if (IsEmpty()) + return 0; - ElemType sum = 0; + ElemType sum = 0; // TODO: Do this in 'double'? - long m = (long) this->NzCount(); + long m = (long) NzCount(); const ElemType* nzValues = NzValues(); //four-way unrolling @@ -1235,6 +1229,9 @@ ElemType CPUSparseMatrix::SumOfElements() const template MATH_API File& operator>>(File& stream, CPUSparseMatrix& us) { + if (!us.OwnBuffer()) + LogicError("Cannot read into a managed external matrix"); + stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); size_t elsize; stream >> elsize; @@ -1351,4 +1348,5 @@ template CPUSparseMatrix::~CPUSparseMatrix(); template CPUSparseMatrix CPUSparseMatrix::ColumnSlice(size_t startColumn, size_t numCols) const; template CPUMatrix CPUSparseMatrix::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const; template CPUSparseMatrix& CPUSparseMatrix::operator=(const CPUSparseMatrix& deepCopyFrom); -} } } + +}}} diff --git a/Source/Math/CPUSparseMatrix.h b/Source/Math/CPUSparseMatrix.h index 25c0818e5..f160372a3 100644 --- a/Source/Math/CPUSparseMatrix.h +++ b/Source/Math/CPUSparseMatrix.h @@ -22,21 +22,23 @@ namespace Microsoft { namespace MSR { namespace CNTK { template class MATH_API CPUSparseMatrix : public BaseMatrix { - typedef BaseMatrix B; - using B::m_elemSizeAllocated; - using B::m_computeDevice; - using B::m_externalBuffer; - using B::m_format; - using B::m_matrixName; - using B::m_numCols; - using B::m_numRows; - using B::m_nz; - using B::m_pArray; // without this, base members would require to use thi-> in GCC - using B::OwnBuffer; - using B::Clear; + typedef BaseMatrix Base; + using Base::m_elemSizeAllocated; + using Base::m_computeDevice; + using Base::m_externalBuffer; + using Base::m_format; + using Base::m_matrixName; + using Base::m_numCols; + using Base::m_numRows; + using Base::m_nz; + using Base::m_pArray; // without this, base members would require to use thi-> in GCC + using Base::Clear; + using Base::NzCount; public: - using B::SetMatrixName; + using Base::OwnBuffer; + using Base::IsEmpty; + using Base::SetMatrixName; private: void ZeroInit(); @@ -53,8 +55,8 @@ public: ~CPUSparseMatrix(); public: - using B::GetNumCols; - using B::GetNumRows; + using Base::GetNumCols; + using Base::GetNumRows; void SetValue(const size_t row, const size_t col, ElemType val); void SetValue(const CPUSparseMatrix& /*val*/); @@ -249,6 +251,8 @@ private: size_t m_blockSize; // block size size_t* m_blockIds; // block ids size_t m_blockIdShift; // used to get efficient slice, actual col = blockIds[j] - m_blockIdShift + + CPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from }; typedef CPUSparseMatrix CPUSingleSparseMatrix; diff --git a/Source/Math/CommonMatrix.h b/Source/Math/CommonMatrix.h index 0aa90983b..233bb4358 100644 --- a/Source/Math/CommonMatrix.h +++ b/Source/Math/CommonMatrix.h @@ -158,7 +158,7 @@ enum ElementWiseOperator Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); \ Macro(ElementwiseProductWithLogDerivativeFromOutput); \ Macro(ElementwiseProductWithCosDerivative); \ -//Macro(Index); + //Macro(Index); #define ForAllTernaryOps(Macro) \ Macro(Cond); \ @@ -170,6 +170,7 @@ enum ElementWiseOperator enum MatrixFlagBitPosition { + // TODO: remove all formats that are actually not supported bitPosRowMajor = 0, // row major matrix bitPosSparse = 1, // sparse matrix (COO if uncompressed) bitPosCompressed = 2, // a compressed sparse format (CSC/CSR) @@ -179,6 +180,7 @@ enum MatrixFlagBitPosition enum MatrixFormat { + // TODO: remove all formats that are actually not supported matrixFormatDense = 0, // default is dense matrixFormatColMajor = 0, // default is column major matrixFormatRowMajor = 1 << bitPosRowMajor, // row major matrix @@ -314,17 +316,46 @@ protected: m_matrixName = nullptr; } + void ZeroInit() + { + m_numRows = 0; + m_numCols = 0; + m_elemSizeAllocated = 0; + m_sliceViewOffset = 0; + m_externalBuffer = false; + m_pArray = nullptr; + m_nz = 0; + m_matrixName = nullptr; + } + + // copy all metadata (but not content taht pArray points to) + void ShallowCopyFrom(const BaseMatrix& other) + { + m_format = other.m_format; + m_computeDevice = other.m_computeDevice; + + m_numRows = other.m_numRows; + m_numCols = other.m_numCols; + m_elemSizeAllocated = other.m_elemSizeAllocated; + m_sliceViewOffset = other.m_sliceViewOffset; + m_externalBuffer = other.m_externalBuffer; + m_pArray = other.m_pArray; + m_nz = other.m_nz; + m_matrixName = other.m_matrixName; + } + protected: + MatrixFormat m_format; + mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE + size_t m_numRows; size_t m_numCols; size_t m_elemSizeAllocated; - size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format - MatrixFormat m_format; + size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format --TODO: move to sparse matrix implementations? Or common sparse base class? bool m_externalBuffer; // is the buffer used by this matrix, ElemType* m_pArray; - mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE size_t m_nz; // Number of non-zero elements for sparse matrices (unused in other formats) wchar_t* m_matrixName; // TODO: Use std::wstring? }; -} } } +}}} diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 4925da113..22d6601f0 100644 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -579,7 +579,7 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam class SyncGuard { - bool DoSync() + static bool DoSync() { #ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger static bool do_sync = false; @@ -588,27 +588,30 @@ class SyncGuard #endif return do_sync; } - cudaEvent_t done; + cudaEvent_t m_done; public: SyncGuard() { - done = nullptr; + m_done = nullptr; if (DoSync()) - CUDA_CALL(cudaEventCreate(&done)); + CUDA_CALL(cudaEventCreate(&m_done)); } ~SyncGuard() { if (DoSync()) { - try + // The regular use of this destructor is to synchronize the GPU, but also + // to check for errors. So this destructor is where CUDA errors would be thrown. + // If this destructor runs during stack unwinding, then a different error has + // already happened that should be reported; so we only clean up the resource. + if (std::uncaught_exception()) + cudaEventDestroy(m_done); + else { - CUDA_CALL(cudaEventRecord(done)); - CUDA_CALL(cudaEventSynchronize(done)); - CUDA_CALL(cudaEventDestroy(done)); - } - catch (const std::exception& e) // can't throw in destructors! - { - std::cerr << "SyncGuard: Destructor swallowing CUDA failure: " << e.what() << std::endl; + // failures in a prior launch might be reported here + CUDA_CALL(cudaEventRecord(m_done)); + CUDA_CALL(cudaEventSynchronize(m_done)); + CUDA_CALL(cudaEventDestroy(m_done)); } } } diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu index 1389de7e8..7f70864ae 100644 --- a/Source/Math/GPUSparseMatrix.cu +++ b/Source/Math/GPUSparseMatrix.cu @@ -52,6 +52,10 @@ GPUSPARSE_INDEX_TYPE GPUSparseMatrix::SecondaryIndexValueAt(size_t idx return value; } +//------------------------------------------------------------------------- +// construction and conversion +//------------------------------------------------------------------------- + template void GPUSparseMatrix::ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE computeDevice) { @@ -59,25 +63,18 @@ void GPUSparseMatrix::ZeroInit(const MatrixFormat matrixFormat, const matrixFormat != MatrixFormat::matrixFormatSparseBlockCol && matrixFormat != MatrixFormat::matrixFormatSparseBlockRow) { LogicError("GPUSparseMatrix: unsupported sparse matrix format"); + // BUGBUG: Then why even define others? } + Base::ZeroInit(); + m_computeDevice = computeDevice; // current GPU device Id + m_format = matrixFormat; - m_computeDevice = computeDevice; // current GPU device Id - m_numRows = 0; - m_numCols = 0; - m_elemSizeAllocated = m_nz = 0; // Number of non-zero elements + m_sliceOf = nullptr; m_totalBufferSizeAllocated = 0; - m_sliceViewOffset = 0; - m_format = matrixFormat; - m_externalBuffer = false; - m_pArray = nullptr; - m_matrixName = nullptr; - - m_blockSize = 0; - - m_rowToId = nullptr; - - m_tempHostBuffer = nullptr; - m_tempHostBufferSize = 0; + m_blockSize = 0; + m_rowToId = nullptr; + m_tempHostBuffer = nullptr; + m_tempHostBufferSize = 0; } template @@ -123,13 +120,15 @@ DEVICEID_TYPE GPUSparseMatrix::PrepareDevice(DEVICEID_TYPE deviceId /* } template -void GPUSparseMatrix::DeepCopy(const GPUSparseMatrix& deepCopy) +/*private*/ void GPUSparseMatrix::DeepCopy(const GPUSparseMatrix& deepCopy) { ChangeDeviceTo(deepCopy.m_computeDevice); deepCopy.PrepareDevice(); Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false); - m_nz = deepCopy.m_nz; + m_externalBuffer = false; + m_sliceOf = nullptr; + m_nz = deepCopy.m_nz; m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice)); @@ -146,7 +145,6 @@ void GPUSparseMatrix::DeepCopy(const GPUSparseMatrix& deepCo GetNumNZElements()); } - m_externalBuffer = false; SetMatrixName(deepCopy.m_matrixName); // TODO: to copy other varibles used only for class based LM @@ -156,7 +154,7 @@ template void GPUSparseMatrix::SetValue(const GPUSparseMatrix& deepCopy) { if (!OwnBuffer()) - LogicError("Cannot SetValue on Managed external matrix"); + LogicError("Cannot SetValue on managed external matrix"); DeepCopy(deepCopy); } @@ -190,6 +188,9 @@ void GPUSparseMatrix::SetValue(const CPUSparseMatrix& deepCo template void GPUSparseMatrix::CopyToCPUSparseMatrix(CPUSparseMatrix& cpuSparseMatrix) const { + if (!cpuSparseMatrix.OwnBuffer()) + LogicError("Cannot CopyToCPUSparseMatrix on managed external matrix"); + cpuSparseMatrix.SetFormat(GetFormat()); if (IsEmpty()) { @@ -307,6 +308,9 @@ void GPUSparseMatrix::CopyToDenseMatrix(GPUMatrix& denseMatr template void GPUSparseMatrix::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix& outMatrix) const { + if (!outMatrix.OwnBuffer()) + LogicError("Cannot ConvertToSparseFormat to managed external matrix"); + if (IsEmpty()) { outMatrix.ZeroInit(newFormat, GetComputeDeviceId()); @@ -528,26 +532,15 @@ GPUSparseMatrix& GPUSparseMatrix::operator=(const GPUSparseM template GPUSparseMatrix::GPUSparseMatrix(GPUSparseMatrix&& moveFrom) { - m_computeDevice = moveFrom.m_computeDevice; - m_numRows = moveFrom.m_numRows; - m_numCols = moveFrom.m_numCols; - m_nz = moveFrom.m_nz; - m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; + Base::ShallowCopyFrom(moveFrom); + // TODO: implement this using operator= or a shared function m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated; - m_pArray = moveFrom.m_pArray; - m_sliceViewOffset = moveFrom.m_sliceViewOffset; - m_format = moveFrom.m_format; - m_externalBuffer = moveFrom.m_externalBuffer; - m_matrixName = moveFrom.m_matrixName; - - m_blockSize = moveFrom.m_blockSize; - - m_rowToId = moveFrom.m_rowToId; - - m_tempHostBuffer = moveFrom.m_tempHostBuffer; - m_tempHostBufferSize = moveFrom.m_tempHostBufferSize; - - moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freeed + m_sliceOf = moveFrom.m_sliceOf; + m_blockSize = moveFrom.m_blockSize; + m_rowToId = moveFrom.m_rowToId; + m_tempHostBuffer = moveFrom.m_tempHostBuffer; + m_tempHostBufferSize = moveFrom.m_tempHostBufferSize; + moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freed } template @@ -557,26 +550,13 @@ GPUSparseMatrix& GPUSparseMatrix::operator=(GPUSparseMatrix< { if (OwnBuffer()) ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom - m_computeDevice = moveFrom.m_computeDevice; - m_numRows = moveFrom.m_numRows; - m_numCols = moveFrom.m_numCols; - m_nz = moveFrom.m_nz; - m_elemSizeAllocated = moveFrom.m_elemSizeAllocated; + Base::ShallowCopyFrom(moveFrom); m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated; - m_pArray = moveFrom.m_pArray; - m_sliceViewOffset = moveFrom.m_sliceViewOffset; - m_format = moveFrom.m_format; - m_externalBuffer = moveFrom.m_externalBuffer; - - m_matrixName = moveFrom.m_matrixName; - - m_blockSize = moveFrom.m_blockSize; - - m_rowToId = moveFrom.m_rowToId; - - m_tempHostBuffer = moveFrom.m_tempHostBuffer; - m_tempHostBufferSize = moveFrom.m_tempHostBufferSize; - + m_sliceOf = moveFrom.m_sliceOf; + m_blockSize = moveFrom.m_blockSize; + m_rowToId = moveFrom.m_rowToId; + m_tempHostBuffer = moveFrom.m_tempHostBuffer; + m_tempHostBufferSize = moveFrom.m_tempHostBufferSize; moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); } @@ -590,11 +570,10 @@ GPUSparseMatrix::~GPUSparseMatrix() } template -void GPUSparseMatrix::ReleaseMemory() +/*private*/ void GPUSparseMatrix::ReleaseMemory() { - // If OwnBuffer() is false then this matrix - // is simply a view over another matrix. In that - // case we shouldn't free anything. + // If OwnBuffer() is false then this matrix is a view over another matrix. + // In that case we shouldn't free anything. if (OwnBuffer()) { delete[] m_matrixName; @@ -609,7 +588,6 @@ void GPUSparseMatrix::ReleaseMemory() TracingGPUMemoryAllocator::Free(m_computeDevice, m_rowToId); m_rowToId = nullptr; } - ZeroInit(m_format, m_computeDevice); } @@ -627,17 +605,18 @@ void GPUSparseMatrix::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix void GPUSparseMatrix::Reshape(const size_t numRows, const size_t numCols) { - if (!OwnBuffer()) - LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally."); - if (m_numRows == numRows && m_numCols == numCols) return; + if (!OwnBuffer()) + LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally."); + if (m_format != MatrixFormat::matrixFormatSparseCSC) NOT_IMPLEMENTED; @@ -688,8 +667,8 @@ void GPUSparseMatrix::Resize(const size_t numRows, const size_t numCol Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly, keepExistingValues); } -//WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start. -//TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation +// WARNING: When memory is reallocated, existing information will be lost. +// TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation template void GPUSparseMatrix::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*=true*/) { @@ -749,13 +728,17 @@ void GPUSparseMatrix::Resize(const size_t numRows, const size_t numCol m_format = matrixFormat; } -//Reset matrix so it can be reused +// Reset matrix to 0. template void GPUSparseMatrix::Reset() { + if (!OwnBuffer()) + LogicError("Cannot Reset since the buffer is managed externally."); + m_nz = 0; m_blockSize = 0; } + // copy features to GPU template void GPUSparseMatrix::SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE* h_CSRRow, const GPUSPARSE_INDEX_TYPE* h_Col, const ElemType* h_Val, @@ -966,9 +949,11 @@ void GPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPU } } +// dense X sparse = dense template void GPUSparseMatrix::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix& lhs, const bool transposeA, - const GPUSparseMatrix& rhs, const bool transposeB, ElemType beta, GPUMatrix& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise) + const GPUSparseMatrix& rhs, const bool transposeB, ElemType beta, + GPUMatrix& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise) { if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId())) RuntimeError("GPUSparseMatrix::ConvolveAndWeightedAdd: All matrices must be on the same GPU"); @@ -1222,7 +1207,7 @@ void GPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const GPUMatrix size_t GPUSparseMatrix::IdentifyRowsWithValues() const { @@ -1404,11 +1389,7 @@ ElemType GPUSparseMatrix::Adagrad(GPUMatrix& c, const bool n } } -//------------------------------------------------------------------------- -// End of new GPU Sparse Matrix code -//------------------------------------------------------------------------- - -//sparse X dense = dense +// sparse X dense = dense template void GPUSparseMatrix::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix& a, const bool transposeA, const GPUMatrix& b, const bool transposeD, ElemType beta, GPUMatrix& c) @@ -1758,6 +1739,7 @@ void GPUSparseMatrix::ElementWisePower(ElemType alpha, const GPUSparse } } +// sparse x dense = scalar template ElemType GPUSparseMatrix::InnerProductOfMatrices(const GPUSparseMatrix& a, const GPUMatrix& b) { @@ -1878,8 +1860,8 @@ bool GPUSparseMatrix::IsValid() const } template -bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, const GPUSparseMatrix& b, - const ElemType threshold) +/*static*/ bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, const GPUSparseMatrix& b, + const ElemType threshold) { if (a.GetNumNZElements() != b.GetNumNZElements() || a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) return false; @@ -1908,8 +1890,8 @@ bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, con } template -bool GPUSparseMatrix::AreEqual(const GPUMatrix& a, const GPUSparseMatrix& b, - const ElemType threshold) +/*static*/ bool GPUSparseMatrix::AreEqual(const GPUMatrix& a, const GPUSparseMatrix& b, + const ElemType threshold) { if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) return false; @@ -1919,8 +1901,8 @@ bool GPUSparseMatrix::AreEqual(const GPUMatrix& a, const GPU } template -bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, const GPUMatrix& b, - const ElemType threshold) +/*static*/ bool GPUSparseMatrix::AreEqual(const GPUSparseMatrix& a, const GPUMatrix& b, + const ElemType threshold) { if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols()) return false; @@ -1940,6 +1922,7 @@ bool GPUSparseMatrix::IsEqualTo(const GPUMatrix& a, const El { return AreEqual(*this, a, threshold); } + #pragma endregion Static BLAS Functions #pragma region Member BLAS Functions @@ -1958,6 +1941,7 @@ DEVICEID_TYPE GPUSparseMatrix::GetComputeDeviceId() const return m_computeDevice; } +// sparse x dense = dense template GPUMatrix GPUSparseMatrix::ElementProductOf(const GPUSparseMatrix& a, const GPUMatrix& b) { @@ -1980,6 +1964,7 @@ GPUMatrix GPUSparseMatrix::ElementProductOf(const GPUSparseM return c; } +// sparse x dense = dense template GPUMatrix GPUSparseMatrix::ElementProductOf(const GPUMatrix& a, const GPUSparseMatrix& b) { @@ -2002,6 +1987,7 @@ GPUSparseMatrix GPUSparseMatrix::operator-(const GPUSparseMa return res; } +// TODO: This is an unusual use of this operator. Remove this. template GPUSparseMatrix& GPUSparseMatrix::operator^=(ElemType alpha) { @@ -2010,6 +1996,7 @@ GPUSparseMatrix& GPUSparseMatrix::operator^=(ElemType alpha) return us; } +// TODO: This is an unusual use of this operator. Remove this. template GPUSparseMatrix GPUSparseMatrix::operator^(ElemType alpha) const { @@ -2154,6 +2141,7 @@ GPUSparseMatrix GPUSparseMatrix::ColumnSlice(size_t startCol slice.m_pArray = m_pArray; slice.m_format = m_format; slice.m_externalBuffer = true; + slice.m_sliceOf = const_cast*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be. slice.m_matrixName = m_matrixName; slice.m_blockSize = m_blockSize; slice.m_rowToId = m_rowToId; @@ -2231,7 +2219,7 @@ template ElemType GPUSparseMatrix::SumOfAbsElements() const { if (IsEmpty()) - LogicError("SumOfAbsElements: Matrix is empty"); + return 0; cublasHandle_t cuHandle = GPUMatrix::GetCublasHandle(GetComputeDeviceId()); if (sizeof(ElemType) == sizeof(float)) @@ -2264,11 +2252,12 @@ ElemType GPUSparseMatrix::SumOfElements() const return h_sum; } +// sqrt(sum all elements^2) template ElemType GPUSparseMatrix::FrobeniusNorm() const { if (IsEmpty()) - LogicError("FrobeniusNorm: Matrix is empty."); + return 0; ElemType* d_sum = TracingGPUMemoryAllocator::Allocate(m_computeDevice, 1); ElemType h_sum = 0; @@ -2287,7 +2276,7 @@ template ElemType GPUSparseMatrix::MatrixNormInf() const { if (IsEmpty()) - LogicError("MatrixNorm1: Matrix is empty."); + return 0; ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate(m_computeDevice, 1); ElemType h_maxAbs = 0; @@ -2305,8 +2294,6 @@ ElemType GPUSparseMatrix::MatrixNormInf() const template ElemType GPUSparseMatrix::MatrixNorm1() const { - if (IsEmpty()) - LogicError("MatrixNorm1: Matrix is empty."); return SumOfAbsElements(); } @@ -2317,6 +2304,10 @@ ElemType GPUSparseMatrix::MatrixNorm1() const template GPUSparseMatrix& GPUSparseMatrix::ElementInverse() { +#if 1 + // Note: This makes no sense because sparse matrices are defined by having lots of zeroes. + NOT_IMPLEMENTED; +#else if (!OwnBuffer()) LogicError("Cannot modify since the buffer is managed externally."); @@ -2328,29 +2319,45 @@ GPUSparseMatrix& GPUSparseMatrix::ElementInverse() SyncGuard syncGuard; _elemInverse<<>>(NzValues(), N); return *this; +#endif } template GPUSparseMatrix& GPUSparseMatrix::AssignElementInverseOf(const GPUSparseMatrix& a) { +#if 1 + // Note: This makes no sense because sparse matrices are defined by having lots of zeroes. + UNUSED(a); NOT_IMPLEMENTED; +#else SetValue(a); return ElementInverse(); +#endif } template GPUSparseMatrix& GPUSparseMatrix::InplaceSigmoid() { +#if 1 + // Note: This makes no sense because sigmoid(0) != 0. + NOT_IMPLEMENTED; +#else performElementWiseFunction(ElementWiseOperator::opSigmoid, *this); return *this; +#endif } template GPUSparseMatrix& GPUSparseMatrix::AssignSigmoidOf(const GPUSparseMatrix& a) { +#if 1 + // Note: This makes no sense because sigmoid(0) != 0. + UNUSED(a); NOT_IMPLEMENTED; +#else if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); performElementWiseFunction(ElementWiseOperator::opSigmoid, a); return *this; +#endif } template @@ -2404,33 +2411,53 @@ GPUSparseMatrix& GPUSparseMatrix::AssignSqrtOf(const GPUSpar template GPUSparseMatrix& GPUSparseMatrix::InplaceExp() { +#if 1 + // Note: This makes no sense because exp(0) != 0. + NOT_IMPLEMENTED; +#else performElementWiseFunction(ElementWiseOperator::opExp, *this); return *this; +#endif } template GPUSparseMatrix& GPUSparseMatrix::AssignExpOf(const GPUSparseMatrix& a) { +#if 1 + // Note: This makes no sense because exp(0) != 0. + UNUSED(a); NOT_IMPLEMENTED; +#else if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); performElementWiseFunction(ElementWiseOperator::opExp, a); return *this; +#endif } template GPUSparseMatrix& GPUSparseMatrix::InplaceLog() { +#if 1 + // Note: This makes no sense because log(0) != 0. + NOT_IMPLEMENTED; +#else performElementWiseFunction(ElementWiseOperator::opLog, *this); return *this; +#endif } template GPUSparseMatrix& GPUSparseMatrix::AssignLogOf(const GPUSparseMatrix& a) { +#if 1 + // Note: This makes no sense because log(0) != 0. + UNUSED(a); NOT_IMPLEMENTED; +#else if (this != &a) Resize(a.GetNumRows(), a.GetNumCols()); performElementWiseFunction(ElementWiseOperator::opLog, a); return *this; +#endif } template @@ -2449,6 +2476,7 @@ GPUSparseMatrix& GPUSparseMatrix::AssignAbsOf(const GPUSpars return *this; } +// TODO: Check whether these functions always map 0 to 0. template GPUSparseMatrix& GPUSparseMatrix::InplaceTruncateBottom(const ElemType threshold) { @@ -2543,7 +2571,7 @@ GPUSparseMatrix& GPUSparseMatrix::SetToZeroIfAbsLessThan(con //outBuffer should be allocated to be >= size by the caller template template -void GPUSparseMatrix::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size) +/*private*/ void GPUSparseMatrix::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size) { #pragma omp parallel for for (size_t i = 0; i < (size & ~3); i += 4) @@ -2629,6 +2657,9 @@ template GPUSparseMatrix& GPUSparseMatrix::operator=(GPUSparseMatrix template MATH_API File& operator>>(File& stream, GPUSparseMatrix& us) { + if (!us.OwnBuffer()) + LogicError("Cannot read into a managed external matrix"); + stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT")); size_t elsize; stream >> elsize; diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h index 74fc2d62c..0b823e0fa 100644 --- a/Source/Math/GPUSparseMatrix.h +++ b/Source/Math/GPUSparseMatrix.h @@ -23,29 +23,30 @@ template class MATH_API GPUSparseMatrix : public BaseMatrix { public: - typedef BaseMatrix B; - using B::m_numRows; - using B::m_numCols; - using B::m_pArray; - using B::m_elemSizeAllocated; - using B::m_sliceViewOffset; - using B::m_nz; - using B::m_format; - using B::m_computeDevice; - using B::m_externalBuffer; - using B::m_matrixName; - using B::OwnBuffer; - using B::GetFormat; - using B::SetFormat; - using B::GetNumRows; - using B::GetNumCols; - using B::IsEmpty; - using B::SetComputeDeviceId; - using B::SetMatrixName; - using B::SetNzCount; - using B::Clear; + typedef BaseMatrix Base; + using Base::m_numRows; + using Base::m_numCols; + using Base::m_pArray; + using Base::m_elemSizeAllocated; + using Base::m_sliceViewOffset; + using Base::m_nz; + using Base::m_format; + using Base::m_computeDevice; + using Base::m_externalBuffer; + using Base::m_matrixName; + using Base::OwnBuffer; + using Base::GetFormat; + using Base::SetFormat; + using Base::GetNumRows; + using Base::GetNumCols; + using Base::SetComputeDeviceId; + using Base::SetMatrixName; + using Base::SetNzCount; + using Base::Clear; // without this, base members would require to use thi-> in GCC public: + using Base::IsEmpty; + GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR); explicit GPUSparseMatrix(DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR); @@ -390,6 +391,8 @@ private: mutable void* m_tempHostBuffer; // used to copy values. mutable size_t m_tempHostBufferSize; + + GPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from }; }}} diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 3ee8e5069..68de0c538 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -169,7 +169,7 @@ public: // REVIEW alexeyk: GPU version copies from device to host only, implement all versions (device <-> host). void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const; - Matrix ColumnSlice(size_t startColumn, size_t numCols) const; + Matrix ColumnSlice(size_t startColumn, size_t numCols) const; // note: 'const' is misleading here, as the returned matrix is a mutable reference // difference between AssignColumnSlice and SetColumnSlice // AssignColumnSlice : this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1) diff --git a/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk b/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk index af377b79d..0ac8be307 100644 --- a/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk +++ b/Tests/UnitTests/ReaderTests/Config/HTKMLFReaderSimpleDataLoop16_Config.cntk @@ -4,7 +4,7 @@ DataDir = $RootDir$ # deviceId = -1 for CPU, >= 0 for GPU devices deviceId = -1 -precision = "float" +precision = "double" Simple_Test = [ reader = [