some clean-up of sparse matrices;

disabled meaningless elementwise sparse ops such as log();
sparse matrice slices now keep track of their original matrix (not used yet);
bug fix: ~SyncGuard() should not swallow errors since CUDA errors may get discovered here;
bug fix: HTKMLFReader test 16 must use consistent precision
This commit is contained in:
Frank Seide 2016-03-01 10:43:50 -08:00
Родитель d0a5eb527d
Коммит 7793b7be93
8 изменённых файлов: 323 добавлений и 253 удалений

Просмотреть файл

@ -59,16 +59,13 @@
#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
#endif
#define SWAP(a, b) \
{ \
(a) ^= (b); \
(b) ^= (a); \
(a) ^= (b); \
}
// TODO: Move to CommonMatrix.h
#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
namespace Microsoft { namespace MSR { namespace CNTK {
#pragma region Helpful Enum Definitions
enum class MatrixOrder
{
RowMajor = 101, // row-major arrays
@ -77,56 +74,55 @@ enum class MatrixOrder
enum class MatrixTranspose : char
{
NoTrans = 'N', // trans='N'
Trans = 'T', // trans='T'
ConjTrans = 'C' // trans='C'
NoTrans = 'N', // trans='N'
Trans = 'T', // trans='T'
ConjTrans = 'C' // trans='C'
};
enum class SymMatrixType : char
{
Up = 'U', // symmetric matrix is stored in the upper part
Low = 'L', // symmetric matrix is stored in thelower part
Full = 'F', // full populated
NotSymmetric = 'N' // not a symmetric matrix
Up = 'U', // symmetric matrix is stored in the upper part
Low = 'L', // symmetric matrix is stored in thelower part
Full = 'F', // full populated
NotSymmetric = 'N' // not a symmetric matrix
};
enum class MatrixOpSide : char
{
Left = 'L', // left multiply
Left = 'L', // left multiply
Right = 'R', // right multiply
};
#pragma endregion Helpful Enum Definitions
#pragma region Constructors and Destructor
//should only be used by constructors.
template <class ElemType>
void CPUSparseMatrix<ElemType>::ZeroInit()
{
m_numRows = 0;
m_numCols = 0;
m_elemSizeAllocated = 0;
m_compIndexSize = 0;
m_externalBuffer = false;
m_computeDevice = CPUDEVICE;
m_nz = 0;
m_matrixName = NULL;
//-------------------------------------------------------------------------
// construction and conversion
//-------------------------------------------------------------------------
// should only be used by constructors.
template <class ElemType>
/*private*/ void CPUSparseMatrix<ElemType>::ZeroInit()
{
Base::ZeroInit();
m_computeDevice = CPUDEVICE;
m_sliceOf = nullptr;
m_compIndexSize = 0;
// if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
{
m_colIdx = -1;
m_pArray = NULL;
m_unCompIndex = NULL;
m_compIndex = NULL;
m_colIdx = -1;
m_unCompIndex = nullptr;
m_compIndex = nullptr;
}
// else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
{
m_blockSize = 0;
m_blockSize = 0;
m_blockIdShift = 0;
m_pArray = NULL;
m_blockIds = NULL;
m_blockIds = nullptr;
}
m_nzValues = NULL;
m_nzValues = nullptr;
}
//should only be used by constructors.
@ -144,19 +140,17 @@ void CPUSparseMatrix<ElemType>::CheckInit(const MatrixFormat format)
template <class ElemType>
CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format)
{
CheckInit(format);
}
template <class ElemType>
CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size)
{
CheckInit(format);
Resize(numRows, numCols, size, true, false);
}
//copy constructor, deep copy
// copy constructor, deep copy
template <class ElemType>
CPUSparseMatrix<ElemType>::CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deepCopyFrom)
{
@ -166,7 +160,7 @@ CPUSparseMatrix<ElemType>::CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deep
SetMatrixName(deepCopyFrom.m_matrixName);
}
//assignment operator, deep copy
// assignment operator, deep copy
template <class ElemType>
CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(const CPUSparseMatrix<ElemType>& deepCopyFrom)
{
@ -177,29 +171,23 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(const CPUSparseM
return *this;
}
//move constructor, shallow copy
// move constructor, shallow copy
template <class ElemType>
CPUSparseMatrix<ElemType>::CPUSparseMatrix(CPUSparseMatrix<ElemType>&& moveFrom)
{
m_format = moveFrom.m_format;
m_numRows = moveFrom.m_numRows;
m_numCols = moveFrom.m_numCols;
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
Base::ShallowCopyFrom(moveFrom);
// BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now.
m_compIndexSize = moveFrom.m_compIndexSize;
m_externalBuffer = moveFrom.m_externalBuffer;
m_computeDevice = moveFrom.m_computeDevice;
m_nz = moveFrom.m_nz;
m_matrixName = moveFrom.m_matrixName;
m_colIdx = moveFrom.m_colIdx;
m_pArray = moveFrom.m_pArray;
m_nzValues = moveFrom.m_nzValues;
m_colIdx = moveFrom.m_colIdx;
m_nzValues = moveFrom.m_nzValues;
m_unCompIndex = moveFrom.m_unCompIndex;
m_compIndex = moveFrom.m_compIndex;
m_compIndex = moveFrom.m_compIndex;
m_blockSize = moveFrom.m_blockSize;
m_blockSize = moveFrom.m_blockSize;
m_blockIdShift = moveFrom.m_blockIdShift;
m_blockIds = moveFrom.m_blockIds;
m_blockIds = moveFrom.m_blockIds;
// release the pointer from the source object so that the destructor won't release it twice
moveFrom.ZeroInit();
@ -213,26 +201,19 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(CPUSparseMatrix<
{
if (OwnBuffer())
ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom
Base::ShallowCopyFrom(moveFrom);
// BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now.
m_format = moveFrom.m_format;
m_numRows = moveFrom.m_numRows;
m_numCols = moveFrom.m_numCols;
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
m_compIndexSize = moveFrom.m_compIndexSize;
m_externalBuffer = moveFrom.m_externalBuffer;
m_computeDevice = moveFrom.m_computeDevice;
m_nz = moveFrom.m_nz;
m_matrixName = moveFrom.m_matrixName;
m_colIdx = moveFrom.m_colIdx;
m_pArray = moveFrom.m_pArray;
m_nzValues = moveFrom.m_nzValues;
m_colIdx = moveFrom.m_colIdx;
m_nzValues = moveFrom.m_nzValues;
m_unCompIndex = moveFrom.m_unCompIndex;
m_compIndex = moveFrom.m_compIndex;
m_compIndex = moveFrom.m_compIndex;
m_blockSize = moveFrom.m_blockSize;
m_blockSize = moveFrom.m_blockSize;
m_blockIdShift = moveFrom.m_blockIdShift;
m_blockIds = moveFrom.m_blockIds;
m_blockIds = moveFrom.m_blockIds;
// release the pointer from the source object so that the destructor won't release it twice
moveFrom.ZeroInit();
@ -249,9 +230,8 @@ CPUSparseMatrix<ElemType>::~CPUSparseMatrix()
template <class ElemType>
void CPUSparseMatrix<ElemType>::ReleaseMemory()
{
// If m_externalBuffer is true then this matrix
// is simply a view over another matrix. In that
// case we shouldn't free anything.
// If m_externalBuffer is true then this matrix is simply a view over another matrix.
// In that case we shouldn't free anything.
if (!m_externalBuffer)
{
delete[] m_matrixName;
@ -335,24 +315,26 @@ void CPUSparseMatrix<ElemType>::SetValue(const size_t row, const size_t col, con
m_nz++;
}
//make sure call order in colume wise for CSC and row wise for CSR
// make sure call order in colume wise for CSC and row wise for CSR
template <class ElemType>
void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
{
if (!OwnBuffer())
if (!OwnBuffer()) // TODO: GPU version allows to overwrite a view with a fresh non-view
LogicError("Cannot modify since the buffer is managed externally.");
this->Reset();
m_format = v.GetFormat();
Reset();
m_format = v.GetFormat();
m_externalBuffer = false;
m_sliceOf = nullptr;
this->Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize());
Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize());
m_nz = v.NzCount();
if (m_nz > 0)
{
memcpy(this->NzValues(), v.NzValues(), v.NzSize());
memcpy(this->RowLocation(), v.RowLocation(), v.RowSize());
memcpy(this->ColLocation(), v.ColLocation(), v.ColSize());
memcpy(NzValues(), v.NzValues(), v.NzSize());
memcpy(RowLocation(), v.RowLocation(), v.RowSize());
memcpy(ColLocation(), v.ColLocation(), v.ColSize());
}
}
@ -402,17 +384,22 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
CPUSparseMatrix<ElemType> slice(m_format);
slice.m_numRows = m_numRows;
slice.m_numCols = numCols;
// BUGBUG: m_matrixName?
// BUGBUG: m_sliceViewOffset?
slice.m_externalBuffer = true;
slice.m_sliceOf = const_cast<CPUSparseMatrix<ElemType>*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be.
if (m_format == MatrixFormat::matrixFormatSparseCSC)
{
slice.m_pArray = m_pArray;
slice.m_nzValues = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against m_pArray
slice.m_unCompIndex = m_unCompIndex;
slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
slice.m_externalBuffer = true;
slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
slice.m_pArray = m_pArray;
slice.m_nzValues = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against m_pArray
slice.m_unCompIndex = m_unCompIndex;
slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
slice.m_compIndexSize = numCols + 1;
slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
slice.m_elemSizeAllocated = slice.m_nz;
slice.m_compIndexSize = numCols + 1;
}
else if (m_format == MatrixFormat::matrixFormatSparseBlockCol)
{
@ -446,13 +433,15 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
endColBlock = (long long) m_blockSize;
}
slice.m_pArray = m_pArray + startColBlock * m_numRows;
slice.m_nzValues = slice.m_pArray;
slice.m_blockIds = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers
slice.m_blockSize = (size_t) max((long long) 0, endColBlock - startColBlock);
// BUGBUG: m_elemSizeAllocated?
slice.m_pArray = m_pArray + startColBlock * m_numRows;
slice.m_nzValues = slice.m_pArray;
slice.m_blockIds = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers
slice.m_blockSize = (size_t) max((long long) 0, endColBlock - startColBlock);
slice.m_blockIdShift = m_blockIdShift + startColumn;
slice.m_externalBuffer = true;
slice.m_nz = slice.m_blockSize * m_numRows;
slice.m_nz = slice.m_blockSize * m_numRows;
}
return slice;
@ -461,9 +450,6 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
template <class ElemType>
CPUMatrix<ElemType> CPUSparseMatrix<ElemType>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const
{
// if (numCols == 0)
// LogicError("The slice cannot have 0 columns.");
if (startColumn + numCols > m_numCols)
InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) m_numCols);
@ -614,17 +600,21 @@ void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
}
}
//Reset matrix so it can be reused
// Reset matrix to 0.
template <class ElemType>
void CPUSparseMatrix<ElemType>::Reset()
{
if (!OwnBuffer())
LogicError("Cannot Reset since the buffer is managed externally.");
m_nz = 0;
m_colIdx = -1;
m_blockSize = 0;
m_blockIdShift = 0;
}
//c = alpha*op(lhs) * op(rhs) + beta*c
// c = alpha*op(lhs) * op(rhs) + beta*c
// dense x sparse = dense
template <class ElemType>
void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c)
@ -711,7 +701,8 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
}
}
//c = alpha * op(lhs) * op(rhs)
// dense x sparse = sparse
// c = alpha * op(lhs) * op(rhs)
template <class ElemType>
void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, CPUSparseMatrix<ElemType>& c)
@ -807,6 +798,7 @@ void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<E
}
}
// dense += sparse
template <class ElemType>
void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& rhs)
{
@ -861,7 +853,7 @@ void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSpars
}
template <class ElemType>
bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
/*static*/ bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
{
if (a.IsEmpty() || b.IsEmpty())
LogicError("AreEqual: one of the input matrices is empty.");
@ -894,6 +886,7 @@ void CPUSparseMatrix<ElemType>::NormalGrad(CPUMatrix<ElemType>& c, const ElemTyp
c.Resize(GetNumRows(), GetNumCols());
c.SetValue(0.0);
}
// BUGBUG: dimension/ownbuffer check?
if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
{
@ -927,6 +920,7 @@ ElemType CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c, const bool n
c.Resize(GetNumRows(), GetNumCols());
c.SetValue(0.0);
}
// BUGBUG: dimension/ownbuffer check?
ElemType aveMultiplier = 0;
@ -1156,12 +1150,12 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceSoftThreshold(const
template <class ElemType>
ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
{
if (this->IsEmpty())
LogicError("FrobeniusNorm: Matrix is empty.");
if (IsEmpty())
return 0;
ElemType v = 0;
ElemType v = 0; // TODO: do this in 'double'?
long m = (long) this->NzCount();
long m = (long) NzCount();
const ElemType* nzValues = NzValues();
//four-way unrolling
@ -1183,8 +1177,8 @@ ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
template <class ElemType>
ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
{
if (this->IsEmpty())
LogicError("SumOfAbsElements: Matrix is empty.");
if (IsEmpty())
return 0;
if (sizeof(ElemType) == sizeof(double))
{
@ -1209,12 +1203,12 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
template <class ElemType>
ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
{
if (this->IsEmpty())
LogicError("SumOfElements: Matrix is empty.");
if (IsEmpty())
return 0;
ElemType sum = 0;
ElemType sum = 0; // TODO: Do this in 'double'?
long m = (long) this->NzCount();
long m = (long) NzCount();
const ElemType* nzValues = NzValues();
//four-way unrolling
@ -1235,6 +1229,9 @@ ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
template <typename ElemType>
MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
{
if (!us.OwnBuffer())
LogicError("Cannot read into a managed external matrix");
stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
size_t elsize;
stream >> elsize;
@ -1351,4 +1348,5 @@ template CPUSparseMatrix<char>::~CPUSparseMatrix();
template CPUSparseMatrix<char> CPUSparseMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
} } }
}}}

Просмотреть файл

@ -22,21 +22,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
template <class ElemType>
class MATH_API CPUSparseMatrix : public BaseMatrix<ElemType>
{
typedef BaseMatrix<ElemType> B;
using B::m_elemSizeAllocated;
using B::m_computeDevice;
using B::m_externalBuffer;
using B::m_format;
using B::m_matrixName;
using B::m_numCols;
using B::m_numRows;
using B::m_nz;
using B::m_pArray; // without this, base members would require to use thi-> in GCC
using B::OwnBuffer;
using B::Clear;
typedef BaseMatrix<ElemType> Base;
using Base::m_elemSizeAllocated;
using Base::m_computeDevice;
using Base::m_externalBuffer;
using Base::m_format;
using Base::m_matrixName;
using Base::m_numCols;
using Base::m_numRows;
using Base::m_nz;
using Base::m_pArray; // without this, base members would require to use thi-> in GCC
using Base::Clear;
using Base::NzCount;
public:
using B::SetMatrixName;
using Base::OwnBuffer;
using Base::IsEmpty;
using Base::SetMatrixName;
private:
void ZeroInit();
@ -53,8 +55,8 @@ public:
~CPUSparseMatrix();
public:
using B::GetNumCols;
using B::GetNumRows;
using Base::GetNumCols;
using Base::GetNumRows;
void SetValue(const size_t row, const size_t col, ElemType val);
void SetValue(const CPUSparseMatrix<ElemType>& /*val*/);
@ -249,6 +251,8 @@ private:
size_t m_blockSize; // block size
size_t* m_blockIds; // block ids
size_t m_blockIdShift; // used to get efficient slice, actual col = blockIds[j] - m_blockIdShift
CPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from
};
typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;

Просмотреть файл

@ -158,7 +158,7 @@ enum ElementWiseOperator
Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); \
Macro(ElementwiseProductWithLogDerivativeFromOutput); \
Macro(ElementwiseProductWithCosDerivative); \
//Macro(Index);
//Macro(Index);
#define ForAllTernaryOps(Macro) \
Macro(Cond); \
@ -170,6 +170,7 @@ enum ElementWiseOperator
enum MatrixFlagBitPosition
{
// TODO: remove all formats that are actually not supported
bitPosRowMajor = 0, // row major matrix
bitPosSparse = 1, // sparse matrix (COO if uncompressed)
bitPosCompressed = 2, // a compressed sparse format (CSC/CSR)
@ -179,6 +180,7 @@ enum MatrixFlagBitPosition
enum MatrixFormat
{
// TODO: remove all formats that are actually not supported
matrixFormatDense = 0, // default is dense
matrixFormatColMajor = 0, // default is column major
matrixFormatRowMajor = 1 << bitPosRowMajor, // row major matrix
@ -314,17 +316,46 @@ protected:
m_matrixName = nullptr;
}
void ZeroInit()
{
m_numRows = 0;
m_numCols = 0;
m_elemSizeAllocated = 0;
m_sliceViewOffset = 0;
m_externalBuffer = false;
m_pArray = nullptr;
m_nz = 0;
m_matrixName = nullptr;
}
// copy all metadata (but not content taht pArray points to)
void ShallowCopyFrom(const BaseMatrix& other)
{
m_format = other.m_format;
m_computeDevice = other.m_computeDevice;
m_numRows = other.m_numRows;
m_numCols = other.m_numCols;
m_elemSizeAllocated = other.m_elemSizeAllocated;
m_sliceViewOffset = other.m_sliceViewOffset;
m_externalBuffer = other.m_externalBuffer;
m_pArray = other.m_pArray;
m_nz = other.m_nz;
m_matrixName = other.m_matrixName;
}
protected:
MatrixFormat m_format;
mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
size_t m_numRows;
size_t m_numCols;
size_t m_elemSizeAllocated;
size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format
MatrixFormat m_format;
size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format --TODO: move to sparse matrix implementations? Or common sparse base class?
bool m_externalBuffer; // is the buffer used by this matrix,
ElemType* m_pArray;
mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
size_t m_nz; // Number of non-zero elements for sparse matrices (unused in other formats)
wchar_t* m_matrixName; // TODO: Use std::wstring?
};
} } }
}}}

Просмотреть файл

@ -579,7 +579,7 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam
class SyncGuard
{
bool DoSync()
static bool DoSync()
{
#ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
static bool do_sync = false;
@ -588,27 +588,30 @@ class SyncGuard
#endif
return do_sync;
}
cudaEvent_t done;
cudaEvent_t m_done;
public:
SyncGuard()
{
done = nullptr;
m_done = nullptr;
if (DoSync())
CUDA_CALL(cudaEventCreate(&done));
CUDA_CALL(cudaEventCreate(&m_done));
}
~SyncGuard()
{
if (DoSync())
{
try
// The regular use of this destructor is to synchronize the GPU, but also
// to check for errors. So this destructor is where CUDA errors would be thrown.
// If this destructor runs during stack unwinding, then a different error has
// already happened that should be reported; so we only clean up the resource.
if (std::uncaught_exception())
cudaEventDestroy(m_done);
else
{
CUDA_CALL(cudaEventRecord(done));
CUDA_CALL(cudaEventSynchronize(done));
CUDA_CALL(cudaEventDestroy(done));
}
catch (const std::exception& e) // can't throw in destructors!
{
std::cerr << "SyncGuard: Destructor swallowing CUDA failure: " << e.what() << std::endl;
// failures in a prior launch might be reported here
CUDA_CALL(cudaEventRecord(m_done));
CUDA_CALL(cudaEventSynchronize(m_done));
CUDA_CALL(cudaEventDestroy(m_done));
}
}
}

Просмотреть файл

@ -52,6 +52,10 @@ GPUSPARSE_INDEX_TYPE GPUSparseMatrix<ElemType>::SecondaryIndexValueAt(size_t idx
return value;
}
//-------------------------------------------------------------------------
// construction and conversion
//-------------------------------------------------------------------------
template <class ElemType>
void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE computeDevice)
{
@ -59,25 +63,18 @@ void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const
matrixFormat != MatrixFormat::matrixFormatSparseBlockCol && matrixFormat != MatrixFormat::matrixFormatSparseBlockRow)
{
LogicError("GPUSparseMatrix: unsupported sparse matrix format");
// BUGBUG: Then why even define others?
}
Base::ZeroInit();
m_computeDevice = computeDevice; // current GPU device Id
m_format = matrixFormat;
m_computeDevice = computeDevice; // current GPU device Id
m_numRows = 0;
m_numCols = 0;
m_elemSizeAllocated = m_nz = 0; // Number of non-zero elements
m_sliceOf = nullptr;
m_totalBufferSizeAllocated = 0;
m_sliceViewOffset = 0;
m_format = matrixFormat;
m_externalBuffer = false;
m_pArray = nullptr;
m_matrixName = nullptr;
m_blockSize = 0;
m_rowToId = nullptr;
m_tempHostBuffer = nullptr;
m_tempHostBufferSize = 0;
m_blockSize = 0;
m_rowToId = nullptr;
m_tempHostBuffer = nullptr;
m_tempHostBufferSize = 0;
}
template <class ElemType>
@ -123,13 +120,15 @@ DEVICEID_TYPE GPUSparseMatrix<ElemType>::PrepareDevice(DEVICEID_TYPE deviceId /*
}
template <class ElemType>
void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
/*private*/ void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
{
ChangeDeviceTo(deepCopy.m_computeDevice);
deepCopy.PrepareDevice();
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false);
m_nz = deepCopy.m_nz;
m_externalBuffer = false;
m_sliceOf = nullptr;
m_nz = deepCopy.m_nz;
m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix
CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
@ -146,7 +145,6 @@ void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCo
GetNumNZElements());
}
m_externalBuffer = false;
SetMatrixName(deepCopy.m_matrixName);
// TODO: to copy other varibles used only for class based LM
@ -156,7 +154,7 @@ template <class ElemType>
void GPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopy)
{
if (!OwnBuffer())
LogicError("Cannot SetValue on Managed external matrix");
LogicError("Cannot SetValue on managed external matrix");
DeepCopy(deepCopy);
}
@ -190,6 +188,9 @@ void GPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCo
template <class ElemType>
void GPUSparseMatrix<ElemType>::CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType>& cpuSparseMatrix) const
{
if (!cpuSparseMatrix.OwnBuffer())
LogicError("Cannot CopyToCPUSparseMatrix on managed external matrix");
cpuSparseMatrix.SetFormat(GetFormat());
if (IsEmpty())
{
@ -307,6 +308,9 @@ void GPUSparseMatrix<ElemType>::CopyToDenseMatrix(GPUMatrix<ElemType>& denseMatr
template <class ElemType>
void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const
{
if (!outMatrix.OwnBuffer())
LogicError("Cannot ConvertToSparseFormat to managed external matrix");
if (IsEmpty())
{
outMatrix.ZeroInit(newFormat, GetComputeDeviceId());
@ -528,26 +532,15 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseM
template <class ElemType>
GPUSparseMatrix<ElemType>::GPUSparseMatrix(GPUSparseMatrix<ElemType>&& moveFrom)
{
m_computeDevice = moveFrom.m_computeDevice;
m_numRows = moveFrom.m_numRows;
m_numCols = moveFrom.m_numCols;
m_nz = moveFrom.m_nz;
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
Base::ShallowCopyFrom(moveFrom);
// TODO: implement this using operator= or a shared function
m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
m_pArray = moveFrom.m_pArray;
m_sliceViewOffset = moveFrom.m_sliceViewOffset;
m_format = moveFrom.m_format;
m_externalBuffer = moveFrom.m_externalBuffer;
m_matrixName = moveFrom.m_matrixName;
m_blockSize = moveFrom.m_blockSize;
m_rowToId = moveFrom.m_rowToId;
m_tempHostBuffer = moveFrom.m_tempHostBuffer;
m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freeed
m_sliceOf = moveFrom.m_sliceOf;
m_blockSize = moveFrom.m_blockSize;
m_rowToId = moveFrom.m_rowToId;
m_tempHostBuffer = moveFrom.m_tempHostBuffer;
m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freed
}
template <class ElemType>
@ -557,26 +550,13 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(GPUSparseMatrix<
{
if (OwnBuffer())
ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom
m_computeDevice = moveFrom.m_computeDevice;
m_numRows = moveFrom.m_numRows;
m_numCols = moveFrom.m_numCols;
m_nz = moveFrom.m_nz;
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
Base::ShallowCopyFrom(moveFrom);
m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
m_pArray = moveFrom.m_pArray;
m_sliceViewOffset = moveFrom.m_sliceViewOffset;
m_format = moveFrom.m_format;
m_externalBuffer = moveFrom.m_externalBuffer;
m_matrixName = moveFrom.m_matrixName;
m_blockSize = moveFrom.m_blockSize;
m_rowToId = moveFrom.m_rowToId;
m_tempHostBuffer = moveFrom.m_tempHostBuffer;
m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
m_sliceOf = moveFrom.m_sliceOf;
m_blockSize = moveFrom.m_blockSize;
m_rowToId = moveFrom.m_rowToId;
m_tempHostBuffer = moveFrom.m_tempHostBuffer;
m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice);
}
@ -590,11 +570,10 @@ GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
}
template <class ElemType>
void GPUSparseMatrix<ElemType>::ReleaseMemory()
/*private*/ void GPUSparseMatrix<ElemType>::ReleaseMemory()
{
// If OwnBuffer() is false then this matrix
// is simply a view over another matrix. In that
// case we shouldn't free anything.
// If OwnBuffer() is false then this matrix is a view over another matrix.
// In that case we shouldn't free anything.
if (OwnBuffer())
{
delete[] m_matrixName;
@ -609,7 +588,6 @@ void GPUSparseMatrix<ElemType>::ReleaseMemory()
TracingGPUMemoryAllocator::Free<GPUSPARSE_INDEX_TYPE>(m_computeDevice, m_rowToId);
m_rowToId = nullptr;
}
ZeroInit(m_format, m_computeDevice);
}
@ -627,17 +605,18 @@ void GPUSparseMatrix<ElemType>::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<E
}
//-------------------------------------------------------------------------
// Start of new GPU Sparse Matrix code
// main operations
//-------------------------------------------------------------------------
template <class ElemType>
void GPUSparseMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
{
if (!OwnBuffer())
LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally.");
if (m_numRows == numRows && m_numCols == numCols)
return;
if (!OwnBuffer())
LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally.");
if (m_format != MatrixFormat::matrixFormatSparseCSC)
NOT_IMPLEMENTED;
@ -688,8 +667,8 @@ void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly, keepExistingValues);
}
//WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start.
//TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
// WARNING: When memory is reallocated, existing information will be lost.
// TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
template <class ElemType>
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*=true*/)
{
@ -749,13 +728,17 @@ void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
m_format = matrixFormat;
}
//Reset matrix so it can be reused
// Reset matrix to 0.
template <class ElemType>
void GPUSparseMatrix<ElemType>::Reset()
{
if (!OwnBuffer())
LogicError("Cannot Reset since the buffer is managed externally.");
m_nz = 0;
m_blockSize = 0;
}
// copy features to GPU
template <class ElemType>
void GPUSparseMatrix<ElemType>::SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE* h_CSRRow, const GPUSPARSE_INDEX_TYPE* h_Col, const ElemType* h_Val,
@ -966,9 +949,11 @@ void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPU
}
}
// dense X sparse = dense
template <class ElemType>
void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta,
GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
{
if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
RuntimeError("GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd: All matrices must be on the same GPU");
@ -1222,7 +1207,7 @@ void GPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const GPUMatrix<E
}
}
//find the rows of rhs with values
// find the rows of rhs with values
template <class ElemType>
size_t GPUSparseMatrix<ElemType>::IdentifyRowsWithValues() const
{
@ -1404,11 +1389,7 @@ ElemType GPUSparseMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& c, const bool n
}
}
//-------------------------------------------------------------------------
// End of new GPU Sparse Matrix code
//-------------------------------------------------------------------------
//sparse X dense = dense
// sparse X dense = dense
template <class ElemType>
void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA,
const GPUMatrix<ElemType>& b, const bool transposeD, ElemType beta, GPUMatrix<ElemType>& c)
@ -1758,6 +1739,7 @@ void GPUSparseMatrix<ElemType>::ElementWisePower(ElemType alpha, const GPUSparse
}
}
// sparse x dense = scalar
template <class ElemType>
ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
{
@ -1878,8 +1860,8 @@ bool GPUSparseMatrix<ElemType>::IsValid() const
}
template <class ElemType>
bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
const ElemType threshold)
/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
const ElemType threshold)
{
if (a.GetNumNZElements() != b.GetNumNZElements() || a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
return false;
@ -1908,8 +1890,8 @@ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, con
}
template <class ElemType>
bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
const ElemType threshold)
/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
const ElemType threshold)
{
if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
return false;
@ -1919,8 +1901,8 @@ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPU
}
template <class ElemType>
bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b,
const ElemType threshold)
/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b,
const ElemType threshold)
{
if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
return false;
@ -1940,6 +1922,7 @@ bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const El
{
return AreEqual(*this, a, threshold);
}
#pragma endregion Static BLAS Functions
#pragma region Member BLAS Functions
@ -1958,6 +1941,7 @@ DEVICEID_TYPE GPUSparseMatrix<ElemType>::GetComputeDeviceId() const
return m_computeDevice;
}
// sparse x dense = dense
template <class ElemType>
GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
{
@ -1980,6 +1964,7 @@ GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseM
return c;
}
// sparse x dense = dense
template <class ElemType>
GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
{
@ -2002,6 +1987,7 @@ GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator-(const GPUSparseMa
return res;
}
// TODO: This is an unusual use of this operator. Remove this.
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
{
@ -2010,6 +1996,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
return us;
}
// TODO: This is an unusual use of this operator. Remove this.
template <class ElemType>
GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator^(ElemType alpha) const
{
@ -2154,6 +2141,7 @@ GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
slice.m_pArray = m_pArray;
slice.m_format = m_format;
slice.m_externalBuffer = true;
slice.m_sliceOf = const_cast<GPUSparseMatrix<ElemType>*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be.
slice.m_matrixName = m_matrixName;
slice.m_blockSize = m_blockSize;
slice.m_rowToId = m_rowToId;
@ -2231,7 +2219,7 @@ template <class ElemType>
ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
{
if (IsEmpty())
LogicError("SumOfAbsElements: Matrix is empty");
return 0;
cublasHandle_t cuHandle = GPUMatrix<ElemType>::GetCublasHandle(GetComputeDeviceId());
if (sizeof(ElemType) == sizeof(float))
@ -2264,11 +2252,12 @@ ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
return h_sum;
}
// sqrt(sum all elements^2)
template <class ElemType>
ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const
{
if (IsEmpty())
LogicError("FrobeniusNorm: Matrix is empty.");
return 0;
ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(m_computeDevice, 1);
ElemType h_sum = 0;
@ -2287,7 +2276,7 @@ template <class ElemType>
ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
{
if (IsEmpty())
LogicError("MatrixNorm1: Matrix is empty.");
return 0;
ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(m_computeDevice, 1);
ElemType h_maxAbs = 0;
@ -2305,8 +2294,6 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
template <class ElemType>
ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
{
if (IsEmpty())
LogicError("MatrixNorm1: Matrix is empty.");
return SumOfAbsElements();
}
@ -2317,6 +2304,10 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse()
{
#if 1
// Note: This makes no sense because sparse matrices are defined by having lots of zeroes.
NOT_IMPLEMENTED;
#else
if (!OwnBuffer())
LogicError("Cannot modify since the buffer is managed externally.");
@ -2328,29 +2319,45 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse()
SyncGuard syncGuard;
_elemInverse<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(NzValues(), N);
return *this;
#endif
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementInverseOf(const GPUSparseMatrix<ElemType>& a)
{
#if 1
// Note: This makes no sense because sparse matrices are defined by having lots of zeroes.
UNUSED(a); NOT_IMPLEMENTED;
#else
SetValue(a);
return ElementInverse();
#endif
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSigmoid()
{
#if 1
// Note: This makes no sense because sigmoid(0) != 0.
NOT_IMPLEMENTED;
#else
performElementWiseFunction(ElementWiseOperator::opSigmoid, *this);
return *this;
#endif
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSigmoidOf(const GPUSparseMatrix<ElemType>& a)
{
#if 1
// Note: This makes no sense because sigmoid(0) != 0.
UNUSED(a); NOT_IMPLEMENTED;
#else
if (this != &a)
Resize(a.GetNumRows(), a.GetNumCols());
performElementWiseFunction(ElementWiseOperator::opSigmoid, a);
return *this;
#endif
}
template <class ElemType>
@ -2404,33 +2411,53 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSqrtOf(const GPUSpar
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceExp()
{
#if 1
// Note: This makes no sense because exp(0) != 0.
NOT_IMPLEMENTED;
#else
performElementWiseFunction(ElementWiseOperator::opExp, *this);
return *this;
#endif
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignExpOf(const GPUSparseMatrix<ElemType>& a)
{
#if 1
// Note: This makes no sense because exp(0) != 0.
UNUSED(a); NOT_IMPLEMENTED;
#else
if (this != &a)
Resize(a.GetNumRows(), a.GetNumCols());
performElementWiseFunction(ElementWiseOperator::opExp, a);
return *this;
#endif
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLog()
{
#if 1
// Note: This makes no sense because log(0) != 0.
NOT_IMPLEMENTED;
#else
performElementWiseFunction(ElementWiseOperator::opLog, *this);
return *this;
#endif
}
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLogOf(const GPUSparseMatrix<ElemType>& a)
{
#if 1
// Note: This makes no sense because log(0) != 0.
UNUSED(a); NOT_IMPLEMENTED;
#else
if (this != &a)
Resize(a.GetNumRows(), a.GetNumCols());
performElementWiseFunction(ElementWiseOperator::opLog, a);
return *this;
#endif
}
template <class ElemType>
@ -2449,6 +2476,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignAbsOf(const GPUSpars
return *this;
}
// TODO: Check whether these functions always map 0 to 0.
template <class ElemType>
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
{
@ -2543,7 +2571,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
//outBuffer should be allocated to be >= size by the caller
template <class ElemType>
template <class OutType, class InType>
void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size)
/*private*/ void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size)
{
#pragma omp parallel for
for (size_t i = 0; i < (size & ~3); i += 4)
@ -2629,6 +2657,9 @@ template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix
template <class ElemType>
MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
{
if (!us.OwnBuffer())
LogicError("Cannot read into a managed external matrix");
stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
size_t elsize;
stream >> elsize;

Просмотреть файл

@ -23,29 +23,30 @@ template <class ElemType>
class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
{
public:
typedef BaseMatrix<ElemType> B;
using B::m_numRows;
using B::m_numCols;
using B::m_pArray;
using B::m_elemSizeAllocated;
using B::m_sliceViewOffset;
using B::m_nz;
using B::m_format;
using B::m_computeDevice;
using B::m_externalBuffer;
using B::m_matrixName;
using B::OwnBuffer;
using B::GetFormat;
using B::SetFormat;
using B::GetNumRows;
using B::GetNumCols;
using B::IsEmpty;
using B::SetComputeDeviceId;
using B::SetMatrixName;
using B::SetNzCount;
using B::Clear;
typedef BaseMatrix<ElemType> Base;
using Base::m_numRows;
using Base::m_numCols;
using Base::m_pArray;
using Base::m_elemSizeAllocated;
using Base::m_sliceViewOffset;
using Base::m_nz;
using Base::m_format;
using Base::m_computeDevice;
using Base::m_externalBuffer;
using Base::m_matrixName;
using Base::OwnBuffer;
using Base::GetFormat;
using Base::SetFormat;
using Base::GetNumRows;
using Base::GetNumCols;
using Base::SetComputeDeviceId;
using Base::SetMatrixName;
using Base::SetNzCount;
using Base::Clear;
// without this, base members would require to use thi-> in GCC
public:
using Base::IsEmpty;
GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
explicit GPUSparseMatrix(DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
@ -390,6 +391,8 @@ private:
mutable void* m_tempHostBuffer; // used to copy values.
mutable size_t m_tempHostBufferSize;
GPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from
};
}}}

Просмотреть файл

@ -169,7 +169,7 @@ public:
// REVIEW alexeyk: GPU version copies from device to host only, implement all versions (device <-> host).
void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const;
Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const; // note: 'const' is misleading here, as the returned matrix is a mutable reference
// difference between AssignColumnSlice and SetColumnSlice
// AssignColumnSlice : this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1)

Просмотреть файл

@ -4,7 +4,7 @@ DataDir = $RootDir$
# deviceId = -1 for CPU, >= 0 for GPU devices
deviceId = -1
precision = "float"
precision = "double"
Simple_Test = [
reader = [