some clean-up of sparse matrices;
disabled meaningless elementwise sparse ops such as log(); sparse matrice slices now keep track of their original matrix (not used yet); bug fix: ~SyncGuard() should not swallow errors since CUDA errors may get discovered here; bug fix: HTKMLFReader test 16 must use consistent precision
This commit is contained in:
Родитель
d0a5eb527d
Коммит
7793b7be93
|
@ -59,16 +59,13 @@
|
|||
#define BLAS_COLMAJOR (int) MatrixOrder::ColMajor,
|
||||
#endif
|
||||
|
||||
#define SWAP(a, b) \
|
||||
{ \
|
||||
(a) ^= (b); \
|
||||
(b) ^= (a); \
|
||||
(a) ^= (b); \
|
||||
}
|
||||
// TODO: Move to CommonMatrix.h
|
||||
#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK {
|
||||
|
||||
#pragma region Helpful Enum Definitions
|
||||
|
||||
enum class MatrixOrder
|
||||
{
|
||||
RowMajor = 101, // row-major arrays
|
||||
|
@ -77,56 +74,55 @@ enum class MatrixOrder
|
|||
|
||||
enum class MatrixTranspose : char
|
||||
{
|
||||
NoTrans = 'N', // trans='N'
|
||||
Trans = 'T', // trans='T'
|
||||
ConjTrans = 'C' // trans='C'
|
||||
NoTrans = 'N', // trans='N'
|
||||
Trans = 'T', // trans='T'
|
||||
ConjTrans = 'C' // trans='C'
|
||||
};
|
||||
|
||||
enum class SymMatrixType : char
|
||||
{
|
||||
Up = 'U', // symmetric matrix is stored in the upper part
|
||||
Low = 'L', // symmetric matrix is stored in thelower part
|
||||
Full = 'F', // full populated
|
||||
NotSymmetric = 'N' // not a symmetric matrix
|
||||
Up = 'U', // symmetric matrix is stored in the upper part
|
||||
Low = 'L', // symmetric matrix is stored in thelower part
|
||||
Full = 'F', // full populated
|
||||
NotSymmetric = 'N' // not a symmetric matrix
|
||||
};
|
||||
|
||||
enum class MatrixOpSide : char
|
||||
{
|
||||
Left = 'L', // left multiply
|
||||
Left = 'L', // left multiply
|
||||
Right = 'R', // right multiply
|
||||
};
|
||||
|
||||
#pragma endregion Helpful Enum Definitions
|
||||
|
||||
#pragma region Constructors and Destructor
|
||||
|
||||
//should only be used by constructors.
|
||||
template <class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::ZeroInit()
|
||||
{
|
||||
m_numRows = 0;
|
||||
m_numCols = 0;
|
||||
m_elemSizeAllocated = 0;
|
||||
m_compIndexSize = 0;
|
||||
m_externalBuffer = false;
|
||||
m_computeDevice = CPUDEVICE;
|
||||
m_nz = 0;
|
||||
m_matrixName = NULL;
|
||||
//-------------------------------------------------------------------------
|
||||
// construction and conversion
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
// should only be used by constructors.
|
||||
template <class ElemType>
|
||||
/*private*/ void CPUSparseMatrix<ElemType>::ZeroInit()
|
||||
{
|
||||
Base::ZeroInit();
|
||||
m_computeDevice = CPUDEVICE;
|
||||
|
||||
m_sliceOf = nullptr;
|
||||
m_compIndexSize = 0;
|
||||
// if(m_format == MatrixFormat::matrixFormatSparseCSC || m_format == MatrixFormat::matrixFormatSparseCSR)
|
||||
{
|
||||
m_colIdx = -1;
|
||||
m_pArray = NULL;
|
||||
m_unCompIndex = NULL;
|
||||
m_compIndex = NULL;
|
||||
m_colIdx = -1;
|
||||
m_unCompIndex = nullptr;
|
||||
m_compIndex = nullptr;
|
||||
}
|
||||
// else if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
m_blockSize = 0;
|
||||
m_blockSize = 0;
|
||||
m_blockIdShift = 0;
|
||||
m_pArray = NULL;
|
||||
m_blockIds = NULL;
|
||||
m_blockIds = nullptr;
|
||||
}
|
||||
m_nzValues = NULL;
|
||||
m_nzValues = nullptr;
|
||||
}
|
||||
|
||||
//should only be used by constructors.
|
||||
|
@ -144,19 +140,17 @@ void CPUSparseMatrix<ElemType>::CheckInit(const MatrixFormat format)
|
|||
template <class ElemType>
|
||||
CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format)
|
||||
{
|
||||
|
||||
CheckInit(format);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
CPUSparseMatrix<ElemType>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size)
|
||||
{
|
||||
|
||||
CheckInit(format);
|
||||
Resize(numRows, numCols, size, true, false);
|
||||
}
|
||||
|
||||
//copy constructor, deep copy
|
||||
// copy constructor, deep copy
|
||||
template <class ElemType>
|
||||
CPUSparseMatrix<ElemType>::CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deepCopyFrom)
|
||||
{
|
||||
|
@ -166,7 +160,7 @@ CPUSparseMatrix<ElemType>::CPUSparseMatrix(const CPUSparseMatrix<ElemType>& deep
|
|||
SetMatrixName(deepCopyFrom.m_matrixName);
|
||||
}
|
||||
|
||||
//assignment operator, deep copy
|
||||
// assignment operator, deep copy
|
||||
template <class ElemType>
|
||||
CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(const CPUSparseMatrix<ElemType>& deepCopyFrom)
|
||||
{
|
||||
|
@ -177,29 +171,23 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(const CPUSparseM
|
|||
return *this;
|
||||
}
|
||||
|
||||
//move constructor, shallow copy
|
||||
// move constructor, shallow copy
|
||||
template <class ElemType>
|
||||
CPUSparseMatrix<ElemType>::CPUSparseMatrix(CPUSparseMatrix<ElemType>&& moveFrom)
|
||||
{
|
||||
m_format = moveFrom.m_format;
|
||||
m_numRows = moveFrom.m_numRows;
|
||||
m_numCols = moveFrom.m_numCols;
|
||||
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
|
||||
Base::ShallowCopyFrom(moveFrom);
|
||||
// BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now.
|
||||
|
||||
m_compIndexSize = moveFrom.m_compIndexSize;
|
||||
m_externalBuffer = moveFrom.m_externalBuffer;
|
||||
m_computeDevice = moveFrom.m_computeDevice;
|
||||
m_nz = moveFrom.m_nz;
|
||||
m_matrixName = moveFrom.m_matrixName;
|
||||
|
||||
m_colIdx = moveFrom.m_colIdx;
|
||||
m_pArray = moveFrom.m_pArray;
|
||||
m_nzValues = moveFrom.m_nzValues;
|
||||
m_colIdx = moveFrom.m_colIdx;
|
||||
m_nzValues = moveFrom.m_nzValues;
|
||||
m_unCompIndex = moveFrom.m_unCompIndex;
|
||||
m_compIndex = moveFrom.m_compIndex;
|
||||
m_compIndex = moveFrom.m_compIndex;
|
||||
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
m_blockIdShift = moveFrom.m_blockIdShift;
|
||||
m_blockIds = moveFrom.m_blockIds;
|
||||
m_blockIds = moveFrom.m_blockIds;
|
||||
|
||||
// release the pointer from the source object so that the destructor won't release it twice
|
||||
moveFrom.ZeroInit();
|
||||
|
@ -213,26 +201,19 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::operator=(CPUSparseMatrix<
|
|||
{
|
||||
if (OwnBuffer())
|
||||
ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom
|
||||
Base::ShallowCopyFrom(moveFrom);
|
||||
// BUGBUG: This did not use to copy m_sliceViewOffset, I presume it should be copied? It is now.
|
||||
|
||||
m_format = moveFrom.m_format;
|
||||
m_numRows = moveFrom.m_numRows;
|
||||
m_numCols = moveFrom.m_numCols;
|
||||
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
|
||||
m_compIndexSize = moveFrom.m_compIndexSize;
|
||||
m_externalBuffer = moveFrom.m_externalBuffer;
|
||||
m_computeDevice = moveFrom.m_computeDevice;
|
||||
m_nz = moveFrom.m_nz;
|
||||
m_matrixName = moveFrom.m_matrixName;
|
||||
|
||||
m_colIdx = moveFrom.m_colIdx;
|
||||
m_pArray = moveFrom.m_pArray;
|
||||
m_nzValues = moveFrom.m_nzValues;
|
||||
m_colIdx = moveFrom.m_colIdx;
|
||||
m_nzValues = moveFrom.m_nzValues;
|
||||
m_unCompIndex = moveFrom.m_unCompIndex;
|
||||
m_compIndex = moveFrom.m_compIndex;
|
||||
m_compIndex = moveFrom.m_compIndex;
|
||||
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
m_blockIdShift = moveFrom.m_blockIdShift;
|
||||
m_blockIds = moveFrom.m_blockIds;
|
||||
m_blockIds = moveFrom.m_blockIds;
|
||||
|
||||
// release the pointer from the source object so that the destructor won't release it twice
|
||||
moveFrom.ZeroInit();
|
||||
|
@ -249,9 +230,8 @@ CPUSparseMatrix<ElemType>::~CPUSparseMatrix()
|
|||
template <class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::ReleaseMemory()
|
||||
{
|
||||
// If m_externalBuffer is true then this matrix
|
||||
// is simply a view over another matrix. In that
|
||||
// case we shouldn't free anything.
|
||||
// If m_externalBuffer is true then this matrix is simply a view over another matrix.
|
||||
// In that case we shouldn't free anything.
|
||||
if (!m_externalBuffer)
|
||||
{
|
||||
delete[] m_matrixName;
|
||||
|
@ -335,24 +315,26 @@ void CPUSparseMatrix<ElemType>::SetValue(const size_t row, const size_t col, con
|
|||
m_nz++;
|
||||
}
|
||||
|
||||
//make sure call order in colume wise for CSC and row wise for CSR
|
||||
// make sure call order in colume wise for CSC and row wise for CSR
|
||||
template <class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
|
||||
{
|
||||
if (!OwnBuffer())
|
||||
if (!OwnBuffer()) // TODO: GPU version allows to overwrite a view with a fresh non-view
|
||||
LogicError("Cannot modify since the buffer is managed externally.");
|
||||
|
||||
this->Reset();
|
||||
m_format = v.GetFormat();
|
||||
Reset();
|
||||
m_format = v.GetFormat();
|
||||
m_externalBuffer = false;
|
||||
m_sliceOf = nullptr;
|
||||
|
||||
this->Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize());
|
||||
Resize(v.GetNumRows(), v.GetNumCols(), v.NzSize());
|
||||
m_nz = v.NzCount();
|
||||
|
||||
if (m_nz > 0)
|
||||
{
|
||||
memcpy(this->NzValues(), v.NzValues(), v.NzSize());
|
||||
memcpy(this->RowLocation(), v.RowLocation(), v.RowSize());
|
||||
memcpy(this->ColLocation(), v.ColLocation(), v.ColSize());
|
||||
memcpy(NzValues(), v.NzValues(), v.NzSize());
|
||||
memcpy(RowLocation(), v.RowLocation(), v.RowSize());
|
||||
memcpy(ColLocation(), v.ColLocation(), v.ColSize());
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -402,17 +384,22 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
|
|||
CPUSparseMatrix<ElemType> slice(m_format);
|
||||
slice.m_numRows = m_numRows;
|
||||
slice.m_numCols = numCols;
|
||||
// BUGBUG: m_matrixName?
|
||||
// BUGBUG: m_sliceViewOffset?
|
||||
slice.m_externalBuffer = true;
|
||||
slice.m_sliceOf = const_cast<CPUSparseMatrix<ElemType>*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be.
|
||||
|
||||
if (m_format == MatrixFormat::matrixFormatSparseCSC)
|
||||
{
|
||||
slice.m_pArray = m_pArray;
|
||||
slice.m_nzValues = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against m_pArray
|
||||
slice.m_unCompIndex = m_unCompIndex;
|
||||
slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
|
||||
slice.m_externalBuffer = true;
|
||||
slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
|
||||
slice.m_pArray = m_pArray;
|
||||
|
||||
slice.m_nzValues = m_pArray + m_compIndex[startColumn]; // note: m_compIndex is always against m_pArray
|
||||
slice.m_unCompIndex = m_unCompIndex;
|
||||
slice.m_compIndex = m_compIndex + startColumn; // Just shift the compressed index location to the new startColumn - that's it!
|
||||
slice.m_compIndexSize = numCols + 1;
|
||||
|
||||
slice.m_nz = m_compIndex[startColumn + numCols] - m_compIndex[startColumn];
|
||||
slice.m_elemSizeAllocated = slice.m_nz;
|
||||
slice.m_compIndexSize = numCols + 1;
|
||||
}
|
||||
else if (m_format == MatrixFormat::matrixFormatSparseBlockCol)
|
||||
{
|
||||
|
@ -446,13 +433,15 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
|
|||
endColBlock = (long long) m_blockSize;
|
||||
}
|
||||
|
||||
slice.m_pArray = m_pArray + startColBlock * m_numRows;
|
||||
slice.m_nzValues = slice.m_pArray;
|
||||
slice.m_blockIds = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers
|
||||
slice.m_blockSize = (size_t) max((long long) 0, endColBlock - startColBlock);
|
||||
// BUGBUG: m_elemSizeAllocated?
|
||||
slice.m_pArray = m_pArray + startColBlock * m_numRows;
|
||||
|
||||
slice.m_nzValues = slice.m_pArray;
|
||||
slice.m_blockIds = m_blockIds + startColBlock; // the value stored in the block id is based on the original column numbers
|
||||
slice.m_blockSize = (size_t) max((long long) 0, endColBlock - startColBlock);
|
||||
slice.m_blockIdShift = m_blockIdShift + startColumn;
|
||||
slice.m_externalBuffer = true;
|
||||
slice.m_nz = slice.m_blockSize * m_numRows;
|
||||
|
||||
slice.m_nz = slice.m_blockSize * m_numRows;
|
||||
}
|
||||
|
||||
return slice;
|
||||
|
@ -461,9 +450,6 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
|
|||
template <class ElemType>
|
||||
CPUMatrix<ElemType> CPUSparseMatrix<ElemType>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const
|
||||
{
|
||||
// if (numCols == 0)
|
||||
// LogicError("The slice cannot have 0 columns.");
|
||||
|
||||
if (startColumn + numCols > m_numCols)
|
||||
InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) m_numCols);
|
||||
|
||||
|
@ -614,17 +600,21 @@ void CPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
|
|||
}
|
||||
}
|
||||
|
||||
//Reset matrix so it can be reused
|
||||
// Reset matrix to 0.
|
||||
template <class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::Reset()
|
||||
{
|
||||
if (!OwnBuffer())
|
||||
LogicError("Cannot Reset since the buffer is managed externally.");
|
||||
|
||||
m_nz = 0;
|
||||
m_colIdx = -1;
|
||||
m_blockSize = 0;
|
||||
m_blockIdShift = 0;
|
||||
}
|
||||
|
||||
//c = alpha*op(lhs) * op(rhs) + beta*c
|
||||
// c = alpha*op(lhs) * op(rhs) + beta*c
|
||||
// dense x sparse = dense
|
||||
template <class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
|
||||
const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, CPUMatrix<ElemType>& c)
|
||||
|
@ -711,7 +701,8 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
|
|||
}
|
||||
}
|
||||
|
||||
//c = alpha * op(lhs) * op(rhs)
|
||||
// dense x sparse = sparse
|
||||
// c = alpha * op(lhs) * op(rhs)
|
||||
template <class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<ElemType>& lhs, const bool transposeA,
|
||||
const CPUSparseMatrix<ElemType>& rhs, const bool transposeB, CPUSparseMatrix<ElemType>& c)
|
||||
|
@ -807,6 +798,7 @@ void CPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const CPUMatrix<E
|
|||
}
|
||||
}
|
||||
|
||||
// dense += sparse
|
||||
template <class ElemType>
|
||||
void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix<ElemType>& lhs, CPUMatrix<ElemType>& rhs)
|
||||
{
|
||||
|
@ -861,7 +853,7 @@ void CPUSparseMatrix<ElemType>::ScaleAndAdd(const ElemType alpha, const CPUSpars
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
|
||||
/*static*/ bool CPUSparseMatrix<ElemType>::AreEqual(const CPUSparseMatrix<ElemType>& a, const CPUSparseMatrix<ElemType>& b, const ElemType threshold)
|
||||
{
|
||||
if (a.IsEmpty() || b.IsEmpty())
|
||||
LogicError("AreEqual: one of the input matrices is empty.");
|
||||
|
@ -894,6 +886,7 @@ void CPUSparseMatrix<ElemType>::NormalGrad(CPUMatrix<ElemType>& c, const ElemTyp
|
|||
c.Resize(GetNumRows(), GetNumCols());
|
||||
c.SetValue(0.0);
|
||||
}
|
||||
// BUGBUG: dimension/ownbuffer check?
|
||||
|
||||
if (m_format == MatrixFormat::matrixFormatSparseBlockCol || m_format == MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
|
@ -927,6 +920,7 @@ ElemType CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c, const bool n
|
|||
c.Resize(GetNumRows(), GetNumCols());
|
||||
c.SetValue(0.0);
|
||||
}
|
||||
// BUGBUG: dimension/ownbuffer check?
|
||||
|
||||
ElemType aveMultiplier = 0;
|
||||
|
||||
|
@ -1156,12 +1150,12 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::InplaceSoftThreshold(const
|
|||
template <class ElemType>
|
||||
ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
|
||||
{
|
||||
if (this->IsEmpty())
|
||||
LogicError("FrobeniusNorm: Matrix is empty.");
|
||||
if (IsEmpty())
|
||||
return 0;
|
||||
|
||||
ElemType v = 0;
|
||||
ElemType v = 0; // TODO: do this in 'double'?
|
||||
|
||||
long m = (long) this->NzCount();
|
||||
long m = (long) NzCount();
|
||||
const ElemType* nzValues = NzValues();
|
||||
|
||||
//four-way unrolling
|
||||
|
@ -1183,8 +1177,8 @@ ElemType CPUSparseMatrix<ElemType>::FrobeniusNorm() const
|
|||
template <class ElemType>
|
||||
ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
|
||||
{
|
||||
if (this->IsEmpty())
|
||||
LogicError("SumOfAbsElements: Matrix is empty.");
|
||||
if (IsEmpty())
|
||||
return 0;
|
||||
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
|
@ -1209,12 +1203,12 @@ ElemType CPUSparseMatrix<ElemType>::SumOfAbsElements() const
|
|||
template <class ElemType>
|
||||
ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
|
||||
{
|
||||
if (this->IsEmpty())
|
||||
LogicError("SumOfElements: Matrix is empty.");
|
||||
if (IsEmpty())
|
||||
return 0;
|
||||
|
||||
ElemType sum = 0;
|
||||
ElemType sum = 0; // TODO: Do this in 'double'?
|
||||
|
||||
long m = (long) this->NzCount();
|
||||
long m = (long) NzCount();
|
||||
const ElemType* nzValues = NzValues();
|
||||
|
||||
//four-way unrolling
|
||||
|
@ -1235,6 +1229,9 @@ ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
|
|||
template <typename ElemType>
|
||||
MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
|
||||
{
|
||||
if (!us.OwnBuffer())
|
||||
LogicError("Cannot read into a managed external matrix");
|
||||
|
||||
stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
|
||||
size_t elsize;
|
||||
stream >> elsize;
|
||||
|
@ -1351,4 +1348,5 @@ template CPUSparseMatrix<char>::~CPUSparseMatrix();
|
|||
template CPUSparseMatrix<char> CPUSparseMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||
template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
|
||||
template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);
|
||||
} } }
|
||||
|
||||
}}}
|
||||
|
|
|
@ -22,21 +22,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
|
|||
template <class ElemType>
|
||||
class MATH_API CPUSparseMatrix : public BaseMatrix<ElemType>
|
||||
{
|
||||
typedef BaseMatrix<ElemType> B;
|
||||
using B::m_elemSizeAllocated;
|
||||
using B::m_computeDevice;
|
||||
using B::m_externalBuffer;
|
||||
using B::m_format;
|
||||
using B::m_matrixName;
|
||||
using B::m_numCols;
|
||||
using B::m_numRows;
|
||||
using B::m_nz;
|
||||
using B::m_pArray; // without this, base members would require to use thi-> in GCC
|
||||
using B::OwnBuffer;
|
||||
using B::Clear;
|
||||
typedef BaseMatrix<ElemType> Base;
|
||||
using Base::m_elemSizeAllocated;
|
||||
using Base::m_computeDevice;
|
||||
using Base::m_externalBuffer;
|
||||
using Base::m_format;
|
||||
using Base::m_matrixName;
|
||||
using Base::m_numCols;
|
||||
using Base::m_numRows;
|
||||
using Base::m_nz;
|
||||
using Base::m_pArray; // without this, base members would require to use thi-> in GCC
|
||||
using Base::Clear;
|
||||
using Base::NzCount;
|
||||
|
||||
public:
|
||||
using B::SetMatrixName;
|
||||
using Base::OwnBuffer;
|
||||
using Base::IsEmpty;
|
||||
using Base::SetMatrixName;
|
||||
|
||||
private:
|
||||
void ZeroInit();
|
||||
|
@ -53,8 +55,8 @@ public:
|
|||
~CPUSparseMatrix();
|
||||
|
||||
public:
|
||||
using B::GetNumCols;
|
||||
using B::GetNumRows;
|
||||
using Base::GetNumCols;
|
||||
using Base::GetNumRows;
|
||||
|
||||
void SetValue(const size_t row, const size_t col, ElemType val);
|
||||
void SetValue(const CPUSparseMatrix<ElemType>& /*val*/);
|
||||
|
@ -249,6 +251,8 @@ private:
|
|||
size_t m_blockSize; // block size
|
||||
size_t* m_blockIds; // block ids
|
||||
size_t m_blockIdShift; // used to get efficient slice, actual col = blockIds[j] - m_blockIdShift
|
||||
|
||||
CPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from
|
||||
};
|
||||
|
||||
typedef CPUSparseMatrix<float> CPUSingleSparseMatrix;
|
||||
|
|
|
@ -158,7 +158,7 @@ enum ElementWiseOperator
|
|||
Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); \
|
||||
Macro(ElementwiseProductWithLogDerivativeFromOutput); \
|
||||
Macro(ElementwiseProductWithCosDerivative); \
|
||||
//Macro(Index);
|
||||
//Macro(Index);
|
||||
|
||||
#define ForAllTernaryOps(Macro) \
|
||||
Macro(Cond); \
|
||||
|
@ -170,6 +170,7 @@ enum ElementWiseOperator
|
|||
|
||||
enum MatrixFlagBitPosition
|
||||
{
|
||||
// TODO: remove all formats that are actually not supported
|
||||
bitPosRowMajor = 0, // row major matrix
|
||||
bitPosSparse = 1, // sparse matrix (COO if uncompressed)
|
||||
bitPosCompressed = 2, // a compressed sparse format (CSC/CSR)
|
||||
|
@ -179,6 +180,7 @@ enum MatrixFlagBitPosition
|
|||
|
||||
enum MatrixFormat
|
||||
{
|
||||
// TODO: remove all formats that are actually not supported
|
||||
matrixFormatDense = 0, // default is dense
|
||||
matrixFormatColMajor = 0, // default is column major
|
||||
matrixFormatRowMajor = 1 << bitPosRowMajor, // row major matrix
|
||||
|
@ -314,17 +316,46 @@ protected:
|
|||
m_matrixName = nullptr;
|
||||
}
|
||||
|
||||
void ZeroInit()
|
||||
{
|
||||
m_numRows = 0;
|
||||
m_numCols = 0;
|
||||
m_elemSizeAllocated = 0;
|
||||
m_sliceViewOffset = 0;
|
||||
m_externalBuffer = false;
|
||||
m_pArray = nullptr;
|
||||
m_nz = 0;
|
||||
m_matrixName = nullptr;
|
||||
}
|
||||
|
||||
// copy all metadata (but not content taht pArray points to)
|
||||
void ShallowCopyFrom(const BaseMatrix& other)
|
||||
{
|
||||
m_format = other.m_format;
|
||||
m_computeDevice = other.m_computeDevice;
|
||||
|
||||
m_numRows = other.m_numRows;
|
||||
m_numCols = other.m_numCols;
|
||||
m_elemSizeAllocated = other.m_elemSizeAllocated;
|
||||
m_sliceViewOffset = other.m_sliceViewOffset;
|
||||
m_externalBuffer = other.m_externalBuffer;
|
||||
m_pArray = other.m_pArray;
|
||||
m_nz = other.m_nz;
|
||||
m_matrixName = other.m_matrixName;
|
||||
}
|
||||
|
||||
protected:
|
||||
MatrixFormat m_format;
|
||||
mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
|
||||
|
||||
size_t m_numRows;
|
||||
size_t m_numCols;
|
||||
size_t m_elemSizeAllocated;
|
||||
size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format
|
||||
MatrixFormat m_format;
|
||||
size_t m_sliceViewOffset; // this is used to get a column slice view of a matrix in the Sparse CSC format --TODO: move to sparse matrix implementations? Or common sparse base class?
|
||||
bool m_externalBuffer; // is the buffer used by this matrix,
|
||||
ElemType* m_pArray;
|
||||
mutable DEVICEID_TYPE m_computeDevice; // current GPU device Id or CPUDEVICE
|
||||
size_t m_nz; // Number of non-zero elements for sparse matrices (unused in other formats)
|
||||
wchar_t* m_matrixName; // TODO: Use std::wstring?
|
||||
};
|
||||
|
||||
} } }
|
||||
}}}
|
||||
|
|
|
@ -579,7 +579,7 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam
|
|||
|
||||
class SyncGuard
|
||||
{
|
||||
bool DoSync()
|
||||
static bool DoSync()
|
||||
{
|
||||
#ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
|
||||
static bool do_sync = false;
|
||||
|
@ -588,27 +588,30 @@ class SyncGuard
|
|||
#endif
|
||||
return do_sync;
|
||||
}
|
||||
cudaEvent_t done;
|
||||
cudaEvent_t m_done;
|
||||
public:
|
||||
SyncGuard()
|
||||
{
|
||||
done = nullptr;
|
||||
m_done = nullptr;
|
||||
if (DoSync())
|
||||
CUDA_CALL(cudaEventCreate(&done));
|
||||
CUDA_CALL(cudaEventCreate(&m_done));
|
||||
}
|
||||
~SyncGuard()
|
||||
{
|
||||
if (DoSync())
|
||||
{
|
||||
try
|
||||
// The regular use of this destructor is to synchronize the GPU, but also
|
||||
// to check for errors. So this destructor is where CUDA errors would be thrown.
|
||||
// If this destructor runs during stack unwinding, then a different error has
|
||||
// already happened that should be reported; so we only clean up the resource.
|
||||
if (std::uncaught_exception())
|
||||
cudaEventDestroy(m_done);
|
||||
else
|
||||
{
|
||||
CUDA_CALL(cudaEventRecord(done));
|
||||
CUDA_CALL(cudaEventSynchronize(done));
|
||||
CUDA_CALL(cudaEventDestroy(done));
|
||||
}
|
||||
catch (const std::exception& e) // can't throw in destructors!
|
||||
{
|
||||
std::cerr << "SyncGuard: Destructor swallowing CUDA failure: " << e.what() << std::endl;
|
||||
// failures in a prior launch might be reported here
|
||||
CUDA_CALL(cudaEventRecord(m_done));
|
||||
CUDA_CALL(cudaEventSynchronize(m_done));
|
||||
CUDA_CALL(cudaEventDestroy(m_done));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -52,6 +52,10 @@ GPUSPARSE_INDEX_TYPE GPUSparseMatrix<ElemType>::SecondaryIndexValueAt(size_t idx
|
|||
return value;
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// construction and conversion
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const DEVICEID_TYPE computeDevice)
|
||||
{
|
||||
|
@ -59,25 +63,18 @@ void GPUSparseMatrix<ElemType>::ZeroInit(const MatrixFormat matrixFormat, const
|
|||
matrixFormat != MatrixFormat::matrixFormatSparseBlockCol && matrixFormat != MatrixFormat::matrixFormatSparseBlockRow)
|
||||
{
|
||||
LogicError("GPUSparseMatrix: unsupported sparse matrix format");
|
||||
// BUGBUG: Then why even define others?
|
||||
}
|
||||
Base::ZeroInit();
|
||||
m_computeDevice = computeDevice; // current GPU device Id
|
||||
m_format = matrixFormat;
|
||||
|
||||
m_computeDevice = computeDevice; // current GPU device Id
|
||||
m_numRows = 0;
|
||||
m_numCols = 0;
|
||||
m_elemSizeAllocated = m_nz = 0; // Number of non-zero elements
|
||||
m_sliceOf = nullptr;
|
||||
m_totalBufferSizeAllocated = 0;
|
||||
m_sliceViewOffset = 0;
|
||||
m_format = matrixFormat;
|
||||
m_externalBuffer = false;
|
||||
m_pArray = nullptr;
|
||||
m_matrixName = nullptr;
|
||||
|
||||
m_blockSize = 0;
|
||||
|
||||
m_rowToId = nullptr;
|
||||
|
||||
m_tempHostBuffer = nullptr;
|
||||
m_tempHostBufferSize = 0;
|
||||
m_blockSize = 0;
|
||||
m_rowToId = nullptr;
|
||||
m_tempHostBuffer = nullptr;
|
||||
m_tempHostBufferSize = 0;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -123,13 +120,15 @@ DEVICEID_TYPE GPUSparseMatrix<ElemType>::PrepareDevice(DEVICEID_TYPE deviceId /*
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
|
||||
/*private*/ void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCopy)
|
||||
{
|
||||
ChangeDeviceTo(deepCopy.m_computeDevice);
|
||||
deepCopy.PrepareDevice();
|
||||
|
||||
Resize(deepCopy.m_numRows, deepCopy.m_numCols, deepCopy.GetNumNZElements(), deepCopy.m_format, true, false);
|
||||
m_nz = deepCopy.m_nz;
|
||||
m_externalBuffer = false;
|
||||
m_sliceOf = nullptr;
|
||||
m_nz = deepCopy.m_nz;
|
||||
m_sliceViewOffset = 0; // reset to zero as we only start copying the indices starting from the offset in the source matrix
|
||||
|
||||
CUDA_CALL(cudaMemcpy(BufferPointer(), deepCopy.NzValues(), NzSize(), cudaMemcpyDeviceToDevice));
|
||||
|
@ -146,7 +145,6 @@ void GPUSparseMatrix<ElemType>::DeepCopy(const GPUSparseMatrix<ElemType>& deepCo
|
|||
GetNumNZElements());
|
||||
}
|
||||
|
||||
m_externalBuffer = false;
|
||||
SetMatrixName(deepCopy.m_matrixName);
|
||||
|
||||
// TODO: to copy other varibles used only for class based LM
|
||||
|
@ -156,7 +154,7 @@ template <class ElemType>
|
|||
void GPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopy)
|
||||
{
|
||||
if (!OwnBuffer())
|
||||
LogicError("Cannot SetValue on Managed external matrix");
|
||||
LogicError("Cannot SetValue on managed external matrix");
|
||||
|
||||
DeepCopy(deepCopy);
|
||||
}
|
||||
|
@ -190,6 +188,9 @@ void GPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCo
|
|||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::CopyToCPUSparseMatrix(CPUSparseMatrix<ElemType>& cpuSparseMatrix) const
|
||||
{
|
||||
if (!cpuSparseMatrix.OwnBuffer())
|
||||
LogicError("Cannot CopyToCPUSparseMatrix on managed external matrix");
|
||||
|
||||
cpuSparseMatrix.SetFormat(GetFormat());
|
||||
if (IsEmpty())
|
||||
{
|
||||
|
@ -307,6 +308,9 @@ void GPUSparseMatrix<ElemType>::CopyToDenseMatrix(GPUMatrix<ElemType>& denseMatr
|
|||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::ConvertToSparseFormat(MatrixFormat newFormat, GPUSparseMatrix<ElemType>& outMatrix) const
|
||||
{
|
||||
if (!outMatrix.OwnBuffer())
|
||||
LogicError("Cannot ConvertToSparseFormat to managed external matrix");
|
||||
|
||||
if (IsEmpty())
|
||||
{
|
||||
outMatrix.ZeroInit(newFormat, GetComputeDeviceId());
|
||||
|
@ -528,26 +532,15 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseM
|
|||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>::GPUSparseMatrix(GPUSparseMatrix<ElemType>&& moveFrom)
|
||||
{
|
||||
m_computeDevice = moveFrom.m_computeDevice;
|
||||
m_numRows = moveFrom.m_numRows;
|
||||
m_numCols = moveFrom.m_numCols;
|
||||
m_nz = moveFrom.m_nz;
|
||||
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
|
||||
Base::ShallowCopyFrom(moveFrom);
|
||||
// TODO: implement this using operator= or a shared function
|
||||
m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
|
||||
m_pArray = moveFrom.m_pArray;
|
||||
m_sliceViewOffset = moveFrom.m_sliceViewOffset;
|
||||
m_format = moveFrom.m_format;
|
||||
m_externalBuffer = moveFrom.m_externalBuffer;
|
||||
m_matrixName = moveFrom.m_matrixName;
|
||||
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
|
||||
m_rowToId = moveFrom.m_rowToId;
|
||||
|
||||
m_tempHostBuffer = moveFrom.m_tempHostBuffer;
|
||||
m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
|
||||
|
||||
moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freeed
|
||||
m_sliceOf = moveFrom.m_sliceOf;
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
m_rowToId = moveFrom.m_rowToId;
|
||||
m_tempHostBuffer = moveFrom.m_tempHostBuffer;
|
||||
m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
|
||||
moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice); // so that memory in moveFrom is not freed
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -557,26 +550,13 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(GPUSparseMatrix<
|
|||
{
|
||||
if (OwnBuffer())
|
||||
ReleaseMemory(); // always delete the data pointer since we will use the pointer from moveFrom
|
||||
m_computeDevice = moveFrom.m_computeDevice;
|
||||
m_numRows = moveFrom.m_numRows;
|
||||
m_numCols = moveFrom.m_numCols;
|
||||
m_nz = moveFrom.m_nz;
|
||||
m_elemSizeAllocated = moveFrom.m_elemSizeAllocated;
|
||||
Base::ShallowCopyFrom(moveFrom);
|
||||
m_totalBufferSizeAllocated = moveFrom.m_totalBufferSizeAllocated;
|
||||
m_pArray = moveFrom.m_pArray;
|
||||
m_sliceViewOffset = moveFrom.m_sliceViewOffset;
|
||||
m_format = moveFrom.m_format;
|
||||
m_externalBuffer = moveFrom.m_externalBuffer;
|
||||
|
||||
m_matrixName = moveFrom.m_matrixName;
|
||||
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
|
||||
m_rowToId = moveFrom.m_rowToId;
|
||||
|
||||
m_tempHostBuffer = moveFrom.m_tempHostBuffer;
|
||||
m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
|
||||
|
||||
m_sliceOf = moveFrom.m_sliceOf;
|
||||
m_blockSize = moveFrom.m_blockSize;
|
||||
m_rowToId = moveFrom.m_rowToId;
|
||||
m_tempHostBuffer = moveFrom.m_tempHostBuffer;
|
||||
m_tempHostBufferSize = moveFrom.m_tempHostBufferSize;
|
||||
moveFrom.ZeroInit(moveFrom.m_format, moveFrom.m_computeDevice);
|
||||
}
|
||||
|
||||
|
@ -590,11 +570,10 @@ GPUSparseMatrix<ElemType>::~GPUSparseMatrix()
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::ReleaseMemory()
|
||||
/*private*/ void GPUSparseMatrix<ElemType>::ReleaseMemory()
|
||||
{
|
||||
// If OwnBuffer() is false then this matrix
|
||||
// is simply a view over another matrix. In that
|
||||
// case we shouldn't free anything.
|
||||
// If OwnBuffer() is false then this matrix is a view over another matrix.
|
||||
// In that case we shouldn't free anything.
|
||||
if (OwnBuffer())
|
||||
{
|
||||
delete[] m_matrixName;
|
||||
|
@ -609,7 +588,6 @@ void GPUSparseMatrix<ElemType>::ReleaseMemory()
|
|||
TracingGPUMemoryAllocator::Free<GPUSPARSE_INDEX_TYPE>(m_computeDevice, m_rowToId);
|
||||
m_rowToId = nullptr;
|
||||
}
|
||||
|
||||
ZeroInit(m_format, m_computeDevice);
|
||||
}
|
||||
|
||||
|
@ -627,17 +605,18 @@ void GPUSparseMatrix<ElemType>::ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<E
|
|||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// Start of new GPU Sparse Matrix code
|
||||
// main operations
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
|
||||
{
|
||||
if (!OwnBuffer())
|
||||
LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally.");
|
||||
|
||||
if (m_numRows == numRows && m_numCols == numCols)
|
||||
return;
|
||||
|
||||
if (!OwnBuffer())
|
||||
LogicError("GPUSparseMatrix::Reshape: Cannot Reshape since the buffer is managed externally.");
|
||||
|
||||
if (m_format != MatrixFormat::matrixFormatSparseCSC)
|
||||
NOT_IMPLEMENTED;
|
||||
|
||||
|
@ -688,8 +667,8 @@ void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
|
|||
Resize(numRows, numCols, numNZElemToReserve, GetFormat(), growOnly, keepExistingValues);
|
||||
}
|
||||
|
||||
//WARNING: When memory is reallocated existing information will be lost, workaround is to allocte enough memory from start.
|
||||
//TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
|
||||
// WARNING: When memory is reallocated, existing information will be lost.
|
||||
// TODO: add keepExistingValues (default to true) argument so that the existing values are kept even after reallocation
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*=true*/)
|
||||
{
|
||||
|
@ -749,13 +728,17 @@ void GPUSparseMatrix<ElemType>::Resize(const size_t numRows, const size_t numCol
|
|||
m_format = matrixFormat;
|
||||
}
|
||||
|
||||
//Reset matrix so it can be reused
|
||||
// Reset matrix to 0.
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::Reset()
|
||||
{
|
||||
if (!OwnBuffer())
|
||||
LogicError("Cannot Reset since the buffer is managed externally.");
|
||||
|
||||
m_nz = 0;
|
||||
m_blockSize = 0;
|
||||
}
|
||||
|
||||
// copy features to GPU
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::SetMatrixFromCSRFormat(const GPUSPARSE_INDEX_TYPE* h_CSRRow, const GPUSPARSE_INDEX_TYPE* h_Col, const ElemType* h_Val,
|
||||
|
@ -966,9 +949,11 @@ void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPU
|
|||
}
|
||||
}
|
||||
|
||||
// dense X sparse = dense
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& lhs, const bool transposeA,
|
||||
const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
|
||||
const GPUSparseMatrix<ElemType>& rhs, const bool transposeB, ElemType beta,
|
||||
GPUMatrix<ElemType>& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise)
|
||||
{
|
||||
if (lhs.GetComputeDeviceId() != rhs.GetComputeDeviceId() || (lhs.GetComputeDeviceId() != c.GetComputeDeviceId()))
|
||||
RuntimeError("GPUSparseMatrix<ElemType>::ConvolveAndWeightedAdd: All matrices must be on the same GPU");
|
||||
|
@ -1222,7 +1207,7 @@ void GPUSparseMatrix<ElemType>::MultiplyAndAdd(ElemType alpha, const GPUMatrix<E
|
|||
}
|
||||
}
|
||||
|
||||
//find the rows of rhs with values
|
||||
// find the rows of rhs with values
|
||||
template <class ElemType>
|
||||
size_t GPUSparseMatrix<ElemType>::IdentifyRowsWithValues() const
|
||||
{
|
||||
|
@ -1404,11 +1389,7 @@ ElemType GPUSparseMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& c, const bool n
|
|||
}
|
||||
}
|
||||
|
||||
//-------------------------------------------------------------------------
|
||||
// End of new GPU Sparse Matrix code
|
||||
//-------------------------------------------------------------------------
|
||||
|
||||
//sparse X dense = dense
|
||||
// sparse X dense = dense
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA,
|
||||
const GPUMatrix<ElemType>& b, const bool transposeD, ElemType beta, GPUMatrix<ElemType>& c)
|
||||
|
@ -1758,6 +1739,7 @@ void GPUSparseMatrix<ElemType>::ElementWisePower(ElemType alpha, const GPUSparse
|
|||
}
|
||||
}
|
||||
|
||||
// sparse x dense = scalar
|
||||
template <class ElemType>
|
||||
ElemType GPUSparseMatrix<ElemType>::InnerProductOfMatrices(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
|
||||
{
|
||||
|
@ -1878,8 +1860,8 @@ bool GPUSparseMatrix<ElemType>::IsValid() const
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
|
||||
const ElemType threshold)
|
||||
/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
|
||||
const ElemType threshold)
|
||||
{
|
||||
if (a.GetNumNZElements() != b.GetNumNZElements() || a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
|
||||
return false;
|
||||
|
@ -1908,8 +1890,8 @@ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, con
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
|
||||
const ElemType threshold)
|
||||
/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b,
|
||||
const ElemType threshold)
|
||||
{
|
||||
if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
|
||||
return false;
|
||||
|
@ -1919,8 +1901,8 @@ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUMatrix<ElemType>& a, const GPU
|
|||
}
|
||||
|
||||
template <class ElemType>
|
||||
bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b,
|
||||
const ElemType threshold)
|
||||
/*static*/ bool GPUSparseMatrix<ElemType>::AreEqual(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b,
|
||||
const ElemType threshold)
|
||||
{
|
||||
if (a.GetNumRows() != b.GetNumRows() || a.GetNumCols() != b.GetNumCols())
|
||||
return false;
|
||||
|
@ -1940,6 +1922,7 @@ bool GPUSparseMatrix<ElemType>::IsEqualTo(const GPUMatrix<ElemType>& a, const El
|
|||
{
|
||||
return AreEqual(*this, a, threshold);
|
||||
}
|
||||
|
||||
#pragma endregion Static BLAS Functions
|
||||
|
||||
#pragma region Member BLAS Functions
|
||||
|
@ -1958,6 +1941,7 @@ DEVICEID_TYPE GPUSparseMatrix<ElemType>::GetComputeDeviceId() const
|
|||
return m_computeDevice;
|
||||
}
|
||||
|
||||
// sparse x dense = dense
|
||||
template <class ElemType>
|
||||
GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseMatrix<ElemType>& a, const GPUMatrix<ElemType>& b)
|
||||
{
|
||||
|
@ -1980,6 +1964,7 @@ GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUSparseM
|
|||
return c;
|
||||
}
|
||||
|
||||
// sparse x dense = dense
|
||||
template <class ElemType>
|
||||
GPUMatrix<ElemType> GPUSparseMatrix<ElemType>::ElementProductOf(const GPUMatrix<ElemType>& a, const GPUSparseMatrix<ElemType>& b)
|
||||
{
|
||||
|
@ -2002,6 +1987,7 @@ GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator-(const GPUSparseMa
|
|||
return res;
|
||||
}
|
||||
|
||||
// TODO: This is an unusual use of this operator. Remove this.
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
|
||||
{
|
||||
|
@ -2010,6 +1996,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator^=(ElemType alpha)
|
|||
return us;
|
||||
}
|
||||
|
||||
// TODO: This is an unusual use of this operator. Remove this.
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::operator^(ElemType alpha) const
|
||||
{
|
||||
|
@ -2154,6 +2141,7 @@ GPUSparseMatrix<ElemType> GPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
|
|||
slice.m_pArray = m_pArray;
|
||||
slice.m_format = m_format;
|
||||
slice.m_externalBuffer = true;
|
||||
slice.m_sliceOf = const_cast<GPUSparseMatrix<ElemType>*>(this); // BUGBUG: ColumnSlice() returns a reference to a mutable matrix, even if itself is 'const'; should not be.
|
||||
slice.m_matrixName = m_matrixName;
|
||||
slice.m_blockSize = m_blockSize;
|
||||
slice.m_rowToId = m_rowToId;
|
||||
|
@ -2231,7 +2219,7 @@ template <class ElemType>
|
|||
ElemType GPUSparseMatrix<ElemType>::SumOfAbsElements() const
|
||||
{
|
||||
if (IsEmpty())
|
||||
LogicError("SumOfAbsElements: Matrix is empty");
|
||||
return 0;
|
||||
|
||||
cublasHandle_t cuHandle = GPUMatrix<ElemType>::GetCublasHandle(GetComputeDeviceId());
|
||||
if (sizeof(ElemType) == sizeof(float))
|
||||
|
@ -2264,11 +2252,12 @@ ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
|
|||
return h_sum;
|
||||
}
|
||||
|
||||
// sqrt(sum all elements^2)
|
||||
template <class ElemType>
|
||||
ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const
|
||||
{
|
||||
if (IsEmpty())
|
||||
LogicError("FrobeniusNorm: Matrix is empty.");
|
||||
return 0;
|
||||
|
||||
ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(m_computeDevice, 1);
|
||||
ElemType h_sum = 0;
|
||||
|
@ -2287,7 +2276,7 @@ template <class ElemType>
|
|||
ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
|
||||
{
|
||||
if (IsEmpty())
|
||||
LogicError("MatrixNorm1: Matrix is empty.");
|
||||
return 0;
|
||||
|
||||
ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(m_computeDevice, 1);
|
||||
ElemType h_maxAbs = 0;
|
||||
|
@ -2305,8 +2294,6 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
|
|||
template <class ElemType>
|
||||
ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
|
||||
{
|
||||
if (IsEmpty())
|
||||
LogicError("MatrixNorm1: Matrix is empty.");
|
||||
return SumOfAbsElements();
|
||||
}
|
||||
|
||||
|
@ -2317,6 +2304,10 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNorm1() const
|
|||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse()
|
||||
{
|
||||
#if 1
|
||||
// Note: This makes no sense because sparse matrices are defined by having lots of zeroes.
|
||||
NOT_IMPLEMENTED;
|
||||
#else
|
||||
if (!OwnBuffer())
|
||||
LogicError("Cannot modify since the buffer is managed externally.");
|
||||
|
||||
|
@ -2328,29 +2319,45 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::ElementInverse()
|
|||
SyncGuard syncGuard;
|
||||
_elemInverse<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock>>>(NzValues(), N);
|
||||
return *this;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignElementInverseOf(const GPUSparseMatrix<ElemType>& a)
|
||||
{
|
||||
#if 1
|
||||
// Note: This makes no sense because sparse matrices are defined by having lots of zeroes.
|
||||
UNUSED(a); NOT_IMPLEMENTED;
|
||||
#else
|
||||
SetValue(a);
|
||||
return ElementInverse();
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceSigmoid()
|
||||
{
|
||||
#if 1
|
||||
// Note: This makes no sense because sigmoid(0) != 0.
|
||||
NOT_IMPLEMENTED;
|
||||
#else
|
||||
performElementWiseFunction(ElementWiseOperator::opSigmoid, *this);
|
||||
return *this;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSigmoidOf(const GPUSparseMatrix<ElemType>& a)
|
||||
{
|
||||
#if 1
|
||||
// Note: This makes no sense because sigmoid(0) != 0.
|
||||
UNUSED(a); NOT_IMPLEMENTED;
|
||||
#else
|
||||
if (this != &a)
|
||||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
performElementWiseFunction(ElementWiseOperator::opSigmoid, a);
|
||||
return *this;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -2404,33 +2411,53 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignSqrtOf(const GPUSpar
|
|||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceExp()
|
||||
{
|
||||
#if 1
|
||||
// Note: This makes no sense because exp(0) != 0.
|
||||
NOT_IMPLEMENTED;
|
||||
#else
|
||||
performElementWiseFunction(ElementWiseOperator::opExp, *this);
|
||||
return *this;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignExpOf(const GPUSparseMatrix<ElemType>& a)
|
||||
{
|
||||
#if 1
|
||||
// Note: This makes no sense because exp(0) != 0.
|
||||
UNUSED(a); NOT_IMPLEMENTED;
|
||||
#else
|
||||
if (this != &a)
|
||||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
performElementWiseFunction(ElementWiseOperator::opExp, a);
|
||||
return *this;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceLog()
|
||||
{
|
||||
#if 1
|
||||
// Note: This makes no sense because log(0) != 0.
|
||||
NOT_IMPLEMENTED;
|
||||
#else
|
||||
performElementWiseFunction(ElementWiseOperator::opLog, *this);
|
||||
return *this;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignLogOf(const GPUSparseMatrix<ElemType>& a)
|
||||
{
|
||||
#if 1
|
||||
// Note: This makes no sense because log(0) != 0.
|
||||
UNUSED(a); NOT_IMPLEMENTED;
|
||||
#else
|
||||
if (this != &a)
|
||||
Resize(a.GetNumRows(), a.GetNumCols());
|
||||
performElementWiseFunction(ElementWiseOperator::opLog, a);
|
||||
return *this;
|
||||
#endif
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
@ -2449,6 +2476,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::AssignAbsOf(const GPUSpars
|
|||
return *this;
|
||||
}
|
||||
|
||||
// TODO: Check whether these functions always map 0 to 0.
|
||||
template <class ElemType>
|
||||
GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::InplaceTruncateBottom(const ElemType threshold)
|
||||
{
|
||||
|
@ -2543,7 +2571,7 @@ GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::SetToZeroIfAbsLessThan(con
|
|||
//outBuffer should be allocated to be >= size by the caller
|
||||
template <class ElemType>
|
||||
template <class OutType, class InType>
|
||||
void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size)
|
||||
/*private*/ void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inBuffer, const size_t size)
|
||||
{
|
||||
#pragma omp parallel for
|
||||
for (size_t i = 0; i < (size & ~3); i += 4)
|
||||
|
@ -2629,6 +2657,9 @@ template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix
|
|||
template <class ElemType>
|
||||
MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
|
||||
{
|
||||
if (!us.OwnBuffer())
|
||||
LogicError("Cannot read into a managed external matrix");
|
||||
|
||||
stream.GetMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
|
||||
size_t elsize;
|
||||
stream >> elsize;
|
||||
|
|
|
@ -23,29 +23,30 @@ template <class ElemType>
|
|||
class MATH_API GPUSparseMatrix : public BaseMatrix<ElemType>
|
||||
{
|
||||
public:
|
||||
typedef BaseMatrix<ElemType> B;
|
||||
using B::m_numRows;
|
||||
using B::m_numCols;
|
||||
using B::m_pArray;
|
||||
using B::m_elemSizeAllocated;
|
||||
using B::m_sliceViewOffset;
|
||||
using B::m_nz;
|
||||
using B::m_format;
|
||||
using B::m_computeDevice;
|
||||
using B::m_externalBuffer;
|
||||
using B::m_matrixName;
|
||||
using B::OwnBuffer;
|
||||
using B::GetFormat;
|
||||
using B::SetFormat;
|
||||
using B::GetNumRows;
|
||||
using B::GetNumCols;
|
||||
using B::IsEmpty;
|
||||
using B::SetComputeDeviceId;
|
||||
using B::SetMatrixName;
|
||||
using B::SetNzCount;
|
||||
using B::Clear;
|
||||
typedef BaseMatrix<ElemType> Base;
|
||||
using Base::m_numRows;
|
||||
using Base::m_numCols;
|
||||
using Base::m_pArray;
|
||||
using Base::m_elemSizeAllocated;
|
||||
using Base::m_sliceViewOffset;
|
||||
using Base::m_nz;
|
||||
using Base::m_format;
|
||||
using Base::m_computeDevice;
|
||||
using Base::m_externalBuffer;
|
||||
using Base::m_matrixName;
|
||||
using Base::OwnBuffer;
|
||||
using Base::GetFormat;
|
||||
using Base::SetFormat;
|
||||
using Base::GetNumRows;
|
||||
using Base::GetNumCols;
|
||||
using Base::SetComputeDeviceId;
|
||||
using Base::SetMatrixName;
|
||||
using Base::SetNzCount;
|
||||
using Base::Clear;
|
||||
// without this, base members would require to use thi-> in GCC
|
||||
public:
|
||||
using Base::IsEmpty;
|
||||
|
||||
GPUSparseMatrix(const size_t numRows, const size_t numCols, const size_t numNZ, DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
|
||||
|
||||
explicit GPUSparseMatrix(DEVICEID_TYPE computeDevice, const MatrixFormat matrixFormat = MatrixFormat::matrixFormatSparseCSR);
|
||||
|
@ -390,6 +391,8 @@ private:
|
|||
|
||||
mutable void* m_tempHostBuffer; // used to copy values.
|
||||
mutable size_t m_tempHostBufferSize;
|
||||
|
||||
GPUSparseMatrix* m_sliceOf; // if this is a slice, then this points to the owning matrix object that we sliced from
|
||||
};
|
||||
|
||||
}}}
|
||||
|
|
|
@ -169,7 +169,7 @@ public:
|
|||
// REVIEW alexeyk: GPU version copies from device to host only, implement all versions (device <-> host).
|
||||
void CopySection(size_t numRows, size_t numCols, ElemType* dst, size_t colStride) const;
|
||||
|
||||
Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
|
||||
Matrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const; // note: 'const' is misleading here, as the returned matrix is a mutable reference
|
||||
|
||||
// difference between AssignColumnSlice and SetColumnSlice
|
||||
// AssignColumnSlice : this(:, startColumn:startColumn+numCols-1) = fromMatrix(:, startColumn: startColumn+numCols-1)
|
||||
|
|
|
@ -4,7 +4,7 @@ DataDir = $RootDir$
|
|||
# deviceId = -1 for CPU, >= 0 for GPU devices
|
||||
deviceId = -1
|
||||
|
||||
precision = "float"
|
||||
precision = "double"
|
||||
|
||||
Simple_Test = [
|
||||
reader = [
|
||||
|
|
Загрузка…
Ссылка в новой задаче