From 59a2c26d7d1c4bf0d577baa6b166fe5c17639ab6 Mon Sep 17 00:00:00 2001 From: KeDengMS Date: Fri, 31 Mar 2017 11:07:11 -0700 Subject: [PATCH] Make sparse label CE backprop faster When minibatch size is big (like 10000), the diagonal matrix in times node for gradient could be big. This change implements a ColumnwiseScaleAndWeightedAdd to reduce the cost in that case. --- .../LinearAlgebraNodes.h | 6 +-- Source/Math/CPUMatrix.cpp | 23 +++++++++ Source/Math/CPUMatrix.h | 2 + Source/Math/CPUSparseMatrix.cpp | 39 +++++++++++++++ Source/Math/CPUSparseMatrix.h | 2 + Source/Math/GPUMatrix.cu | 16 ++++++ Source/Math/GPUMatrix.h | 2 + Source/Math/GPUMatrixCUDAKernels.cuh | 49 +++++++++++++++++++ Source/Math/GPUSparseMatrix.cu | 31 ++++++++++++ Source/Math/GPUSparseMatrix.h | 3 ++ Source/Math/Matrix.cpp | 19 +++++++ Source/Math/Matrix.h | 2 + Source/Math/NoGPU.cpp | 10 ++++ .../UnitTests/MathTests/baseline.txt | 7 ++- Tests/UnitTests/MathTests/MatrixBlasTests.cpp | 31 ++++++++++++ 15 files changed, 236 insertions(+), 6 deletions(-) diff --git a/Source/ComputationNetworkLib/LinearAlgebraNodes.h b/Source/ComputationNetworkLib/LinearAlgebraNodes.h index 846a1021e..69a1d6c47 100755 --- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h +++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h @@ -591,10 +591,8 @@ public: Matrix gradient = GradientFor(fr); Matrix inputValue = InputRef(1 - inputIndex).ValueFor(fr); Matrix inputGradient = InputRef(inputIndex).GradientFor(fr); - Matrix gradientDiagonal(gradient.GetNumCols(), gradient.GetNumCols(), gradient.GetDeviceId()); - gradientDiagonal.SetDiagonalValue(gradient); - Matrix::MultiplyAndWeightedAdd( - (ElemType)1.0, inputValue, false, gradientDiagonal, true, + Matrix::ColumnwiseScaleAndWeightedAdd( + (ElemType)1.0, inputValue, gradient, Input(inputIndex)->ParentOverwritesGradient() ? (ElemType)0.0 : (ElemType)1.0, inputGradient); // TODO: better move this special-casing into TensorView::AssignElementwiseProductOf() diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index e8684fdf1..273a512c4 100755 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -4836,6 +4836,29 @@ void CPUMatrix::Multiply1x1AndWeightedAdd(ElemType alpha, const CPUMat c(i, j) = b(i, j) * f + c(i, j) * beta; } +template +void CPUMatrix::ColumnwiseScaleAndWeightedAdd(ElemType alpha, const CPUMatrix& a, const CPUMatrix& v, ElemType beta, CPUMatrix& c) +{ + if (v.GetNumRows() != 1 && v.GetNumCols() != 1) + InvalidArgument("the argument v must be a vector"); // v is a vector + + if (beta == 0) + c.RequireSize(a.GetNumRows(), a.GetNumCols()); + else + c.VerifySize(a.GetNumRows(), a.GetNumCols()); // Can't resize if beta != 0 + + const ElemType* vd = v.Data(); + + if (beta == 0) // don't even read the memory if beta is 0 +#pragma omp parallel for + foreach_coord(i, j, c) + c(i, j) = alpha * a(i, j) * vd[j]; + else +#pragma omp parallel for + foreach_coord(i, j, c) + c(i, j) = alpha * a(i, j) * vd[j] + c(i, j) * beta; +} + /* compute singular value decomposition as A = U*SIGMA*VT W is used as temp working memory diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 549a933cd..4566c01ab 100755 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -412,6 +412,8 @@ public: static void Multiply(const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c); static void Multiply1x1AndWeightedAdd(ElemType alpha, const CPUMatrix& a, const CPUMatrix& b, ElemType beta, CPUMatrix& c); + static void ColumnwiseScaleAndWeightedAdd(ElemType alpha, const CPUMatrix& a, const CPUMatrix& v, ElemType beta, CPUMatrix& c); + static void ScaleAndAdd(ElemType alpha, const CPUMatrix& a, CPUMatrix& c); static void AddScaledDifference(const ElemType alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c); static void AssignScaledDifference(const ElemType alpha, const CPUMatrix& a, const CPUMatrix& b, CPUMatrix& c); diff --git a/Source/Math/CPUSparseMatrix.cpp b/Source/Math/CPUSparseMatrix.cpp index 9ac662a97..286ac240e 100644 --- a/Source/Math/CPUSparseMatrix.cpp +++ b/Source/Math/CPUSparseMatrix.cpp @@ -1116,6 +1116,45 @@ void CPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const CPUMatrix +void CPUSparseMatrix::ColumnwiseScaleAndWeightedAdd(ElemType alpha, const CPUSparseMatrix& a, const CPUMatrix& v, ElemType beta, CPUMatrix& c) +{ + if (v.GetNumRows() != 1 && v.GetNumCols() != 1) + InvalidArgument("the argument v must be a vector"); // v is a vector + + if (a.GetFormat() != matrixFormatSparseCSC) + NOT_IMPLEMENTED; + + if (beta == 0) + { + c.RequireSize(a.GetNumRows(), a.GetNumCols()); + c.SetValue((ElemType)0); + } + else + c.VerifySize(a.GetNumRows(), a.GetNumCols()); // Can't resize if beta != 0 + + const ElemType* vd = v.Data(); + +#pragma omp parallel for + for (long col = 0; col < (long)a.GetNumCols(); col++) + { + auto start = a.SecondaryIndexLocation()[col]; + auto end = a.SecondaryIndexLocation()[col + 1]; + + for (auto p = start; p < end; p++) + { + auto row = a.MajorIndexLocation()[p]; + ElemType val = a.Buffer()[p]; + + if (beta == 0) // don't even read the memory if beta is 0 + c(row, col) = alpha * vd[col] * val; + else + c(row, col) = alpha * vd[col] * val + beta * c(row, col); + } + } +} + // dense += sparse template void CPUSparseMatrix::ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix& lhs, CPUMatrix& rhs) diff --git a/Source/Math/CPUSparseMatrix.h b/Source/Math/CPUSparseMatrix.h index 6dce95b97..490418749 100644 --- a/Source/Math/CPUSparseMatrix.h +++ b/Source/Math/CPUSparseMatrix.h @@ -132,6 +132,8 @@ public: static void MultiplyAndAdd(ElemType alpha, const CPUMatrix& lhs, const bool transposeA, const CPUSparseMatrix& rhs, const bool transposeB, CPUSparseMatrix& c); + static void ColumnwiseScaleAndWeightedAdd(ElemType alpha, const CPUSparseMatrix& a, const CPUMatrix& v, ElemType beta, CPUMatrix& c); + static void ScaleAndAdd(const ElemType alpha, const CPUSparseMatrix& lhs, CPUMatrix& c); static bool AreEqual(const CPUSparseMatrix& a, const CPUSparseMatrix& b, const ElemType threshold = 1e-8); diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index ae272337d..59279f809 100755 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -3487,6 +3487,22 @@ void GPUMatrix::Multiply(const GPUMatrix& a, const GPUMatrix return GPUMatrix::MultiplyAndWeightedAdd(1, a, false, b, false, 0, c); } +template +void GPUMatrix::ColumnwiseScaleAndWeightedAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& v, ElemType beta, GPUMatrix& c) +{ + if (v.GetNumRows() != 1 && v.GetNumCols() != 1) + InvalidArgument("the argument v must be a vector"); // v is a vector + + if (beta == 0) + c.RequireSize(a.GetNumRows(), a.GetNumCols()); + else + c.VerifySize(a.GetNumRows(), a.GetNumCols()); // Can't resize if beta != 0 + + int blocksPerGrid = (int)ceil(1.0 * c.GetNumElements() / GridDim::maxThreadsPerBlock); + SyncGuard syncGuard; + _columnwiseScaleAndWeightedAdd<<>>(alpha, a.Data(), v.Data(), beta, c.Data(), a.GetNumRows(), a.GetNumCols()); +} + /// Matrix-scalar multiply with col-major matrices: c = alpha * a + c /// if a is a column vector, add to all columns of c /// if a is a row vector, add to all rows of c diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 2b788c550..60bb442f1 100755 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -521,6 +521,8 @@ public: static void Multiply(const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); static void Multiply1x1AndWeightedAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, ElemType beta, GPUMatrix& c); + static void ColumnwiseScaleAndWeightedAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& v, ElemType beta, GPUMatrix& c); + static void ScaleAndAdd(ElemType alpha, const GPUMatrix& a, GPUMatrix& c); static void ScaleAndAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); static void AddScaledDifference(const ElemType alpha, const GPUMatrix& a, const GPUMatrix& b, GPUMatrix& c); diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh index 46809562c..03269387f 100755 --- a/Source/Math/GPUMatrixCUDAKernels.cuh +++ b/Source/Math/GPUMatrixCUDAKernels.cuh @@ -3136,6 +3136,55 @@ __global__ void _dense1DConvMultSparseCSCTransposeAndAddToDense( } } +template +__global__ void _columnwiseScaleAndWeightedAdd( + ElemType alpha, + const ElemType* aData, + const ElemType* vData, + ElemType beta, + ElemType* cData, + int m, int n) +{ + CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x; + if (id >= m * n) + return; + + CUDA_LONG col = id / m; + + if (beta == 0) // don't even read the memory if beta is 0 + cData[id] = alpha * vData[col] * aData[id]; + else + cData[id] = alpha * vData[col] * aData[id] + beta * cData[id]; +} + +template +__global__ void _columnwiseScaleAndWeightedAdd4CSC( + ElemType alpha, + const ElemType* aData, const GPUSPARSE_INDEX_TYPE* aSecondaryIndices, const GPUSPARSE_INDEX_TYPE* aMajorIndices, + const ElemType* vData, + ElemType beta, + ElemType* cData, + int m, int n) +{ + CUDA_LONG col = blockDim.x * blockIdx.x + threadIdx.x; + if (col >= n) + return; + + GPUSPARSE_INDEX_TYPE start = aSecondaryIndices[col]; + GPUSPARSE_INDEX_TYPE end = aSecondaryIndices[col + 1]; + + for (GPUSPARSE_INDEX_TYPE p = start; p < end; p++) + { + GPUSPARSE_INDEX_TYPE row = aMajorIndices[p]; + ElemType val = aData[p]; + + if (beta == 0) // don't even read the memory if beta is 0 + cData[IDX2C(row, col, m)] = alpha * vData[col] * val; + else + cData[IDX2C(row, col, m)] = alpha * vData[col] * val + beta * cData[IDX2C(row, col, m)]; + } +} + template __global__ void _reshape( const int oldNumRows, // old row count diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu index 28eb04145..a5ba531aa 100755 --- a/Source/Math/GPUSparseMatrix.cu +++ b/Source/Math/GPUSparseMatrix.cu @@ -1189,6 +1189,35 @@ void GPUSparseMatrix::ConvolveAndWeightedAdd(ElemType alpha, const GPU } } +// c[:,j] = alpha * v[j] * a[:,j] + beta * c[:,j] +template +void GPUSparseMatrix::ColumnwiseScaleAndWeightedAdd(ElemType alpha, const GPUSparseMatrix& a, const GPUMatrix& v, ElemType beta, GPUMatrix& c) +{ + if (v.GetNumRows() != 1 && v.GetNumCols() != 1) + InvalidArgument("the argument v must be a vector"); // v is a vector + + if (a.GetFormat() != matrixFormatSparseCSC) + NOT_IMPLEMENTED; + + if (beta == 0) + { + c.RequireSize(a.GetNumRows(), a.GetNumCols()); + c.SetValue((ElemType)0); + } + else + c.VerifySize(a.GetNumRows(), a.GetNumCols()); // Can't resize if beta != 0 + + int blocksPerGrid = (int)ceil(1.0 * a.GetNumCols() / GridDim::maxThreadsPerBlock); + SyncGuard syncGuard; + _columnwiseScaleAndWeightedAdd4CSC<<>>( + alpha, + a.Data(), a.SecondaryIndexLocation(), a.MajorIndexLocation(), + v.Data(), + beta, + c.Data(), + a.GetNumRows(), a.GetNumCols()); +} + template void GPUSparseMatrix::TensorShuffleScaleAndAdd(ElemType keepWeight, const GPUSparseMatrix& a, size_t D, size_t S, size_t M, size_t K, size_t T, ElemType scaleFactor, const GPUSparseMatrix& b, GPUSparseMatrix& c) @@ -3060,6 +3089,7 @@ template GPUMatrix GPUSparseMatrix::CopyColumnSliceToDense(size_t, s template GPUSparseMatrix& GPUSparseMatrix::operator=(GPUSparseMatrix&&); template void GPUSparseMatrix::Reshape(const size_t, const size_t); template void GPUSparseMatrix::ScaleAndAdd(char, GPUSparseMatrix const &, GPUMatrix &); +template void GPUSparseMatrix::ColumnwiseScaleAndWeightedAdd(char, const GPUSparseMatrix&, const GPUMatrix&, char, GPUMatrix&); // Support template GPUSparseMatrix::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat); @@ -3084,6 +3114,7 @@ template GPUMatrix GPUSparseMatrix::CopyColumnSliceToDense(size_t, template GPUSparseMatrix& GPUSparseMatrix::operator=(GPUSparseMatrix&&); template void GPUSparseMatrix::Reshape(const size_t, const size_t); template void GPUSparseMatrix::ScaleAndAdd(short, GPUSparseMatrix const &, GPUMatrix &); +template void GPUSparseMatrix::ColumnwiseScaleAndWeightedAdd(short, const GPUSparseMatrix&, const GPUMatrix&, short, GPUMatrix&); template GPUSparseMatrix::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat); template GPUSparseMatrix::~GPUSparseMatrix(); diff --git a/Source/Math/GPUSparseMatrix.h b/Source/Math/GPUSparseMatrix.h index b1ce0472c..7e38855da 100755 --- a/Source/Math/GPUSparseMatrix.h +++ b/Source/Math/GPUSparseMatrix.h @@ -409,6 +409,9 @@ public: const bool transposeD, ElemType beta, GPUMatrix& C); static void MultiplyAndAdd(ElemType alpha, const GPUMatrix& lhs, const bool transposeA, const GPUSparseMatrix& rhs, const bool transposeB, GPUSparseMatrix& c); + + static void ColumnwiseScaleAndWeightedAdd(ElemType alpha, const GPUSparseMatrix& a, const GPUMatrix& v, ElemType beta, GPUMatrix& c); + static void ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix& lhs, GPUMatrix& c); static void ConvolveAndWeightedAdd(ElemType alpha, const GPUMatrix& lhs, const bool transposeA, const GPUSparseMatrix& rhs, const bool transposeB, ElemType beta, GPUMatrix& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise); diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 6f7572c51..163cc712e 100755 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -4954,6 +4954,25 @@ void Matrix::ConvolveAndWeightedAdd(ElemType alpha, const MatrixColumnwise scale with col-major matrix and accumulate. +/// Scalar +/// Input matrix +/// Input scale vector for each column of a +/// Scalar +/// Resulting matrix, the same shape as a +template +void Matrix::ColumnwiseScaleAndWeightedAdd(ElemType alpha, const Matrix& a, const Matrix& v, ElemType beta, Matrix& c) +{ + DecideAndMoveToRightDevice(a, v, c); + + DISPATCH_MATRIX_ON_FLAG(&a, + nullptr, + CPUMatrix::ColumnwiseScaleAndWeightedAdd(alpha, *a.m_CPUMatrix, *v.m_CPUMatrix, beta, *c.m_CPUMatrix), + GPUMatrix::ColumnwiseScaleAndWeightedAdd(alpha, *a.m_GPUMatrix, *v.m_GPUMatrix, beta, *c.m_GPUMatrix), + CPUSparseMatrix::ColumnwiseScaleAndWeightedAdd(alpha, *a.m_CPUSparseMatrix, *v.m_CPUMatrix, beta, *c.m_CPUMatrix), + GPUSparseMatrix::ColumnwiseScaleAndWeightedAdd(alpha, *a.m_GPUSparseMatrix, *v.m_GPUMatrix, beta, *c.m_GPUMatrix)); +} + /// Matrix-scalar multiply with col-major matrices: c = alpha * a + c /// if a is a column vector, add to all columns of c /// if a is a row vector, add to all rows of c diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 8065f197c..118719df6 100755 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -568,6 +568,8 @@ public: static void Multiply1x1AndWeightedAdd(ElemType alpha, const Matrix& a, const Matrix& b, ElemType beta, Matrix& c); static void ConvolveAndWeightedAdd(ElemType alpha, const Matrix& a, const bool transposeA, const Matrix& b, const bool transposeB, ElemType beta, Matrix& c, size_t numChannels, size_t horizontalSubsample, bool padding, bool channelwise); + static void ColumnwiseScaleAndWeightedAdd(ElemType alpha, const Matrix& a, const Matrix& v, ElemType beta, Matrix& c); + static void ScaleAndAdd(ElemType alpha, const Matrix& a, Matrix& c); static void ScaleAndAdd(ElemType alpha, const Matrix& a, ElemType beta, Matrix& c); static void AddScaledDifference(const ElemType alpha, const Matrix& a, const Matrix& b, Matrix& c); diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp index a92e0c6e1..72bdbfe17 100755 --- a/Source/Math/NoGPU.cpp +++ b/Source/Math/NoGPU.cpp @@ -233,6 +233,11 @@ void GPUSparseMatrix::MultiplyAndAdd(ElemType alpha, const GPUMatrix +void GPUSparseMatrix::ColumnwiseScaleAndWeightedAdd(ElemType alpha, const GPUSparseMatrix& a, const GPUMatrix& v, ElemType beta, GPUMatrix& c) +{ +} + // used for gradients udpate template void GPUSparseMatrix::ScaleAndAdd(const ElemType alpha, const GPUSparseMatrix& lhs, GPUMatrix& rhs) @@ -1952,6 +1957,11 @@ void GPUMatrix::Multiply(const GPUMatrix& /*a*/, const GPUMa { } +template +void GPUMatrix::ColumnwiseScaleAndWeightedAdd(ElemType alpha, const GPUMatrix& a, const GPUMatrix& v, ElemType beta, GPUMatrix& c) +{ +} + /// Matrix-scalar multiply with col-major matrices: c = alpha * a + c /// if a is a column vector, add to all columns of c /// if a is a row vector, add to all rows of c diff --git a/Tests/EndToEndTests/UnitTests/MathTests/baseline.txt b/Tests/EndToEndTests/UnitTests/MathTests/baseline.txt index 3b61735aa..0ee2f737d 100644 --- a/Tests/EndToEndTests/UnitTests/MathTests/baseline.txt +++ b/Tests/EndToEndTests/UnitTests/MathTests/baseline.txt @@ -234,8 +234,8 @@ Test module "MathTests" has passed with: 770 assertions out of 770 passed Test suite "CPUMatrixSuite" has passed with: - 35 test cases out of 35 passed - 5248724 assertions out of 5248724 passed + 36 test cases out of 36 passed + 5248732 assertions out of 5248732 passed Test case "CPUMatrixSuite/CPUMatrixConstructorNoFlags" has passed with: 8 assertions out of 8 passed @@ -318,6 +318,9 @@ Test module "MathTests" has passed with: Test case "CPUMatrixSuite/MatrixMultiplyAndPlusAndMinus" has passed with: 1050120 assertions out of 1050120 passed + Test case "CPUMatrixSuite/MatrixColumnwiseScaleAndWeightedAdd" has passed with: + 8 assertions out of 8 passed + Test case "CPUMatrixSuite/MatrixScaleAndAdd" has passed with: 1572864 assertions out of 1572864 passed diff --git a/Tests/UnitTests/MathTests/MatrixBlasTests.cpp b/Tests/UnitTests/MathTests/MatrixBlasTests.cpp index d732a4d19..285059477 100644 --- a/Tests/UnitTests/MathTests/MatrixBlasTests.cpp +++ b/Tests/UnitTests/MathTests/MatrixBlasTests.cpp @@ -119,6 +119,37 @@ BOOST_FIXTURE_TEST_CASE(MatrixMultiplyAndPlusAndMinus, RandomSeedFixture) } } +BOOST_FIXTURE_TEST_CASE(MatrixColumnwiseScaleAndWeightedAdd, RandomSeedFixture) +{ + size_t m = 256; + size_t n = 64; + for(int deviceId : {-1, 0}) + { + SingleMatrix singleMatrixA(deviceId); + singleMatrixA.AssignTruncateBottomOf(SingleMatrix::RandomUniform(m, n, deviceId, -200, 1, IncrementCounter()), 0); + const SingleMatrix singleMatrixB = SingleMatrix::RandomUniform(n, 1, deviceId, 0, 1, IncrementCounter()); + SingleMatrix singleMatrixAcsc = singleMatrixA.DeepClone(); + singleMatrixAcsc.SwitchToMatrixType(SPARSE, matrixFormatSparseCSC, true); + + SingleMatrix singleMatrixCexpected = SingleMatrix::RandomUniform(m, n, deviceId, 0, 1, IncrementCounter()); + SingleMatrix singleMatrixCdense = singleMatrixCexpected.DeepClone(); + SingleMatrix singleMatrixCsparse = singleMatrixCexpected.DeepClone(); + + SingleMatrix singleMatrixBdiag(n, n, deviceId); + singleMatrixBdiag.SetValue(0); + singleMatrixBdiag.SetDiagonalValue(singleMatrixB); + for(float beta : {0.0f, 1.0f}) + { + SingleMatrix::MultiplyAndWeightedAdd(1, singleMatrixA, false, singleMatrixBdiag, false, beta, singleMatrixCexpected); + SingleMatrix::ColumnwiseScaleAndWeightedAdd(1, singleMatrixA, singleMatrixB, beta, singleMatrixCdense); + SingleMatrix::ColumnwiseScaleAndWeightedAdd(1, singleMatrixAcsc, singleMatrixB, beta, singleMatrixCsparse); + + BOOST_CHECK(singleMatrixCexpected.IsEqualTo(singleMatrixCdense, c_epsilonFloatE4)); + BOOST_CHECK(singleMatrixCexpected.IsEqualTo(singleMatrixCsparse, c_epsilonFloatE4)); + } + } +} + BOOST_FIXTURE_TEST_CASE(MatrixScaleAndAdd, RandomSeedFixture) { std::mt19937 rng(0);