[CNTK core] sparse GPU impl of FSAdagrad and RmsProp

Also fix a bug in dense GPU MultiplyAndWeightedAdd that garbage data may get in via resize
This commit is contained in:
KeDengMS 2017-01-12 21:26:56 -08:00
Родитель 3063238829
Коммит 4505c48fd0
11 изменённых файлов: 389 добавлений и 17 удалений

Просмотреть файл

@ -1186,6 +1186,7 @@ UNITTEST_MATH_SRC = \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixLearnerTests.cpp \
$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))

Просмотреть файл

@ -3370,15 +3370,16 @@ void GPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix
int k = int(transposeA ? a.m_numRows : a.m_numCols);
int l = int(transposeB ? b.m_numCols : b.m_numRows);
c.RequireSize(m, n);
if (beta == 0)
c.RequireSize(m, n);
else
c.VerifySize(m, n); // Can't resize if beta != 0
if (!(m > 0 && k > 0 && l > 0 && n > 0))
RuntimeError("!(m>0 && k>0 && l>0 && n>0)"); // converting from size_t to int may cause overflow
if (k != l)
RuntimeError("matrix dim mismatch in MultiplyAndWeightedAdd");
CUBLAS_CALL(cublas_gemm(cuHandle, transA, transB, m, n, k, &alpha, a.Data(), (int) a.m_numRows, b.Data(), (int) b.m_numRows, &beta, c.Data(), (int) c.m_numRows));
c.m_numRows = m;
c.m_numCols = n;
}
template <class ElemType>

Просмотреть файл

@ -41,6 +41,10 @@
#define CUDA_LONG int32_t
#endif
// special markers in BlockId2ColOrRow()/ColOrRow2BlockId()
static const GPUSPARSE_INDEX_TYPE Id_NotAssigned = -1;
static const GPUSPARSE_INDEX_TYPE Id_Pending = INT_MAX;
#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
// On older GPUs, CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
@ -1454,6 +1458,68 @@ __global__ void _fsadagrad(CUDA_LONG size, ElemType* grad, ElemType* smoothAda,
}
}
template<class ElemType>
inline __device__ ElemType _getvalue4BlockSparseCol(ElemType* v, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len, CUDA_LONG idx)
{
CUDA_LONG col = idx / len;
CUDA_LONG row = idx - col * len;
CUDA_LONG blockid = colOrRow2blockId[col];
return (blockid == Id_NotAssigned) ? 0 : v[blockid * len + row];
}
template<class ElemType>
inline __device__ void _scalevalue4BlockSparseCol(ElemType* v, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len, CUDA_LONG idx, ElemType s)
{
CUDA_LONG col = idx / len;
CUDA_LONG row = idx - col * len;
CUDA_LONG blockid = colOrRow2blockId[col];
if (blockid != Id_NotAssigned)
{
v[blockid * len + row] *= s;
}
}
template <class ElemType>
__global__ void _fsadagrad4BlockSparseCol(CUDA_LONG size,
ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul)
{
CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
CUDA_LONG stride = blockDim.x * gridDim.x;
for (; idx < size; idx += stride)
{
ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, idx);
ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
smoothAda[idx] = adaSqr;
if (adaSqr != 0.0f)
{
ElemType w;
if (sizeof(ElemType) == sizeof(double))
{
w = adaMul * rsqrt(adaSqr);
}
else
{
w = adaMul * rsqrtf(adaSqr);
}
if (w > 10.0f)
w = 10.0f;
g *= w;
}
if (mom > 0.0f)
{
g = mom * smoothMom[idx] + (1.0f - mom) * g;
smoothMom[idx] = g;
}
g *= lr;
val[idx] -= g;
}
}
template <class ElemType>
__global__ void _rmsprop_init(
ElemType* avars, ElemType* signs, ElemType* steps,
@ -1470,6 +1536,23 @@ __global__ void _rmsprop_init(
steps[i] = ElemType(0.02);
}
template <class ElemType>
__global__ void _rmsprop_init4BlockSparseCol(
ElemType* avars, ElemType* signs, ElemType* steps,
ElemType* curr_grad, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
const CUDA_LONG N)
{
CUDA_LONG i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= N)
return;
ElemType tmp = _getvalue4BlockSparseCol(curr_grad, colOrRow2blockId, len, i);
avars[i] = tmp * tmp;
signs[i] = ElemType(0.0);
steps[i] = ElemType(0.02);
}
template <class ElemType>
__global__ void _rmsprop(
ElemType* avars, ElemType* signs, ElemType* steps,
@ -1523,6 +1606,61 @@ __global__ void _rmsprop(
multipliers[i] = temp;
}
template <class ElemType>
__global__ void _rmsprop4BlockSparseCol(
ElemType* avars, ElemType* signs, ElemType* steps,
ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
const CUDA_LONG N,
ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN,
ElemType floor,
ElemType* upd_gpu,
ElemType* multipliers)
{
CUDA_LONG i = blockDim.x * blockIdx.x + threadIdx.x;
if (i >= N)
return;
ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, i);
avars[i] = RMS_GAMMA * avars[i] + (ElemType(1.0) - RMS_GAMMA) * (g * g);
// // grad sign base 3: 0->neg, 1->zero, 2->pos
// const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
// // signs[i] contains three consecutive grad_sign
// signs[i] = 3*(int(signs[i]) % 9) + grad_sign;
// // update according to the following table:
// // (!pos,!pos,!pos) or (!neg,!neg,!neg): RMS_WGT_INC
// // (!neg,!neg,neg) or (!pos,!pos,pos): RMS_WGT_DEC
// // otherwise: no action
// switch(int(upd_gpu[int(signs[i])]))
// {
// case 0:
// steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
// break;
// case 2:
// steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
// break;
// }
// curr_grad[i] *= steps[i] / sqrt(avars[i] + floor);
const int grad_sign = (ElemType(0) < g) - (g < ElemType(0));
if (signs[i] * grad_sign > 0)
steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
else
steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
ElemType temp = steps[i] / sqrt(avars[i] + floor);
_scalevalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, i, temp);
signs[i] = grad_sign;
if (multipliers != nullptr)
multipliers[i] = temp;
}
template <class ElemType>
__global__ void _rescaleToRange(
ElemType* a,
@ -3009,10 +3147,6 @@ __global__ void _reshape(
newColumnIndex[newNumCols] = oldColumnIndex[oldNumCols]; // set end pointer
}
// special markers in BlockId2ColOrRow()/ColOrRow2BlockId()
static const GPUSPARSE_INDEX_TYPE Id_NotAssigned = -1;
static const GPUSPARSE_INDEX_TYPE Id_Pending = INT_MAX;
//called before _determineBlockIds and _denseMulSparseCSCTransposeToSparseBlockCol to determine which columns have values and
//what's the mapping from the column id in the resulted SparseBlockCol format to the column id in the dense format
//input: rowIndexes: the row indexes of the CSC sparse matrix to be multiplied with

Просмотреть файл

@ -1505,6 +1505,130 @@ ElemType GPUSparseMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& c, const bool n
}
}
template <class ElemType>
void GPUSparseMatrix<ElemType>::FSAdagrad(
GPUMatrix<ElemType>& c,
GPUMatrix<ElemType>& functionValues,
ElemType learnRatePerSample,
ElemType momentum,
ElemType adaWeight,
ElemType adaMul)
{
if (GetFormat() != MatrixFormat::matrixFormatSparseBlockCol)
{
NOT_IMPLEMENTED;
}
size_t numColsNeeded = 2 * GetNumCols();
if (c.IsEmpty() || (c.GetNumCols() < numColsNeeded))
{
c.RequireSize(GetNumRows(), numColsNeeded);
c.SetValue(0.0);
}
assert((c.GetNumRows() == GetNumRows()) && (c.GetNumCols() == numColsNeeded));
size_t n = GetNumElements();
int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
_fsadagrad4BlockSparseCol<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(
n, Data(), ColOrRow2BlockId(), GetNumRows(),
c.Data(), c.Data() + n, functionValues.Data(),
learnRatePerSample, momentum, adaWeight, adaMul);
}
template <class ElemType>
ElemType GPUSparseMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& c,
ElemType RMS_GAMMA,
ElemType RMS_WGT_INC,
ElemType RMS_WGT_MAX,
ElemType RMS_WGT_DEC,
ElemType RMS_WGT_MIN,
const bool needAveMultiplier)
{
if (GetFormat() != MatrixFormat::matrixFormatSparseBlockCol)
{
NOT_IMPLEMENTED;
}
const ElemType floor = 1e-6f;
static ElemType* upd_gpu = (ElemType*)0;
size_t n = GetNumElements();
int blocksPerGrid = (c.GetNumElements() + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
size_t numColsNeeded = GetNumCols() * 3;
if (needAveMultiplier)
numColsNeeded += GetNumCols();
if (c.IsEmpty() || c.GetNumCols() < numColsNeeded)
{
c.RequireSize(GetNumRows(), numColsNeeded);
c.SetValue(0.0);
ElemType* avars = c.Data(); // accumulated variances for RMS scaling
ElemType* signs = c.Data() + n; // sign of previous gradient
ElemType* steps = c.Data() + 2 * n; // current step size
// Data()+3*n is temp memory used to store multipliers, no need to initialize
_rmsprop_init4BlockSparseCol<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(
avars, signs, steps,
Data(), ColOrRow2BlockId(), GetNumRows(),
n);
}
assert(c.GetNumRows() == GetNumRows() && c.GetNumCols() == numColsNeeded);
ElemType* avars = c.Data(); // accumulated variances for RMS scaling
ElemType* signs = c.Data() + n; // sign of previous gradient
ElemType* steps = c.Data() + 2 * n; // current step size
ElemType* multipliers = nullptr;
if (needAveMultiplier)
multipliers = c.Data() + 3 * n; // temp memory used to store multipliers,
if (!upd_gpu)
{
const ElemType upd[] = {
2, 2, 0,
2, 2, 0,
1, 1, 1,
2, 2, 0,
1, 2, 1,
0, 2, 2,
1, 1, 1,
0, 2, 2,
0, 2, 2,
};
upd_gpu = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 27);
CUDA_CALL(cudaMemcpy(upd_gpu, upd, sizeof(ElemType) * _countof(upd), cudaMemcpyHostToDevice));
}
_rmsprop4BlockSparseCol<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(
avars, signs, steps,
Data(), ColOrRow2BlockId(), GetNumRows(),
n,
RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN,
floor, upd_gpu, multipliers);
if (!needAveMultiplier)
return 1;
cublasHandle_t cuHandle = GPUMatrix<ElemType>::GetCublasHandle(GetComputeDeviceId());
if (sizeof(ElemType) == sizeof(float))
{
float aveMultiplier = 0;
CUBLAS_CALL(cublasSasum(cuHandle, (CUDA_LONG)n, reinterpret_cast<float*>(multipliers), 1, &aveMultiplier));
return aveMultiplier / n;
}
else
{
double aveMultiplier = 0;
CUBLAS_CALL(cublasDasum(cuHandle, (CUDA_LONG)n, reinterpret_cast<double*>(multipliers), 1, &aveMultiplier));
return (ElemType)aveMultiplier / n;
}
}
// sparse X dense = dense
template <class ElemType>
void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA,

Просмотреть файл

@ -410,6 +410,8 @@ public:
void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
ElemType Adagrad(GPUMatrix<ElemType>& c, const bool needAveMultiplier);
void FSAdagrad(GPUMatrix<ElemType>& c, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul);
ElemType RmsProp(GPUMatrix<ElemType>& c, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);

Просмотреть файл

@ -1585,11 +1585,13 @@ void Matrix<ElemType>::FSAdagradUpdate(size_t mbSize,
// - makes up for general scaling (targetAdagradAvDenom, a constant chosen by the user that should resemble the typical value range of gradients)
// - sqrt(1/#samples accumulated) to turn the sqr sum into an average
let targetAdagradAvDenom_x_sqrtAdagradSqrFrames = (ElemType)(targetAdagradAvDenom * sqrt(smoothedCount));
DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
{ m_CPUMatrix->FSAdagrad(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix, (ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum, targetAdagradAvDenom_x_sqrtAdagradSqrFrames); SetDataLocation(CPU); },
{ m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, (ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum, targetAdagradAvDenom_x_sqrtAdagradSqrFrames); SetDataLocation(GPU); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
{ gradients.m_GPUSparseMatrix->FSAdagrad(*m_GPUMatrix, *functionValues.m_GPUMatrix, (ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum, targetAdagradAvDenom_x_sqrtAdagradSqrFrames); SetDataLocation(GPU); });
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
}
@ -1604,11 +1606,11 @@ ElemType Matrix<ElemType>::RmsProp(Matrix<ElemType>& gradients,
{
DecideAndMoveToRightDevice(*this, gradients);
DISPATCH_MATRIX_ON_FLAG(this, &gradients,
DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
{ return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(CPU); },
{ return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU); },
{ NOT_IMPLEMENTED; },
{ NOT_IMPLEMENTED; });
{ return gradients.m_GPUSparseMatrix->RmsProp(*m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU); });
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
}

Просмотреть файл

@ -255,8 +255,17 @@ ElemType GPUSparseMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& c, const bool n
{
return 1;
}
//template<class ElemType>
//void GPUSparseMatrix<ElemType>::FSAdagrad(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>&, ElemType, ElemType, ElemType, ElemType) { }
template<class ElemType>
void GPUSparseMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>&, GPUMatrix<ElemType>&, ElemType, ElemType, ElemType, ElemType)
{
}
template<class ElemType>
ElemType GPUSparseMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>&, ElemType, ElemType, ElemType, ElemType, ElemType, const bool)
{
return 1;
}
template <class ElemType>
void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA,
@ -1057,6 +1066,7 @@ ElemType GPUMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& gradients, const bool
{
return 0;
}
template <class ElemType>
void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul)
{

Просмотреть файл

@ -2197,12 +2197,8 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
smoothedGradient.NormalGrad(gradientValues, functionValues,
(ElemType) learnRatePerSample, (ElemType) momentum, useNesterovMomentum);
}
else if (adpType == GradientsUpdateType::AdaGrad ||
(adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
(adpType == GradientsUpdateType::FSAdaGrad && gradientValues.GetMatrixType() == MatrixType::SPARSE))
else if (adpType == GradientsUpdateType::AdaGrad)
{
// rmsprop for sparse is not implemented yet, delegate it with adagrad
double aveMultiplier = smoothedGradient.Adagrad(gradientValues, needAveMultiplier);
Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
}

Просмотреть файл

@ -6318,6 +6318,16 @@ Test module "MathTests" has passed with:
Test case "GPUMatrixSuite/MatrixSparseElementWisePower" has passed with:
1 assertion out of 1 passed
Test suite "MatrixLearnerSuite" has passed with:
2 test case out of 2 passed
4 assertions out of 4 passed
Test case "MatrixLearnerSuite/FSAdagradSparse" has passed with:
2 assertions out of 2 passed
Test case "MatrixLearnerSuite/RmsPropSparse" has passed with:
2 assertions out of 2 passed
Test suite "MatrixUnitTests" has passed with:
26 test cases out of 26 passed
4048755 assertions out of 4048755 passed

Просмотреть файл

@ -138,6 +138,7 @@
<ClCompile Include="GPUMatrixCudaBlasTests.cpp" />
<ClCompile Include="GPUMatrixTests.cpp" />
<ClCompile Include="GPUSparseMatrixTests.cpp" />
<ClCompile Include="MatrixLearnerTests.cpp" />
<ClCompile Include="MatrixBlasTests.cpp" />
<ClCompile Include="MatrixDataSynchronizationTests.cpp" />
<ClCompile Include="MatrixFileWriteReadTests.cpp" />

Просмотреть файл

@ -0,0 +1,91 @@
//
// Copyright (c) Microsoft. All rights reserved.
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
//
#include "stdafx.h"
#include <math.h>
#ifdef _WIN32
#include <crtdefs.h>
#endif
#include "../../../Source/Math/Matrix.h"
#include "../../../Source/Math/CPUMatrix.h"
using namespace Microsoft::MSR::CNTK;
class MatrixLearnerFixture : public RandomSeedFixture
{
public:
static const size_t dim1 = 256;
static const size_t dim2 = 128;
static const size_t dim3 = 2048;
SingleMatrix matSG;
SingleMatrix matSGsparse;
SingleMatrix matM;
SingleMatrix matMsparse;
SingleMatrix matG;
SingleMatrix matGsparseBSC;
MatrixLearnerFixture() :
matSG(c_deviceIdZero),
matSGsparse(c_deviceIdZero),
matM(c_deviceIdZero),
matMsparse(c_deviceIdZero),
matG(c_deviceIdZero),
matGsparseBSC(c_deviceIdZero)
{
// smoothed gradient
matSG = SingleMatrix::RandomGaussian(dim1, dim2, c_deviceIdZero, -1.0f, 1.0f, IncrementCounter());
matSGsparse = SingleMatrix(matSG.DeepClone());
// model
matM = SingleMatrix::RandomGaussian(dim1, dim2, c_deviceIdZero, -1.0f, 1.0f, IncrementCounter());
matMsparse = SingleMatrix(matM.DeepClone());
// generates gradient
SingleMatrix matG1(c_deviceIdZero);
matG1.AssignTruncateBottomOf(Matrix<float>::RandomUniform(dim2, dim3, c_deviceIdZero, -300.0f, 0.1f, IncrementCounter()), 0);
SingleMatrix matG1sparseCSC(matG1.DeepClone());
matG1sparseCSC.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC, true);
SingleMatrix matG2 = SingleMatrix::RandomGaussian(dim1, dim3, c_deviceIdZero, -1.0f, 1.0f, IncrementCounter());
SingleMatrix::MultiplyAndWeightedAdd(1, matG2, false, matG1, true, 0, matG);
matGsparseBSC.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol, false);
SingleMatrix::MultiplyAndAdd(matG2, false, matG1sparseCSC, true, matGsparseBSC);
}
};
namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
BOOST_AUTO_TEST_SUITE(MatrixLearnerSuite)
// tests FSAdagrad sparse vs. dense
BOOST_FIXTURE_TEST_CASE(FSAdagradSparse, MatrixLearnerFixture)
{
// run learner
double smoothedCount = 1000;
matSG.FSAdagradUpdate(dim2, matG, matM, smoothedCount, 0.0001, 1.0, 0.9, 0.9);
smoothedCount = 1000;
matSGsparse.FSAdagradUpdate(dim2, matGsparseBSC, matMsparse, smoothedCount, 0.0001, 1.0, 0.9, 0.9);
BOOST_CHECK(matSG.IsEqualTo(matSGsparse, c_epsilonFloatE5));
BOOST_CHECK(matM.IsEqualTo(matMsparse, c_epsilonFloatE5));
}
// tests RmsProp sparse vs. dense
BOOST_FIXTURE_TEST_CASE(RmsPropSparse, MatrixLearnerFixture)
{
// run learner
float avg = matSG.RmsProp(matG, 0.99f, 1.2f, 10.0f, 0.75f, 0.1f, true);
float avgSparse = matSGsparse.RmsProp(matGsparseBSC, 0.99f, 1.2f, 10.0f, 0.75f, 0.1f, true);
BOOST_CHECK(matSG.IsEqualTo(matSGsparse, c_epsilonFloatE4));
BOOST_CHECK(fabsf(avg - avgSparse) < c_epsilonFloatE5);
}
BOOST_AUTO_TEST_SUITE_END()
}}}}