[CNTK core] sparse GPU impl of FSAdagrad and RmsProp
Also fix a bug in dense GPU MultiplyAndWeightedAdd that garbage data may get in via resize
This commit is contained in:
Родитель
3063238829
Коммит
4505c48fd0
1
Makefile
1
Makefile
|
@ -1186,6 +1186,7 @@ UNITTEST_MATH_SRC = \
|
|||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixLearnerTests.cpp \
|
||||
$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
|
||||
|
||||
UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
|
||||
|
|
|
@ -3370,15 +3370,16 @@ void GPUMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix
|
|||
int k = int(transposeA ? a.m_numRows : a.m_numCols);
|
||||
int l = int(transposeB ? b.m_numCols : b.m_numRows);
|
||||
|
||||
c.RequireSize(m, n);
|
||||
if (beta == 0)
|
||||
c.RequireSize(m, n);
|
||||
else
|
||||
c.VerifySize(m, n); // Can't resize if beta != 0
|
||||
|
||||
if (!(m > 0 && k > 0 && l > 0 && n > 0))
|
||||
RuntimeError("!(m>0 && k>0 && l>0 && n>0)"); // converting from size_t to int may cause overflow
|
||||
if (k != l)
|
||||
RuntimeError("matrix dim mismatch in MultiplyAndWeightedAdd");
|
||||
CUBLAS_CALL(cublas_gemm(cuHandle, transA, transB, m, n, k, &alpha, a.Data(), (int) a.m_numRows, b.Data(), (int) b.m_numRows, &beta, c.Data(), (int) c.m_numRows));
|
||||
c.m_numRows = m;
|
||||
c.m_numCols = n;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
|
|
|
@ -41,6 +41,10 @@
|
|||
#define CUDA_LONG int32_t
|
||||
#endif
|
||||
|
||||
// special markers in BlockId2ColOrRow()/ColOrRow2BlockId()
|
||||
static const GPUSPARSE_INDEX_TYPE Id_NotAssigned = -1;
|
||||
static const GPUSPARSE_INDEX_TYPE Id_Pending = INT_MAX;
|
||||
|
||||
#define IDX2C(i, j, ld) (((j) * (ld)) + (i)) // 0 based indexing
|
||||
|
||||
// On older GPUs, CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
|
||||
|
@ -1454,6 +1458,68 @@ __global__ void _fsadagrad(CUDA_LONG size, ElemType* grad, ElemType* smoothAda,
|
|||
}
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
inline __device__ ElemType _getvalue4BlockSparseCol(ElemType* v, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len, CUDA_LONG idx)
|
||||
{
|
||||
CUDA_LONG col = idx / len;
|
||||
CUDA_LONG row = idx - col * len;
|
||||
CUDA_LONG blockid = colOrRow2blockId[col];
|
||||
return (blockid == Id_NotAssigned) ? 0 : v[blockid * len + row];
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
inline __device__ void _scalevalue4BlockSparseCol(ElemType* v, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len, CUDA_LONG idx, ElemType s)
|
||||
{
|
||||
CUDA_LONG col = idx / len;
|
||||
CUDA_LONG row = idx - col * len;
|
||||
CUDA_LONG blockid = colOrRow2blockId[col];
|
||||
if (blockid != Id_NotAssigned)
|
||||
{
|
||||
v[blockid * len + row] *= s;
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _fsadagrad4BlockSparseCol(CUDA_LONG size,
|
||||
ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
|
||||
ElemType* smoothAda, ElemType* smoothMom, ElemType* val,
|
||||
ElemType lr, ElemType mom, ElemType adaWeight, ElemType adaMul)
|
||||
{
|
||||
CUDA_LONG idx = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
CUDA_LONG stride = blockDim.x * gridDim.x;
|
||||
for (; idx < size; idx += stride)
|
||||
{
|
||||
ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, idx);
|
||||
ElemType adaSqr = adaWeight * smoothAda[idx] + (1.0f - adaWeight) * g * g;
|
||||
smoothAda[idx] = adaSqr;
|
||||
if (adaSqr != 0.0f)
|
||||
{
|
||||
ElemType w;
|
||||
if (sizeof(ElemType) == sizeof(double))
|
||||
{
|
||||
w = adaMul * rsqrt(adaSqr);
|
||||
}
|
||||
else
|
||||
{
|
||||
w = adaMul * rsqrtf(adaSqr);
|
||||
}
|
||||
|
||||
if (w > 10.0f)
|
||||
w = 10.0f;
|
||||
g *= w;
|
||||
}
|
||||
|
||||
if (mom > 0.0f)
|
||||
{
|
||||
g = mom * smoothMom[idx] + (1.0f - mom) * g;
|
||||
smoothMom[idx] = g;
|
||||
}
|
||||
|
||||
g *= lr;
|
||||
val[idx] -= g;
|
||||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _rmsprop_init(
|
||||
ElemType* avars, ElemType* signs, ElemType* steps,
|
||||
|
@ -1470,6 +1536,23 @@ __global__ void _rmsprop_init(
|
|||
steps[i] = ElemType(0.02);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _rmsprop_init4BlockSparseCol(
|
||||
ElemType* avars, ElemType* signs, ElemType* steps,
|
||||
ElemType* curr_grad, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
|
||||
const CUDA_LONG N)
|
||||
{
|
||||
CUDA_LONG i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (i >= N)
|
||||
return;
|
||||
|
||||
ElemType tmp = _getvalue4BlockSparseCol(curr_grad, colOrRow2blockId, len, i);
|
||||
|
||||
avars[i] = tmp * tmp;
|
||||
signs[i] = ElemType(0.0);
|
||||
steps[i] = ElemType(0.02);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _rmsprop(
|
||||
ElemType* avars, ElemType* signs, ElemType* steps,
|
||||
|
@ -1523,6 +1606,61 @@ __global__ void _rmsprop(
|
|||
multipliers[i] = temp;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _rmsprop4BlockSparseCol(
|
||||
ElemType* avars, ElemType* signs, ElemType* steps,
|
||||
ElemType* grad_bsc, const GPUSPARSE_INDEX_TYPE* colOrRow2blockId, const size_t len,
|
||||
const CUDA_LONG N,
|
||||
ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN,
|
||||
ElemType floor,
|
||||
ElemType* upd_gpu,
|
||||
ElemType* multipliers)
|
||||
{
|
||||
CUDA_LONG i = blockDim.x * blockIdx.x + threadIdx.x;
|
||||
if (i >= N)
|
||||
return;
|
||||
|
||||
ElemType g = _getvalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, i);
|
||||
|
||||
avars[i] = RMS_GAMMA * avars[i] + (ElemType(1.0) - RMS_GAMMA) * (g * g);
|
||||
|
||||
// // grad sign base 3: 0->neg, 1->zero, 2->pos
|
||||
// const int grad_sign = 1 + (ElemType(0) < curr_grad[i]) - (curr_grad[i] < ElemType(0));
|
||||
|
||||
// // signs[i] contains three consecutive grad_sign
|
||||
// signs[i] = 3*(int(signs[i]) % 9) + grad_sign;
|
||||
|
||||
// // update according to the following table:
|
||||
// // (!pos,!pos,!pos) or (!neg,!neg,!neg): RMS_WGT_INC
|
||||
// // (!neg,!neg,neg) or (!pos,!pos,pos): RMS_WGT_DEC
|
||||
// // otherwise: no action
|
||||
|
||||
// switch(int(upd_gpu[int(signs[i])]))
|
||||
// {
|
||||
// case 0:
|
||||
// steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
|
||||
// break;
|
||||
// case 2:
|
||||
// steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
|
||||
// break;
|
||||
// }
|
||||
// curr_grad[i] *= steps[i] / sqrt(avars[i] + floor);
|
||||
|
||||
const int grad_sign = (ElemType(0) < g) - (g < ElemType(0));
|
||||
|
||||
if (signs[i] * grad_sign > 0)
|
||||
steps[i] = min(steps[i] * RMS_WGT_INC, RMS_WGT_MAX);
|
||||
else
|
||||
steps[i] = max(steps[i] * RMS_WGT_DEC, RMS_WGT_MIN);
|
||||
|
||||
ElemType temp = steps[i] / sqrt(avars[i] + floor);
|
||||
_scalevalue4BlockSparseCol(grad_bsc, colOrRow2blockId, len, i, temp);
|
||||
signs[i] = grad_sign;
|
||||
|
||||
if (multipliers != nullptr)
|
||||
multipliers[i] = temp;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
__global__ void _rescaleToRange(
|
||||
ElemType* a,
|
||||
|
@ -3009,10 +3147,6 @@ __global__ void _reshape(
|
|||
newColumnIndex[newNumCols] = oldColumnIndex[oldNumCols]; // set end pointer
|
||||
}
|
||||
|
||||
// special markers in BlockId2ColOrRow()/ColOrRow2BlockId()
|
||||
static const GPUSPARSE_INDEX_TYPE Id_NotAssigned = -1;
|
||||
static const GPUSPARSE_INDEX_TYPE Id_Pending = INT_MAX;
|
||||
|
||||
//called before _determineBlockIds and _denseMulSparseCSCTransposeToSparseBlockCol to determine which columns have values and
|
||||
//what's the mapping from the column id in the resulted SparseBlockCol format to the column id in the dense format
|
||||
//input: rowIndexes: the row indexes of the CSC sparse matrix to be multiplied with
|
||||
|
|
|
@ -1505,6 +1505,130 @@ ElemType GPUSparseMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& c, const bool n
|
|||
}
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::FSAdagrad(
|
||||
GPUMatrix<ElemType>& c,
|
||||
GPUMatrix<ElemType>& functionValues,
|
||||
ElemType learnRatePerSample,
|
||||
ElemType momentum,
|
||||
ElemType adaWeight,
|
||||
ElemType adaMul)
|
||||
{
|
||||
if (GetFormat() != MatrixFormat::matrixFormatSparseBlockCol)
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
size_t numColsNeeded = 2 * GetNumCols();
|
||||
|
||||
if (c.IsEmpty() || (c.GetNumCols() < numColsNeeded))
|
||||
{
|
||||
c.RequireSize(GetNumRows(), numColsNeeded);
|
||||
c.SetValue(0.0);
|
||||
}
|
||||
|
||||
assert((c.GetNumRows() == GetNumRows()) && (c.GetNumCols() == numColsNeeded));
|
||||
|
||||
size_t n = GetNumElements();
|
||||
int blocksPerGrid = (n + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
|
||||
_fsadagrad4BlockSparseCol<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(
|
||||
n, Data(), ColOrRow2BlockId(), GetNumRows(),
|
||||
c.Data(), c.Data() + n, functionValues.Data(),
|
||||
learnRatePerSample, momentum, adaWeight, adaMul);
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
ElemType GPUSparseMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>& c,
|
||||
ElemType RMS_GAMMA,
|
||||
ElemType RMS_WGT_INC,
|
||||
ElemType RMS_WGT_MAX,
|
||||
ElemType RMS_WGT_DEC,
|
||||
ElemType RMS_WGT_MIN,
|
||||
const bool needAveMultiplier)
|
||||
{
|
||||
if (GetFormat() != MatrixFormat::matrixFormatSparseBlockCol)
|
||||
{
|
||||
NOT_IMPLEMENTED;
|
||||
}
|
||||
|
||||
const ElemType floor = 1e-6f;
|
||||
static ElemType* upd_gpu = (ElemType*)0;
|
||||
|
||||
size_t n = GetNumElements();
|
||||
int blocksPerGrid = (c.GetNumElements() + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock;
|
||||
|
||||
size_t numColsNeeded = GetNumCols() * 3;
|
||||
if (needAveMultiplier)
|
||||
numColsNeeded += GetNumCols();
|
||||
|
||||
if (c.IsEmpty() || c.GetNumCols() < numColsNeeded)
|
||||
{
|
||||
c.RequireSize(GetNumRows(), numColsNeeded);
|
||||
c.SetValue(0.0);
|
||||
|
||||
ElemType* avars = c.Data(); // accumulated variances for RMS scaling
|
||||
ElemType* signs = c.Data() + n; // sign of previous gradient
|
||||
ElemType* steps = c.Data() + 2 * n; // current step size
|
||||
// Data()+3*n is temp memory used to store multipliers, no need to initialize
|
||||
|
||||
_rmsprop_init4BlockSparseCol<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(
|
||||
avars, signs, steps,
|
||||
Data(), ColOrRow2BlockId(), GetNumRows(),
|
||||
n);
|
||||
}
|
||||
assert(c.GetNumRows() == GetNumRows() && c.GetNumCols() == numColsNeeded);
|
||||
|
||||
ElemType* avars = c.Data(); // accumulated variances for RMS scaling
|
||||
ElemType* signs = c.Data() + n; // sign of previous gradient
|
||||
ElemType* steps = c.Data() + 2 * n; // current step size
|
||||
|
||||
ElemType* multipliers = nullptr;
|
||||
if (needAveMultiplier)
|
||||
multipliers = c.Data() + 3 * n; // temp memory used to store multipliers,
|
||||
|
||||
if (!upd_gpu)
|
||||
{
|
||||
const ElemType upd[] = {
|
||||
2, 2, 0,
|
||||
2, 2, 0,
|
||||
1, 1, 1,
|
||||
2, 2, 0,
|
||||
1, 2, 1,
|
||||
0, 2, 2,
|
||||
1, 1, 1,
|
||||
0, 2, 2,
|
||||
0, 2, 2,
|
||||
};
|
||||
|
||||
upd_gpu = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 27);
|
||||
CUDA_CALL(cudaMemcpy(upd_gpu, upd, sizeof(ElemType) * _countof(upd), cudaMemcpyHostToDevice));
|
||||
}
|
||||
|
||||
_rmsprop4BlockSparseCol<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(
|
||||
avars, signs, steps,
|
||||
Data(), ColOrRow2BlockId(), GetNumRows(),
|
||||
n,
|
||||
RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN,
|
||||
floor, upd_gpu, multipliers);
|
||||
|
||||
if (!needAveMultiplier)
|
||||
return 1;
|
||||
|
||||
cublasHandle_t cuHandle = GPUMatrix<ElemType>::GetCublasHandle(GetComputeDeviceId());
|
||||
if (sizeof(ElemType) == sizeof(float))
|
||||
{
|
||||
float aveMultiplier = 0;
|
||||
CUBLAS_CALL(cublasSasum(cuHandle, (CUDA_LONG)n, reinterpret_cast<float*>(multipliers), 1, &aveMultiplier));
|
||||
return aveMultiplier / n;
|
||||
}
|
||||
else
|
||||
{
|
||||
double aveMultiplier = 0;
|
||||
CUBLAS_CALL(cublasDasum(cuHandle, (CUDA_LONG)n, reinterpret_cast<double*>(multipliers), 1, &aveMultiplier));
|
||||
return (ElemType)aveMultiplier / n;
|
||||
}
|
||||
}
|
||||
|
||||
// sparse X dense = dense
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA,
|
||||
|
|
|
@ -410,6 +410,8 @@ public:
|
|||
|
||||
void NormalGrad(GPUMatrix<ElemType>& c, const ElemType momentum);
|
||||
ElemType Adagrad(GPUMatrix<ElemType>& c, const bool needAveMultiplier);
|
||||
void FSAdagrad(GPUMatrix<ElemType>& c, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul);
|
||||
ElemType RmsProp(GPUMatrix<ElemType>& c, ElemType RMS_GAMMA, ElemType RMS_WGT_INC, ElemType RMS_WGT_MAX, ElemType RMS_WGT_DEC, ElemType RMS_WGT_MIN, const bool needAveMultiplier);
|
||||
|
||||
static void Multiply(const GPUSparseMatrix<ElemType>& S, const GPUMatrix<ElemType>& D, GPUMatrix<ElemType>& C);
|
||||
static void Multiply(const GPUMatrix<ElemType>& D, const GPUSparseMatrix<ElemType>& S, GPUMatrix<ElemType>& C);
|
||||
|
|
|
@ -1585,11 +1585,13 @@ void Matrix<ElemType>::FSAdagradUpdate(size_t mbSize,
|
|||
// - makes up for general scaling (targetAdagradAvDenom, a constant chosen by the user that should resemble the typical value range of gradients)
|
||||
// - sqrt(1/#samples accumulated) to turn the sqr sum into an average
|
||||
let targetAdagradAvDenom_x_sqrtAdagradSqrFrames = (ElemType)(targetAdagradAvDenom * sqrt(smoothedCount));
|
||||
|
||||
DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
|
||||
{ m_CPUMatrix->FSAdagrad(*gradients.m_CPUMatrix, *functionValues.m_CPUMatrix, (ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum, targetAdagradAvDenom_x_sqrtAdagradSqrFrames); SetDataLocation(CPU); },
|
||||
{ m_GPUMatrix->FSAdagrad(*gradients.m_GPUMatrix, *functionValues.m_GPUMatrix, (ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum, targetAdagradAvDenom_x_sqrtAdagradSqrFrames); SetDataLocation(GPU); },
|
||||
{ NOT_IMPLEMENTED; },
|
||||
{ NOT_IMPLEMENTED; });
|
||||
{ gradients.m_GPUSparseMatrix->FSAdagrad(*m_GPUMatrix, *functionValues.m_GPUMatrix, (ElemType)learnRatePerSample, (ElemType)meanMomentum, (ElemType)varMomentum, targetAdagradAvDenom_x_sqrtAdagradSqrFrames); SetDataLocation(GPU); });
|
||||
|
||||
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
|
||||
}
|
||||
|
||||
|
@ -1604,11 +1606,11 @@ ElemType Matrix<ElemType>::RmsProp(Matrix<ElemType>& gradients,
|
|||
{
|
||||
DecideAndMoveToRightDevice(*this, gradients);
|
||||
|
||||
DISPATCH_MATRIX_ON_FLAG(this, &gradients,
|
||||
DISPATCH_MATRIX_ON_FLAG(&gradients, &gradients,
|
||||
{ return m_CPUMatrix->RmsProp(*gradients.m_CPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(CPU); },
|
||||
{ return m_GPUMatrix->RmsProp(*gradients.m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU); },
|
||||
{ NOT_IMPLEMENTED; },
|
||||
{ NOT_IMPLEMENTED; });
|
||||
{ return gradients.m_GPUSparseMatrix->RmsProp(*m_GPUMatrix, RMS_GAMMA, RMS_WGT_INC, RMS_WGT_MAX, RMS_WGT_DEC, RMS_WGT_MIN, needAveMultiplier); SetDataLocation(GPU); });
|
||||
// Note: Since both 'this' and gradients are changed, we must call SetDataLocation() on 'this' as well.
|
||||
}
|
||||
|
||||
|
|
|
@ -255,8 +255,17 @@ ElemType GPUSparseMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& c, const bool n
|
|||
{
|
||||
return 1;
|
||||
}
|
||||
//template<class ElemType>
|
||||
//void GPUSparseMatrix<ElemType>::FSAdagrad(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>&, ElemType, ElemType, ElemType, ElemType) { }
|
||||
|
||||
template<class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>&, GPUMatrix<ElemType>&, ElemType, ElemType, ElemType, ElemType)
|
||||
{
|
||||
}
|
||||
|
||||
template<class ElemType>
|
||||
ElemType GPUSparseMatrix<ElemType>::RmsProp(GPUMatrix<ElemType>&, ElemType, ElemType, ElemType, ElemType, ElemType, const bool)
|
||||
{
|
||||
return 1;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const GPUSparseMatrix<ElemType>& a, const bool transposeA,
|
||||
|
@ -1057,6 +1066,7 @@ ElemType GPUMatrix<ElemType>::Adagrad(GPUMatrix<ElemType>& gradients, const bool
|
|||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
template <class ElemType>
|
||||
void GPUMatrix<ElemType>::FSAdagrad(GPUMatrix<ElemType>& gradients, GPUMatrix<ElemType>& functionValues, ElemType learnRatePerSample, ElemType momentum, ElemType adaWeight, ElemType adaMul)
|
||||
{
|
||||
|
|
|
@ -2197,12 +2197,8 @@ void SGD<ElemType>::UpdateWeights(Matrix<ElemType>& functionValues, Matrix<ElemT
|
|||
smoothedGradient.NormalGrad(gradientValues, functionValues,
|
||||
(ElemType) learnRatePerSample, (ElemType) momentum, useNesterovMomentum);
|
||||
}
|
||||
else if (adpType == GradientsUpdateType::AdaGrad ||
|
||||
(adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) ||
|
||||
(adpType == GradientsUpdateType::FSAdaGrad && gradientValues.GetMatrixType() == MatrixType::SPARSE))
|
||||
else if (adpType == GradientsUpdateType::AdaGrad)
|
||||
{
|
||||
// rmsprop for sparse is not implemented yet, delegate it with adagrad
|
||||
|
||||
double aveMultiplier = smoothedGradient.Adagrad(gradientValues, needAveMultiplier);
|
||||
Matrix<ElemType>::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues);
|
||||
}
|
||||
|
|
|
@ -6318,6 +6318,16 @@ Test module "MathTests" has passed with:
|
|||
Test case "GPUMatrixSuite/MatrixSparseElementWisePower" has passed with:
|
||||
1 assertion out of 1 passed
|
||||
|
||||
Test suite "MatrixLearnerSuite" has passed with:
|
||||
2 test case out of 2 passed
|
||||
4 assertions out of 4 passed
|
||||
|
||||
Test case "MatrixLearnerSuite/FSAdagradSparse" has passed with:
|
||||
2 assertions out of 2 passed
|
||||
|
||||
Test case "MatrixLearnerSuite/RmsPropSparse" has passed with:
|
||||
2 assertions out of 2 passed
|
||||
|
||||
Test suite "MatrixUnitTests" has passed with:
|
||||
26 test cases out of 26 passed
|
||||
4048755 assertions out of 4048755 passed
|
||||
|
|
|
@ -138,6 +138,7 @@
|
|||
<ClCompile Include="GPUMatrixCudaBlasTests.cpp" />
|
||||
<ClCompile Include="GPUMatrixTests.cpp" />
|
||||
<ClCompile Include="GPUSparseMatrixTests.cpp" />
|
||||
<ClCompile Include="MatrixLearnerTests.cpp" />
|
||||
<ClCompile Include="MatrixBlasTests.cpp" />
|
||||
<ClCompile Include="MatrixDataSynchronizationTests.cpp" />
|
||||
<ClCompile Include="MatrixFileWriteReadTests.cpp" />
|
||||
|
|
|
@ -0,0 +1,91 @@
|
|||
//
|
||||
// Copyright (c) Microsoft. All rights reserved.
|
||||
// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
|
||||
//
|
||||
#include "stdafx.h"
|
||||
#include <math.h>
|
||||
#ifdef _WIN32
|
||||
#include <crtdefs.h>
|
||||
#endif
|
||||
#include "../../../Source/Math/Matrix.h"
|
||||
#include "../../../Source/Math/CPUMatrix.h"
|
||||
|
||||
using namespace Microsoft::MSR::CNTK;
|
||||
|
||||
class MatrixLearnerFixture : public RandomSeedFixture
|
||||
{
|
||||
public:
|
||||
static const size_t dim1 = 256;
|
||||
static const size_t dim2 = 128;
|
||||
static const size_t dim3 = 2048;
|
||||
|
||||
SingleMatrix matSG;
|
||||
SingleMatrix matSGsparse;
|
||||
SingleMatrix matM;
|
||||
SingleMatrix matMsparse;
|
||||
SingleMatrix matG;
|
||||
SingleMatrix matGsparseBSC;
|
||||
|
||||
MatrixLearnerFixture() :
|
||||
matSG(c_deviceIdZero),
|
||||
matSGsparse(c_deviceIdZero),
|
||||
matM(c_deviceIdZero),
|
||||
matMsparse(c_deviceIdZero),
|
||||
matG(c_deviceIdZero),
|
||||
matGsparseBSC(c_deviceIdZero)
|
||||
{
|
||||
// smoothed gradient
|
||||
matSG = SingleMatrix::RandomGaussian(dim1, dim2, c_deviceIdZero, -1.0f, 1.0f, IncrementCounter());
|
||||
matSGsparse = SingleMatrix(matSG.DeepClone());
|
||||
|
||||
// model
|
||||
matM = SingleMatrix::RandomGaussian(dim1, dim2, c_deviceIdZero, -1.0f, 1.0f, IncrementCounter());
|
||||
matMsparse = SingleMatrix(matM.DeepClone());
|
||||
|
||||
// generates gradient
|
||||
SingleMatrix matG1(c_deviceIdZero);
|
||||
matG1.AssignTruncateBottomOf(Matrix<float>::RandomUniform(dim2, dim3, c_deviceIdZero, -300.0f, 0.1f, IncrementCounter()), 0);
|
||||
|
||||
SingleMatrix matG1sparseCSC(matG1.DeepClone());
|
||||
matG1sparseCSC.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC, true);
|
||||
|
||||
SingleMatrix matG2 = SingleMatrix::RandomGaussian(dim1, dim3, c_deviceIdZero, -1.0f, 1.0f, IncrementCounter());
|
||||
|
||||
SingleMatrix::MultiplyAndWeightedAdd(1, matG2, false, matG1, true, 0, matG);
|
||||
|
||||
matGsparseBSC.SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseBlockCol, false);
|
||||
SingleMatrix::MultiplyAndAdd(matG2, false, matG1sparseCSC, true, matGsparseBSC);
|
||||
}
|
||||
};
|
||||
|
||||
namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
|
||||
|
||||
BOOST_AUTO_TEST_SUITE(MatrixLearnerSuite)
|
||||
|
||||
// tests FSAdagrad sparse vs. dense
|
||||
BOOST_FIXTURE_TEST_CASE(FSAdagradSparse, MatrixLearnerFixture)
|
||||
{
|
||||
// run learner
|
||||
double smoothedCount = 1000;
|
||||
matSG.FSAdagradUpdate(dim2, matG, matM, smoothedCount, 0.0001, 1.0, 0.9, 0.9);
|
||||
|
||||
smoothedCount = 1000;
|
||||
matSGsparse.FSAdagradUpdate(dim2, matGsparseBSC, matMsparse, smoothedCount, 0.0001, 1.0, 0.9, 0.9);
|
||||
|
||||
BOOST_CHECK(matSG.IsEqualTo(matSGsparse, c_epsilonFloatE5));
|
||||
BOOST_CHECK(matM.IsEqualTo(matMsparse, c_epsilonFloatE5));
|
||||
}
|
||||
|
||||
// tests RmsProp sparse vs. dense
|
||||
BOOST_FIXTURE_TEST_CASE(RmsPropSparse, MatrixLearnerFixture)
|
||||
{
|
||||
// run learner
|
||||
float avg = matSG.RmsProp(matG, 0.99f, 1.2f, 10.0f, 0.75f, 0.1f, true);
|
||||
float avgSparse = matSGsparse.RmsProp(matGsparseBSC, 0.99f, 1.2f, 10.0f, 0.75f, 0.1f, true);
|
||||
|
||||
BOOST_CHECK(matSG.IsEqualTo(matSGsparse, c_epsilonFloatE4));
|
||||
BOOST_CHECK(fabsf(avg - avgSparse) < c_epsilonFloatE5);
|
||||
}
|
||||
|
||||
BOOST_AUTO_TEST_SUITE_END()
|
||||
}}}}
|
Загрузка…
Ссылка в новой задаче