From 7ad6f69f5e644b65d8fee426e5657eeb693a166a Mon Sep 17 00:00:00 2001 From: Qiwei Ye Date: Wed, 1 Jun 2016 15:44:47 +0800 Subject: [PATCH] adding debug info --- Source/SGDLib/SGD.cpp | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/Source/SGDLib/SGD.cpp b/Source/SGDLib/SGD.cpp index f8e5f7192..5fd484d55 100644 --- a/Source/SGDLib/SGD.cpp +++ b/Source/SGDLib/SGD.cpp @@ -20,6 +20,7 @@ #include #include +#include namespace Microsoft { namespace MSR { namespace CNTK { @@ -1950,6 +1951,8 @@ template Matrix sgdUpdateNoise((DEVICEID_TYPE) functionValues.GetDeviceId()); if (noiseStd > 0) { + fprintf(stderr, "noisestd \n"); + fflush(stderr); // get the gradient structure since gradient is sparse sgdUpdateNoise.SetValue(gradientValues); @@ -1960,31 +1963,53 @@ template // L2 regularizer if (L2RegWeight > 0) - { + fprintf(stderr, "l2 \n"); + fflush(stderr); { // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample Matrix::ScaleAndAdd((ElemType)(L2RegWeight * actualMBSize), functionValues, gradientValues); } + if (adpType == GradientsUpdateType::None) { + ElemType* debugParametersBefore = nullptr; + if (gradientValues.GetMatrixType() == MatrixType::SPARSE) + { + debugParametersBefore = functionValues.CopyToArray(); + } smoothedGradient.NormalGrad(gradientValues, functionValues, (ElemType) learnRatePerSample, (ElemType) momentum, useNesterovMomentum); + + if (gradientValues.GetMatrixType() == MatrixType::SPARSE) + { + ElemType* debugParametersAfter = functionValues.CopyToArray(); + size_t sizeofElement = functionValues.GetNumElements(); + std::transform(debugParametersBefore, debugParametersBefore + sizeofElement, debugParametersAfter, debugParametersAfter, std::minus()); + size_t zeroNum = std::count(debugParametersAfter, debugParametersAfter + sizeofElement, 0.0f); + fprintf(stderr, "function value after update : %d zero number \n", (int)zeroNum); + fflush(stderr); + } } else if (adpType == GradientsUpdateType::AdaGrad || (adpType == GradientsUpdateType::RmsProp && gradientValues.GetMatrixType() == MatrixType::SPARSE) || (adpType == GradientsUpdateType::FSAdaGrad && gradientValues.GetMatrixType() == MatrixType::SPARSE)) { // rmsprop for sparse is not implemented yet, delegate it with adagrad - + fprintf(stderr, "adagrad \n"); + fflush(stderr); double aveMultiplier = smoothedGradient.Adagrad(gradientValues, needAveMultiplier); Matrix::ScaleAndAdd((ElemType)(-learnRatePerSample / aveMultiplier), gradientValues, functionValues); } else if (adpType == GradientsUpdateType::FSAdaGrad) { + fprintf(stderr, "fsadagrad \n"); + fflush(stderr); smoothedGradient.FSAdagrad(actualMBSize, gradientValues, functionValues, (ElemType) learnRatePerSample, (ElemType) momentum); } else if (adpType == GradientsUpdateType::RmsProp) { + fprintf(stderr, "rmsprop \n"); + fflush(stderr); double aveMultiplier = smoothedGradient.RmsProp(gradientValues, (ElemType) sgd->m_rpi.gamma, (ElemType) sgd->m_rpi.inc, (ElemType) sgd->m_rpi.max, (ElemType) sgd->m_rpi.dec, (ElemType) sgd->m_rpi.min, needAveMultiplier); @@ -1993,12 +2018,15 @@ template if (noiseStd > 0) { + fprintf(stderr, "noisestd\n"); + fflush(stderr); Matrix::ScaleAndAdd(1.0, sgdUpdateNoise, functionValues); } // L1 regularizer with proximal gradient descent method if (L1RegWeight > 0) - { + { fprintf(stderr, "adagrad \n"); + fflush(stderr); // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample functionValues.InplaceSoftThreshold((ElemType)(learnRatePerSample * L1RegWeight * actualMBSize)); }