From e1a9cabbde633a42945963be09d94e132a8405a2 Mon Sep 17 00:00:00 2001 From: Mark Hillebrand Date: Tue, 23 Aug 2016 17:57:00 +0200 Subject: [PATCH] Address CR comments --- Examples/Image/MNIST/Config/Macros.ndl | 8 +- Examples/Image/MNIST/Config/Shared.bs | 22 ++-- .../Image/Miscellaneous/CIFAR-10/Macros.ndl | 16 +-- .../Miscellaneous/ImageNet/ResNet/Macros.ndl | 4 +- .../Miscellaneous/ImageNet/VGG/Macros.ndl | 8 +- Source/ComputationNetworkLib/TrainingNodes.h | 104 ++++++++++-------- Source/Math/BatchNormalizationEngine.cpp | 52 +++++---- Source/Math/BatchNormalizationEngine.h | 4 +- Source/Math/CPUMatrix.cpp | 9 +- Source/Math/CPUMatrix.h | 6 +- Source/Math/CntkBatchNormalization.cuh | 44 ++++---- Source/Math/CuDnnBatchNormalization.cu | 28 ++--- Source/Math/CuDnnCommon.h | 2 +- Source/Math/GPUMatrix.cu | 59 +++++----- Source/Math/GPUMatrix.h | 4 +- Source/Math/MathCUDA.vcxproj | 2 +- Source/Math/Matrix.cpp | 12 +- Source/Math/Matrix.h | 4 +- Source/Math/NoGPU.cpp | 2 +- .../NonSpatial/01_OneHidden.cntk | 12 +- .../NonSpatial/01_OneHidden.ndl | 4 +- .../Spatial/02_BatchNormConv.cntk | 4 + .../Spatial/02_BatchNormConv.ndl | 7 +- .../Spatial/CNTK/testcases.yml | 8 +- .../Spatial/CuDNN/testcases.yml | 8 +- .../BatchNormalization/Spatial/Macros.ndl | 16 +-- .../CIFAR-10/02_BatchNormConv/testcases.yml | 2 +- .../BatchNormalizationEngineTests.cpp | 5 +- Tests/UnitTests/V2LibraryTests/Image.h | 4 +- 29 files changed, 242 insertions(+), 218 deletions(-) diff --git a/Examples/Image/MNIST/Config/Macros.ndl b/Examples/Image/MNIST/Config/Macros.ndl index cfa1e2baa..3786f979a 100644 --- a/Examples/Image/MNIST/Config/Macros.ndl +++ b/Examples/Image/MNIST/Config/Macros.ndl @@ -26,9 +26,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [ b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) t = Times(W, x) - bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst) + bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst) y = RectifiedLinear(bn) ] @@ -72,10 +72,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue) sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue) m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0) - var = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0) + v = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0) c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$) - y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$) + y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$) ] ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [ diff --git a/Examples/Image/MNIST/Config/Shared.bs b/Examples/Image/MNIST/Config/Shared.bs index 2354465df..ff05527c1 100644 --- a/Examples/Image/MNIST/Config/Shared.bs +++ b/Examples/Image/MNIST/Config/Shared.bs @@ -22,13 +22,13 @@ DNNImageSigmoidLayer (inW, inH, inC, outDim, x, parmScale) = [ # ReLU layer with batch normalization # TODO: rename to DNN- DnnBNReLULayer (inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [ - W = Parameter (outDim, inDim, init = "gaussian", initValueScale = wScale, initOnCPUOnly=true) - b = Parameter (outDim, 1, init = "fixedValue", value = bValue) - sc = Parameter (outDim, 1, init = "fixedValue", value = scValue) - m = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0) - var = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0) + W = Parameter (outDim, inDim, init = "gaussian", initValueScale = wScale, initOnCPUOnly=true) + b = Parameter (outDim, 1, init = "fixedValue", value = bValue) + sc = Parameter (outDim, 1, init = "fixedValue", value = scValue) + m = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0) + v = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0) t = Times(W, x) # TODO: W * x - bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst) + bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst) y = RectifiedLinear(bn) ].y @@ -58,13 +58,13 @@ ConvNDReLULayer (inp, kW, kH, inMap, inWCount, outMap, hStride, vStride, wScale, ].out ConvBNLayerW (W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst) = [ # TODO: delete if not needed - b = Parameter(outMap, 1, init="fixedValue", value=bValue) - sc = Parameter(outMap, 1, init="fixedValue", value=scValue) - m = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0) - var = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0) + b = Parameter(outMap, 1, init="fixedValue", value=bValue) + sc = Parameter(outMap, 1, init="fixedValue", value=scValue) + m = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0) + v = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0) c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true /* , imageLayout=$imageLayout$*/) - y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/) + y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/) ].y ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [ diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl index 552f51951..4fa20f065 100644 --- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl +++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl @@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue) m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$) - y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$) + y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$) ] ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) @@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst) b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue) m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$) - y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$) + y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$) ] ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst) @@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) t = Times(W, x) - bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst) + bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst) y = RectifiedLinear(bn) ] @@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) t = Times(W, x) - bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst) + bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst) y = RectifiedLinear(bn) ] diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl index ca1585526..b546fcfaf 100644 --- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl +++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl @@ -8,9 +8,9 @@ BN(inp, mapCount, bValue, scValue, bnTimeConst) b = Parameter(mapCount, 1, init = fixedValue, value = bValue) sc = Parameter(mapCount, 1, init = fixedValue, value = scValue) m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - y = BatchNormalization(inp, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn") + y = BatchNormalization(inp, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn") ] ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst) diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl index 82d3b8582..949bf29e5 100644 --- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl +++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl @@ -15,9 +15,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue) b = Parameter(outDim, 1, init = fixedValue, value = bValue) sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01) m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) t = Times(W, x) - bn = BatchNormalization(t, sc, b, m, var, spatial = false) + bn = BatchNormalization(t, sc, b, m, v, spatial = false) y = RectifiedLinear(bn) ] @@ -47,9 +47,9 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, b = Parameter(outMap, 1, init = fixedValue, value = bValue) sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue) m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn") - bn = BatchNormalization(c, sc, b, m, var, spatial = true, imageLayout = "cudnn") + bn = BatchNormalization(c, sc, b, m, v, spatial = true, imageLayout = "cudnn") y = RectifiedLinear(bn); ] diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h index ba0d28ae2..e4390ded0 100644 --- a/Source/ComputationNetworkLib/TrainingNodes.h +++ b/Source/ComputationNetworkLib/TrainingNodes.h @@ -1547,7 +1547,7 @@ template class DropoutNode; // // m = mean(input) // var = variance(input) -// input_norm = (input - mean) / sqrt(var) +// input_norm = (input - mean) / sqrt(epsilon + var) // output = gamma * input_norm + beta // // where gamma and beta are trainable parameters(represented as LearnableParameter). @@ -1570,7 +1570,7 @@ template class DropoutNode; // * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch. // Value 0 (default) means no blending will happen and only the current minibatch statistics will be used. // Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase). -// * epsilon is a conditioner constant used in computing inverted standard deviation +// * epsilon is a conditioner constant used in computing inverse standard deviation // * useCntkEngine is a Boolean flag that specifies which batch normalization implementation to use: CNTK or cuDNN-based. // * imageLayout is the image layout. Only cudnn is supported at present. // ----------------------------------------------------------------------- @@ -1584,14 +1584,14 @@ public: BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) : Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true), m_samplesSeen(0), m_imageLayoutKind(ImageLayoutKind::CHW), - m_convertRunningVariance(false) + m_convertRunningVariancePending(false) { } BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant, double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) : Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant), m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_samplesSeen(0), - m_convertRunningVariance(false) + m_convertRunningVariancePending(false) { } BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) : @@ -1627,7 +1627,9 @@ public: fstream >> m_normTimeConst; fstream >> m_blendTimeConst; fstream >> m_imageLayoutKind; +#ifdef _DEBUG fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str()); +#endif fstream >> mbCount; m_samplesSeen = mbCount; fstream >> m_epsilon; @@ -1666,7 +1668,9 @@ public: { size_t mbCount; fstream >> m_imageLayoutKind; +#ifdef _DEBUG fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str()); +#endif fstream >> mbCount; m_samplesSeen = mbCount; } @@ -1681,8 +1685,9 @@ public: { // Prior to CNTK_MODEL_VERSION_12, running inverse standard // deviation was stored in Input 4. Now variance is used. - // We (approximately) convert it during validation later. - m_convertRunningVariance = true; + // We (approximately) convert it during validation later + // (and then clear the flag). + m_convertRunningVariancePending = true; } } @@ -1713,15 +1718,13 @@ private: // time-constant conversions // in inference mode, only use long-term mean and do not update running estimates if (!Environment().IsTraining()) { - assert(0 < m_samplesSeen); // something must have been trained + if (m_samplesSeen == 0) + RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str()); return 0; // (m_normTimeConst == infinity) no new contribution from current minibatch } // Initialization case: only use current minibatch. - if (m_samplesSeen == 0) - { - return 1.0; - } + if (m_samplesSeen == 0) return 1.0; double numSamples = (double)GetMBLayout()->GetActualNumSamples(); @@ -1747,15 +1750,13 @@ private: // time-constant conversions // in inference mode, only use long-term mean and do not update running estimates if (!Environment().IsTraining()) { - assert(0 < m_samplesSeen); // something must have been trained - return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate + if (m_samplesSeen == 0) + RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str()); + return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate } // Initialization case: only use current minibatch. - if (m_samplesSeen == 0) - { - return 0; - } + if (m_samplesSeen == 0) return 0; // convert to blend factor (= weight for running stats) // The code below special-cases two boundary cases, but those are just the limit cases of the main formula. @@ -1771,7 +1772,8 @@ public: virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override { - assert(!m_convertRunningVariance); + if (m_convertRunningVariancePending) + LogicError("%ls: Failed to convert running variance until forward prop", NodeName().c_str()); FrameRange fr(Input(0)->GetMBLayout()); Matrix sliceInputValue = Input(0)->ValueFor(fr); @@ -1792,12 +1794,16 @@ public: double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this) double blendFactor = ComputeBlendFactor(); // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this) - m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in) - expAvgFactor, blendFactor, - runMean, runVariance, // (in/out) running estimates, updated from the current MB mean/variance - /*out=*/ sliceOutputValue, // (out) batch-normalized output value + // In inference-only mode, m_savedMean and m_saveInvStdDev will not be + // produced and BackpropToNonLooping() may not be called. In + // non-inference (training) mode, saved statistics must be produced. + bool inferenceOnly = !Environment().IsTraining(); + m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in) + inferenceOnly, expAvgFactor, blendFactor, + runMean, runVariance, // (in/out) running estimates, updated from the current MB mean/variance + /*out=*/ sliceOutputValue, // (out) batch-normalized output value m_epsilon, - *m_saveMean, *m_saveInvStdDev); // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine + *m_savedMean, *m_savedInvStdDev); // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine } // Note: This function assumes that inputIndex=0 is called before the others. @@ -1805,6 +1811,16 @@ public: // BUGBUG: If the input has no learnables (e.g. using BN instead of corpus mean/var norm), this will not be called for inputIndex=0 at all. virtual void BackpropToNonLooping(size_t inputIndex) override { + // Must be in training mode. + if (!Environment().IsTraining()) + LogicError("%ls: BackpropToNonLooping() cannot be called in inference mode", NodeName().c_str()); + // In non-inference mode, the batch normalization engine must provide + // saved statistics, m_savedMean and m_savedInvStdDev + if (m_savedMean->IsEmpty()) + LogicError("%ls: m_savedMean cannot be empty", NodeName().c_str()); + if (m_savedInvStdDev->IsEmpty()) + LogicError("%ls: m_savedInvStdDev cannot be empty", NodeName().c_str()); + FrameRange fr(Input(0)->GetMBLayout()); if (inputIndex == 0) // derivative with respect to the input. @@ -1813,16 +1829,8 @@ public: auto sliceInputValue = Input(0)->ValueFor(fr); const Matrix& scale = Input(1)->Value(); const Matrix& bias = Input(2)->Value(); - const Matrix& runMean = Input(3)->Value(); - const Matrix& runVariance = Input(4)->Value(); auto sliceInputGrad = Input(0)->GradientFor(fr); - // The mean used in Forward() are either saveMean or runMean. - // This is decided by the engine, which communicates back the decision by returning - // an empty saveMean in case runMean should be used. Likewise for variance / inverted standard deviation. - let& actualMean = !m_saveMean->IsEmpty() ? *m_saveMean : runMean; // empty if only the running mean is used - if (m_saveInvStdDev->IsEmpty()) RuntimeError("TODO m_saveInvStdDev <-> runVariance not the same:"); - let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runVariance; m_dScale->Resize(scale); // gradients for scale and bias get stored here m_dBias->Resize(bias); @@ -1833,7 +1841,7 @@ public: sliceInputGrad, // (out) gradient for data input goes here scale, // (in) out of scale and bias, only scale is needed in gradient propagation blendFactor, // (in) smoothing weight for running stats (1=use only running stats) - actualMean, actualInvStdDev, // (in) actual mean/stddev values used in ForwardProp() + *m_savedMean, *m_savedInvStdDev, // (in) saved mean/invstddev values used in ForwardProp() *m_dScale, *m_dBias); // (out) gradients for scale and bias } else if (inputIndex == 1) // derivative with respect to the scale @@ -1855,6 +1863,7 @@ public: virtual void EndBackprop() override { + // TODO don't update in locked mode auto numSamples = GetMBLayout()->GetActualNumSamples(); m_samplesSeen += numSamples; Base::EndBackprop(); @@ -1892,19 +1901,22 @@ public: if (isFinalValidationPass) { - if (m_convertRunningVariance) + if (m_convertRunningVariancePending) { - // Input 4 is still inverse standard deviation. We convert it to variance, approximately, - // and output a warning. - fprintf(stderr, "WARNING: %ls: approximately converting inverse standard deviation into variance when loading pre-CuDNNv5 model\n", + // Prior to CNTK CuDNN v5 support (and the CNTK engine of the same time), mean and inverse standard deviation + // statistics were computed and stored. With CuDNN v5 (and the corresponding CNTK engine update), this was changed + // to mean and variance. + // To load an old model for further training or inference, Input(4) (which is inverse standard deviation) needs to + // be converted to variance, via v = 1/(isd^2) + epsilon, where 'v' is variance and 'isd' is inverse standard + // Since this is an approximation, we output a warning. + fprintf(stderr, "WARNING: %ls: loading pre-CuDNNv5 model and approximately converting variance statistics format\n", NodeName().c_str()); Matrix& runInvStdDev = Input(4)->Value(); runInvStdDev.AssignElementPowerOf(runInvStdDev, 2); runInvStdDev.ElementInverse(); runInvStdDev += (float) m_epsilon; - fprintf(stderr, "--- %ls converted runVariance after loading\n", NodeName().c_str()); runInvStdDev.Print(); - m_convertRunningVariance = false; + m_convertRunningVariancePending = false; } // check inputs @@ -1949,8 +1961,8 @@ public: void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override { Base::RequestMatricesBeforeForwardProp(matrixPool); - RequestMatrixFromPool(m_saveMean, matrixPool); - RequestMatrixFromPool(m_saveInvStdDev, matrixPool); + RequestMatrixFromPool(m_savedMean, matrixPool); + RequestMatrixFromPool(m_savedInvStdDev, matrixPool); } void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override @@ -1963,8 +1975,8 @@ public: void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override { Base::ReleaseMatricesAfterBackprop(matrixPool); - ReleaseMatrixToPool(m_saveMean, matrixPool); - ReleaseMatrixToPool(m_saveInvStdDev, matrixPool); + ReleaseMatrixToPool(m_savedMean, matrixPool); + ReleaseMatrixToPool(m_savedInvStdDev, matrixPool); ReleaseMatrixToPool(m_dScale, matrixPool); ReleaseMatrixToPool(m_dBias, matrixPool); } @@ -2035,7 +2047,7 @@ private: // REVIEW alexeyk: if this works, document it properly in Wiki. double m_blendTimeConst; - // Epsilon used to compute inverse standard deviation (m_saveInvStdDev). + // Epsilon used to compute inverse standard deviation (m_savedInvStdDev). double m_epsilon; // Whether to use CNTK or cuDNN BN implementation. bool m_useCntkEngine; @@ -2048,8 +2060,8 @@ private: size_t m_samplesSeen; // Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation. - shared_ptr> m_saveMean; - shared_ptr> m_saveInvStdDev; + shared_ptr> m_savedMean; + shared_ptr> m_savedInvStdDev; // Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls. // Not used for blendFactor=1 in CNTK engine. shared_ptr> m_dScale; @@ -2057,7 +2069,7 @@ private: std::unique_ptr> m_bnEng; - bool m_convertRunningVariance; + bool m_convertRunningVariancePending; }; template class BatchNormalizationNode; diff --git a/Source/Math/BatchNormalizationEngine.cpp b/Source/Math/BatchNormalizationEngine.cpp index c254c8c64..6664d2f31 100644 --- a/Source/Math/BatchNormalizationEngine.cpp +++ b/Source/Math/BatchNormalizationEngine.cpp @@ -10,14 +10,16 @@ namespace Microsoft { namespace MSR { namespace CNTK { template -void BatchNormEngine::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, - Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) +void BatchNormEngine::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, + Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) { assert(in.GetNumRows() == m_inOutT.GetNumElements()); assert(out.GetNumRows() == m_inOutT.GetNumElements()); assert(in.GetNumCols() == out.GetNumCols()); assert(std::isfinite(expAvgFactor) && (0 <= expAvgFactor && expAvgFactor <= 1)); assert(std::isfinite(blendFactor) && (0 <= blendFactor && blendFactor <= 1)); + // In inference mode, must only use runnig statististics + assert(!inferenceOnly || ((expAvgFactor == 0.0) && (blendFactor == 1.0))); assert(std::isfinite(epsilon) && epsilon > 0); if (!m_spatial) { @@ -39,28 +41,35 @@ void BatchNormEngine::Forward(const Mat& in, const Mat& scale, const M assert(runVariance.GetNumCols() == 1); EnsureCompatible(); - ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev); + ForwardCore(in, scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev); - if (!m_spatial) + if (!inferenceOnly) { - assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows()); - assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows()); + assert(!savedMean.IsEmpty()); + assert(!savedInvStdDev.IsEmpty()); + if (!m_spatial) + { + assert(m_inOutT.GetNumElements() == savedMean.GetNumRows()); + assert(m_inOutT.GetNumElements() == savedInvStdDev.GetNumRows()); + } + else + { + assert((m_inOutT.GetNumElements() % savedMean.GetNumRows()) == 0); + assert((m_inOutT.GetNumElements() % savedInvStdDev.GetNumRows()) == 0); + } + assert(savedMean.GetNumCols() == 1); + assert(savedInvStdDev.GetNumCols() == 1); } - else - { - assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0); - assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0); - } - assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1); - assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1); } template void BatchNormEngine::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, - const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad) + const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad) { + assert(!savedMean.IsEmpty()); + assert(!savedInvStdDev.IsEmpty()); EnsureCompatible(); - BackwardCore(in, srcGrad, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad); + BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad); } template @@ -89,24 +98,23 @@ protected: InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout."); } - void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, - Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override + void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, + Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override { - in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev); + in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev); } - void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev, + void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad) override { - srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad); + srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad); } }; template class CntkBatchNormEngine; template class CntkBatchNormEngine; -template -bool HasFlag(T src, T testFlag) +template bool HasFlag(T src, T testFlag) { return ((int)src & (int)testFlag) != 0; } diff --git a/Source/Math/BatchNormalizationEngine.h b/Source/Math/BatchNormalizationEngine.h index d4beb82ae..0f594d83c 100644 --- a/Source/Math/BatchNormalizationEngine.h +++ b/Source/Math/BatchNormalizationEngine.h @@ -34,7 +34,7 @@ public: public: virtual ~BatchNormEngine() = default; - void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, + void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev); void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev, @@ -56,7 +56,7 @@ protected: virtual void EnsureCompatible() = 0; // saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched - virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, + virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0; virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev, diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp index 24dd8439f..1b8813445 100644 --- a/Source/Math/CPUMatrix.cpp +++ b/Source/Math/CPUMatrix.cpp @@ -4372,20 +4372,19 @@ void CPUMatrix::AveragePoolingBackward(const CPUMatrix& mpRowCol, } template -void CPUMatrix::BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, double expAvgFactor, double blendFactor, +void CPUMatrix::BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix& runMean, CPUMatrix& runVariance, CPUMatrix& out, double epsilon, CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const { assert((GetNumRows() % scale.GetNumRows()) == 0); - if (expAvgFactor != 0 || blendFactor != 1) + if (!inferenceOnly || expAvgFactor != 0 || blendFactor != 1) RuntimeError("Batch normalization training on CPU is not yet implemented."); saveMean.Resize(0, 0); // only doing inference: these two are not produced saveInvStdDev.Resize(0, 0); bool spatial = GetNumRows() != scale.GetNumRows(); - size_t batchSize = GetNumCols(); if (spatial) { size_t spatialSize = GetNumRows() / scale.GetNumRows(); @@ -4395,7 +4394,7 @@ void CPUMatrix::BatchNormalizationForward(const CPUMatrix& s for (long irow = 0; irow < out.GetNumRows(); irow++) { size_t imap = irow / spatialSize; - ElemType stdDev = sqrt(runVariance(imap, 0) * (batchSize - 1) / batchSize + epsilon); + ElemType stdDev = sqrt(runVariance(imap, 0) + epsilon); out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) / stdDev + bias(imap, 0); } } @@ -4407,7 +4406,7 @@ void CPUMatrix::BatchNormalizationForward(const CPUMatrix& s { for (long irow = 0; irow < out.GetNumRows(); irow++) { - ElemType stdDev = sqrt(runVariance(irow, 0) * (batchSize - 1) / batchSize + epsilon); + ElemType stdDev = sqrt(runVariance(irow, 0) + epsilon); out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) / stdDev + bias(irow, 0); } } diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h index 357d2b7e6..782b31576 100644 --- a/Source/Math/CPUMatrix.h +++ b/Source/Math/CPUMatrix.h @@ -27,8 +27,8 @@ namespace Microsoft { namespace MSR { namespace CNTK { double logadd(double x, double y); -//To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor -//conversion is need when passing data between CPUMatrix and C++ matrices +// To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor +// conversion is need when passing data between CPUMatrix and C++ matrices template class MATH_API CPUMatrix : public BaseMatrix { @@ -375,7 +375,7 @@ public: void AveragePoolingBackward(const CPUMatrix& mpRowCol, const CPUMatrix& mpRowIndices, const CPUMatrix& indices, CPUMatrix& grad) const; - void BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, double expAvgFactor, double blendFactor, CPUMatrix& runMean, CPUMatrix& runStdDev, + void BatchNormalizationForward(const CPUMatrix& scale, const CPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix& runMean, CPUMatrix& runVariance, CPUMatrix& out, double epsilon, CPUMatrix& saveMean, CPUMatrix& saveInvStdDev) const; void BatchNormalizationBackward(const CPUMatrix& in, CPUMatrix& grad, const CPUMatrix& scale, double blendFactor, const CPUMatrix& saveMean, const CPUMatrix& saveInvStdDev, CPUMatrix& scaleGrad, CPUMatrix& biasGrad) const; diff --git a/Source/Math/CntkBatchNormalization.cuh b/Source/Math/CntkBatchNormalization.cuh index ae1f77221..e6b375084 100644 --- a/Source/Math/CntkBatchNormalization.cuh +++ b/Source/Math/CntkBatchNormalization.cuh @@ -203,7 +203,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, assert(::isfinite(epsilon) && epsilon > 0); assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1); assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1); - assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update) + // TODO optimize for (expAvgFactor == 0) && (blendFactor == 1) int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U; if (irowSrcBase >= vectorSize) @@ -346,7 +346,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, im2[k] = Operations::RSqrt(static_cast(m2[k] / batchSize + epsilon)); if (blendFactor != 0) { - ElemType runInvStdDev = Operations::RSqrt(static_cast(run[k] * (batchSize - 1) / batchSize + epsilon)); + ElemType runInvStdDev = Operations::RSqrt(static_cast(run[k] + epsilon)); im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k]; } } @@ -376,7 +376,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial assert((vectorSize % spatialSize) == 0); assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1); assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1); - assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update) + // TODO optimize for (expAvgFactor == 0) && (blendFactor == 1) assert(::isfinite(epsilon) && epsilon > 0); int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U; @@ -506,7 +506,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast(m2[0] / (batchSize * spatialSize) + epsilon)); if (blendFactor != 0) { - ElemType runInvStdDev = Operations::RSqrt(static_cast(runVariance[blockIdx.x] * (batchSize - 1) / batchSize + epsilon)); + ElemType runInvStdDev = Operations::RSqrt(static_cast(runVariance[blockIdx.x] + epsilon)); xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x]; } } @@ -729,7 +729,7 @@ struct NormalizeBatchTraining template __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias, - const ElemType* saveMean, const ElemType* saveInvStdDev) + const ElemType* savedMean, const ElemType* savedInvStdDev) { static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32)."); static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32)."); @@ -754,8 +754,8 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con // Read mean and inv std dev. if (threadIdx.y == 0) { - LoadValues(saveMean + irowSrcBase, mean); - LoadValues(saveInvStdDev + irowSrcBase, invStdDev); + LoadValues(savedMean + irowSrcBase, mean); + LoadValues(savedInvStdDev + irowSrcBase, invStdDev); StoreValues(mean, &meanS[threadIdx.x * U]); StoreValues(invStdDev, &invStdDevS[threadIdx.x * U]); } @@ -833,7 +833,7 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con template __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, - ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev) + ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev) { static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32)."); static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32)."); @@ -859,8 +859,8 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial // Read mean and inv std dev. if (tid == 0) { - meanS = saveMean[blockIdx.x]; - invStdDevS = saveInvStdDev[blockIdx.x]; + meanS = savedMean[blockIdx.x]; + invStdDevS = savedInvStdDev[blockIdx.x]; } __syncthreads(); if (tid != 0) @@ -922,7 +922,7 @@ struct ComputeScaleAndBiasGradients { template static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy, - ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream) + ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream) { assert((vectorSize % U) == 0); const int BlockDimX = 32 / U; @@ -931,7 +931,7 @@ struct ComputeScaleAndBiasGradients // Create a grid that has uses striding in y-dimension to cover whole minibatch. auto gdim = dim3(static_cast(RoundUpToMultiple(vectorSize, BlockDimX * U))); kComputeScaleAndBiasGradients<<>>( - static_cast(vectorSize), static_cast(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev); + static_cast(vectorSize), static_cast(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev); } }; @@ -940,7 +940,7 @@ struct ComputeSpatialScaleAndBiasGradients { template static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy, - ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream) + ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream) { assert((spatialSize % U) == 0); assert((vectorSize % spatialSize) == 0); @@ -951,7 +951,7 @@ struct ComputeSpatialScaleAndBiasGradients // Create a grid that has uses striding in y-dimension to cover whole minibatch. auto gdim = dim3(static_cast(vectorSize / spatialSize)); kComputeSpatialScaleAndBiasGradients<<>>( - static_cast(vectorSize), static_cast(spatialSize), static_cast(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev); + static_cast(vectorSize), static_cast(spatialSize), static_cast(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev); } }; @@ -959,7 +959,7 @@ struct ComputeSpatialScaleAndBiasGradients template __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx, const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias, - const ElemType* saveMean, const ElemType* saveInvStdDev) + const ElemType* savedMean, const ElemType* savedInvStdDev) { static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32)."); static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32)."); @@ -991,8 +991,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize scale[k] = bnScale[imap]; ds[k] = dScale[imap]; db[k] = dBias[imap]; - mean[k] = saveMean[imap]; - invStdDev[k] = saveInvStdDev[imap]; + mean[k] = savedMean[imap]; + invStdDev[k] = savedInvStdDev[imap]; } } else @@ -1000,8 +1000,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize LoadValues(bnScale + irowBase, scale); LoadValues(dScale + irowBase, ds); LoadValues(dBias + irowBase, db); - LoadValues(saveMean + irowBase, mean); - LoadValues(saveInvStdDev + irowBase, invStdDev); + LoadValues(savedMean + irowBase, mean); + LoadValues(savedInvStdDev + irowBase, invStdDev); } int icol = blockIdx.y * BlockDimY + threadIdx.y; @@ -1053,7 +1053,7 @@ struct BackpropagateBatchNormGradients template static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx, const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, - const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream) + const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream) { assert((vectorSize % U) == 0); const int BlockDimX = 32 / U; @@ -1064,12 +1064,12 @@ struct BackpropagateBatchNormGradients if (spatial) { kBackpropagateBatchNormGradients<<>>( - static_cast(vectorSize), static_cast(spatialSize), static_cast(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev); + static_cast(vectorSize), static_cast(spatialSize), static_cast(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev); } else { kBackpropagateBatchNormGradients<<>>( - static_cast(vectorSize), static_cast(spatialSize), static_cast(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev); + static_cast(vectorSize), static_cast(spatialSize), static_cast(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev); } } }; diff --git a/Source/Math/CuDnnBatchNormalization.cu b/Source/Math/CuDnnBatchNormalization.cu index 3dac26fa4..584c2943b 100644 --- a/Source/Math/CuDnnBatchNormalization.cu +++ b/Source/Math/CuDnnBatchNormalization.cu @@ -42,9 +42,11 @@ protected: InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions."); } - void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runStdDev, - Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override + void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance, + Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override { + UNUSED(inferenceOnly); // TODO + // REVIEW alexeyk: there might be a way to do this in cuDNN. if (blendFactor != 0 && (blendFactor != 1 || expAvgFactor > 0)) InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only."); @@ -53,25 +55,25 @@ protected: cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION; // cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON. epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON); - // expAvgFactor == 0 && blendFactor == 1 means we are in inference mode. - if (expAvgFactor == 0 && blendFactor == 1) + if (inferenceOnly) { - saveMean.Resize(0, 0); // (these are not produced in this case) - saveInvStdDev.Resize(0, 0); + assert(expAvgFactor == 0 && blendFactor == 1); + savedMean.Resize(0, 0); // (these are not produced in this case) + savedInvStdDev.Resize(0, 0); CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out), - m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runStdDev), epsilon)); + m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runVariance), epsilon)); } else { - saveMean.Resize(runMean); - saveInvStdDev.Resize(runMean); + savedMean.Resize(runMean); + savedInvStdDev.Resize(runMean); CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), - m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runStdDev), - epsilon, ptr(saveMean), ptr(saveInvStdDev))); + m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runVariance), + epsilon, ptr(savedMean), ptr(savedInvStdDev))); } } - void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev, + void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad) override { UNUSED(blendFactor); // BUGBUG: It should be used. @@ -79,7 +81,7 @@ protected: cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION; // REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine. CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad), - m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev))); + m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(savedMean), ptr(savedInvStdDev))); } private: diff --git a/Source/Math/CuDnnCommon.h b/Source/Math/CuDnnCommon.h index 95d48aeba..743f5a86e 100644 --- a/Source/Math/CuDnnCommon.h +++ b/Source/Math/CuDnnCommon.h @@ -9,7 +9,7 @@ #include "TensorShape.h" #include #if CUDNN_MAJOR < 5 -#error CNTK needs CuDNN version 5.0 or higher, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn. +#error CNTK requires the NVIDIA cuDNN library 5.0 or higher to build, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn for installation instructions. #endif #include diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu index 4bd8d028b..d3d40ac83 100644 --- a/Source/Math/GPUMatrix.cu +++ b/Source/Math/GPUMatrix.cu @@ -67,7 +67,7 @@ cudaStream_t MATH_API GetStream() template \ GPUMatrix& GPUMatrix::Inplace##f() \ { \ - performElementWiseFunction(ElementWiseOperator::op##f, Data()); \ + performElementWiseFunction(ElementWiseOperator::op##f, Data()); \ return *this; \ } #define DEF_ELEMWISE_ASSIGN_FUNC(f) \ @@ -77,8 +77,8 @@ cudaStream_t MATH_API GetStream() if (a.IsEmpty()) \ LogicError("Assign##f##Of: Matrix a is empty."); \ if (this != &a) \ - RequireSize(a.GetNumRows(), a.GetNumCols()); \ - performElementWiseFunction(ElementWiseOperator::op##f, a.Data()); \ + RequireSize(a.GetNumRows(), a.GetNumCols()); \ + performElementWiseFunction(ElementWiseOperator::op##f, a.Data()); \ return *this; \ } @@ -3160,12 +3160,13 @@ void GPUMatrix::AveragePoolingBackward(const GPUMatrix& mpRowCol, Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows()); } -// returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty +// returns savedMean/savedInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty template -void GPUMatrix::BatchNormalizationForward(const GPUMatrix& scale, const GPUMatrix& bias, double expAvgFactor, double blendFactor, +void GPUMatrix::BatchNormalizationForward(const GPUMatrix& scale, const GPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, GPUMatrix& runMean, GPUMatrix& runVariance, GPUMatrix& out, double epsilon, - GPUMatrix& saveMean, GPUMatrix& saveInvStdDev) const + GPUMatrix& savedMean, GPUMatrix& savedInvStdDev) const { + UNUSED(inferenceOnly); // TODO assert((GetNumRows() % scale.GetNumRows()) == 0); bool spatial = GetNumRows() != scale.GetNumRows(); @@ -3178,52 +3179,54 @@ void GPUMatrix::BatchNormalizationForward(const GPUMatrix& s assert(0 < batchSize && batchSize <= std::numeric_limits::max()); SyncGuard syncGuard; - if (expAvgFactor > 0 || blendFactor < 1) + //if (expAvgFactor > 0 || blendFactor < 1) + if (inferenceOnly) { - // Compute data mean and inverse standard deviation (into saveMean and - // saveInvStdDev), and update running mean and variance. + // Pick running statistics for normalizing. No update reuqired, and + // saved statistics do not need to be produced. + assert(expAvgFactor == 0 && blendFactor == 1); + normalizeRunningStats = true; + savedMean.RequireSize(0, 0); + savedInvStdDev.RequireSize(0, 0); + } + else + { + // Compute data mean and inverse standard deviation (into savedMean and + // savedInvStdDev), and update running mean and variance. + // TODO expAvgFactor == 0 && blendFactor == 1 can be optimized (no need for update). normalizeRunningStats = false; - saveMean.RequireSize(runMean); - saveInvStdDev.RequireSize(runMean); + savedMean.RequireSize(runMean); + savedInvStdDev.RequireSize(runMean); if (spatial) { Call(spatialSize, vectorSize, spatialSize, batchSize, Data(), expAvgFactor, blendFactor, runMean.Data(), runVariance.Data(), epsilon, - saveMean.Data(), saveInvStdDev.Data(), GetStream()); + savedMean.Data(), savedInvStdDev.Data(), GetStream()); } else { Call(vectorSize, vectorSize, batchSize, Data(), expAvgFactor, blendFactor, runMean.Data(), runVariance.Data(), epsilon, - saveMean.Data(), saveInvStdDev.Data(), GetStream()); + savedMean.Data(), savedInvStdDev.Data(), GetStream()); } } - else - { - // With expAvgFactor == 0 and blendFactor == 1 the running statistics - // do not need to be updated. CNTK engine in this case returns saveMean - // and saveInvStdDev empty, but cuDNN engine does not. - normalizeRunningStats = true; - saveMean.RequireSize(0, 0); - saveInvStdDev.RequireSize(0, 0); - } Call(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial, normalizeRunningStats, epsilon, Data(), out.Data(), scale.Data(), bias.Data(), runMean.Data(), runVariance.Data(), - saveMean.Data(), saveInvStdDev.Data(), + savedMean.Data(), savedInvStdDev.Data(), GetStream()); } -// saveMean/saveInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp(). +// savedMean/savedInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp(). // For blendFactor=1, they are not used and can be uninitialized or empty. template void GPUMatrix::BatchNormalizationBackward(const GPUMatrix& in, GPUMatrix& grad, const GPUMatrix& scale, double blendFactor, - const GPUMatrix& saveMean, const GPUMatrix& saveInvStdDev, + const GPUMatrix& savedMean, const GPUMatrix& savedInvStdDev, GPUMatrix& scaleGrad, GPUMatrix& biasGrad) const { assert((GetNumRows() % scale.GetNumRows()) == 0); @@ -3240,16 +3243,16 @@ void GPUMatrix::BatchNormalizationBackward(const GPUMatrix& if (spatial) { Call(spatialSize, vectorSize, spatialSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(), - saveMean.Data(), saveInvStdDev.Data(), GetStream()); + savedMean.Data(), savedInvStdDev.Data(), GetStream()); } else { Call(vectorSize, vectorSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(), - saveMean.Data(), saveInvStdDev.Data(), GetStream()); + savedMean.Data(), savedInvStdDev.Data(), GetStream()); } ElemType mbStatsWeight = (ElemType)(1 - blendFactor); // weight for contribution from actual MB stats (0 if none, e.g. locked BN node) Call(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial, - in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream()); + in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), savedMean.Data(), savedInvStdDev.Data(), GetStream()); } #pragma region Static BLAS Functions diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h index 8930af586..85a3753db 100644 --- a/Source/Math/GPUMatrix.h +++ b/Source/Math/GPUMatrix.h @@ -467,8 +467,8 @@ public: void AveragePoolingForward(const GPUMatrix& mpRowCol, const GPUMatrix& mpRowIndices, const GPUMatrix& indices, GPUMatrix& output) const; void AveragePoolingBackward(const GPUMatrix& mpRowCol, const GPUMatrix& mpRowIndices, const GPUMatrix& indices, GPUMatrix& grad) const; - void BatchNormalizationForward(const GPUMatrix& scale, const GPUMatrix& bias, double expAvgFactor, double blendFactor, - GPUMatrix& runMean, GPUMatrix& runStdDev, GPUMatrix& out, double epsilon, + void BatchNormalizationForward(const GPUMatrix& scale, const GPUMatrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, + GPUMatrix& runMean, GPUMatrix& runVariance, GPUMatrix& out, double epsilon, GPUMatrix& saveMean, GPUMatrix& saveInvStdDev) const; void BatchNormalizationBackward(const GPUMatrix& in, GPUMatrix& grad, const GPUMatrix& scale, double blendFactor, const GPUMatrix& saveMean, const GPUMatrix& saveInvStdDev, diff --git a/Source/Math/MathCUDA.vcxproj b/Source/Math/MathCUDA.vcxproj index 401704daa..a88a253b6 100644 --- a/Source/Math/MathCUDA.vcxproj +++ b/Source/Math/MathCUDA.vcxproj @@ -197,6 +197,6 @@ if exist "$(CuDnnDll)" xcopy /D /Y "$(CuDnnDll)" "$(OutputPath)" - + diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp index 49a3accae..12dc05eb8 100644 --- a/Source/Math/Matrix.cpp +++ b/Source/Math/Matrix.cpp @@ -4274,8 +4274,8 @@ void Matrix::AveragePoolingBackward(const Matrix& mpRowCol, const } template -void Matrix::BatchNormalizationForward(const Matrix& scale, const Matrix& bias, double expAvgFactor, double blendFactor, - Matrix& runMean, Matrix& runStdDev, Matrix& out, double epsilon, +void Matrix::BatchNormalizationForward(const Matrix& scale, const Matrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, + Matrix& runMean, Matrix& runVariance, Matrix& out, double epsilon, Matrix& saveMean, Matrix& saveInvStdDev) const { DecideAndMoveToRightDevice(*this, out); @@ -4283,11 +4283,11 @@ void Matrix::BatchNormalizationForward(const Matrix& scale, // REVIEW alexeyk: add sparse version. DISPATCH_MATRIX_ON_FLAG(this, this, - m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor, - *(runMean.m_CPUMatrix), *(runStdDev.m_CPUMatrix), + m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), inferenceOnly, expAvgFactor, blendFactor, + *(runMean.m_CPUMatrix), *(runVariance.m_CPUMatrix), *(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)), - m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor, - *(runMean.m_GPUMatrix), *(runStdDev.m_GPUMatrix), + m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), inferenceOnly, expAvgFactor, blendFactor, + *(runMean.m_GPUMatrix), *(runVariance.m_GPUMatrix), *(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)), NOT_IMPLEMENTED, NOT_IMPLEMENTED); diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h index 150ef865e..28a4c1f40 100644 --- a/Source/Math/Matrix.h +++ b/Source/Math/Matrix.h @@ -500,8 +500,8 @@ public: void AveragePoolingForward(const Matrix& mpRowCol, const Matrix& mpRowIndices, const Matrix& indices, Matrix& output) const; void AveragePoolingBackward(const Matrix& mpRowCol, const Matrix& mpRowIndices, const Matrix& indices, Matrix& grad) const; - void BatchNormalizationForward(const Matrix& scale, const Matrix& bias, double expAvgFactor, double blendFactor, - Matrix& runMean, Matrix& runStdDev, Matrix& out, double epsilon, + void BatchNormalizationForward(const Matrix& scale, const Matrix& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, + Matrix& runMean, Matrix& runVariance, Matrix& out, double epsilon, Matrix& saveMean, Matrix& saveInvStdDev) const; void BatchNormalizationBackward(const Matrix& in, Matrix& grad, const Matrix& scale, double blendFactor, const Matrix& saveMean, const Matrix& saveInvStdDev, Matrix& scaleGrad, Matrix& biasGrad) const; diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp index 55596854c..7f21caa56 100644 --- a/Source/Math/NoGPU.cpp +++ b/Source/Math/NoGPU.cpp @@ -1827,7 +1827,7 @@ void GPUMatrix::AveragePoolingBackward(const GPUMatrix& mpRowCol, template void GPUMatrix::BatchNormalizationForward(const GPUMatrix& scale, const GPUMatrix& bias, double expAvgFactor, double blendFactor, - GPUMatrix& runMean, GPUMatrix& runStdDev, GPUMatrix& out, double epsilon, + GPUMatrix& runMean, GPUMatrix& runVariance, GPUMatrix& out, double epsilon, GPUMatrix& saveMean, GPUMatrix& saveInvStdDev) const { } diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk index c3df1a844..2fd994497 100644 --- a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk +++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk @@ -1,3 +1,7 @@ +# Synthetic test for non-spatial batch normalization, (almost) in isolation of +# other nodes (in particular Convolution). This is not a general example and/or +# documenting best practices. The network was based on MNIST/01_OneHidden. + rootDir = ".." configDir = "$rootDir$/Config" @@ -17,10 +21,6 @@ traceLevel = 1 batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine" -####################################### -# TRAINING CONFIG # -####################################### - train = [ action = "train" @@ -54,10 +54,6 @@ train = [ ] ] -####################################### -# TEST CONFIG # -####################################### - test = [ action = "test" minibatchSize = 1024 # reduce this if you run out of memory diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl index 6a51e0aca..aa717aec1 100644 --- a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl +++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl @@ -22,8 +22,8 @@ DNN = [ b = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0) sc = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 1) m = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - y = BatchNormalization(h1, sc, b, m, var, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$) + v = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + y = BatchNormalization(h1, sc, b, m, v, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$) ol = DNNLayer(hiddenDim, labelDim, y, 1) diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk index baab959f7..1ddd4590b 100644 --- a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk +++ b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk @@ -1,3 +1,7 @@ +# Synthetic test for spatial batch normalization, (almost) in isolation of +# other nodes (in particular Convolution). This is not a general example and/or +# documenting best practices. The network was based on CIFAR-10/02_BatchNormConv. + RootDir = "." ConfigDir = "$RootDir$" diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl index 38e3a4e79..eb04ba04f 100644 --- a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl +++ b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl @@ -22,8 +22,7 @@ ndlMnistMacros = [ scValue = 1 # Batch normalization time constant. - #bnTimeConst = 4096 - bnTimeConst = 2048 + bnTimeConst = 0 fc1WScale = 12 fc1BValue = 0 @@ -42,9 +41,9 @@ DNN=[ b = LearnableParameter(cMap1, 1, init = fixedValue, value = conv1BValue) sc = LearnableParameter(cMap1, 1, init = fixedValue, value = scValue) m = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - y = BatchNormalization(featScaled, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$) + y = BatchNormalization(featScaled, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$) conv1 = RectifiedLinear(y) # pool1 diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml index 459fd20b5..bacd1cc83 100644 --- a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml +++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml @@ -19,13 +19,13 @@ testCases: Epochs must be finished with expected results: patterns: - Finished Epoch - - CE = {{float,tolerance=2.0%}} * {{integer}} - - Err = {{float,tolerance=2.0%}} * {{integer}} + - CE = {{float,tolerance=0.1%}} * {{integer}} + - Err = {{float,tolerance=0.1%}} * {{integer}} - totalSamplesSeen = {{integer}} - learningRatePerSample = {{float,tolerance=0.001%}} Final test results must match: patterns: - "Final Results: Minibatch[{{integer}}-{{integer}}]" - - Err = {{float,tolerance=2.0%}} * {{integer}} - - CE = {{float,tolerance=2.0%}} * {{integer}} + - Err = {{float,tolerance=0.1%}} * {{integer}} + - CE = {{float,tolerance=0.1%}} * {{integer}} diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml index 459fd20b5..bacd1cc83 100644 --- a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml +++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml @@ -19,13 +19,13 @@ testCases: Epochs must be finished with expected results: patterns: - Finished Epoch - - CE = {{float,tolerance=2.0%}} * {{integer}} - - Err = {{float,tolerance=2.0%}} * {{integer}} + - CE = {{float,tolerance=0.1%}} * {{integer}} + - Err = {{float,tolerance=0.1%}} * {{integer}} - totalSamplesSeen = {{integer}} - learningRatePerSample = {{float,tolerance=0.001%}} Final test results must match: patterns: - "Final Results: Minibatch[{{integer}}-{{integer}}]" - - Err = {{float,tolerance=2.0%}} * {{integer}} - - CE = {{float,tolerance=2.0%}} * {{integer}} + - Err = {{float,tolerance=0.1%}} * {{integer}} + - CE = {{float,tolerance=0.1%}} * {{integer}} diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl b/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl index 6c783b860..731432b1a 100644 --- a/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl +++ b/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl @@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue) m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$) - y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn) + y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn) ] ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) @@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst) b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue) m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$) - y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn) + y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn) ] ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst) @@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) t = Times(W, x) - bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn) + bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn) y = RectifiedLinear(bn) ] @@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) - var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) + v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0) t = Times(W, x) - bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn) + bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn) y = RectifiedLinear(bn) ] diff --git a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml index 292267353..d389f5af2 100644 --- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml +++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml @@ -32,4 +32,4 @@ testCases: - "Final Results: Minibatch[{{integer}}-{{integer}}]" # this part is flaky, needs to be investigated (VSO item #414) #- Err = {{float,tolerance=2.0%}} * {{integer}} - - CE = {{float,tolerance=2.0%}} * {{integer}} + - CE = {{float,tolerance=4.0%}} * {{integer}} diff --git a/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp b/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp index ae5d2b992..9fe87cf7a 100644 --- a/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp +++ b/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp @@ -100,6 +100,7 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward) double expAvg = std::get<3>(cfg); double blendFactor = 0; // cuDNN supports blendFactor == 0 (train) or 1 (eval) only. double eps = 1e-5; // CUDNN_BN_MIN_EPSILON + bool inferenceOnly = false; auto engCudnn = BNEng::Create(baseDeviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::CuDnn); auto engCntk = BNEng::Create(deviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::Cntk); @@ -142,12 +143,12 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward) CudaTimer time1; time1.Start(); - engCntk->Forward(in, scale, bias, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev); + engCntk->Forward(in, scale, bias, inferenceOnly, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev); time1.Stop(); CudaTimer time2; time2.Start(); - engCudnn->Forward(inB, scaleB, biasB, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB); + engCudnn->Forward(inB, scaleB, biasB, inferenceOnly, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB); time2.Stop(); std::stringstream tmsg; diff --git a/Tests/UnitTests/V2LibraryTests/Image.h b/Tests/UnitTests/V2LibraryTests/Image.h index a2c164e1c..7b0efdd49 100644 --- a/Tests/UnitTests/V2LibraryTests/Image.h +++ b/Tests/UnitTests/V2LibraryTests/Image.h @@ -28,12 +28,12 @@ inline FunctionPtr ProjLayer(Variable wProj, Variable input, size_t hStride, siz auto b = Parameter({ outFeatureMapCount }, (float)bValue, device); auto sc = Parameter({ outFeatureMapCount }, (float)scValue, device); auto m = Constant({ outFeatureMapCount }, 0.0f, device); - auto var = Constant({ outFeatureMapCount }, 0.0f, device); + auto v = Constant({ outFeatureMapCount }, 0.0f, device); size_t numInputChannels = input.Shape()[input.Shape().NumAxes() - 1]; auto c = Convolution(wProj, input, { hStride, vStride, numInputChannels }, { true }, { false }); - return BatchNormalization(c, sc, b, m, var, true /*spatial*/, (double)bnTimeConst); + return BatchNormalization(c, sc, b, m, v, true /*spatial*/, (double)bnTimeConst); } inline FunctionPtr ResNetNode2(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)