Address CR comments

2016-08-23 17:57:00 +02:00 · 2016-08-23 17:57:00 +02:00 · e1a9cabbde
--- a/Examples/Image/MNIST/Config/Macros.ndl
+++ b/Examples/Image/MNIST/Config/Macros.ndl
@ -26,9 +26,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
    y = RectifiedLinear(bn)
 ]

@ -72,10 +72,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
    b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
    sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
    m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
-    var = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
+    v = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
+    y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
 ]

 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
--- a/Examples/Image/MNIST/Config/Shared.bs
+++ b/Examples/Image/MNIST/Config/Shared.bs
@ -26,9 +26,9 @@ DnnBNReLULayer (inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
    b  = Parameter (outDim, 1, init = "fixedValue", value = bValue) 
    sc = Parameter (outDim, 1, init = "fixedValue", value = scValue) 
    m  = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
-    var = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
+    v  = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
    t = Times(W, x)  # TODO: W * x
-    bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
    y = RectifiedLinear(bn)
 ].y

@ -61,10 +61,10 @@ ConvBNLayerW (W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeC
    b  = Parameter(outMap, 1, init="fixedValue", value=bValue)
    sc = Parameter(outMap, 1, init="fixedValue", value=scValue)
    m  = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
-    var = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
+    v  = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true /* , imageLayout=$imageLayout$*/)
-    y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
+    y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
 ].y

 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]

 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]

 ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
    y = RectifiedLinear(bn)
 ]

@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
    y = RectifiedLinear(bn)
 ]

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@ -8,9 +8,9 @@ BN(inp, mapCount, bValue, scValue, bnTimeConst)
    b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
    sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
    m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
-    y = BatchNormalization(inp, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
+    y = BatchNormalization(inp, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
 ]

 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
@ -15,9 +15,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
    b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
    m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false)
    y = RectifiedLinear(bn)
 ]

@ -47,9 +47,9 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
    sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
    m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, var, spatial = true, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, v, spatial = true, imageLayout = "cudnn")
    y = RectifiedLinear(bn);
 ]
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -1547,7 +1547,7 @@ template class DropoutNode<double>;
 // 
 // m = mean(input)
 // var = variance(input)
-// input_norm = (input - mean) / sqrt(var)
+// input_norm = (input - mean) / sqrt(epsilon + var)
 // output = gamma * input_norm + beta
 // 
 // where gamma and beta are trainable parameters(represented as LearnableParameter).
@ -1570,7 +1570,7 @@ template class DropoutNode<double>;
 // * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
 //      Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
 //      Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase).
-// * epsilon is a conditioner constant used in computing inverted standard deviation
+// * epsilon is a conditioner constant used in computing inverse standard deviation
 // * useCntkEngine is a Boolean flag that specifies which batch normalization implementation to use: CNTK or cuDNN-based.
 // * imageLayout is the image layout. Only cudnn is supported at present.
 // -----------------------------------------------------------------------
@ -1584,14 +1584,14 @@ public:
    BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) :
        Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
        m_samplesSeen(0), m_imageLayoutKind(ImageLayoutKind::CHW),
-        m_convertRunningVariance(false)
+        m_convertRunningVariancePending(false)
    {
    }
    BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
                           double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) :
        Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
        m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_samplesSeen(0),
-        m_convertRunningVariance(false)
+        m_convertRunningVariancePending(false)
    {
    }
    BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
@ -1627,7 +1627,9 @@ public:
            fstream >> m_normTimeConst;
            fstream >> m_blendTimeConst;
            fstream >> m_imageLayoutKind;
+#ifdef _DEBUG 
            fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
+#endif
            fstream >> mbCount;
            m_samplesSeen = mbCount;
            fstream >> m_epsilon;
@ -1666,7 +1668,9 @@ public:
            {
                size_t mbCount;
                fstream >> m_imageLayoutKind;
+#ifdef _DEBUG 
                fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
+#endif
                fstream >> mbCount;
                m_samplesSeen = mbCount;
            }
@ -1681,8 +1685,9 @@ public:
        {
            // Prior to CNTK_MODEL_VERSION_12, running inverse standard
            // deviation was stored in Input 4. Now variance is used.
-            // We (approximately) convert it during validation later.
-            m_convertRunningVariance = true;
+            // We (approximately) convert it during validation later
+            // (and then clear the flag).
+            m_convertRunningVariancePending = true;
        }
    }

@ -1713,15 +1718,13 @@ private: // time-constant conversions
        // in inference mode, only use long-term mean and do not update running estimates
        if (!Environment().IsTraining())
        {
-            assert(0 < m_samplesSeen);                       // something must have been trained
+            if (m_samplesSeen == 0)
+                RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str());
            return 0;                                        // (m_normTimeConst == infinity) no new contribution from current minibatch
        }

        // Initialization case: only use current minibatch.
-        if (m_samplesSeen == 0)
-        {
-            return 1.0;
-        }
+        if (m_samplesSeen == 0) return 1.0;

        double numSamples = (double)GetMBLayout()->GetActualNumSamples();

@ -1747,15 +1750,13 @@ private: // time-constant conversions
        // in inference mode, only use long-term mean and do not update running estimates
        if (!Environment().IsTraining())
        {
-            assert(0 < m_samplesSeen);  // something must have been trained
+            if (m_samplesSeen == 0)
+                RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str());
            return 1.0;                 // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
        }

        // Initialization case: only use current minibatch.
-        if (m_samplesSeen == 0)
-        {
-            return 0;
-        }
+        if (m_samplesSeen == 0) return 0;

        // convert to blend factor (= weight for running stats)
        // The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
@ -1771,7 +1772,8 @@ public:

    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
    {
-        assert(!m_convertRunningVariance);
+        if (m_convertRunningVariancePending)
+            LogicError("%ls: Failed to convert running variance until forward prop", NodeName().c_str());
        FrameRange fr(Input(0)->GetMBLayout());

        Matrix<ElemType> sliceInputValue  = Input(0)->ValueFor(fr);
@ -1792,12 +1794,16 @@ public:
        double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this)
        double blendFactor  = ComputeBlendFactor();  // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)

+        // In inference-only mode, m_savedMean and m_saveInvStdDev will not be
+        // produced and BackpropToNonLooping() may not be called. In
+        // non-inference (training) mode, saved statistics must be produced.
+        bool inferenceOnly = !Environment().IsTraining();
        m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias,   // (in)
-                         expAvgFactor, blendFactor,
+                         inferenceOnly, expAvgFactor, blendFactor,
                         runMean, runVariance,                   // (in/out) running estimates, updated from the current MB mean/variance
                         /*out=*/ sliceOutputValue,              // (out) batch-normalized output value
                         m_epsilon,
-                         *m_saveMean, *m_saveInvStdDev);       // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
+                         *m_savedMean, *m_savedInvStdDev);       // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
    }

    // Note: This function assumes that inputIndex=0 is called before the others.
@ -1805,6 +1811,16 @@ public:
    // BUGBUG: If the input has no learnables (e.g. using BN instead of corpus mean/var norm), this will not be called for inputIndex=0 at all.
    virtual void BackpropToNonLooping(size_t inputIndex) override
    {
+        // Must be in training mode.
+        if (!Environment().IsTraining())
+            LogicError("%ls: BackpropToNonLooping() cannot be called in inference mode", NodeName().c_str());
+        // In non-inference mode, the batch normalization engine must provide
+        // saved statistics, m_savedMean and m_savedInvStdDev
+        if (m_savedMean->IsEmpty())
+            LogicError("%ls: m_savedMean cannot be empty", NodeName().c_str());
+        if (m_savedInvStdDev->IsEmpty())
+            LogicError("%ls: m_savedInvStdDev cannot be empty", NodeName().c_str());
+
        FrameRange fr(Input(0)->GetMBLayout());

        if (inputIndex == 0) // derivative with respect to the input.
@ -1813,16 +1829,8 @@ public:
            auto sliceInputValue                = Input(0)->ValueFor(fr);
            const Matrix<ElemType>& scale       = Input(1)->Value();
            const Matrix<ElemType>& bias        = Input(2)->Value();
-            const Matrix<ElemType>& runMean     = Input(3)->Value();
-            const Matrix<ElemType>& runVariance = Input(4)->Value();

            auto sliceInputGrad = Input(0)->GradientFor(fr);
-            // The mean used in Forward() are either saveMean or runMean.
-            // This is decided by the engine, which communicates back the decision by returning
-            // an empty saveMean in case runMean should be used. Likewise for variance / inverted standard deviation.
-            let& actualMean      = !m_saveMean->IsEmpty()      ? *m_saveMean      : runMean;   // empty if only the running mean is used
-            if (m_saveInvStdDev->IsEmpty()) RuntimeError("TODO m_saveInvStdDev <-> runVariance not the same:");
-            let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runVariance;
            m_dScale->Resize(scale); // gradients for scale and bias get stored here
            m_dBias->Resize(bias);

@ -1833,7 +1841,7 @@ public:
                              sliceInputGrad,                   // (out) gradient for data input goes here
                              scale,                            // (in)  out of scale and bias, only scale is needed in gradient propagation
                              blendFactor,                      // (in)  smoothing weight for running stats (1=use only running stats)
-                              actualMean, actualInvStdDev,      // (in)  actual mean/stddev values used in ForwardProp()
+                              *m_savedMean, *m_savedInvStdDev,   // (in)  saved mean/invstddev values used in ForwardProp()
                              *m_dScale, *m_dBias);             // (out) gradients for scale and bias
        }
        else if (inputIndex == 1) // derivative with respect to the scale
@ -1855,6 +1863,7 @@ public:

    virtual void EndBackprop() override
    {
+        // TODO don't update in locked mode
        auto numSamples = GetMBLayout()->GetActualNumSamples();
        m_samplesSeen += numSamples;
        Base::EndBackprop();
@ -1892,19 +1901,22 @@ public:

        if (isFinalValidationPass)
        {
-            if (m_convertRunningVariance)
+            if (m_convertRunningVariancePending)
            {
-                // Input 4 is still inverse standard deviation. We convert it to variance, approximately,
-                // and output a warning.
-                fprintf(stderr, "WARNING: %ls: approximately converting inverse standard deviation into variance when loading pre-CuDNNv5 model\n",
+                // Prior to CNTK CuDNN v5 support (and the CNTK engine of the same time), mean and inverse standard deviation
+                // statistics were computed and stored. With CuDNN v5 (and the corresponding CNTK engine update), this was changed
+                // to mean and variance.
+                // To load an old model for further training or inference, Input(4) (which is inverse standard deviation) needs to
+                // be converted to variance, via v = 1/(isd^2) + epsilon, where 'v' is variance and 'isd' is inverse standard
+                // Since this is an approximation, we output a warning.
+                fprintf(stderr, "WARNING: %ls: loading pre-CuDNNv5 model and approximately converting variance statistics format\n",
                        NodeName().c_str());
                Matrix<ElemType>& runInvStdDev = Input(4)->Value();
                runInvStdDev.AssignElementPowerOf(runInvStdDev, 2);
                runInvStdDev.ElementInverse();
                runInvStdDev += (float) m_epsilon;
-                fprintf(stderr, "--- %ls converted runVariance after loading\n", NodeName().c_str());
                runInvStdDev.Print();
-                m_convertRunningVariance = false;
+                m_convertRunningVariancePending = false;
            }

            // check inputs
@ -1949,8 +1961,8 @@ public:
    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeForwardProp(matrixPool);
-        RequestMatrixFromPool(m_saveMean, matrixPool);
-        RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
+        RequestMatrixFromPool(m_savedMean, matrixPool);
+        RequestMatrixFromPool(m_savedInvStdDev, matrixPool);
    }

    void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
@ -1963,8 +1975,8 @@ public:
    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
    {
        Base::ReleaseMatricesAfterBackprop(matrixPool);
-        ReleaseMatrixToPool(m_saveMean, matrixPool);
-        ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
+        ReleaseMatrixToPool(m_savedMean, matrixPool);
+        ReleaseMatrixToPool(m_savedInvStdDev, matrixPool);
        ReleaseMatrixToPool(m_dScale, matrixPool);
        ReleaseMatrixToPool(m_dBias, matrixPool);
    }
@ -2035,7 +2047,7 @@ private:
    // REVIEW alexeyk: if this works, document it properly in Wiki.
    double m_blendTimeConst;

-    // Epsilon used to compute inverse standard deviation (m_saveInvStdDev).
+    // Epsilon used to compute inverse standard deviation (m_savedInvStdDev).
    double m_epsilon;
    // Whether to use CNTK or cuDNN BN implementation.
    bool m_useCntkEngine;
@ -2048,8 +2060,8 @@ private:
    size_t m_samplesSeen;

    // Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation.
-    shared_ptr<Matrix<ElemType>> m_saveMean;
-    shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
+    shared_ptr<Matrix<ElemType>> m_savedMean;
+    shared_ptr<Matrix<ElemType>> m_savedInvStdDev;
    // Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
    // Not used for blendFactor=1 in CNTK engine.
    shared_ptr<Matrix<ElemType>> m_dScale;
@ -2057,7 +2069,7 @@ private:

    std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;

-    bool m_convertRunningVariance;
+    bool m_convertRunningVariancePending;
 };

 template class BatchNormalizationNode<float>;
--- a/Source/Math/BatchNormalizationEngine.cpp
+++ b/Source/Math/BatchNormalizationEngine.cpp
@ -10,14 +10,16 @@
 namespace Microsoft { namespace MSR { namespace CNTK {

 template <class ElemType>
-void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-                                        Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev)
+void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+                                        Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
 {
    assert(in.GetNumRows() == m_inOutT.GetNumElements());
    assert(out.GetNumRows() == m_inOutT.GetNumElements());
    assert(in.GetNumCols() == out.GetNumCols());
    assert(std::isfinite(expAvgFactor) && (0 <= expAvgFactor && expAvgFactor <= 1));
    assert(std::isfinite(blendFactor) && (0 <= blendFactor && blendFactor <= 1));
+    // In inference mode, must only use runnig statististics
+    assert(!inferenceOnly || ((expAvgFactor == 0.0) && (blendFactor == 1.0)));
    assert(std::isfinite(epsilon) && epsilon > 0);
    if (!m_spatial)
    {
@ -39,28 +41,35 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
    assert(runVariance.GetNumCols() == 1);

    EnsureCompatible();
-    ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
+    ForwardCore(in, scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);

+    if (!inferenceOnly)
+    {
+        assert(!savedMean.IsEmpty());
+        assert(!savedInvStdDev.IsEmpty());
        if (!m_spatial)
        {
-        assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
-        assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
+            assert(m_inOutT.GetNumElements() == savedMean.GetNumRows());
+            assert(m_inOutT.GetNumElements() == savedInvStdDev.GetNumRows());
        }
        else
        {
-        assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
-        assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
+            assert((m_inOutT.GetNumElements() % savedMean.GetNumRows()) == 0);
+            assert((m_inOutT.GetNumElements() % savedInvStdDev.GetNumRows()) == 0);
+        }
+        assert(savedMean.GetNumCols() == 1);
+        assert(savedInvStdDev.GetNumCols() == 1);
    }
-    assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
-    assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
 }

 template <class ElemType>
 void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
-                                         const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
+                                         const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad)
 {
+    assert(!savedMean.IsEmpty());
+    assert(!savedInvStdDev.IsEmpty());
    EnsureCompatible();
-    BackwardCore(in, srcGrad, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+    BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
 }

 template <class ElemType>
@ -89,24 +98,23 @@ protected:
            InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
    }

-    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-                     Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
+    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+                     Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
    {
-        in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
+        in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
    }

-    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
+    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
                      Mat& scaleGrad, Mat& biasGrad) override
    {
-        srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+        srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
    }
 };

 template class CntkBatchNormEngine<float>;
 template class CntkBatchNormEngine<double>;

-template <typename T>
-bool HasFlag(T src, T testFlag)
+template <typename T> bool HasFlag(T src, T testFlag)
 {
    return ((int)src & (int)testFlag) != 0;
 }
--- a/Source/Math/BatchNormalizationEngine.h
+++ b/Source/Math/BatchNormalizationEngine.h
@ -34,7 +34,7 @@ public:
 public:
    virtual ~BatchNormEngine() = default;

-    void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+    void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
                 Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);

    void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
@ -56,7 +56,7 @@ protected:
    virtual void EnsureCompatible() = 0;

    // saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
-    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
                 Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;

    virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -4372,20 +4372,19 @@ void CPUMatrix<ElemType>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol,
 }

 template <class ElemType>
-void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
+void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
                                                    CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance, CPUMatrix<ElemType>& out, double epsilon,
                                                    CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
 {
    assert((GetNumRows() % scale.GetNumRows()) == 0);

-    if (expAvgFactor != 0 || blendFactor != 1)
+    if (!inferenceOnly || expAvgFactor != 0 || blendFactor != 1)
        RuntimeError("Batch normalization training on CPU is not yet implemented.");

    saveMean.Resize(0, 0); // only doing inference: these two are not produced
    saveInvStdDev.Resize(0, 0);

    bool spatial = GetNumRows() != scale.GetNumRows();
-    size_t batchSize = GetNumCols();
    if (spatial)
    {
        size_t spatialSize = GetNumRows() / scale.GetNumRows();
@ -4395,7 +4394,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
            for (long irow = 0; irow < out.GetNumRows(); irow++)
            {
                size_t imap = irow / spatialSize;
-                ElemType stdDev = sqrt(runVariance(imap, 0) * (batchSize - 1) / batchSize + epsilon);
+                ElemType stdDev = sqrt(runVariance(imap, 0) + epsilon);
                out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) / stdDev + bias(imap, 0);
            }
        }
@ -4407,7 +4406,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
        {
            for (long irow = 0; irow < out.GetNumRows(); irow++)
            {
-                ElemType stdDev = sqrt(runVariance(irow, 0) * (batchSize - 1) / batchSize + epsilon);
+                ElemType stdDev = sqrt(runVariance(irow, 0) + epsilon);
                out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) / stdDev + bias(irow, 0);
            }
        }
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -27,8 +27,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 double logadd(double x, double y);

-//To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
-//conversion is need when passing data between CPUMatrix and C++ matrices
+// To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
+// conversion is need when passing data between CPUMatrix and C++ matrices
 template <class ElemType>
 class MATH_API CPUMatrix : public BaseMatrix<ElemType>
 {
@ -375,7 +375,7 @@ public:
    void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
                                CPUMatrix<ElemType>& grad) const;

-    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runStdDev,
+    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance,
                                   CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
    void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
                                    CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
--- a/Source/Math/CntkBatchNormalization.cuh
+++ b/Source/Math/CntkBatchNormalization.cuh
@ -203,7 +203,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
    assert(::isfinite(epsilon) && epsilon > 0);
    assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
    assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
-    assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
+    // TODO optimize for (expAvgFactor == 0) && (blendFactor == 1)

    int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
    if (irowSrcBase >= vectorSize)
@ -346,7 +346,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
            im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
            if (blendFactor != 0)
            {
-                ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] * (batchSize - 1) / batchSize + epsilon));
+                ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
                im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
            }
        }
@ -376,7 +376,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
    assert((vectorSize % spatialSize) == 0);
    assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
    assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
-    assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
+    // TODO optimize for (expAvgFactor == 0) && (blendFactor == 1)
    assert(::isfinite(epsilon) && epsilon > 0);

    int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
@ -506,7 +506,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
        xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
        if (blendFactor != 0)
        {
-            ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] * (batchSize - 1) / batchSize + epsilon));
+            ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
            xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
        }
    }
@ -729,7 +729,7 @@ struct NormalizeBatchTraining

 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
 __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
-                                              const ElemType* saveMean, const ElemType* saveInvStdDev)
+                                              const ElemType* savedMean, const ElemType* savedInvStdDev)
 {
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@ -754,8 +754,8 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
    // Read mean and inv std dev.
    if (threadIdx.y == 0)
    {
-        LoadValues<U>(saveMean + irowSrcBase, mean);
-        LoadValues<U>(saveInvStdDev + irowSrcBase, invStdDev);
+        LoadValues<U>(savedMean + irowSrcBase, mean);
+        LoadValues<U>(savedInvStdDev + irowSrcBase, invStdDev);
        StoreValues<U>(mean, &meanS[threadIdx.x * U]);
        StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
    }
@ -833,7 +833,7 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con

 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
 __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
-                                                        ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev)
+                                                        ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
 {
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@ -859,8 +859,8 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
    // Read mean and inv std dev.
    if (tid == 0)
    {
-        meanS = saveMean[blockIdx.x];
-        invStdDevS = saveInvStdDev[blockIdx.x];
+        meanS = savedMean[blockIdx.x];
+        invStdDevS = savedInvStdDev[blockIdx.x];
    }
    __syncthreads();
    if (tid != 0)
@ -922,7 +922,7 @@ struct ComputeScaleAndBiasGradients
 {
    template <typename ElemType>
    static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
-        ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+        ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);
        const int BlockDimX = 32 / U;
@ -931,7 +931,7 @@ struct ComputeScaleAndBiasGradients
        // Create a grid that has uses striding in y-dimension to cover whole minibatch.
        auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
        kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
-            static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
+            static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
    }
 };

@ -940,7 +940,7 @@ struct ComputeSpatialScaleAndBiasGradients
 {
    template <typename ElemType>
    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
-                     ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+                     ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
    {
        assert((spatialSize % U) == 0);
        assert((vectorSize % spatialSize) == 0);
@ -951,7 +951,7 @@ struct ComputeSpatialScaleAndBiasGradients
        // Create a grid that has uses striding in y-dimension to cover whole minibatch.
        auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
        kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
-            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
+            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
    }
 };

@ -959,7 +959,7 @@ struct ComputeSpatialScaleAndBiasGradients
 template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
 __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
                                                    const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
-                                                    const ElemType* saveMean, const ElemType* saveInvStdDev)
+                                                    const ElemType* savedMean, const ElemType* savedInvStdDev)
 {
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@ -991,8 +991,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
            scale[k] = bnScale[imap];
            ds[k] = dScale[imap];
            db[k] = dBias[imap];
-            mean[k] = saveMean[imap];
-            invStdDev[k] = saveInvStdDev[imap];
+            mean[k] = savedMean[imap];
+            invStdDev[k] = savedInvStdDev[imap];
        }
    }
    else
@ -1000,8 +1000,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
        LoadValues<U>(bnScale + irowBase, scale);
        LoadValues<U>(dScale + irowBase, ds);
        LoadValues<U>(dBias + irowBase, db);
-        LoadValues<U>(saveMean + irowBase, mean);
-        LoadValues<U>(saveInvStdDev + irowBase, invStdDev);
+        LoadValues<U>(savedMean + irowBase, mean);
+        LoadValues<U>(savedInvStdDev + irowBase, invStdDev);
    }

    int icol = blockIdx.y * BlockDimY + threadIdx.y;
@ -1053,7 +1053,7 @@ struct BackpropagateBatchNormGradients
    template <typename ElemType>
    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
                     const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
-                     const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+                     const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);
        const int BlockDimX = 32 / U;
@ -1064,12 +1064,12 @@ struct BackpropagateBatchNormGradients
        if (spatial)
        {
            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
-                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
+                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
        }
        else
        {
            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
-                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
+                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
        }
    }
 };
--- a/Source/Math/CuDnnBatchNormalization.cu
+++ b/Source/Math/CuDnnBatchNormalization.cu
@ -42,9 +42,11 @@ protected:
            InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
    }

-    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runStdDev,
-                     Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
+    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+                     Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
    {
+        UNUSED(inferenceOnly); // TODO
+
        // REVIEW alexeyk: there might be a way to do this in cuDNN.
        if (blendFactor != 0 && (blendFactor != 1 || expAvgFactor > 0))
            InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
@ -53,25 +55,25 @@ protected:
        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
        // cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
        epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
-        // expAvgFactor == 0 && blendFactor == 1 means we are in inference mode.
-        if (expAvgFactor == 0 && blendFactor == 1)
+        if (inferenceOnly)
        {
-            saveMean.Resize(0, 0);      // (these are not produced in this case)
-            saveInvStdDev.Resize(0, 0);
+            assert(expAvgFactor == 0 && blendFactor == 1);
+            savedMean.Resize(0, 0);      // (these are not produced in this case)
+            savedInvStdDev.Resize(0, 0);
            CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
-                                                               m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runStdDev), epsilon));
+                                                               m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runVariance), epsilon));
        }
        else
        {
-            saveMean.Resize(runMean);
-            saveInvStdDev.Resize(runMean);
+            savedMean.Resize(runMean);
+            savedInvStdDev.Resize(runMean);
            CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
-                                                              m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runStdDev),
-                                                              epsilon, ptr(saveMean), ptr(saveInvStdDev)));
+                                                              m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runVariance),
+                                                              epsilon, ptr(savedMean), ptr(savedInvStdDev)));
        }
    }

-    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
+    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
                      Mat& scaleGrad, Mat& biasGrad) override
    {
        UNUSED(blendFactor);  // BUGBUG: It should be used.
@ -79,7 +81,7 @@ protected:
        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
        // REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine.
        CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
-                                                   m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
+                                                   m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(savedMean), ptr(savedInvStdDev)));
    }

 private:
--- a/Source/Math/CuDnnCommon.h
+++ b/Source/Math/CuDnnCommon.h
@ -9,7 +9,7 @@
 #include "TensorShape.h"
 #include <cudnn.h>
 #if CUDNN_MAJOR < 5
-#error CNTK needs CuDNN version 5.0 or higher, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn.
+#error CNTK requires the NVIDIA cuDNN library 5.0 or higher to build, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn for installation instructions.
 #endif
 #include <memory>

--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -3160,12 +3160,13 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
                                                                Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows());
 }

-// returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
+// returns savedMean/savedInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
 template <class ElemType>
-void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
+void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
-                                                    GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
+                                                    GPUMatrix<ElemType>& savedMean, GPUMatrix<ElemType>& savedInvStdDev) const
 {
+    UNUSED(inferenceOnly); // TODO
    assert((GetNumRows() % scale.GetNumRows()) == 0);

    bool spatial = GetNumRows() != scale.GetNumRows();
@ -3178,52 +3179,54 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
    assert(0 < batchSize  && batchSize  <= std::numeric_limits<int>::max());

    SyncGuard syncGuard;
-    if (expAvgFactor > 0 || blendFactor < 1)
+    //if (expAvgFactor > 0 || blendFactor < 1)
+    if (inferenceOnly)
    {
-        // Compute data mean and inverse standard deviation (into saveMean and
-        // saveInvStdDev), and update running mean and variance.
+        // Pick running statistics for normalizing. No update reuqired, and
+        // saved statistics do not need to be produced.
+        assert(expAvgFactor == 0 && blendFactor == 1);
+        normalizeRunningStats = true;
+        savedMean.RequireSize(0, 0);
+        savedInvStdDev.RequireSize(0, 0);
+    }
+    else
+    {
+        // Compute data mean and inverse standard deviation (into savedMean and
+        // savedInvStdDev), and update running mean and variance.
+        // TODO expAvgFactor == 0 && blendFactor == 1 can be optimized (no need for update).
        normalizeRunningStats = false;
-        saveMean.RequireSize(runMean);
-        saveInvStdDev.RequireSize(runMean);
+        savedMean.RequireSize(runMean);
+        savedInvStdDev.RequireSize(runMean);
        if (spatial)
        {
            Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, Data(),
                                                                expAvgFactor, blendFactor,
                                                                runMean.Data(), runVariance.Data(), epsilon,
-                                                                saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                                savedMean.Data(), savedInvStdDev.Data(), GetStream());
        }
        else
        {
            Call<ComputeBatchMeanAndInvStdDev, ElemType>(vectorSize, vectorSize, batchSize, Data(),
                                                         expAvgFactor, blendFactor,
                                                         runMean.Data(), runVariance.Data(), epsilon,
-                                                         saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                         savedMean.Data(), savedInvStdDev.Data(), GetStream());
        }
    }
-    else
-    {
-        // With expAvgFactor == 0 and blendFactor == 1 the running statistics
-        // do not need to be updated. CNTK engine in this case returns saveMean
-        // and saveInvStdDev empty, but cuDNN engine does not.
-        normalizeRunningStats = true;
-        saveMean.RequireSize(0, 0);
-        saveInvStdDev.RequireSize(0, 0);
-    }

    Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
                                           normalizeRunningStats, epsilon,
                                           Data(), out.Data(),
                                           scale.Data(), bias.Data(),
                                           runMean.Data(), runVariance.Data(),
-                                           saveMean.Data(), saveInvStdDev.Data(),
+                                           savedMean.Data(), savedInvStdDev.Data(),
                                           GetStream());
 }

-// saveMean/saveInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
+// savedMean/savedInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
 // For blendFactor=1, they are not used and can be uninitialized or empty.
 template <class ElemType>
 void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
-                                                     const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
+                                                     const GPUMatrix<ElemType>& savedMean, const GPUMatrix<ElemType>& savedInvStdDev,
                                                     GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
 {
    assert((GetNumRows() % scale.GetNumRows()) == 0);
@ -3240,16 +3243,16 @@ void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>&
    if (spatial)
    {
        Call<ComputeSpatialScaleAndBiasGradients, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
-                                                            saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                            savedMean.Data(), savedInvStdDev.Data(), GetStream());
    }
    else
    {
        Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
-                                                     saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                     savedMean.Data(), savedInvStdDev.Data(), GetStream());
    }
    ElemType mbStatsWeight = (ElemType)(1 - blendFactor); // weight for contribution from actual MB stats (0 if none, e.g. locked BN node)
    Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
-                                                    in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                    in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), savedMean.Data(), savedInvStdDev.Data(), GetStream());
 }

 #pragma region Static BLAS Functions
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -467,8 +467,8 @@ public:
    void AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
    void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;

-    void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
-                                   GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
+    void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
+                                   GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
                                   GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
    void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
                                    const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@ -197,6 +197,6 @@ if exist "$(CuDnnDll)" xcopy /D /Y "$(CuDnnDll)" "$(OutputPath)"
  </ImportGroup>
  <Target Name="CheckDependencies">
    <Error Condition="!Exists('$(CUB_PATH)')" Text="CNTK requires the NVIDIA CUB library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
-    <Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
+    <Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library 5.0 or higher to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
  </Target>
 </Project>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -4274,8 +4274,8 @@ void Matrix<ElemType>::AveragePoolingBackward(const Matrix<int>& mpRowCol, const
 }

 template <class ElemType>
-void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor, 
-                                                 Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
+void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, 
+                                                 Matrix<ElemType>& runMean, Matrix<ElemType>& runVariance, Matrix<ElemType>& out, double epsilon,
                                                 Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const
 {
    DecideAndMoveToRightDevice(*this, out);
@ -4283,11 +4283,11 @@ void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale,
    // REVIEW alexeyk: add sparse version.
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
-                            m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor,
-                                                                   *(runMean.m_CPUMatrix), *(runStdDev.m_CPUMatrix),
+                            m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), inferenceOnly, expAvgFactor, blendFactor,
+                                                                   *(runMean.m_CPUMatrix), *(runVariance.m_CPUMatrix),
                                                                   *(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)),
-                            m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor,
-                                                                   *(runMean.m_GPUMatrix), *(runStdDev.m_GPUMatrix),
+                            m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), inferenceOnly, expAvgFactor, blendFactor,
+                                                                   *(runMean.m_GPUMatrix), *(runVariance.m_GPUMatrix),
                                                                   *(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)),
                            NOT_IMPLEMENTED,
                            NOT_IMPLEMENTED);
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -500,8 +500,8 @@ public:
    void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
    void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;

-    void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
-                                   Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
+    void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
+                                   Matrix<ElemType>& runMean, Matrix<ElemType>& runVariance, Matrix<ElemType>& out, double epsilon,
                                   Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
    void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
                                    Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -1827,7 +1827,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,

 template <class ElemType>
 void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, 
-                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
+                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
                                                    GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
 {
 }
--- a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk
@ -1,3 +1,7 @@
+# Synthetic test for non-spatial batch normalization, (almost) in isolation of
+# other nodes (in particular Convolution). This is not a general example and/or
+# documenting best practices. The network was based on MNIST/01_OneHidden.
+
 rootDir = ".."

 configDir = "$rootDir$/Config"
@ -17,10 +21,6 @@ traceLevel = 1

 batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"

-#######################################
-#  TRAINING CONFIG                    #
-#######################################
-
 train = [
    action = "train"

@ -54,10 +54,6 @@ train = [
    ]   
 ]

-#######################################
-#  TEST CONFIG                        #
-#######################################
-
 test = [
    action = "test"
    minibatchSize = 1024    # reduce this if you run out of memory
--- a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl
@ -22,8 +22,8 @@ DNN = [
    b = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0) 
    sc = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 1) 
    m = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    y = BatchNormalization(h1, sc, b, m, var, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)
+    v = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    y = BatchNormalization(h1, sc, b, m, v, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)

    ol = DNNLayer(hiddenDim, labelDim, y, 1)

--- a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk
@ -1,3 +1,7 @@
+# Synthetic test for spatial batch normalization, (almost) in isolation of
+# other nodes (in particular Convolution). This is not a general example and/or
+# documenting best practices. The network was based on CIFAR-10/02_BatchNormConv.
+
 RootDir = "."

 ConfigDir = "$RootDir$"
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl
@ -22,8 +22,7 @@ ndlMnistMacros = [
    scValue = 1
    
    # Batch normalization time constant.
-    #bnTimeConst = 4096
-    bnTimeConst = 2048
+    bnTimeConst = 0
  
    fc1WScale = 12
    fc1BValue = 0
@ -42,9 +41,9 @@ DNN=[
    b = LearnableParameter(cMap1, 1, init = fixedValue, value = conv1BValue)
    sc = LearnableParameter(cMap1, 1, init = fixedValue, value = scValue)
    m = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
-    y = BatchNormalization(featScaled, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
+    y = BatchNormalization(featScaled, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
    conv1 = RectifiedLinear(y)

    # pool1
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml
@ -19,13 +19,13 @@ testCases:
  Epochs must be finished with expected results:
    patterns:
      - Finished Epoch
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
-      - Err = {{float,tolerance=2.0%}} * {{integer}}
+      - CE = {{float,tolerance=0.1%}} * {{integer}}
+      - Err = {{float,tolerance=0.1%}} * {{integer}}
      - totalSamplesSeen = {{integer}}
      - learningRatePerSample = {{float,tolerance=0.001%}}

  Final test results must match:
    patterns:
      - "Final Results: Minibatch[{{integer}}-{{integer}}]"
-      - Err = {{float,tolerance=2.0%}} * {{integer}}
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
+      - Err = {{float,tolerance=0.1%}} * {{integer}}
+      - CE = {{float,tolerance=0.1%}} * {{integer}}
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml
@ -19,13 +19,13 @@ testCases:
  Epochs must be finished with expected results:
    patterns:
      - Finished Epoch
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
-      - Err = {{float,tolerance=2.0%}} * {{integer}}
+      - CE = {{float,tolerance=0.1%}} * {{integer}}
+      - Err = {{float,tolerance=0.1%}} * {{integer}}
      - totalSamplesSeen = {{integer}}
      - learningRatePerSample = {{float,tolerance=0.001%}}

  Final test results must match:
    patterns:
      - "Final Results: Minibatch[{{integer}}-{{integer}}]"
-      - Err = {{float,tolerance=2.0%}} * {{integer}}
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
+      - Err = {{float,tolerance=0.1%}} * {{integer}}
+      - CE = {{float,tolerance=0.1%}} * {{integer}}
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl
@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
 ]

 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
 ]

 ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
    y = RectifiedLinear(bn)
 ]

@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
    y = RectifiedLinear(bn)
 ]

--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml
@ -32,4 +32,4 @@ testCases:
      - "Final Results: Minibatch[{{integer}}-{{integer}}]"
      # this part is flaky, needs to be investigated (VSO item #414)
      #- Err = {{float,tolerance=2.0%}} * {{integer}}
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
+      - CE = {{float,tolerance=4.0%}} * {{integer}}
--- a/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp
+++ b/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp
@ -100,6 +100,7 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
            double expAvg = std::get<3>(cfg);
            double blendFactor = 0; // cuDNN supports blendFactor == 0 (train) or 1 (eval) only.
            double eps = 1e-5; // CUDNN_BN_MIN_EPSILON
+            bool inferenceOnly = false;

            auto engCudnn = BNEng::Create(baseDeviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::CuDnn);
            auto engCntk = BNEng::Create(deviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::Cntk);
@ -142,12 +143,12 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)

            CudaTimer time1;
            time1.Start();
-            engCntk->Forward(in, scale, bias, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev);
+            engCntk->Forward(in, scale, bias, inferenceOnly, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev);
            time1.Stop();

            CudaTimer time2;
            time2.Start();
-            engCudnn->Forward(inB, scaleB, biasB, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB);
+            engCudnn->Forward(inB, scaleB, biasB, inferenceOnly, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB);
            time2.Stop();
            
            std::stringstream tmsg;
--- a/Tests/UnitTests/V2LibraryTests/Image.h
+++ b/Tests/UnitTests/V2LibraryTests/Image.h
@ -28,12 +28,12 @@ inline FunctionPtr ProjLayer(Variable wProj, Variable input, size_t hStride, siz
    auto b = Parameter({ outFeatureMapCount }, (float)bValue, device);
    auto sc = Parameter({ outFeatureMapCount }, (float)scValue, device);
    auto m = Constant({ outFeatureMapCount }, 0.0f, device);
-    auto var = Constant({ outFeatureMapCount }, 0.0f, device);
+    auto v = Constant({ outFeatureMapCount }, 0.0f, device);

    size_t numInputChannels = input.Shape()[input.Shape().NumAxes() - 1];

    auto c = Convolution(wProj, input, { hStride, vStride, numInputChannels }, { true }, { false });
-    return BatchNormalization(c, sc, b, m, var, true /*spatial*/, (double)bnTimeConst);
+    return BatchNormalization(c, sc, b, m, v, true /*spatial*/, (double)bnTimeConst);
 }

 inline FunctionPtr ResNetNode2(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)