From e1a9cabbde633a42945963be09d94e132a8405a2 Mon Sep 17 00:00:00 2001
From: Mark Hillebrand <mahilleb@microsoft.com>
Date: Tue, 23 Aug 2016 17:57:00 +0200
Subject: [PATCH] Address CR comments

---
 Examples/Image/MNIST/Config/Macros.ndl        |   8 +-
 Examples/Image/MNIST/Config/Shared.bs         |  22 ++--
 .../Image/Miscellaneous/CIFAR-10/Macros.ndl   |  16 +--
 .../Miscellaneous/ImageNet/ResNet/Macros.ndl  |   4 +-
 .../Miscellaneous/ImageNet/VGG/Macros.ndl     |   8 +-
 Source/ComputationNetworkLib/TrainingNodes.h  | 104 ++++++++++--------
 Source/Math/BatchNormalizationEngine.cpp      |  52 +++++----
 Source/Math/BatchNormalizationEngine.h        |   4 +-
 Source/Math/CPUMatrix.cpp                     |   9 +-
 Source/Math/CPUMatrix.h                       |   6 +-
 Source/Math/CntkBatchNormalization.cuh        |  44 ++++----
 Source/Math/CuDnnBatchNormalization.cu        |  28 ++---
 Source/Math/CuDnnCommon.h                     |   2 +-
 Source/Math/GPUMatrix.cu                      |  59 +++++-----
 Source/Math/GPUMatrix.h                       |   4 +-
 Source/Math/MathCUDA.vcxproj                  |   2 +-
 Source/Math/Matrix.cpp                        |  12 +-
 Source/Math/Matrix.h                          |   4 +-
 Source/Math/NoGPU.cpp                         |   2 +-
 .../NonSpatial/01_OneHidden.cntk              |  12 +-
 .../NonSpatial/01_OneHidden.ndl               |   4 +-
 .../Spatial/02_BatchNormConv.cntk             |   4 +
 .../Spatial/02_BatchNormConv.ndl              |   7 +-
 .../Spatial/CNTK/testcases.yml                |   8 +-
 .../Spatial/CuDNN/testcases.yml               |   8 +-
 .../BatchNormalization/Spatial/Macros.ndl     |  16 +--
 .../CIFAR-10/02_BatchNormConv/testcases.yml   |   2 +-
 .../BatchNormalizationEngineTests.cpp         |   5 +-
 Tests/UnitTests/V2LibraryTests/Image.h        |   4 +-
 29 files changed, 242 insertions(+), 218 deletions(-)

diff --git a/Examples/Image/MNIST/Config/Macros.ndl b/Examples/Image/MNIST/Config/Macros.ndl
index cfa1e2baa..3786f979a 100644
--- a/Examples/Image/MNIST/Config/Macros.ndl
+++ b/Examples/Image/MNIST/Config/Macros.ndl
@@ -26,9 +26,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 
@@ -72,10 +72,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
     b = LearnableParameter(outMap, 1, init=fixedValue, value=bValue)
     sc = LearnableParameter(outMap, 1, init=fixedValue, value=scValue)
     m = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
-    var = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
+    v = LearnableParameter(outMap, 1, init=fixedValue, value=0, learningRateMultiplier=0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
+    y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst, imageLayout=$imageLayout$)
 ]
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
diff --git a/Examples/Image/MNIST/Config/Shared.bs b/Examples/Image/MNIST/Config/Shared.bs
index 2354465df..ff05527c1 100644
--- a/Examples/Image/MNIST/Config/Shared.bs
+++ b/Examples/Image/MNIST/Config/Shared.bs
@@ -22,13 +22,13 @@ DNNImageSigmoidLayer (inW, inH, inC, outDim, x, parmScale) = [
 # ReLU layer with batch normalization
 # TODO: rename to DNN-
 DnnBNReLULayer (inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
-    W   = Parameter (outDim, inDim, init = "gaussian", initValueScale = wScale, initOnCPUOnly=true) 
-    b   = Parameter (outDim, 1, init = "fixedValue", value = bValue) 
-    sc  = Parameter (outDim, 1, init = "fixedValue", value = scValue) 
-    m   = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
-    var = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
+    W  = Parameter (outDim, inDim, init = "gaussian", initValueScale = wScale, initOnCPUOnly=true) 
+    b  = Parameter (outDim, 1, init = "fixedValue", value = bValue) 
+    sc = Parameter (outDim, 1, init = "fixedValue", value = scValue) 
+    m  = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
+    v  = Parameter (outDim, 1, init = "fixedValue", value = 0, learningRateMultiplier = 0)
     t = Times(W, x)  # TODO: W * x
-    bn = BatchNormalization(t, sc, b, m, var, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, v, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ].y
 
@@ -58,13 +58,13 @@ ConvNDReLULayer (inp, kW, kH, inMap, inWCount, outMap, hStride, vStride, wScale,
 ].out
 
 ConvBNLayerW (W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst) = [   # TODO: delete if not needed
-    b   = Parameter(outMap, 1, init="fixedValue", value=bValue)
-    sc  = Parameter(outMap, 1, init="fixedValue", value=scValue)
-    m   = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
-    var = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
+    b  = Parameter(outMap, 1, init="fixedValue", value=bValue)
+    sc = Parameter(outMap, 1, init="fixedValue", value=scValue)
+    m  = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
+    v  = Parameter(outMap, 1, init="fixedValue", value=0, learningRateMultiplier=0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true /* , imageLayout=$imageLayout$*/)
-    y = BatchNormalization(c, sc, b, m, var, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
+    y = BatchNormalization(c, sc, b, m, v, eval=false, spatial=true, normalizationTimeConstant=bnTimeConst /* , imageLayout=$imageLayout$*/)
 ].y
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
index 552f51951..4fa20f065 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
     b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
     sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
     b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
     sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
     c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]
 
 ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
@@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 
@@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst)
     y = RectifiedLinear(bn)
 ]
 
diff --git a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
index ca1585526..b546fcfaf 100644
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@@ -8,9 +8,9 @@ BN(inp, mapCount, bValue, scValue, bnTimeConst)
     b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
     sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
     m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
-    y = BatchNormalization(inp, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
+    y = BatchNormalization(inp, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
 ]
 
 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
index 82d3b8582..949bf29e5 100644
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
@@ -15,9 +15,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
     b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
     m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false)
     y = RectifiedLinear(bn)
 ]
 
@@ -47,9 +47,9 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
     b = Parameter(outMap, 1, init = fixedValue, value = bValue)
     sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
     m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, var, spatial = true, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, v, spatial = true, imageLayout = "cudnn")
     y = RectifiedLinear(bn);
 ]
diff --git a/Source/ComputationNetworkLib/TrainingNodes.h b/Source/ComputationNetworkLib/TrainingNodes.h
index ba0d28ae2..e4390ded0 100644
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@@ -1547,7 +1547,7 @@ template class DropoutNode<double>;
 // 
 // m = mean(input)
 // var = variance(input)
-// input_norm = (input - mean) / sqrt(var)
+// input_norm = (input - mean) / sqrt(epsilon + var)
 // output = gamma * input_norm + beta
 // 
 // where gamma and beta are trainable parameters(represented as LearnableParameter).
@@ -1570,7 +1570,7 @@ template class DropoutNode<double>;
 // * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
 //      Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
 //      Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase).
-// * epsilon is a conditioner constant used in computing inverted standard deviation
+// * epsilon is a conditioner constant used in computing inverse standard deviation
 // * useCntkEngine is a Boolean flag that specifies which batch normalization implementation to use: CNTK or cuDNN-based.
 // * imageLayout is the image layout. Only cudnn is supported at present.
 // -----------------------------------------------------------------------
@@ -1584,14 +1584,14 @@ public:
     BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) :
         Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
         m_samplesSeen(0), m_imageLayoutKind(ImageLayoutKind::CHW),
-        m_convertRunningVariance(false)
+        m_convertRunningVariancePending(false)
     {
     }
     BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
                            double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) :
         Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
         m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_samplesSeen(0),
-        m_convertRunningVariance(false)
+        m_convertRunningVariancePending(false)
     {
     }
     BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
@@ -1627,7 +1627,9 @@ public:
             fstream >> m_normTimeConst;
             fstream >> m_blendTimeConst;
             fstream >> m_imageLayoutKind;
+#ifdef _DEBUG 
             fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
+#endif
             fstream >> mbCount;
             m_samplesSeen = mbCount;
             fstream >> m_epsilon;
@@ -1666,7 +1668,9 @@ public:
             {
                 size_t mbCount;
                 fstream >> m_imageLayoutKind;
+#ifdef _DEBUG 
                 fprintf(stderr, "INFO: %ls: initialized samplesSeen from mbCount when loading pre-CuDNNv5 model\n", NodeName().c_str());
+#endif
                 fstream >> mbCount;
                 m_samplesSeen = mbCount;
             }
@@ -1681,8 +1685,9 @@ public:
         {
             // Prior to CNTK_MODEL_VERSION_12, running inverse standard
             // deviation was stored in Input 4. Now variance is used.
-            // We (approximately) convert it during validation later.
-            m_convertRunningVariance = true;
+            // We (approximately) convert it during validation later
+            // (and then clear the flag).
+            m_convertRunningVariancePending = true;
         }
     }
 
@@ -1713,15 +1718,13 @@ private: // time-constant conversions
         // in inference mode, only use long-term mean and do not update running estimates
         if (!Environment().IsTraining())
         {
-            assert(0 < m_samplesSeen);                       // something must have been trained
+            if (m_samplesSeen == 0)
+                RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str());
             return 0;                                        // (m_normTimeConst == infinity) no new contribution from current minibatch
         }
 
         // Initialization case: only use current minibatch.
-        if (m_samplesSeen == 0)
-        {
-            return 1.0;
-        }
+        if (m_samplesSeen == 0) return 1.0;
 
         double numSamples = (double)GetMBLayout()->GetActualNumSamples();
 
@@ -1747,15 +1750,13 @@ private: // time-constant conversions
         // in inference mode, only use long-term mean and do not update running estimates
         if (!Environment().IsTraining())
         {
-            assert(0 < m_samplesSeen);  // something must have been trained
-            return 1.0;             // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
+            if (m_samplesSeen == 0)
+                RuntimeError("%ls: inference mode is used, but nothing has been trained.", NodeName().c_str());
+            return 1.0;                 // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
         }
 
         // Initialization case: only use current minibatch.
-        if (m_samplesSeen == 0)
-        {
-            return 0;
-        }
+        if (m_samplesSeen == 0) return 0;
 
         // convert to blend factor (= weight for running stats)
         // The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
@@ -1771,7 +1772,8 @@ public:
 
     virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
     {
-        assert(!m_convertRunningVariance);
+        if (m_convertRunningVariancePending)
+            LogicError("%ls: Failed to convert running variance until forward prop", NodeName().c_str());
         FrameRange fr(Input(0)->GetMBLayout());
 
         Matrix<ElemType> sliceInputValue  = Input(0)->ValueFor(fr);
@@ -1792,12 +1794,16 @@ public:
         double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this)
         double blendFactor  = ComputeBlendFactor();  // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)
 
-        m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in)
-                         expAvgFactor, blendFactor,
-                         runMean, runVariance,                 // (in/out) running estimates, updated from the current MB mean/variance
-                         /*out=*/ sliceOutputValue,            // (out) batch-normalized output value
+        // In inference-only mode, m_savedMean and m_saveInvStdDev will not be
+        // produced and BackpropToNonLooping() may not be called. In
+        // non-inference (training) mode, saved statistics must be produced.
+        bool inferenceOnly = !Environment().IsTraining();
+        m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias,   // (in)
+                         inferenceOnly, expAvgFactor, blendFactor,
+                         runMean, runVariance,                   // (in/out) running estimates, updated from the current MB mean/variance
+                         /*out=*/ sliceOutputValue,              // (out) batch-normalized output value
                          m_epsilon,
-                         *m_saveMean, *m_saveInvStdDev);       // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
+                         *m_savedMean, *m_savedInvStdDev);       // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine
     }
 
     // Note: This function assumes that inputIndex=0 is called before the others.
@@ -1805,6 +1811,16 @@ public:
     // BUGBUG: If the input has no learnables (e.g. using BN instead of corpus mean/var norm), this will not be called for inputIndex=0 at all.
     virtual void BackpropToNonLooping(size_t inputIndex) override
     {
+        // Must be in training mode.
+        if (!Environment().IsTraining())
+            LogicError("%ls: BackpropToNonLooping() cannot be called in inference mode", NodeName().c_str());
+        // In non-inference mode, the batch normalization engine must provide
+        // saved statistics, m_savedMean and m_savedInvStdDev
+        if (m_savedMean->IsEmpty())
+            LogicError("%ls: m_savedMean cannot be empty", NodeName().c_str());
+        if (m_savedInvStdDev->IsEmpty())
+            LogicError("%ls: m_savedInvStdDev cannot be empty", NodeName().c_str());
+
         FrameRange fr(Input(0)->GetMBLayout());
 
         if (inputIndex == 0) // derivative with respect to the input.
@@ -1813,16 +1829,8 @@ public:
             auto sliceInputValue                = Input(0)->ValueFor(fr);
             const Matrix<ElemType>& scale       = Input(1)->Value();
             const Matrix<ElemType>& bias        = Input(2)->Value();
-            const Matrix<ElemType>& runMean     = Input(3)->Value();
-            const Matrix<ElemType>& runVariance = Input(4)->Value();
 
             auto sliceInputGrad = Input(0)->GradientFor(fr);
-            // The mean used in Forward() are either saveMean or runMean.
-            // This is decided by the engine, which communicates back the decision by returning
-            // an empty saveMean in case runMean should be used. Likewise for variance / inverted standard deviation.
-            let& actualMean      = !m_saveMean->IsEmpty()      ? *m_saveMean      : runMean;   // empty if only the running mean is used
-            if (m_saveInvStdDev->IsEmpty()) RuntimeError("TODO m_saveInvStdDev <-> runVariance not the same:");
-            let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runVariance;
             m_dScale->Resize(scale); // gradients for scale and bias get stored here
             m_dBias->Resize(bias);
 
@@ -1833,7 +1841,7 @@ public:
                               sliceInputGrad,                   // (out) gradient for data input goes here
                               scale,                            // (in)  out of scale and bias, only scale is needed in gradient propagation
                               blendFactor,                      // (in)  smoothing weight for running stats (1=use only running stats)
-                              actualMean, actualInvStdDev,      // (in)  actual mean/stddev values used in ForwardProp()
+                              *m_savedMean, *m_savedInvStdDev,   // (in)  saved mean/invstddev values used in ForwardProp()
                               *m_dScale, *m_dBias);             // (out) gradients for scale and bias
         }
         else if (inputIndex == 1) // derivative with respect to the scale
@@ -1855,6 +1863,7 @@ public:
 
     virtual void EndBackprop() override
     {
+        // TODO don't update in locked mode
         auto numSamples = GetMBLayout()->GetActualNumSamples();
         m_samplesSeen += numSamples;
         Base::EndBackprop();
@@ -1892,19 +1901,22 @@ public:
 
         if (isFinalValidationPass)
         {
-            if (m_convertRunningVariance)
+            if (m_convertRunningVariancePending)
             {
-                // Input 4 is still inverse standard deviation. We convert it to variance, approximately,
-                // and output a warning.
-                fprintf(stderr, "WARNING: %ls: approximately converting inverse standard deviation into variance when loading pre-CuDNNv5 model\n",
+                // Prior to CNTK CuDNN v5 support (and the CNTK engine of the same time), mean and inverse standard deviation
+                // statistics were computed and stored. With CuDNN v5 (and the corresponding CNTK engine update), this was changed
+                // to mean and variance.
+                // To load an old model for further training or inference, Input(4) (which is inverse standard deviation) needs to
+                // be converted to variance, via v = 1/(isd^2) + epsilon, where 'v' is variance and 'isd' is inverse standard
+                // Since this is an approximation, we output a warning.
+                fprintf(stderr, "WARNING: %ls: loading pre-CuDNNv5 model and approximately converting variance statistics format\n",
                         NodeName().c_str());
                 Matrix<ElemType>& runInvStdDev = Input(4)->Value();
                 runInvStdDev.AssignElementPowerOf(runInvStdDev, 2);
                 runInvStdDev.ElementInverse();
                 runInvStdDev += (float) m_epsilon;
-                fprintf(stderr, "--- %ls converted runVariance after loading\n", NodeName().c_str());
                 runInvStdDev.Print();
-                m_convertRunningVariance = false;
+                m_convertRunningVariancePending = false;
             }
 
             // check inputs
@@ -1949,8 +1961,8 @@ public:
     void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
     {
         Base::RequestMatricesBeforeForwardProp(matrixPool);
-        RequestMatrixFromPool(m_saveMean, matrixPool);
-        RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
+        RequestMatrixFromPool(m_savedMean, matrixPool);
+        RequestMatrixFromPool(m_savedInvStdDev, matrixPool);
     }
 
     void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
@@ -1963,8 +1975,8 @@ public:
     void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
     {
         Base::ReleaseMatricesAfterBackprop(matrixPool);
-        ReleaseMatrixToPool(m_saveMean, matrixPool);
-        ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
+        ReleaseMatrixToPool(m_savedMean, matrixPool);
+        ReleaseMatrixToPool(m_savedInvStdDev, matrixPool);
         ReleaseMatrixToPool(m_dScale, matrixPool);
         ReleaseMatrixToPool(m_dBias, matrixPool);
     }
@@ -2035,7 +2047,7 @@ private:
     // REVIEW alexeyk: if this works, document it properly in Wiki.
     double m_blendTimeConst;
 
-    // Epsilon used to compute inverse standard deviation (m_saveInvStdDev).
+    // Epsilon used to compute inverse standard deviation (m_savedInvStdDev).
     double m_epsilon;
     // Whether to use CNTK or cuDNN BN implementation.
     bool m_useCntkEngine;
@@ -2048,8 +2060,8 @@ private:
     size_t m_samplesSeen;
 
     // Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation.
-    shared_ptr<Matrix<ElemType>> m_saveMean;
-    shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
+    shared_ptr<Matrix<ElemType>> m_savedMean;
+    shared_ptr<Matrix<ElemType>> m_savedInvStdDev;
     // Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
     // Not used for blendFactor=1 in CNTK engine.
     shared_ptr<Matrix<ElemType>> m_dScale;
@@ -2057,7 +2069,7 @@ private:
 
     std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
 
-    bool m_convertRunningVariance;
+    bool m_convertRunningVariancePending;
 };
 
 template class BatchNormalizationNode<float>;
diff --git a/Source/Math/BatchNormalizationEngine.cpp b/Source/Math/BatchNormalizationEngine.cpp
index c254c8c64..6664d2f31 100644
--- a/Source/Math/BatchNormalizationEngine.cpp
+++ b/Source/Math/BatchNormalizationEngine.cpp
@@ -10,14 +10,16 @@
 namespace Microsoft { namespace MSR { namespace CNTK {
 
 template <class ElemType>
-void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-                                        Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev)
+void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+                                        Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
 {
     assert(in.GetNumRows() == m_inOutT.GetNumElements());
     assert(out.GetNumRows() == m_inOutT.GetNumElements());
     assert(in.GetNumCols() == out.GetNumCols());
     assert(std::isfinite(expAvgFactor) && (0 <= expAvgFactor && expAvgFactor <= 1));
     assert(std::isfinite(blendFactor) && (0 <= blendFactor && blendFactor <= 1));
+    // In inference mode, must only use runnig statististics
+    assert(!inferenceOnly || ((expAvgFactor == 0.0) && (blendFactor == 1.0)));
     assert(std::isfinite(epsilon) && epsilon > 0);
     if (!m_spatial)
     {
@@ -39,28 +41,35 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
     assert(runVariance.GetNumCols() == 1);
 
     EnsureCompatible();
-    ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
+    ForwardCore(in, scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
 
-    if (!m_spatial)
+    if (!inferenceOnly)
     {
-        assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
-        assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
+        assert(!savedMean.IsEmpty());
+        assert(!savedInvStdDev.IsEmpty());
+        if (!m_spatial)
+        {
+            assert(m_inOutT.GetNumElements() == savedMean.GetNumRows());
+            assert(m_inOutT.GetNumElements() == savedInvStdDev.GetNumRows());
+        }
+        else
+        {
+            assert((m_inOutT.GetNumElements() % savedMean.GetNumRows()) == 0);
+            assert((m_inOutT.GetNumElements() % savedInvStdDev.GetNumRows()) == 0);
+        }
+        assert(savedMean.GetNumCols() == 1);
+        assert(savedInvStdDev.GetNumCols() == 1);
     }
-    else
-    {
-        assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
-        assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
-    }
-    assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
-    assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
 }
 
 template <class ElemType>
 void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
-                                         const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
+                                         const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad)
 {
+    assert(!savedMean.IsEmpty());
+    assert(!savedInvStdDev.IsEmpty());
     EnsureCompatible();
-    BackwardCore(in, srcGrad, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+    BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
 }
 
 template <class ElemType>
@@ -89,24 +98,23 @@ protected:
             InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
     }
 
-    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-                     Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
+    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+                     Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
     {
-        in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, saveMean, saveInvStdDev);
+        in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
     }
 
-    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
+    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
                       Mat& scaleGrad, Mat& biasGrad) override
     {
-        srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+        srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
     }
 };
 
 template class CntkBatchNormEngine<float>;
 template class CntkBatchNormEngine<double>;
 
-template <typename T>
-bool HasFlag(T src, T testFlag)
+template <typename T> bool HasFlag(T src, T testFlag)
 {
     return ((int)src & (int)testFlag) != 0;
 }
diff --git a/Source/Math/BatchNormalizationEngine.h b/Source/Math/BatchNormalizationEngine.h
index d4beb82ae..0f594d83c 100644
--- a/Source/Math/BatchNormalizationEngine.h
+++ b/Source/Math/BatchNormalizationEngine.h
@@ -34,7 +34,7 @@ public:
 public:
     virtual ~BatchNormEngine() = default;
 
-    void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+    void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
                  Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
 
     void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
@@ -56,7 +56,7 @@ protected:
     virtual void EnsureCompatible() = 0;
 
     // saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
-    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
                  Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
 
     virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
diff --git a/Source/Math/CPUMatrix.cpp b/Source/Math/CPUMatrix.cpp
index 24dd8439f..1b8813445 100644
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@@ -4372,20 +4372,19 @@ void CPUMatrix<ElemType>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol,
 }
 
 template <class ElemType>
-void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
+void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
                                                     CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance, CPUMatrix<ElemType>& out, double epsilon,
                                                     CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
 {
     assert((GetNumRows() % scale.GetNumRows()) == 0);
 
-    if (expAvgFactor != 0 || blendFactor != 1)
+    if (!inferenceOnly || expAvgFactor != 0 || blendFactor != 1)
         RuntimeError("Batch normalization training on CPU is not yet implemented.");
 
     saveMean.Resize(0, 0); // only doing inference: these two are not produced
     saveInvStdDev.Resize(0, 0);
 
     bool spatial = GetNumRows() != scale.GetNumRows();
-    size_t batchSize = GetNumCols();
     if (spatial)
     {
         size_t spatialSize = GetNumRows() / scale.GetNumRows();
@@ -4395,7 +4394,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
             for (long irow = 0; irow < out.GetNumRows(); irow++)
             {
                 size_t imap = irow / spatialSize;
-                ElemType stdDev = sqrt(runVariance(imap, 0) * (batchSize - 1) / batchSize + epsilon);
+                ElemType stdDev = sqrt(runVariance(imap, 0) + epsilon);
                 out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) / stdDev + bias(imap, 0);
             }
         }
@@ -4407,7 +4406,7 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
         {
             for (long irow = 0; irow < out.GetNumRows(); irow++)
             {
-                ElemType stdDev = sqrt(runVariance(irow, 0) * (batchSize - 1) / batchSize + epsilon);
+                ElemType stdDev = sqrt(runVariance(irow, 0) + epsilon);
                 out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) / stdDev + bias(irow, 0);
             }
         }
diff --git a/Source/Math/CPUMatrix.h b/Source/Math/CPUMatrix.h
index 357d2b7e6..782b31576 100644
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@@ -27,8 +27,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 
 double logadd(double x, double y);
 
-//To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
-//conversion is need when passing data between CPUMatrix and C++ matrices
+// To comply with BLAS libraries matrices are stored in ColMajor. However, by default C/C++/C# use RowMajor
+// conversion is need when passing data between CPUMatrix and C++ matrices
 template <class ElemType>
 class MATH_API CPUMatrix : public BaseMatrix<ElemType>
 {
@@ -375,7 +375,7 @@ public:
     void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
                                 CPUMatrix<ElemType>& grad) const;
 
-    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runStdDev,
+    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance,
                                    CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
     void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
                                     CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
diff --git a/Source/Math/CntkBatchNormalization.cuh b/Source/Math/CntkBatchNormalization.cuh
index ae1f77221..e6b375084 100644
--- a/Source/Math/CntkBatchNormalization.cuh
+++ b/Source/Math/CntkBatchNormalization.cuh
@@ -203,7 +203,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
     assert(::isfinite(epsilon) && epsilon > 0);
     assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
     assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
-    assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
+    // TODO optimize for (expAvgFactor == 0) && (blendFactor == 1)
 
     int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
     if (irowSrcBase >= vectorSize)
@@ -346,7 +346,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
             im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
             if (blendFactor != 0)
             {
-                ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] * (batchSize - 1) / batchSize + epsilon));
+                ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
                 im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
             }
         }
@@ -376,7 +376,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
     assert((vectorSize % spatialSize) == 0);
     assert(::isfinite(expAvgFactor) && 0 <= expAvgFactor && expAvgFactor <= 1);
     assert(::isfinite(blendFactor) && 0 <= blendFactor && blendFactor <= 1);
-    assert(expAvgFactor != 0 || blendFactor != 1); // otherwise no need call (no update)
+    // TODO optimize for (expAvgFactor == 0) && (blendFactor == 1)
     assert(::isfinite(epsilon) && epsilon > 0);
 
     int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
@@ -506,7 +506,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
         xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
         if (blendFactor != 0)
         {
-            ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] * (batchSize - 1) / batchSize + epsilon));
+            ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
             xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
         }
     }
@@ -729,7 +729,7 @@ struct NormalizeBatchTraining
 
 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
 __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
-                                              const ElemType* saveMean, const ElemType* saveInvStdDev)
+                                              const ElemType* savedMean, const ElemType* savedInvStdDev)
 {
     static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
     static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@@ -754,8 +754,8 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
     // Read mean and inv std dev.
     if (threadIdx.y == 0)
     {
-        LoadValues<U>(saveMean + irowSrcBase, mean);
-        LoadValues<U>(saveInvStdDev + irowSrcBase, invStdDev);
+        LoadValues<U>(savedMean + irowSrcBase, mean);
+        LoadValues<U>(savedInvStdDev + irowSrcBase, invStdDev);
         StoreValues<U>(mean, &meanS[threadIdx.x * U]);
         StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
     }
@@ -833,7 +833,7 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
 
 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
 __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
-                                                        ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev)
+                                                        ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
 {
     static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
     static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@@ -859,8 +859,8 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
     // Read mean and inv std dev.
     if (tid == 0)
     {
-        meanS = saveMean[blockIdx.x];
-        invStdDevS = saveInvStdDev[blockIdx.x];
+        meanS = savedMean[blockIdx.x];
+        invStdDevS = savedInvStdDev[blockIdx.x];
     }
     __syncthreads();
     if (tid != 0)
@@ -922,7 +922,7 @@ struct ComputeScaleAndBiasGradients
 {
     template <typename ElemType>
     static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
-        ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+        ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
     {
         assert((vectorSize % U) == 0);
         const int BlockDimX = 32 / U;
@@ -931,7 +931,7 @@ struct ComputeScaleAndBiasGradients
         // Create a grid that has uses striding in y-dimension to cover whole minibatch.
         auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
         kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
-            static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
+            static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
     }
 };
 
@@ -940,7 +940,7 @@ struct ComputeSpatialScaleAndBiasGradients
 {
     template <typename ElemType>
     static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
-                     ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+                     ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
     {
         assert((spatialSize % U) == 0);
         assert((vectorSize % spatialSize) == 0);
@@ -951,7 +951,7 @@ struct ComputeSpatialScaleAndBiasGradients
         // Create a grid that has uses striding in y-dimension to cover whole minibatch.
         auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
         kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
-            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
+            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
     }
 };
 
@@ -959,7 +959,7 @@ struct ComputeSpatialScaleAndBiasGradients
 template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
 __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
                                                     const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
-                                                    const ElemType* saveMean, const ElemType* saveInvStdDev)
+                                                    const ElemType* savedMean, const ElemType* savedInvStdDev)
 {
     static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
     static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@@ -991,8 +991,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
             scale[k] = bnScale[imap];
             ds[k] = dScale[imap];
             db[k] = dBias[imap];
-            mean[k] = saveMean[imap];
-            invStdDev[k] = saveInvStdDev[imap];
+            mean[k] = savedMean[imap];
+            invStdDev[k] = savedInvStdDev[imap];
         }
     }
     else
@@ -1000,8 +1000,8 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
         LoadValues<U>(bnScale + irowBase, scale);
         LoadValues<U>(dScale + irowBase, ds);
         LoadValues<U>(dBias + irowBase, db);
-        LoadValues<U>(saveMean + irowBase, mean);
-        LoadValues<U>(saveInvStdDev + irowBase, invStdDev);
+        LoadValues<U>(savedMean + irowBase, mean);
+        LoadValues<U>(savedInvStdDev + irowBase, invStdDev);
     }
 
     int icol = blockIdx.y * BlockDimY + threadIdx.y;
@@ -1053,7 +1053,7 @@ struct BackpropagateBatchNormGradients
     template <typename ElemType>
     static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
                      const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
-                     const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+                     const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
     {
         assert((vectorSize % U) == 0);
         const int BlockDimX = 32 / U;
@@ -1064,12 +1064,12 @@ struct BackpropagateBatchNormGradients
         if (spatial)
         {
             kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
-                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
+                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
         }
         else
         {
             kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
-                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
+                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
         }
     }
 };
diff --git a/Source/Math/CuDnnBatchNormalization.cu b/Source/Math/CuDnnBatchNormalization.cu
index 3dac26fa4..584c2943b 100644
--- a/Source/Math/CuDnnBatchNormalization.cu
+++ b/Source/Math/CuDnnBatchNormalization.cu
@@ -42,9 +42,11 @@ protected:
             InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
     }
 
-    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runStdDev,
-                     Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
+    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
+                     Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
     {
+        UNUSED(inferenceOnly); // TODO
+
         // REVIEW alexeyk: there might be a way to do this in cuDNN.
         if (blendFactor != 0 && (blendFactor != 1 || expAvgFactor > 0))
             InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
@@ -53,25 +55,25 @@ protected:
         cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
         // cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
         epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
-        // expAvgFactor == 0 && blendFactor == 1 means we are in inference mode.
-        if (expAvgFactor == 0 && blendFactor == 1)
+        if (inferenceOnly)
         {
-            saveMean.Resize(0, 0);      // (these are not produced in this case)
-            saveInvStdDev.Resize(0, 0);
+            assert(expAvgFactor == 0 && blendFactor == 1);
+            savedMean.Resize(0, 0);      // (these are not produced in this case)
+            savedInvStdDev.Resize(0, 0);
             CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
-                                                               m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runStdDev), epsilon));
+                                                               m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runVariance), epsilon));
         }
         else
         {
-            saveMean.Resize(runMean);
-            saveInvStdDev.Resize(runMean);
+            savedMean.Resize(runMean);
+            savedInvStdDev.Resize(runMean);
             CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
-                                                              m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runStdDev),
-                                                              epsilon, ptr(saveMean), ptr(saveInvStdDev)));
+                                                              m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runVariance),
+                                                              epsilon, ptr(savedMean), ptr(savedInvStdDev)));
         }
     }
 
-    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
+    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
                       Mat& scaleGrad, Mat& biasGrad) override
     {
         UNUSED(blendFactor);  // BUGBUG: It should be used.
@@ -79,7 +81,7 @@ protected:
         cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
         // REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine.
         CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
-                                                   m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
+                                                   m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(savedMean), ptr(savedInvStdDev)));
     }
 
 private:
diff --git a/Source/Math/CuDnnCommon.h b/Source/Math/CuDnnCommon.h
index 95d48aeba..743f5a86e 100644
--- a/Source/Math/CuDnnCommon.h
+++ b/Source/Math/CuDnnCommon.h
@@ -9,7 +9,7 @@
 #include "TensorShape.h"
 #include <cudnn.h>
 #if CUDNN_MAJOR < 5
-#error CNTK needs CuDNN version 5.0 or higher, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn.
+#error CNTK requires the NVIDIA cuDNN library 5.0 or higher to build, cf. https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cudnn or https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#cudnn for installation instructions.
 #endif
 #include <memory>
 
diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
index 4bd8d028b..d3d40ac83 100644
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@@ -67,7 +67,7 @@ cudaStream_t MATH_API GetStream()
     template <class ElemType>                                             \
     GPUMatrix<ElemType>& GPUMatrix<ElemType>::Inplace##f()                \
     {                                                                     \
-        performElementWiseFunction(ElementWiseOperator::op##f, Data()); \
+        performElementWiseFunction(ElementWiseOperator::op##f, Data());   \
         return *this;                                                     \
     }
 #define DEF_ELEMWISE_ASSIGN_FUNC(f)                                                       \
@@ -77,8 +77,8 @@ cudaStream_t MATH_API GetStream()
         if (a.IsEmpty())                                                                  \
             LogicError("Assign##f##Of: Matrix a is empty.");                              \
         if (this != &a)                                                                   \
-            RequireSize(a.GetNumRows(), a.GetNumCols());                                       \
-        performElementWiseFunction(ElementWiseOperator::op##f, a.Data());               \
+            RequireSize(a.GetNumRows(), a.GetNumCols());                                  \
+        performElementWiseFunction(ElementWiseOperator::op##f, a.Data());                 \
         return *this;                                                                     \
     }
 
@@ -3160,12 +3160,13 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
                                                                 Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows());
 }
 
-// returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
+// returns savedMean/savedInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
 template <class ElemType>
-void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
+void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
                                                     GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
-                                                    GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
+                                                    GPUMatrix<ElemType>& savedMean, GPUMatrix<ElemType>& savedInvStdDev) const
 {
+    UNUSED(inferenceOnly); // TODO
     assert((GetNumRows() % scale.GetNumRows()) == 0);
 
     bool spatial = GetNumRows() != scale.GetNumRows();
@@ -3178,52 +3179,54 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
     assert(0 < batchSize  && batchSize  <= std::numeric_limits<int>::max());
 
     SyncGuard syncGuard;
-    if (expAvgFactor > 0 || blendFactor < 1)
+    //if (expAvgFactor > 0 || blendFactor < 1)
+    if (inferenceOnly)
     {
-        // Compute data mean and inverse standard deviation (into saveMean and
-        // saveInvStdDev), and update running mean and variance.
+        // Pick running statistics for normalizing. No update reuqired, and
+        // saved statistics do not need to be produced.
+        assert(expAvgFactor == 0 && blendFactor == 1);
+        normalizeRunningStats = true;
+        savedMean.RequireSize(0, 0);
+        savedInvStdDev.RequireSize(0, 0);
+    }
+    else
+    {
+        // Compute data mean and inverse standard deviation (into savedMean and
+        // savedInvStdDev), and update running mean and variance.
+        // TODO expAvgFactor == 0 && blendFactor == 1 can be optimized (no need for update).
         normalizeRunningStats = false;
-        saveMean.RequireSize(runMean);
-        saveInvStdDev.RequireSize(runMean);
+        savedMean.RequireSize(runMean);
+        savedInvStdDev.RequireSize(runMean);
         if (spatial)
         {
             Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, Data(),
                                                                 expAvgFactor, blendFactor,
                                                                 runMean.Data(), runVariance.Data(), epsilon,
-                                                                saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                                savedMean.Data(), savedInvStdDev.Data(), GetStream());
         }
         else
         {
             Call<ComputeBatchMeanAndInvStdDev, ElemType>(vectorSize, vectorSize, batchSize, Data(),
                                                          expAvgFactor, blendFactor,
                                                          runMean.Data(), runVariance.Data(), epsilon,
-                                                         saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                         savedMean.Data(), savedInvStdDev.Data(), GetStream());
         }
     }
-    else
-    {
-        // With expAvgFactor == 0 and blendFactor == 1 the running statistics
-        // do not need to be updated. CNTK engine in this case returns saveMean
-        // and saveInvStdDev empty, but cuDNN engine does not.
-        normalizeRunningStats = true;
-        saveMean.RequireSize(0, 0);
-        saveInvStdDev.RequireSize(0, 0);
-    }
 
     Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
                                            normalizeRunningStats, epsilon,
                                            Data(), out.Data(),
                                            scale.Data(), bias.Data(),
                                            runMean.Data(), runVariance.Data(),
-                                           saveMean.Data(), saveInvStdDev.Data(),
+                                           savedMean.Data(), savedInvStdDev.Data(),
                                            GetStream());
 }
 
-// saveMean/saveInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
+// savedMean/savedInvStdDev are the interpolated mean/inverse standard deviation as used in ForwardProp().
 // For blendFactor=1, they are not used and can be uninitialized or empty.
 template <class ElemType>
 void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
-                                                     const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
+                                                     const GPUMatrix<ElemType>& savedMean, const GPUMatrix<ElemType>& savedInvStdDev,
                                                      GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
 {
     assert((GetNumRows() % scale.GetNumRows()) == 0);
@@ -3240,16 +3243,16 @@ void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>&
     if (spatial)
     {
         Call<ComputeSpatialScaleAndBiasGradients, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
-                                                            saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                            savedMean.Data(), savedInvStdDev.Data(), GetStream());
     }
     else
     {
         Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
-                                                     saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                     savedMean.Data(), savedInvStdDev.Data(), GetStream());
     }
     ElemType mbStatsWeight = (ElemType)(1 - blendFactor); // weight for contribution from actual MB stats (0 if none, e.g. locked BN node)
     Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
-                                                    in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                    in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), savedMean.Data(), savedInvStdDev.Data(), GetStream());
 }
 
 #pragma region Static BLAS Functions
diff --git a/Source/Math/GPUMatrix.h b/Source/Math/GPUMatrix.h
index 8930af586..85a3753db 100644
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@@ -467,8 +467,8 @@ public:
     void AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
     void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;
 
-    void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
-                                   GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
+    void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
+                                   GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
                                    GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
     void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
                                     const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
diff --git a/Source/Math/MathCUDA.vcxproj b/Source/Math/MathCUDA.vcxproj
index 401704daa..a88a253b6 100644
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@@ -197,6 +197,6 @@ if exist "$(CuDnnDll)" xcopy /D /Y "$(CuDnnDll)" "$(OutputPath)"
   </ImportGroup>
   <Target Name="CheckDependencies">
     <Error Condition="!Exists('$(CUB_PATH)')" Text="CNTK requires the NVIDIA CUB library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
-    <Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
+    <Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library 5.0 or higher to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
   </Target>
 </Project>
diff --git a/Source/Math/Matrix.cpp b/Source/Math/Matrix.cpp
index 49a3accae..12dc05eb8 100644
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@@ -4274,8 +4274,8 @@ void Matrix<ElemType>::AveragePoolingBackward(const Matrix<int>& mpRowCol, const
 }
 
 template <class ElemType>
-void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor, 
-                                                 Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
+void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, 
+                                                 Matrix<ElemType>& runMean, Matrix<ElemType>& runVariance, Matrix<ElemType>& out, double epsilon,
                                                  Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const
 {
     DecideAndMoveToRightDevice(*this, out);
@@ -4283,11 +4283,11 @@ void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale,
     // REVIEW alexeyk: add sparse version.
     DISPATCH_MATRIX_ON_FLAG(this,
                             this,
-                            m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor,
-                                                                   *(runMean.m_CPUMatrix), *(runStdDev.m_CPUMatrix),
+                            m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), inferenceOnly, expAvgFactor, blendFactor,
+                                                                   *(runMean.m_CPUMatrix), *(runVariance.m_CPUMatrix),
                                                                    *(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)),
-                            m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor,
-                                                                   *(runMean.m_GPUMatrix), *(runStdDev.m_GPUMatrix),
+                            m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), inferenceOnly, expAvgFactor, blendFactor,
+                                                                   *(runMean.m_GPUMatrix), *(runVariance.m_GPUMatrix),
                                                                    *(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)),
                             NOT_IMPLEMENTED,
                             NOT_IMPLEMENTED);
diff --git a/Source/Math/Matrix.h b/Source/Math/Matrix.h
index 150ef865e..28a4c1f40 100644
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@@ -500,8 +500,8 @@ public:
     void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
     void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;
 
-    void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
-                                   Matrix<ElemType>& runMean, Matrix<ElemType>& runStdDev, Matrix<ElemType>& out, double epsilon,
+    void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor,
+                                   Matrix<ElemType>& runMean, Matrix<ElemType>& runVariance, Matrix<ElemType>& out, double epsilon,
                                    Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
     void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
                                     Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
diff --git a/Source/Math/NoGPU.cpp b/Source/Math/NoGPU.cpp
index 55596854c..7f21caa56 100644
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@@ -1827,7 +1827,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
 
 template <class ElemType>
 void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, 
-                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runStdDev, GPUMatrix<ElemType>& out, double epsilon,
+                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runVariance, GPUMatrix<ElemType>& out, double epsilon,
                                                     GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
 {
 }
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk
index c3df1a844..2fd994497 100644
--- a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.cntk
@@ -1,3 +1,7 @@
+# Synthetic test for non-spatial batch normalization, (almost) in isolation of
+# other nodes (in particular Convolution). This is not a general example and/or
+# documenting best practices. The network was based on MNIST/01_OneHidden.
+
 rootDir = ".."
 
 configDir = "$rootDir$/Config"
@@ -17,10 +21,6 @@ traceLevel = 1
 
 batchNormalizationEngine = "testMustOverrideBatchNormalizationEngine"
 
-#######################################
-#  TRAINING CONFIG                    #
-#######################################
-
 train = [
     action = "train"
 
@@ -54,10 +54,6 @@ train = [
     ]   
 ]
 
-#######################################
-#  TEST CONFIG                        #
-#######################################
-
 test = [
     action = "test"
     minibatchSize = 1024    # reduce this if you run out of memory
diff --git a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl
index 6a51e0aca..aa717aec1 100644
--- a/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl
+++ b/Tests/EndToEndTests/BatchNormalization/NonSpatial/01_OneHidden.ndl
@@ -22,8 +22,8 @@ DNN = [
     b = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0) 
     sc = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 1) 
     m = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    y = BatchNormalization(h1, sc, b, m, var, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)
+    v = LearnableParameter(hiddenDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    y = BatchNormalization(h1, sc, b, m, v, eval=false, spatial=false, normalizationTimeConstant=64, imageLayout=cudnn, engine=$batchNormalizationEngine$)
 
     ol = DNNLayer(hiddenDim, labelDim, y, 1)
 
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk
index baab959f7..1ddd4590b 100644
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.cntk
@@ -1,3 +1,7 @@
+# Synthetic test for spatial batch normalization, (almost) in isolation of
+# other nodes (in particular Convolution). This is not a general example and/or
+# documenting best practices. The network was based on CIFAR-10/02_BatchNormConv.
+
 RootDir = "."
 
 ConfigDir = "$RootDir$"
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl
index 38e3a4e79..eb04ba04f 100644
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/02_BatchNormConv.ndl
@@ -22,8 +22,7 @@ ndlMnistMacros = [
     scValue = 1
     
     # Batch normalization time constant.
-    #bnTimeConst = 4096
-    bnTimeConst = 2048
+    bnTimeConst = 0
   
     fc1WScale = 12
     fc1BValue = 0
@@ -42,9 +41,9 @@ DNN=[
     b = LearnableParameter(cMap1, 1, init = fixedValue, value = conv1BValue)
     sc = LearnableParameter(cMap1, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(cMap1, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
-    y = BatchNormalization(featScaled, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
+    y = BatchNormalization(featScaled, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=$batchNormalizationEngine$)
     conv1 = RectifiedLinear(y)
 
     # pool1
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml
index 459fd20b5..bacd1cc83 100644
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CNTK/testcases.yml
@@ -19,13 +19,13 @@ testCases:
   Epochs must be finished with expected results:
     patterns:
       - Finished Epoch
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
-      - Err = {{float,tolerance=2.0%}} * {{integer}}
+      - CE = {{float,tolerance=0.1%}} * {{integer}}
+      - Err = {{float,tolerance=0.1%}} * {{integer}}
       - totalSamplesSeen = {{integer}}
       - learningRatePerSample = {{float,tolerance=0.001%}}
 
   Final test results must match:
     patterns:
       - "Final Results: Minibatch[{{integer}}-{{integer}}]"
-      - Err = {{float,tolerance=2.0%}} * {{integer}}
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
+      - Err = {{float,tolerance=0.1%}} * {{integer}}
+      - CE = {{float,tolerance=0.1%}} * {{integer}}
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml
index 459fd20b5..bacd1cc83 100644
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/CuDNN/testcases.yml
@@ -19,13 +19,13 @@ testCases:
   Epochs must be finished with expected results:
     patterns:
       - Finished Epoch
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
-      - Err = {{float,tolerance=2.0%}} * {{integer}}
+      - CE = {{float,tolerance=0.1%}} * {{integer}}
+      - Err = {{float,tolerance=0.1%}} * {{integer}}
       - totalSamplesSeen = {{integer}}
       - learningRatePerSample = {{float,tolerance=0.001%}}
 
   Final test results must match:
     patterns:
       - "Final Results: Minibatch[{{integer}}-{{integer}}]"
-      - Err = {{float,tolerance=2.0%}} * {{integer}}
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
+      - Err = {{float,tolerance=0.1%}} * {{integer}}
+      - CE = {{float,tolerance=0.1%}} * {{integer}}
diff --git a/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl b/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl
index 6c783b860..731432b1a 100644
--- a/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl
+++ b/Tests/EndToEndTests/BatchNormalization/Spatial/Macros.ndl
@@ -21,10 +21,10 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
     b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
     sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
     c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
 ]
 
 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@@ -44,10 +44,10 @@ ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
     b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
     sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
     m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     
     c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, var, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
+    y = BatchNormalization(c, sc, b, m, v, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$, engine=cudnn)
 ]
 
 ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
@@ -113,9 +113,9 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
     y = RectifiedLinear(bn)
 ]
 
@@ -125,9 +125,9 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
     b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
     sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
     m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    var = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    v = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
     t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, var, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
+    bn = BatchNormalization(t, sc, b, m, v, spatial = false, normalizationTimeConstant = bnTimeConst, engine=cudnn)
     y = RectifiedLinear(bn)
 ]
 
diff --git a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml
index 292267353..d389f5af2 100644
--- a/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml
+++ b/Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/testcases.yml
@@ -32,4 +32,4 @@ testCases:
       - "Final Results: Minibatch[{{integer}}-{{integer}}]"
       # this part is flaky, needs to be investigated (VSO item #414)
       #- Err = {{float,tolerance=2.0%}} * {{integer}}
-      - CE = {{float,tolerance=2.0%}} * {{integer}}
+      - CE = {{float,tolerance=4.0%}} * {{integer}}
diff --git a/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp b/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp
index ae5d2b992..9fe87cf7a 100644
--- a/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp
+++ b/Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp
@@ -100,6 +100,7 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
             double expAvg = std::get<3>(cfg);
             double blendFactor = 0; // cuDNN supports blendFactor == 0 (train) or 1 (eval) only.
             double eps = 1e-5; // CUDNN_BN_MIN_EPSILON
+            bool inferenceOnly = false;
 
             auto engCudnn = BNEng::Create(baseDeviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::CuDnn);
             auto engCntk = BNEng::Create(deviceId, inOutT, spatial, ImageLayoutKind::CHW, BatchNormEngineKind::Cntk);
@@ -142,12 +143,12 @@ BOOST_AUTO_TEST_CASE(BatchNormalizationForward)
 
             CudaTimer time1;
             time1.Start();
-            engCntk->Forward(in, scale, bias, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev);
+            engCntk->Forward(in, scale, bias, inferenceOnly, expAvg, blendFactor, runMean, runInvStdDev, out, eps, saveMean, saveInvStdDev);
             time1.Stop();
 
             CudaTimer time2;
             time2.Start();
-            engCudnn->Forward(inB, scaleB, biasB, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB);
+            engCudnn->Forward(inB, scaleB, biasB, inferenceOnly, expAvg, blendFactor, runMeanB, runInvStdDevB, outB, eps, saveMeanB, saveInvStdDevB);
             time2.Stop();
             
             std::stringstream tmsg;
diff --git a/Tests/UnitTests/V2LibraryTests/Image.h b/Tests/UnitTests/V2LibraryTests/Image.h
index a2c164e1c..7b0efdd49 100644
--- a/Tests/UnitTests/V2LibraryTests/Image.h
+++ b/Tests/UnitTests/V2LibraryTests/Image.h
@@ -28,12 +28,12 @@ inline FunctionPtr ProjLayer(Variable wProj, Variable input, size_t hStride, siz
     auto b = Parameter({ outFeatureMapCount }, (float)bValue, device);
     auto sc = Parameter({ outFeatureMapCount }, (float)scValue, device);
     auto m = Constant({ outFeatureMapCount }, 0.0f, device);
-    auto var = Constant({ outFeatureMapCount }, 0.0f, device);
+    auto v = Constant({ outFeatureMapCount }, 0.0f, device);
 
     size_t numInputChannels = input.Shape()[input.Shape().NumAxes() - 1];
 
     auto c = Convolution(wProj, input, { hStride, vStride, numInputChannels }, { true }, { false });
-    return BatchNormalization(c, sc, b, m, var, true /*spatial*/, (double)bnTimeConst);
+    return BatchNormalization(c, sc, b, m, v, true /*spatial*/, (double)bnTimeConst);
 }
 
 inline FunctionPtr ResNetNode2(Variable input, size_t outFeatureMapCount, size_t kernelWidth, size_t kernelHeight, double wScale, double bValue, double scValue, size_t bnTimeConst, const DeviceDescriptor& device)