merged with master

2016-04-03 00:59:10 -07:00 · 2016-04-03 00:59:10 -07:00 · 7784350f29
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CN_Chapter.lyx
@ -8333,9 +8333,9 @@ SquareError
 \begin_layout Standard
 \begin_inset Formula 
 \begin{eqnarray}
-v\left(\mathbf{X},\mathbf{\mathbf{Y}}\right) & \leftarrow & \frac{1}{2}\mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
-\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
-\nabla_{\mathbf{\mathbf{Y}}}^{J} & \leftarrow & \nabla_{\mathbf{\mathbf{Y}}}^{J}-\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
+v\left(\mathbf{X},\mathbf{Y}\right) & \leftarrow & \mathrm{Tr}\left(\left(\mathbf{X}-\mathbf{Y}\right)\left(\mathbf{X}-\mathbf{Y}\right)^{T}\right)\\
+\nabla_{\mathbf{X}}^{J} & \leftarrow & \nabla_{\mathbf{X}}^{J}+2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right)\\
+\nabla_{\mathbf{Y}}^{J} & \leftarrow & \nabla_{\mathbf{Y}}^{J}-2\mathbf{\nabla_{n}^{\mathit{J}}}\left(\mathbf{X}-\mathbf{Y}\right).
 \end{eqnarray}

 \end_inset
@ -8367,8 +8367,8 @@ Note that
 \color none
 \begin_inset Formula 
 \begin{eqnarray}
-\frac{\partial v}{\partial\mathbf{X}} & = & \mathbf{X}-\mathbf{Y}\\
-\frac{\partial v}{\partial\mathbf{Y}} & = & \mathbf{-\left(X-\mathbf{Y}\right)}.
+\frac{\partial v}{\partial\mathbf{X}} & = & +2\left(\mathbf{X}-\mathbf{Y}\right)\\
+\frac{\partial v}{\partial\mathbf{Y}} & = & -2\left(\mathbf{X}-\mathbf{Y}\right).
 \end{eqnarray}

 \end_inset
--- a/Examples/Image/MNIST/AdditionalFiles/mnist_convert_python3.py
+++ b/Examples/Image/MNIST/AdditionalFiles/mnist_convert_python3.py
@ -1,4 +1,4 @@
-import urllib
+import urllib.request
 import gzip
 import os
 import struct
--- a/Examples/Image/MNIST/Config/01_OneHidden.cntk
+++ b/Examples/Image/MNIST/Config/01_OneHidden.cntk
@ -13,7 +13,6 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"

 command = MNISTtrain:MNISTtest

--- a/Examples/Image/MNIST/Config/01_OneHidden.ndl
+++ b/Examples/Image/MNIST/Config/01_OneHidden.ndl
@ -25,6 +25,7 @@ DNN = [
    err = ErrorPrediction(labels, ol)
    
    # Special Nodes
+    errTop5 = ErrorPrediction(labels, ol, Const(1), tag="eval")
    FeatureNodes = (features)
    LabelNodes = (labels)
    CriterionNodes = (ce)
--- a/Examples/Image/MNIST/Config/02_Convolution.cntk
+++ b/Examples/Image/MNIST/Config/02_Convolution.cntk
@ -13,7 +13,6 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"

 command = train:test

@ -42,7 +41,7 @@ train = [
    SGD = [
        epochSize = 60000
        minibatchSize = 32
-        learningRatesPerMB = 0.5
+        learningRatesPerMB = 0.1*5:0.3
        momentumPerMB = 0*10:0.7
        maxEpochs = 15
    ]
--- a/Examples/Image/MNIST/Config/02_Convolution.ndl
+++ b/Examples/Image/MNIST/Config/02_Convolution.ndl
@ -23,16 +23,17 @@ DNN=[
    hStride1 = 1
    vStride1 = 1
    # weight[cMap1, kW1 * kH1 * inputChannels]
-    # ConvReLULayer is defined in Macros.ndl
-    conv1_act = ConvReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1)
-
+    # Conv2DReLULayer is defined in Macros.ndl
+    conv1 = Conv2DReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, 10, 1)
+    
    # pool1
    pool1W = 2
    pool1H = 2
    pool1hStride = 2
    pool1vStride = 2
-    pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
-
+    # MaxPooling is a standard NDL node.
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
+    
    # conv2
    kW2 = 5
    kH2 = 5
@ -40,19 +41,20 @@ DNN=[
    hStride2 = 1
    vStride2 = 1
    # weight[cMap2, kW2 * kH2 * cMap1]
-    # ConvReLULayer is defined in Macros.ndl
-    conv2_act = ConvReLULayer(pool1, cMap2, 400, kW2, kH2, hStride2, vStride2, 10, 1)
-
+    # ConvNDReLULayer is defined in Macros.ndl
+    conv2 = ConvNDReLULayer(pool1, kW2, kH2, cMap1, 400, cMap2, hStride2, vStride2, 10, 1)
+ 
    # pool2
    pool2W = 2
    pool2H = 2
    pool2hStride = 2
    pool2vStride = 2
-    pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=$imageLayout$)
-
+    # MaxNDPooling is defined in Macros.ndl
+    pool2 = MaxNDPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout=$imageLayout$)
+    
    h1Dim = 128
    # DNNImageSigmoidLayer and DNNLayer are defined in Macros.ndl
-    h1 = DNNImageSigmoidLayer(4, 4, cMap2, h1Dim, pool2, 1)
+    h1 = DNNImageSigmoidLayer(7, 7, cMap2, h1Dim, pool2, 1)
    ol = DNNLayer(h1Dim, labelDim, h1, 1)
    
    ce = CrossEntropyWithSoftmax(labels, ol)
--- a/Examples/Image/MNIST/Config/03_ConvBatchNorm.cntk
+++ b/Examples/Image/MNIST/Config/03_ConvBatchNorm.cntk
@ -13,9 +13,8 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"

-command = train:CreateEvalModel:test
+command = train:test

 precision = "float"
 modelPath = "$ModelDir$/03_ConvBatchNorm"
@ -38,9 +37,11 @@ train = [
    SGD = [
        epochSize = 60000
        minibatchSize = 32
-        learningRatesPerMB = 0.5
-        momentumPerMB = 0*10:0.7
+        learningRatesPerMB = 0.5:0.1
+        momentumPerMB = 0.9
        maxEpochs = 2
+        #batchNormalizationTimeConstant=0 # Set through NDL
+        batchNormalizationBlendTimeConstant=0:1#INF
    ]
    
    reader = [
@ -63,17 +64,6 @@ train = [
    ]    
 ]

-#######################################
-#  Edit model                         #
-#######################################
-
-CreateEvalModel=[    
-    action=edit
-    CurModel=$ModelDir$/03_ConvBatchNorm
-    NewModel=$ModelDir$/03_ConvBatchNorm.Eval
-    editPath=$ConfigDir$/03_ConvBatchNorm.mel
-]
-
 #######################################
 #  TEST CONFIG                        #
 #######################################
@ -82,7 +72,7 @@ test = [
    action = "test"
    minibatchSize = 32

-    modelPath=$ModelDir$/03_ConvBatchNorm.Eval
+    modelPath=$ModelDir$/03_ConvBatchNorm

    NDLNetworkBuilder = [
        networkDescription = "$ConfigDir$/03_ConvBatchNorm.ndl"
--- a/Examples/Image/MNIST/Config/03_ConvBatchNorm.mel
+++ b/Examples/Image/MNIST/Config/03_ConvBatchNorm.mel
@ -1,6 +0,0 @@
-m=LoadModel($CurModel$, format=cntk)
-SetDefaultModel(m)
-
-SetPropertyForSubTree(CE, batchNormEvalMode, true)
-
-SaveModel(m, $NewModel$, format=cntk)
--- a/Examples/Image/MNIST/Config/03_ConvBatchNorm.ndl
+++ b/Examples/Image/MNIST/Config/03_ConvBatchNorm.ndl
@ -15,7 +15,7 @@ ndlMnistMacros = [
    labels = InputValue(labelDim)

    scValue = 1
-    # Batch normalization time constant.
+    # Batch normalization time constant (normalizationTimeConstant). blendTimeConstant is set through .cntk file.
    bnTimeConst = 1024
    
    convWScale = 10
--- a/Examples/Image/MNIST/Config/Macros.ndl
+++ b/Examples/Image/MNIST/Config/Macros.ndl
@ -1,28 +1,28 @@
 DNNSigmoidLayer(inDim, outDim, x, parmScale) = [
-    W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale) 
-    b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale) 
+    W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true) 
+    b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale, initOnCPUOnly=true) 
    t = Times(W, x)
    z = Plus(t, b)
    y = Sigmoid(z)
 ]

 DNNImageSigmoidLayer(inW, inH, inC, outDim, x, parmScale) = [
-    W = ImageParameter(outDim, inW, inH, inC, init="uniform", initValueScale=parmScale, imageLayout=$imageLayout$)
-    b = LearnableParameter(outDim, 1,         init="uniform", initValueScale=parmScale) 
+    W = ImageParameter(outDim, inW, inH, inC, init="uniform", initValueScale=parmScale, initOnCPUOnly=true, imageLayout=$imageLayout$)
+    b = LearnableParameter(outDim, 1,         init="uniform", initValueScale=parmScale, initOnCPUOnly=true) 
    t = Times(W, x)
    z = Plus(t, b)
    y = Sigmoid(z)
 ]

 DNNLayer(inDim, outDim, x, parmScale) = [
-    W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale)
-    b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale)
+    W = LearnableParameter(outDim, inDim, init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
+    b = LearnableParameter(outDim, 1,     init="uniform", initValueScale=parmScale, initOnCPUOnly=true)
    t = Times(W, x)
    z = Plus(t, b)
 ]

 DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
-    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
+    W = LearnableParameter(outDim, inDim, init = Gaussian, initValueScale = wScale, initOnCPUOnly=true) 
    b = LearnableParameter(outDim, 1, init = fixedValue, value = bValue) 
    sc = LearnableParameter(outDim, 1, init = fixedValue, value = scValue) 
    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
@ -32,12 +32,36 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst) = [
    y = RectifiedLinear(bn)
 ]

-ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
-    convW = LearnableParameter(outMap, inWCount, init="uniform", initValueScale=wScale)
-    convB = ImageParameter(1, 1, outMap, init="fixedValue", value=bValue, imageLayout=$imageLayout$)
-    conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding=false, imageLayout=$imageLayout$)
-    convPlusB = Plus(conv, convB);
-    act = RectifiedLinear(convPlusB);
+ConvW(outMap, inWCount, wScale) = [
+    W = LearnableParameter(outMap, inWCount, init="uniform", initValueScale=wScale, initOnCPUOnly=true)
+]
+
+ConvB(outMap, bValue) = [
+    b = ImageParameter(1, 1, outMap, init="fixedValue", value=bValue, imageLayout=$imageLayout$)
+]
+
+Conv2D(w, inp, kW, kH, outMap, hStride, vStride) = [
+    c = Convolution(w, inp, kW, kH, outMap, hStride, vStride, zeroPadding=true, imageLayout=$imageLayout$)
+]
+
+ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride) = [
+    c = Convolution(w, inp, {kW, kH, inMap}, mapCount=outMap, stride={hStride, vStride, inMap}, sharing={true, true, true}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
+]
+
+Conv2DReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
+    w = ConvW(outMap, inWCount, wScale)
+    b = ConvB(outMap, bValue)
+    c = Conv2D(w, inp, kW, kH, outMap, hStride, vStride)
+    cpb = Plus(c, b);
+    out = RectifiedLinear(cpb);
+]
+
+ConvNDReLULayer(inp, kW, kH, inMap, inWCount, outMap, hStride, vStride, wScale, bValue) = [
+    w = ConvW(outMap, inWCount, wScale)
+    b = ConvB(outMap, bValue)
+    c = ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride)
+    cpb = Plus(c, b);
+    out = RectifiedLinear(cpb);
 ]

 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst) = [
@ -51,7 +75,7 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
 ]

 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst) = [
-    W = LearnableParameter(outMap, inWCount, init=Gaussian, initValueScale=wScale)
+    W = LearnableParameter(outMap, inWCount, init=Gaussian, initValueScale=wScale, initOnCPUOnly=true)
    c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
 ]

@ -59,3 +83,7 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
    y = RectifiedLinear(c)
 ]
+
+MaxNDPooling(inp, kW, kH, hStride, vStride) = [
+		p = Pooling(inp, "max", {kW, kH, 1}, stride={hStride, vStride, 1}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
+]
--- a/Examples/Image/MNIST/README.md
+++ b/Examples/Image/MNIST/README.md
@ -70,7 +70,7 @@ To run the sample, navigate to the Data folder and run the following command:

 3. 03_ConvBatchNorm.ndl is almost identical to 02_Convolution.ndl 
 except that it uses batch normalization for the convolutional and fully connected layers.
-As a result, it achieves around 0.92% of error after training for just 2 epochs (and less than 30 seconds).
+As a result, it achieves around 0.8% of error after training for just 2 epochs (and less than 30 seconds).
 To run the sample, navigate to the Data folder and run the following command:  
 `cntk configFile=../Config/03_ConvBatchNorm.cntk`

--- a/Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk
@ -12,7 +12,6 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"

 prefetch = "true"

@ -45,6 +44,7 @@ Train = [
        readerType = "UCIFastReader"
        file = "$DataDir$/Train.txt"
        randomize = "auto"
+        minibatchMode="full"
        features = [
            dim = 3072
            start = 1
--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.cntk
@ -12,11 +12,10 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"

 prefetch = "true"

-command = Train:AddBNEval:Test
+command = Train:Test

 stderr = "$OutputDir$/02_BatchNormConv"
 traceLevel = 1
@ -44,6 +43,7 @@ Train = [
        readerType = "UCIFastReader"
        file = "$DataDir$/Train.txt"
        randomize = "auto"
+        minibatchMode="full"
        features = [
            dim = 3072
            start = 1
@ -57,16 +57,9 @@ Train = [
    ]    
 ]

-AddBNEval = [    
-    action = "edit"
-    CurModel = "$ModelDir$/02_BatchNormConv"
-    NewModel = "$ModelDir$/02_BatchNormConv.Eval"
-    editPath = "$ConfigDir$/02_BatchNormConv.mel"
-]
-
 Test = [
    action = "test"
-    modelPath = "$ModelDir$/02_BatchNormConv.Eval"
+    modelPath = "$ModelDir$/02_BatchNormConv"
    # Set minibatch size for testing.
    minibatchSize = 16

--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel
@ -1,6 +0,0 @@
-m=LoadModel($CurModel$, format=cntk)
-SetDefaultModel(m)
-
-SetPropertyForSubTree(CE, batchNormEvalMode, true)
-
-SaveModel(m, $NewModel$, format=cntk)
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.cntk
@ -12,12 +12,11 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"

 prefetch = "true"
 parallelTrain = "false"

-command = Train:AddBNEval:Test
+command = Train:Test

 stderr = "$OutputDir$/03_ResNet"
 traceLevel = 1
@ -75,16 +74,9 @@ Train = [
    ]    
 ]

-AddBNEval = [    
-    action = "edit"
-    CurModel = "$ModelDir$/03_ResNet"
-    NewModel = "$ModelDir$/03_ResNet.Eval"
-    editPath = "$ConfigDir$/03_ResNet.mel"
-]
-
 Test = [
    action = "test"
-    modelPath = "$ModelDir$/03_ResNet.Eval"
+    modelPath = "$ModelDir$/03_ResNet"
    # Set minibatch size for testing.
    minibatchSize = 512

--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
@ -1,6 +0,0 @@
-m=LoadModel($CurModel$, format=cntk)
-SetDefaultModel(m)
-
-SetPropertyForSubTree(CE, batchNormEvalMode, true)
-
-SaveModel(m, $NewModel$, format=cntk)
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@ -38,14 +38,14 @@ DNN=[
    rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap2 = 32
-    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
+    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
    rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
    #rn2_1 = ResNetNode2Inc2(rn1_3, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap3 = 64
-    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
+    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
    rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
    #rn3_1 = ResNetNode2Inc2(rn2_3, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
--- a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.cntk
@ -13,12 +13,11 @@ deviceId = 0
 imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1
-# imageLayout = "legacy"

 prefetch="true"
 parallelTrain="false"

-command=Train:AddBNEval:Test
+command=Train:Test

 stderr="$OutputDir$/04_ResNet_56"
 traceLevel=1
@ -76,16 +75,9 @@ Train=[
    ]    
 ]

-AddBNEval=[    
-    action="edit"
-    CurModel="$ModelDir$/04_ResNet_56"
-    NewModel="$ModelDir$/04_ResNet_56.Eval"
-    editPath="$ConfigDir$/03_ResNet.mel"
-]
-
 Test=[
    action="test"
-    modelPath="$ModelDir$/04_ResNet_56.Eval"
+    modelPath="$ModelDir$/04_ResNet_56"
    # Set minibatch size for testing.
    minibatchSize=512

--- a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
@ -53,7 +53,7 @@ DNN=[
    rn1_18= ResNetNode2(rn1_17, cMap1, 144, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap2 = 32
-    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
+    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", learningRateMultiplier = 0)
    rn2_1 = ResNetNode2Inc(rn1_18, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
    #rn2_1 = ResNetNode2Inc2(rn1_18, cMap1, cMap2, 144, 288, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
@ -75,7 +75,7 @@ DNN=[
    rn2_18= ResNetNode2(rn2_17, cMap2, 288, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap3 = 64
-    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
+    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", learningRateMultiplier = 0)
    rn3_1 = ResNetNode2Inc(rn2_18, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
    #rn3_1 = ResNetNode2Inc2(rn2_18, cMap2, cMap3, 288, 576, kW, kH, convWScale, 3.5, convBValue, scValue, bnTimeConst)
    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
--- a/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal.cntk
@ -0,0 +1,80 @@
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros = "$ConfigDir$/Macros.ndl"
+
+precision = "float"
+deviceId = 0
+imageLayout = "cudnn"
+# override the above as follows when running on CPU:
+# deviceId = -1
+
+prefetch = "true"
+
+command = Train:Test
+
+modelPath = "$ModelDir$/05_ConvLocal"
+
+stderr = "$OutputDir$/05_ConvLocal"
+traceLevel = 1
+numMBsToShowResult = 50
+
+Train = [
+    action = "train"
+
+     NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/05_ConvLocal.ndl"
+    ]
+    
+    SGD = [
+        epochSize = 49984
+        minibatchSize = 64
+        learningRatesPerMB = 0.01*10:0.003*10:0.001
+        momentumPerMB = 0.9*20:0.99
+        maxEpochs = 30
+        L2RegWeight = 0.03
+    ]
+    
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Train.txt"
+        randomize = "auto"
+        minibatchMode="full"
+        features = [
+            dim = 3072
+            start = 1
+        ]
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
+        ]
+    ]    
+]
+
+Test = [
+    action = "test"
+    # Set minibatch size for testing.
+    minibatchSize = 16
+
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test.txt"
+        randomize = "none"
+        features = [
+            dim = 3072
+            start = 1
+        ]
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
+        ]
+    ]    
+]
--- a/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal.ndl
@ -0,0 +1,84 @@
+load=ndlMnistMacros
+run=DNN
+
+ndlMnistMacros = [
+    ImageW = 32
+    ImageH = 32
+    ImageC = 3
+    LabelDim = 10
+
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = $imageLayout$)
+    featOffs = Const(128)
+    featScaled = Minus(features, featOffs)
+    labels = Input(LabelDim, tag = label)
+    
+    conv1WScale = 0.0043
+    conv1BValue = 0
+    conv2WScale = 1.414
+    conv2BValue = 0
+    conv3WScale = 1.414
+    conv3BValue = 0
+    conv4WScale = 1.414
+    conv4BValue = 0
+    fc1WScale = 1.5
+    fc1BValue = 0
+]
+
+DNN=[
+    # conv1
+    kW1 = 5
+    kH1 = 5
+    cMap1 = 64
+    hStride1 = 1
+    vStride1 = 1
+    # weight[cMap1, kW1 * kH1 * ImageC]
+    conv1 = ConvReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue)
+
+    # pool1
+    pool1W = 3
+    pool1H = 3
+    pool1hStride = 2
+    pool1vStride = 2
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout = $imageLayout$)
+
+    # conv2
+    kW2 = 5
+    kH2 = 5
+    cMap2 = 64
+    hStride2 = 1
+    vStride2 = 1
+    # weight[cMap2, kW2 * kH2 * cMap1]
+    conv2 = ConvReLULayer(pool1, cMap2, 1600, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue)
+
+    # pool2
+    pool2W = 3
+    pool2H = 3
+    pool2hStride = 2
+    pool2vStride = 2
+    pool2 = MaxPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout = $imageLayout$)
+
+    # conv_local3
+    kW3 = 3
+    kH3 = 3
+    cMap3 = 64
+    hStride3 = 1
+    vStride3 = 1
+    # weight[cMap3 * pool2OutW * poolOutH, kW3 * kH3 * cMap2]
+    conv3 = ConvLocalReLULayer(pool2, cMap3, 3136, cMap2, 576, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue)
+
+    # conv_local4
+    kW4 = 3
+    kH4 = 3
+    cMap4 = 32
+    hStride4 = 1
+    vStride4 = 1
+    # weight[cMap4 * conv3OutW * conv3OutH, kW4 * kH4 * cMap3]
+    conv4 = ConvLocalReLULayer(conv3, cMap4, 1568, cMap3, 576, kW4, kH4, hStride4, vStride4, conv4WScale, conv4BValue)
+
+    ol = DnnImageLastLayer(7, 7, cMap4, labelDim, conv4, fc1WScale, fc1BValue)
+    
+    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
+    Err = ErrorPrediction(labels, ol, tag = Eval)
+    OutputNodes = ol
+]
+
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@ -7,6 +7,15 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
    y = RectifiedLinear(p)
 ]

+ConvLocalReLULayer(inp, outMap, outWCount, inMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
+[
+    W = LearnableParameter(outWCount, inWCount, init = Gaussian, initValueScale = wScale)
+    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = $imageLayout$)
+    c = Convolution(W, inp, {kW, kH, inMap}, mapCount = outMap, stride = {hStride, vStride, inMap}, sharing = {false, false, false}, imageLayout = $imageLayout$)
+    p = Plus(c, b)
+    y = RectifiedLinear(p)
+]
+
 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
 [
    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
@ -15,7 +24,7 @@ ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeCo
    isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = $imageLayout$)
-    y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
 ]

 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@ -30,6 +39,17 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    y = RectifiedLinear(c)
 ]

+ProjLayer(W, inp, outMap, hStride, vStride, bValue, scValue, bnTimeConst)
+[
+    b = LearnableParameter(outMap, 1, init = fixedValue, value = bValue)
+    sc = LearnableParameter(outMap, 1, init = fixedValue, value = scValue)
+    m = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    isd = LearnableParameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    
+    c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = $imageLayout$)
+    y = BatchNormalization(c, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, imageLayout = $imageLayout$)
+]
+
 ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
 [
    # First convolution layer.
@ -48,7 +68,7 @@ ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, b
    c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
    
    # Projection convolution layer.
-    c_proj = ConvBNLayerW(Wproj, inp, outMap, 1, 1, 2, 2, bValue, scValue, bnTimeConst)
+    c_proj = ProjLayer(Wproj, inp, outMap, 2, 2, bValue, scValue, bnTimeConst)
    #c_proj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = $imageLayout$)
    
    p = Plus(c2, c_proj)
@ -95,7 +115,7 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue, scValue, bnTimeConst)
    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
    y = RectifiedLinear(bn)
 ]

@ -107,7 +127,7 @@ DnnImageBNReLULayer(inW, inH, inC, outDim, x, wScale, bValue, scValue, bnTimeCon
    m = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    isd = LearnableParameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, normalizationTimeConstant = bnTimeConst)
+    bn = BatchNormalization(t, sc, b, m, isd, spatial = false, normalizationTimeConstant = bnTimeConst)
    y = RectifiedLinear(bn)
 ]

@ -118,3 +138,11 @@ DnnLastLayer(hiddenDim, labelDim, x, wScale, bValue)
    t = Times(W, x)
    z = Plus(t, b)
 ]
+
+DnnImageLastLayer(inW, inH, inC, labelDim, x, wScale, bValue)
+[
+    W = ImageParameter(labelDim, inW, inH, inC, init = Gaussian, initValueScale = wScale, imageLayout=$imageLayout$)
+    b = LearnableParameter(labelDim, init = fixedValue, value = bValue)
+    t = Times(W, x)
+    z = Plus(t, b)
+]
--- a/Examples/Image/Miscellaneous/CIFAR-10/Output/03_ResNet_Train_AddBNEval_Test.log.160
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Output/03_ResNet_Train_AddBNEval_Test.log.160
@ -6411,4 +6411,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
 Allocating matrices for forward and/or backward propagation.
 Minibatch[1-20]: Samples Seen = 10000    Err: ErrorPrediction/Sample = 0.0819    CE: CrossEntropyWithSoftmax/Sample = 0.35141698    
 Final Results: Minibatch[1-20]: Samples Seen = 10000    Err: ErrorPrediction/Sample = 0.0819    CE: CrossEntropyWithSoftmax/Sample = 0.35141698    Perplexity = 1.4210798    
-COMPLETED
+__COMPLETED__
--- a/Examples/Image/Miscellaneous/CIFAR-10/Output/04_ResNet_56_Train_AddBNEval_Test.log.160
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Output/04_ResNet_56_Train_AddBNEval_Test.log.160
@ -9899,4 +9899,4 @@ evalNodeNames are not specified, using all the default evalnodes and training cr
 Allocating matrices for forward and/or backward propagation.
 Minibatch[1-20]: Samples Seen = 10000    Err: ErrorPrediction/Sample = 0.0644    CE: CrossEntropyWithSoftmax/Sample = 0.3034767    
 Final Results: Minibatch[1-20]: Samples Seen = 10000    Err: ErrorPrediction/Sample = 0.0644    CE: CrossEntropyWithSoftmax/Sample = 0.3034767    Perplexity = 1.35456    
-COMPLETED
+__COMPLETED__
--- a/Examples/Image/Miscellaneous/CIFAR-10/README.md
+++ b/Examples/Image/Miscellaneous/CIFAR-10/README.md
@ -25,7 +25,7 @@ Then install numpy package by following instruction from: http://www.scipy.org/i
 2. Alternatively install Python Anaconda distribution which contains most of the popular Python packages including numpy:
 http://continuum.io/downloads

-`-f` parameter is optional and specifies output format of the datasets. `cudnn` option (default) saves dataset in a spatial-major format used by cuDNN, while `legacy` - in CNTK legacy format. Use `cudnn` if CNTK is compiled with cuDNN **and** running on GPU and `legacy` otherwise.
+`-f` parameter is optional and specifies output format of the datasets. `cudnn` option (default) saves dataset in a spatial-major format used by cuDNN, while `legacy` - in CNTK legacy format. Use `cudnn` if CNTK is compiled with cuDNN and `legacy` otherwise.

 ResNet samples require converting CIFAR-10 dataset to actual images. This can be performed by running the following command:
 ```
@ -54,5 +54,7 @@ cntk configFile=02_BatchNormConv.cntk
 3. 03_ResNet.ndl and 04_ResNet_56.ndl are very deep convolutional networks that use ResNet architecture and have 20 and 56 layers respectively (http://arxiv.org/abs/1512.03385).
 With 03_ResNet.cntk you should get around 8.2% of error after training for about 50 minutes. 04_ResNet_56.cntk should produce around 6.4% of error after training for about 3 hours (see log files in the Output directory).

+4. 05_ConvLocal.cntk uses locally-connected convolution layers (see `conv_local3` and `conv_local4` in `05_ConvLocal.cntk`) and resembles a network described here: https://code.google.com/p/cuda-convnet/source/browse/trunk/example-layers/layers-conv-local-11pct.cfg
+
 For more details, refer to .ndl and corresponding .cntk files.

--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.cntk
@ -66,7 +66,7 @@ Train=[
            # Possible values: Center, Random. Default: Center
            cropType="Random"
            # Horizontal random flip, will be enabled by default if cropType=Random
-            #hflip=0
+            #hflip="true"
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
            cropRatio=0.875
            # Crop scale ratio jitter type.
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/CreateEvalModel.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/CreateEvalModel.mel
@ -1,9 +1,6 @@
 m1=LoadModel($CurModel$, format=cntk)
 SetDefaultModel(m1)

-# Switch batch normalization to eval mode.
-SetPropertyForSubTree(CE, batchNormEvalMode, true)
-
 # Add top-5 error prediction node.
 ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = Eval)

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@ -1,18 +1,29 @@
+Conv(W, inp, outMap, kW, kH, hStride, vStride)
+[
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
+]
+
+BN(inp, mapCount, bValue, scValue, bnTimeConst)
+[
+    b = Parameter(mapCount, 1, init = fixedValue, value = bValue)
+    sc = Parameter(mapCount, 1, init = fixedValue, value = scValue)
+    m = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    isd = Parameter(mapCount, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
+    
+    y = BatchNormalization(inp, sc, b, m, isd, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
+]
+
 ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
 [
-    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
-    
-    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, normalizationTimeConstant = bnTimeConst, epsilon = 0.000000001, imageLayout = "cudnn")
+    c = Conv(W, inp, outMap, kW, kH, hStride, vStride)
+    y = BN(c, outMap, bValue, scValue, bnTimeConst)
 ]

 ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
 [
    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, bnTimeConst)
+    c = Conv(W, inp, outMap, kW, kH, hStride, vStride)
+    y = BN(c, outMap, bValue, scValue, bnTimeConst)
 ]

 ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
@ -21,6 +32,19 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    y = RectifiedLinear(c)
 ]

+Conv1x1(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+[
+    W = Parameter(outMap, inMap, init = Gaussian, initValueScale = wScale)
+    c = Convolution(W, inp, 1, 1, outMap, hStride, vStride, zeroPadding = false, imageLayout = "cudnn")
+    y = BN(c, outMap, bValue, scValue, bnTimeConst)
+]
+
+Conv1x1ReLU(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+[
+    c = Conv1x1(inp, outMap, inMap, hStride, vStride, wScale, bValue, scValue, bnTimeConst)
+    y = RectifiedLinear(c)
+]
+
 # Standard building block for ResNet with identity shortcut (option A).
 ResNetNode2A(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
 [
@ -48,15 +72,30 @@ ResNetNode2AInc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue,
    y2 = RectifiedLinear(p)
 ]

+# Standard building block for ResNet with padding (option B).
+ResNetNode2BInc(inp, outMap, inMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
+[
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    
+    # Projection convolution layer.
+    c_proj = Conv1x1(inp, outMap, inMap, 2, 2, wScale, bValue, scValue, bnTimeConst)
+    
+    p = Plus(c2, c_proj)
+    y2 = RectifiedLinear(p)
+]
+
 # Bottleneck building block for ResNet.
 ResNetNode3A(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst)
 [
    # 1x1 reducing convolution.
-    c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    c1 = Conv1x1ReLU(inp, convMap, inMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
    # 3x3 convolution.
    c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
    # 1x1 expanding convolution, no ReLU.
-    c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
    
    p = Plus(c3, inp)
    y = RectifiedLinear(p)
@ -65,11 +104,11 @@ ResNetNode3A(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, b
 ResNetNode3AInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst, wProj, projStride)
 [
    # 1x1 reducing convolution.
-    c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
+    c1 = Conv1x1ReLU(inp, convMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
    # 3x3 convolution.
    c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
    # 1x1 expanding convolution, no ReLU.
-    c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
    # Input-to-output mapping convolution.
    c_proj = ConvBNLayerW(wProj, inp, outMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
    
@ -80,13 +119,13 @@ ResNetNode3AInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue
 ResNetNode3BInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst, projStride)
 [
    # 1x1 reducing convolution.
-    c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
+    c1 = Conv1x1ReLU(inp, convMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
    # 3x3 convolution.
    c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
    # 1x1 expanding convolution, no ReLU.
-    c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
    # Input-to-output mapping convolution.
-    c_proj = ConvBNLayer(inp, outMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
+    c_proj = Conv1x1(inp, outMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
    
    p = Plus(c3, c_proj)
    y = RectifiedLinear(p)
@ -99,3 +138,8 @@ DnnLayer(hiddenDim, labelDim, x, wScale, bValue)
    t = Times(W, x)
    z = Plus(t, b)
 ]
+
+MaxNDPooling(inp, kW, kH, hStride, vStride)
+[
+    p = Pooling(inp, "max", {kW, kH, 1}, stride = {hStride, vStride, 1}, autoPadding = {true, true, false}, imageLayout = "cudnn")
+]
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.cntk
@ -71,7 +71,7 @@ Train=[
            # Possible values: Center, Random. Default: Center
            cropType="Random"
            # Horizontal random flip, will be enabled by default if cropType=Random
-            #hflip=0
+            #hflip="true"
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
            cropRatio=0.46666:0.875
            # Crop scale ratio jitter type.
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
@ -41,8 +41,8 @@ DNN=[
    conv1WScale = 0.6
    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
    # Max pooling
-    pool1W = 2
-    pool1H = 2
+    pool1W = 3
+    pool1H = 3
    pool1hs = 2
    pool1vs = 2
    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.cntk
@ -0,0 +1,115 @@
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros="$ConfigDir$/Macros.ndl"
+
+precision="float"
+deviceId="Auto"
+
+command=Train:CreateEval:Test
+
+parallelTrain="false"
+
+stderr="$OutputDir$/ResNet_18"
+traceLevel=1
+numMBsToShowResult=500
+
+Train=[
+    action="train"
+    modelPath="$ModelDir$/ResNet_18"
+
+     NDLNetworkBuilder=[
+        networkDescription="$ConfigDir$/ResNet_18.ndl"
+    ]
+    
+    SGD=[
+        epochSize=0
+        minibatchSize=256
+        # Note that learning rates are 10x more than in the paper due to a different
+        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
+        learningRatesPerMB=1.0*35:0.1*35:0.01
+        momentumPerMB=0.9
+        maxEpochs=125
+        gradUpdateType="None"
+        L2RegWeight=0.0001
+        dropoutRate=0
+        
+        ParallelTrain=[
+            parallelizationMethod="DataParallelSGD"
+            distributedMBReading="true"
+            parallelizationStartEpoch=1
+            DataParallelSGD=[
+                gradientBits=32
+            ]
+        ]
+    ]
+    
+    reader=[
+        readerType="ImageReader"
+        # Map file which maps images to labels using the following format:
+        # <full path to image><tab><numerical label (0-based class id)>
+        # Example:
+        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
+        file="$DataDir$/train_map.txt"
+        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
+        randomize="Auto"
+        features=[
+            # Below are the required parameters.
+            width=224
+            height=224
+            channels=3
+            # Below are the optional parameters.
+            # Possible values: Center, Random. Default: Center
+            cropType="Random"
+            # Horizontal random flip, will be enabled by default if cropType=Random
+            #hflip="true"
+            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
+            cropRatio=0.46666:0.875
+            # Crop scale ratio jitter type.
+            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
+            jitterType="UniRatio"
+            # Interpolation to use when scaling image to width x height size.
+            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
+            interpolations="Linear"
+            # Stores mean values for each pixel in OpenCV matrix XML format.
+            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
+        ]
+        labels=[
+            labelDim=1000
+        ]
+    ]    
+]
+
+CreateEval=[    
+    action="edit"
+    CurModel="$ModelDir$/ResNet_18"
+    NewModel="$ModelDir$/ResNet_18.Eval"
+    editPath="$ConfigDir$/CreateEvalModel.mel"
+]
+
+Test=[
+    action="test"
+    modelPath="$ModelDir$/ResNet_18.Eval"
+    # Set minibatch size for testing.
+    minibatchSize=64
+
+    reader=[
+        readerType="ImageReader"
+        file="$DataDir$/val_map.txt"
+        randomize="None"
+        features=[
+            width=224
+            height=224
+            channels=3
+            cropType="Center"
+            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
+        ]
+        labels=[
+            labelDim=1000
+        ]
+    ]    
+]
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.ndl
@ -0,0 +1,72 @@
+load=ndlMacros
+run=DNN
+
+ndlMacros = [
+    ImageW = 224
+    ImageH = 224
+    ImageC = 3
+    LabelDim = 1000
+
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
+    labels = Input(LabelDim, tag = label)
+    
+    # Kernels width and height.
+    kW = 3
+    kH = 3
+    # Kernel stride.
+    hs = 1
+    vs = 1
+    
+    # Initial parameter values.
+    convWScale = 7.07
+    convBValue = 0
+
+    fcWScale = 1.13
+    fcBValue = 0
+
+    scValue = 1
+    
+    # Batch normalization time constant.
+    bnTimeConst = 32768
+]
+
+DNN=[
+    conv1WScale = 0.6
+    cMap1 = 64
+    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
+    # Max pooling
+    pool1W = 3
+    pool1H = 3
+    pool1hs = 2
+    pool1vs = 2
+    pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
+    
+    rn1_1 = ResNetNode2A(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn1_2 = ResNetNode2A(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+
+    cMap2 = 128
+    rn2_1 = ResNetNode2BInc(rn1_2, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    
+    cMap3 = 256
+    rn3_1 = ResNetNode2BInc(rn2_2, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+
+    cMap4 = 512
+    rn4_1 = ResNetNode2BInc(rn3_2, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+
+    # Global average pooling
+    pool2W = 7
+    pool2H = 7
+    pool2hs = 1
+    pool2vs = 1
+    pool5 = AveragePooling(rn4_3, pool2W, pool2H, pool2hs, pool2vs, imageLayout = "cudnn")
+
+    ol = DnnLayer(cMap4, labelDim, pool5, fcWScale, fcBValue)
+    
+    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
+    Err = ErrorPrediction(labels, ol, tag = Eval)
+    OutputNodes = ol
+]
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.cntk
@ -70,7 +70,7 @@ Train=[
            # Possible values: Center, Random. Default: Center
            cropType="Random"
            # Horizontal random flip, will be enabled by default if cropType=Random
-            #hflip=0
+            #hflip="true"
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
            cropRatio=0.46666:0.875
            # Crop scale ratio jitter type.
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
@ -35,26 +35,24 @@ DNN=[
    cMap1 = 64
    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
    # Max pooling
-    pool1W = 2
-    pool1H = 2
+    pool1W = 3
+    pool1H = 3
    pool1hs = 2
    pool1vs = 2
-    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
+    pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
    
    rn1_1 = ResNetNode2A(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn1_2 = ResNetNode2A(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn1_3 = ResNetNode2A(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap2 = 128
-    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", needGradient = false)
-    rn2_1 = ResNetNode2AInc(rn1_3, cMap2, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn2_1_Wproj)
+    rn2_1 = ResNetNode2BInc(rn1_3, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn2_3 = ResNetNode2A(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn2_4 = ResNetNode2A(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    
    cMap3 = 256
-    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", needGradient = false)
-    rn3_1 = ResNetNode2AInc(rn2_4, cMap3, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn3_1_Wproj)
+    rn3_1 = ResNetNode2BInc(rn2_4, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn3_3 = ResNetNode2A(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn3_4 = ResNetNode2A(rn3_3, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
@ -62,8 +60,7 @@ DNN=[
    rn3_6 = ResNetNode2A(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap4 = 512
-    rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
-    rn4_1 = ResNetNode2AInc(rn3_6, cMap4, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst, rn4_1_Wproj)
+    rn4_1 = ResNetNode2BInc(rn3_6, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.cntk
@ -71,7 +71,7 @@ Train=[
            # Possible values: Center, Random. Default: Center
            cropType="Random"
            # Horizontal random flip, will be enabled by default if cropType=Random
-            #hflip=0
+            #hflip="true"
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
            cropRatio=0.46666:0.875
            # Crop scale ratio jitter type.
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
@ -41,11 +41,11 @@ DNN=[
    conv1WScale = 0.6
    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, bnTimeConst)
    # Max pooling
-    pool1W = 2
-    pool1H = 2
+    pool1W = 3
+    pool1H = 3
    pool1hs = 2
    pool1vs = 2
-    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
+    pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
    
    rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst, 1)
    rn1_2 = ResNetNode3A(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst)
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/CreateEvalModel.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/CreateEvalModel.mel
@ -1,9 +1,6 @@
 m1=LoadModel($CurModel$, format=cntk)
 SetDefaultModel(m1)

-# Switch batch normalization to eval mode.
-SetPropertyForSubTree(CE, batchNormEvalMode, true)
-
 # Add top-5 error prediction node.
 ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = "eval")

--- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
@ -17,7 +17,7 @@ DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
    m = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    isd = Parameter(outDim, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    t = Times(W, x)
-    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false)
+    bn = BatchNormalization(t, sc, b, m, isd, spatial = false)
    y = RectifiedLinear(bn)
 ]

@ -50,6 +50,6 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    isd = Parameter(outMap, 1, init = fixedValue, value = 0, learningRateMultiplier = 0)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, isd, spatial = true, imageLayout = "cudnn")
    y = RectifiedLinear(bn);
 ]
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.cntk
@ -56,7 +56,7 @@ Train=[
            # Possible values: Center, Random. Default: Center
            cropType="Random"
            # Horizontal random flip, will be enabled by default if cropType=Random
-            #hflip=0
+            #hflip="true"
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
            cropRatio=0.875
            # Crop scale ratio jitter type.
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.cntk
@ -65,7 +65,7 @@ Train=[
            # Possible values: Center, Random. Default: Center
            cropType="Random"
            # Horizontal random flip, will be enabled by default if cropType=Random
-            #hflip=0
+            #hflip="true"
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
            cropRatio=0.875
            # Crop scale ratio jitter type.
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.cntk
@ -65,7 +65,7 @@ Train=[
            # Possible values: Center, Random. Default: Center
            cropType="Random"
            # Horizontal random flip, will be enabled by default if cropType=Random
-            #hflip=0
+            #hflip="true"
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
            cropRatio=0.875
            # Crop scale ratio jitter type.
--- a/25
+++ b/25
@ -31,6 +31,8 @@
 #     defaults to /usr/local/
 # These can be overridden on the command line, e.g. make BUILDTYPE=debug

+ARCH=$(shell uname)
+
 ifndef BUILD_TOP
 BUILD_TOP=.
 endif
@ -211,9 +213,11 @@ CNTKMATH:=cntkmath
 BUILDINFO:= $(SOURCEDIR)/CNTK/buildinfo.h
 GENBUILD:=Tools/generate_build_info

-$(BUILDINFO): $(GENBUILD)
-	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
-	@$(GENBUILD) $(BUILD_TOP)/Config.make
+BUILDINFO_OUTPUT := $(shell $(GENBUILD) $(BUILD_TOP)/Config.make && echo Success)
+
+ifneq ("$(BUILDINFO_OUTPUT)","Success")
+  $(error Could not generate $(BUILDINFO))
+endif


 ########################################
@ -228,6 +232,9 @@ READER_SRC =\
 	$(SOURCEDIR)/Readers/ReaderLib/ReaderShim.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/ChunkRandomizer.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/SequenceRandomizer.cpp \
+	$(SOURCEDIR)/Readers/ReaderLib/SequencePacker.cpp \
+	$(SOURCEDIR)/Readers/ReaderLib/BpttPacker.cpp \
+	$(SOURCEDIR)/Readers/ReaderLib/PackerBase.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/SampleModePacker.cpp \

 COMMON_SRC =\
@ -250,6 +257,7 @@ MATH_SRC =\
 	$(SOURCEDIR)/Math/TensorView.cpp \
 	$(SOURCEDIR)/Math/CUDAPageLockedMemAllocator.cpp \
 	$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
+	$(SOURCEDIR)/Math/BatchNormalizationEngine.cpp \

 ifdef CUDA_PATH
 MATH_SRC +=\
@ -258,7 +266,9 @@ MATH_SRC +=\
 	$(SOURCEDIR)/Math/GPUSparseMatrix.cu \
 	$(SOURCEDIR)/Math/GPUWatcher.cu \
 	$(SOURCEDIR)/Math/MatrixQuantizerGPU.cu \
+	$(SOURCEDIR)/Math/CuDnnCommon.cu \
 	$(SOURCEDIR)/Math/CuDnnConvolutionEngine.cu \
+	$(SOURCEDIR)/Math/CuDnnBatchNormalization.cu \
 	$(SOURCEDIR)/Math/GPUDataTransferer.cpp \

 else
@ -376,6 +386,7 @@ LUSEQUENCEREADER_SRC =\
 	$(SOURCEDIR)/Readers/LUSequenceReader/DataWriterLocal.cpp \
 	$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceParser.cpp \
 	$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceReader.cpp \
+	$(SOURCEDIR)/Readers/LUSequenceReader/LUSequenceWriter.cpp \

 LUSEQUENCEREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(LUSEQUENCEREADER_SRC))

@ -595,8 +606,9 @@ CNTK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(C

 CNTK:=$(BINDIR)/cntk
 ALL+=$(CNTK)
+SRC+=$(CNTK_SRC)

-$(CNTK): $(BUILDINFO)  $(CNTK_OBJ) | $(CNTKMATH_LIB)
+$(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE)
@ -638,10 +650,7 @@ $(OBJDIR)/%.o : %.cpp Makefile
 	@mkdir -p $(dir $@)
 	$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}

-.PHONY: force clean buildall all
-
-force:	$(BUILDINFO)
-
+.PHONY: clean buildall all

 clean:
 	@echo $(SEPARATOR)
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -14,6 +14,7 @@
 #include "ConvolutionalNodes.h"
 #include "NonlinearityNodes.h"
 #include "ReshapingNodes.h"
+#include "InputAndParamNodes.h"
 #include "TensorShape.h"

 namespace Microsoft { namespace MSR { namespace CNTK {
@ -288,36 +289,135 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, timeStep, name);
        }
    }
-    else if (cnNodeType == OperationNameOf(ConvolutionNode))
+    else if (cnNodeType == OperationNameOf(ConvolutionNode) || cnNodeType == OperationNameOf(PoolingNode))
    {
-        if (parameter.size() != 7)
-            RuntimeError("%ls should have 7 fixed parameters[weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"].", cnNodeType.c_str());
+        if (parameter.size() != 3 && parameter.size() != 7)
+        {
+            if (cnNodeType == OperationNameOf(ConvolutionNode))
+            {
+                RuntimeError("%ls: unexpected parameter count. %ls supports 2 modes: \n"
+                             "1. 2D convolution which takes 7 fixed parameters [weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] \n"
+                             "and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"]. \n"
+                             "2. ND convolution which takes 3 fixed parameters [weightNodeName, inputValueNodeName, kernelShape] and \n"
+                             "9 optional parameters [mapCount = [1|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
+                             "For ND convolution, parameters kernelShape, mapCount, stride, sharing, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
+                             cnNodeType.c_str(), cnNodeType.c_str());
+            }
+            else
+            {
+                RuntimeError("%ls: unexpected parameter count. %ls 3 fixed parameters [inputValueNodeName, poolKind, kernelShape] and \n"
+                             "5 optional parameters stride = [1|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
+                             "Parameters kernelShape, stride, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
+                             cnNodeType.c_str(), cnNodeType.c_str());
+            }
+        }

        // setup the parameter position of children so we can hook them up later
-        nodeParamCount = 2;
        nodeParamStart = 0;
+        nodeParamCount = cnNodeType == OperationNameOf(ConvolutionNode) ? 2 : 1;

        if (pass == ndlPassInitial)
        {
-            int id = 2; // skip weightNode and inputValueNode
+            if (parameter.size() == 3)
+            {
+                auto reqParams = node->GetParameters(false);
+                auto optParams = node->GetParameters(true);
+                auto paramGetter = [reqParams, node](size_t index) -> TensorShape
+                {
+                    assert(index < reqParams.size());
+                    auto parm = reqParams[index];
+                    if (parm->GetType() != ndlTypeArray)
+                        return TensorShape((size_t)parm->GetScalar());
+                    auto parms = node->GetParentScript()->ParseVariable(parm->GetValue(), false)->GetParameters();
+                    vector<size_t> dims(parms.size());
+                    for (size_t i = 0; i < dims.size(); i++)
+                        dims[i] = parms[i]->GetValue();
+                    return TensorShape(dims);
+                };
+                auto paramResolver = [optParams, node](const char* name, size_t defaultVal) -> TensorShape
+                {
+                    auto res = std::find_if(begin(optParams), end(optParams), [name](const NDLNode<ElemType>* n) { return EqualCI(n->GetName(), name); });
+                    if (res == end(optParams))
+                        return TensorShape(defaultVal);
+                    auto parm = node->GetParentScript()->ParseVariable((*res)->GetValue(), false);
+                    if (parm->GetType() == ndlTypeConstant)
+                        return TensorShape((size_t)parm->GetValue());
+                    auto parms = parm->GetParameters();
+                    vector<size_t> dims(parms.size());
+                    for (size_t i = 0; i < dims.size(); i++)
+                        dims[i] = parms[i]->GetValue();
+                    return TensorShape(dims);
+                };
+                auto boolParamResolver = [&optParams, node](const char* name, bool defaultVal) -> vector<bool>
+                {
+                    auto res = std::find_if(begin(optParams), end(optParams), [name](const NDLNode<ElemType>* n) { return EqualCI(n->GetName(), name); });
+                    if (res == end(optParams))
+                        return vector<bool>{defaultVal};
+                    auto parm = node->GetParentScript()->ParseVariable((*res)->GetValue(), false);
+                    if (parm == nullptr)
+                        return vector<bool>{(*res)->GetValue()};
+                    if (parm->GetType() != ndlTypeArray)
+                        return vector<bool>{parm->GetValue()};
+                    auto parms = parm->GetParameters();
+                    vector<bool> dims(parms.size());
+                    for (size_t i = 0; i < dims.size(); i++)
+                        dims[i] = parms[i]->GetValue();
+                    return dims;
+                };

-            // evaluate only scalar parameters
-            vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
-            id = 0; // reset counter because the params array starts at zero
-            size_t kernelWidth = ((NDLNode<ElemType>*) params[id++])->GetScalar();
-            size_t kernelHeight = ((NDLNode<ElemType>*) params[id++])->GetScalar();
-            size_t outputChannels = ((NDLNode<ElemType>*) params[id++])->GetScalar();
-            size_t horizontalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
-            size_t verticalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
-            assert(id == 5);
+                auto kernelShape = paramGetter(reqParams.size() - 1);
+                auto mapCount = paramResolver("mapCount", 1);
+                auto stride = paramResolver("stride", 1);
+                auto sharing = boolParamResolver("sharing", true);
+                auto autoPad = boolParamResolver("autoPadding", true);
+                auto lowerPad = paramResolver("lowerPad", 0);
+                auto upperPad = paramResolver("upperPad", 0);
+                ImageLayoutKind imageLayout = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW"));
+                size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");

-            // optional
-            ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
-            bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
-            size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
+                auto pool = PoolKind::None;
+                if (cnNodeType == OperationNameOf(PoolingNode))
+                {
+                    auto parm = node->GetParentScript()->ParseVariable(reqParams[1]->GetValue(), false);
+                    pool = PoolKindFrom(wstring(parm->GetValue()));
+                }

-            nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
-                                          horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding, maxTempMemSizeInSamples, name);
+                if (pool == PoolKind::None)
+                {
+                    nodePtr = builder.Convolution(NULL, NULL, kernelShape, mapCount, stride, sharing, 
+                                                  autoPad, lowerPad, upperPad, imageLayout, maxTempMemSizeInSamples, name);
+                }
+                else
+                {
+                    nodePtr = builder.Pooling(NULL, pool, kernelShape, stride, autoPad, lowerPad, upperPad, imageLayout, name);
+                }
+
+            }
+            else if (parameter.size() == 7)
+            {
+                int id = 2; // skip weightNode and inputValueNode
+
+                // evaluate only scalar parameters
+                vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);
+                id = 0; // reset counter because the params array starts at zero
+                size_t kernelWidth = ((NDLNode<ElemType>*) params[id++])->GetScalar();
+                size_t kernelHeight = ((NDLNode<ElemType>*) params[id++])->GetScalar();
+                size_t outputChannels = ((NDLNode<ElemType>*) params[id++])->GetScalar();
+                size_t horizontalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
+                size_t verticalSubsample = ((NDLNode<ElemType>*) params[id++])->GetScalar();
+                assert(id == 5);
+
+                // optional
+                ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
+                bool zeroPadding = node->GetOptionalParameter("zeroPadding", "false");
+                size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");
+
+                nodePtr = builder.Convolution(NULL, NULL, kernelWidth, kernelHeight, outputChannels,
+                                              horizontalSubsample, verticalSubsample, imageLayoutKind, zeroPadding,
+                                              maxTempMemSizeInSamples, name);
+            }
+            else
+                assert(false);
        }
    }
    else if (cnNodeType == OperationNameOf(MaxPoolingNode))
@ -392,9 +492,9 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            vector<void*> params = EvaluateParameters(node, baseName, id, parameter.size() - id, pass);

            // Optional parameters
-            bool eval = node->GetOptionalParameter("eval", "false");
            bool spatial = node->GetOptionalParameter("spatial", "false");
            double normTimeConst = node->GetOptionalParameter("normalizationTimeConstant", "0");
+            double blendTimeConst = node->GetOptionalParameter("blendTimeConstant", "0");
            double epsilon = node->GetOptionalParameter("epsilon", "0.00001");
            std::wstring bnEngineS = node->GetOptionalParameter("engine", "cntk");
            bool useCntkEngine;
@ -406,7 +506,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                InvalidArgument("Unsupported batch normalization engine, choose either \"cntk\"(default) or \"cudnn\".");
            ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW"));

-            nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, eval, spatial, normTimeConst, epsilon, useCntkEngine, imageLayoutKind, name);
+            nodePtr = builder.BatchNormalization(nullptr, nullptr, nullptr, nullptr, nullptr, spatial, normTimeConst, blendTimeConst, epsilon, useCntkEngine, imageLayoutKind, name);
        }
    }
    else
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -157,6 +157,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
 #endif
    else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(CosineNode), L"Cos")) ret = true;
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -79,14 +79,15 @@ Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logist
 WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]
 ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
 CastAs (type, data) = ReconcileMBLayout (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
-Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]
+Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
+Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]
 AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]
 ColumnwiseCrossProduct = KhatriRaoProduct // deprecated 
 ClassificationError = ErrorPrediction 
 Delay = PastValue 

-BatchNormalization(input, scale, bias, runMean, runInvStdDev, eval, spatial, normalizationTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
+BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
 Abs(x, tag='') = new ComputationNode [ operation = 'Abs' ; inputs = x /*plus the function args*/ ]
 ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
 ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -70,7 +70,7 @@ void TestCn(const ConfigParameters& config);

 void RedirectStdErr(wstring logpath)
 {
-    fprintf(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
+    LOGPRINTF(stderr, "Redirecting stderr to file %S\n", logpath.c_str());
    auto f = make_shared<File>(logpath.c_str(), fileOptionsWrite | fileOptionsText);
    if (dup2(fileno(*f), 2) == -1)
    {
@ -165,7 +165,7 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp

    if (numCPUThreads > 0)
    {
-        std::cerr << "Using " << numCPUThreads << " CPU threads." << endl;
+        LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);
    }

    bool progressTracing = config(L"progressTracing", false);
@ -187,14 +187,14 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
            if (action[j] == "train" || action[j] == "trainRNN")
            {
                wstring modelPath = commandParams("modelPath");
-                std::wcerr << "CNTKModelPath: " << modelPath << endl;
+                LOGPRINTF(stderr, "CNTKModelPath: %ls\n", modelPath.c_str());
                size_t maxEpochs = GetMaxEpochs(commandParams);
-                std::cerr << "CNTKCommandTrainInfo: " + command[i] << " : " << maxEpochs << endl;
+                LOGPRINTF(stderr, "CNTKCommandTrainInfo: %s : %d\n", command[i].c_str(), (int) maxEpochs);
                fullTotalMaxEpochs += maxEpochs;
            }
        }
    }
-    std::cerr << "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : " << fullTotalMaxEpochs << endl;
+    LOGPRINTF(stderr, "CNTKCommandTrainInfo: CNTKNoMoreCommands_Total : %d\n", (int) fullTotalMaxEpochs);

    // set up progress tracing for compute cluster management
    if (progressTracing && (!mpi || mpi->IsMainNode()))
@ -225,19 +225,20 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
            // print a banner to visually separate each action in the log
            const char* delim = "##############################################################################";
            const char* prefix = "Action ";
-            fprintf(stderr, "\n%s\n", delim);
-            fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
-            fprintf(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
-            fprintf(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
-            fprintf(stderr, "%s\n\n", delim);
+            fprintf(stderr, "\n");
+            LOGPRINTF(stderr, "%s\n", delim);
+            LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
+            LOGPRINTF(stderr, "# %s\"%s\"%*s #\n", prefix, thisAction.c_str(), (int)(strlen(delim) - strlen(prefix) - thisAction.size() - 6), "");
+            LOGPRINTF(stderr, "#%*s#\n", (int)(strlen(delim) - 2), "");
+            LOGPRINTF(stderr, "%s\n\n", delim);

            if ((mpi == nullptr) || (commandstoRunOnAllRanks.find(thisAction) != commandstoRunOnAllRanks.end()) || mpi->IsMainNode())
            {
                if (thisAction == "train" || thisAction == "trainRNN")
                {
-                    std::cerr << "CNTKCommandTrainBegin: " + command[i] << endl;
+                    LOGPRINTF(stderr, "CNTKCommandTrainBegin: %s\n", command[i].c_str());
                    DoTrain<ConfigParameters, ElemType>(commandParams);
-                    std::cerr << "CNTKCommandTrainEnd: " + command[i] << endl;
+                    LOGPRINTF(stderr, "CNTKCommandTrainEnd: %s\n", command[i].c_str());
                    fullEpochsOffset += GetMaxEpochs(commandParams);
                }
                else if (thisAction == "adapt")
@ -298,7 +299,8 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
                }
            }

-            fprintf(stderr, "\nAction \"%s\" complete.\n\n", thisAction.c_str());
+            fprintf(stderr, "\n");
+            LOGPRINTF(stderr, "Action \"%s\" complete.\n\n", thisAction.c_str());

            NDLScript<ElemType> ndlScript;
            ndlScript.ClearGlobal(); // clear global macros between commands
@ -321,51 +323,51 @@ std::string TimeDateStamp()

 void PrintBuiltInfo()
 {
-    fprintf(stderr, "-------------------------------------------------------------------\n");
-    fprintf(stderr, "Build info: \n\n");
-    fprintf(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
-    fprintf(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
+    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
+    LOGPRINTF(stderr, "Build info: \n\n");
+    LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
+    LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
 #ifdef _BUILDTYPE_
-    fprintf(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
+    LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
 #endif
 #ifdef _BUILDTARGET_
-    fprintf(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
+    LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
 #endif
 #ifdef _WITH_1BITSGD_
-    fprintf(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
+    LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
 #endif
 #ifdef _MATHLIB_
-    fprintf(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
+    LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
 #endif
 #ifdef _CUDA_PATH_
-    fprintf(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
+    LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
 #endif
 #ifdef _CUB_PATH_
-    fprintf(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
+    LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
 #endif
 #ifdef _CUDNN_PATH_
-    fprintf(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
+    LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
 #endif
 #ifdef _GIT_EXIST
-    fprintf(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
-    fprintf(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
+    LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
+    LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
 #endif
 #ifdef _BUILDER_
-    fprintf(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
+    LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
 #endif
 #ifdef _BUILDPATH_
-    fprintf(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
+    LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
 #endif
-    fprintf(stderr, "-------------------------------------------------------------------\n");
+    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
 }

 void PrintUsageInfo()
 {
-    fprintf(stderr, "-------------------------------------------------------------------\n");
-    fprintf(stderr, "Usage: cntk configFile=yourConfigFile\n");
-    fprintf(stderr, "For detailed information please consult the CNTK book\n");
-    fprintf(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
-    fprintf(stderr, "-------------------------------------------------------------------\n");
+    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
+    LOGPRINTF(stderr, "Usage: cntk configFile=yourConfigFile\n");
+    LOGPRINTF(stderr, "For detailed information please consult the CNTK book\n");
+    LOGPRINTF(stderr, "\"An Introduction to Computational Networks and the Computational Network Toolkit\"\n");
+    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
 }

 // ---------------------------------------------------------------------------
@ -414,7 +416,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    for (const auto& arg : args)
        startupMessage += L"  " + arg;

-    fprintf(stderr, "%ls\n", startupMessage.c_str());
+    LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());

    // parse command-line options
    vector<wstring> sourceFiles;
@ -443,6 +445,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    // compile the BrainScript
    wstring bs = L"[\n";
    bs += L"include \'cntk.core.bs'"; // start with including the standard macros
+
    // Note: Using lowercase ^^ here to match the Linux name of the CNTK exe.
    //bs += standardFunctions + computationNodes + commonMacros + L"\n";
    for (const auto& sourceFile : sourceFiles)
@ -451,7 +454,8 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    for (const auto& over : overrides)
        bs += L"with [ " + over + L" ]\n";

-    fprintf(stderr, "\n\nBrainScript -->\n\n%ls\n\n", bs.c_str());
+    fprintf(stderr, "\n\n");
+    LOGPRINTF(stderr, "BrainScript -->\n\n%ls\n\n", bs.c_str());

    let expr = BS::ParseConfigExpression(bs, move(includePaths)); // parse
    let valp = BS::Evaluate(expr);                                // evaluate parse into a dictionary
@ -460,8 +464,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    // legacy parameters that have changed spelling
    if (config.Find(L"DoneFile")) // variables follow camel case (start with lower-case letters)
        InvalidArgument("Legacy spelling of 'DoneFile' no longer allowed. Use 'doneFile'.");
+
    if (config.Find(L"command")) // spelling error, should be plural. Using 'actions' instead to match the data type.
        InvalidArgument("Legacy spelling of 'command' no longer allowed. Use 'actions'.");
+
    if (config.Find(L"type"))
        InvalidArgument("Legacy name 'type' no longer allowed. Use 'precision'.");

@ -486,7 +492,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
            logpath += msra::strfun::wstrprintf(L"rank%d", (int) mpi->CurrentNodeRank());

        RedirectStdErr(logpath);
-        fprintf(stderr, "%ls\n", startupMessage.c_str());
+        LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
    }

    // echo config info to log
@ -497,16 +503,18 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    int numCPUThreads = config(L"numCPUThreads", 0);
    numCPUThreads = CPUMatrix<float /*any will do*/>::SetNumThreads(numCPUThreads);
    if (numCPUThreads > 0)
-        fprintf(stderr, "Using %d CPU threads.\n", numCPUThreads);
+        LOGPRINTF(stderr, "Using %d CPU threads.\n", numCPUThreads);

    bool progressTracing = config(L"progressTracing", false);
    size_t fullTotalMaxEpochs = 1; // BUGBUG: BS does not allow me to read out the max epochs parameters, as that would instantiate and thus execute the objects
+
    // set up progress tracing for compute cluster management
    if (progressTracing && ((mpi == nullptr) || mpi->IsMainNode()))
        ProgressTracing::TraceTotalNumberOfSteps(fullTotalMaxEpochs); // enable tracing, using this as the total number of epochs

    // MAIN LOOP that executes the actions
    auto actionsVal = config[L"actions"];
+
    // Note: weird behavior. If 'actions' is a scalar value (rather than an array) then it will have been resolved already after the above call. That means, it has already completed its action!
    //       Not pretty, but a direct consequence of the lazy evaluation. The only good solution would be to have a syntax for arrays including length 0 and 1.
    //       Since this in the end behaves indistinguishable from the array loop below, we will keep it for now.
@ -532,7 +540,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
        fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
        fcloseOrDie(fp);
    }
-    fprintf(stderr, "COMPLETED\n"), fflush(stderr);
+    // TODO: change this back to COMPLETED, double underscores don't look good in output
+    LOGPRINTF(stderr, "__COMPLETED__\n");
+    fflush(stderr);

    MPIWrapper::DeleteInstance();
    return EXIT_SUCCESS;
@ -541,11 +551,16 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
 // ---------------------------------------------------------------------------
 // main() for old CNTK config language
 // ---------------------------------------------------------------------------
-
-int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & repots Win32 exceptions
+// called from wmain which is a wrapper that catches & repots Win32 exceptions
+int wmainOldCNTKConfig(int argc, wchar_t* argv[])
 {
    ConfigParameters config;
-    std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);
+    std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);    // get the command param set they want
+    bool timestamping = config(L"timestamping", false);
+    if (timestamping)
+    {
+        ProgressTracing::SetTimestampingFlag();
+    }

    // get the command param set they want
    wstring logpath = config(L"stderr", L"");
@ -586,8 +601,9 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
    std::string timestamp = TimeDateStamp();

    // dump config info
-    fprintf(stderr, "\nRunning on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
-    fprintf(stderr, "Command line: \n");
+    fprintf(stderr, "\n");
+    LOGPRINTF(stderr, "Running on %s at %s\n", GetHostName().c_str(), timestamp.c_str());
+    LOGPRINTF(stderr, "Command line: \n");
    for (int i = 0; i < argc; i++)
        fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability
    fprintf(stderr, "\n\n");
@ -595,24 +611,27 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
 #if 1 //def _DEBUG
    // This simply merges all the different config parameters specified (eg, via config files or via command line directly),
    // and prints it.
-    fprintf(stderr, "\n\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
-    fprintf(stderr, "%s\n", rawConfigString.c_str());
-    fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<\n");
+    fprintf(stderr, "\n\n");
+    LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
+    LOGPRINTF(stderr, "%s\n", rawConfigString.c_str());
+    LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<\n");

-    // Same as above, but all variables are resolved.  If a parameter is set multiple times (eg, set in config, overriden at command line),
+    // Same as above, but all variables are resolved.  If a parameter is set multiple times (eg, set in config, overridden at command line),
    // All of these assignments will appear, even though only the last assignment matters.
-    fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
-    fprintf(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
-    fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
+    fprintf(stderr, "\n");
+    LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
+    LOGPRINTF(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
+    LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");

    // This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
    // value it is set to will appear).
-    fprintf(stderr, "\n>>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
+    fprintf(stderr, "\n");
+    LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
    config.dumpWithResolvedVariables();
-    fprintf(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
+    LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
 #endif

-    fprintf(stderr, "Commands:");
+    LOGPRINTF(stderr, "Commands:");
    for (int i = 0; i < command.size(); i++)
        fprintf(stderr, " %s", command[i].c_str());
    fprintf(stderr, "\n");
@ -623,7 +642,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
    if (config.Exists("type"))
        InvalidArgument("CNTK: Use of 'type' parameter is deprecated, it is called 'precision' now.");

-    fprintf(stderr, "Precision = \"%s\"\n", type.c_str());
+    LOGPRINTF(stderr, "Precision = \"%s\"\n", type.c_str());
+
    if (type == "float")
        DoCommands<float>(config, mpi);
    else if (type == "double")
@ -638,7 +658,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[]) // called from wmain which is
        fprintf(fp, "successfully finished at %s on %s\n", TimeDateStamp().c_str(), GetHostName().c_str());
        fcloseOrDie(fp);
    }
-    fprintf(stderr, "COMPLETED\n"), fflush(stderr);
+    // TODO: Change back to COMPLETED (no underscores)
+    LOGPRINTF(stderr, "__COMPLETED__\n"), fflush(stderr);

    MPIWrapper::DeleteInstance();
    return EXIT_SUCCESS;
@ -659,43 +680,52 @@ void AllocationFailureHandler()
 int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions
 {
    std::set_new_handler(AllocationFailureHandler);
+
    try
-    {
+    {        
        PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
+
        if (argc <= 1)
        {
-            fprintf(stderr, "No command-line argument given.\n");
+            LOGPRINTF(stderr, "No command-line argument given.\n");
            PrintUsageInfo();
            return EXIT_FAILURE;
        }
+
        // detect legacy CNTK configuration
        bool isOldCNTKConfig = false;
        for (int i = 0; i < argc && !isOldCNTKConfig; i++)
            isOldCNTKConfig |= !_wcsnicmp(L"configFile=", argv[i], 11);
+
        if (isOldCNTKConfig)
            return wmainOldCNTKConfig(argc, argv);
+
        // run from BrainScript
        return wmainWithBS(argc, argv);
    }
    catch (const ScriptableObjects::ScriptingException& err)
    {
-        fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
+        fprintf(stderr, "\n");
+        LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
        err.PrintError();
        return EXIT_FAILURE;
    }
    catch (const IExceptionWithCallStackBase& err)
    {
-        fprintf(stderr, "\nEXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
+        fprintf(stderr, "\n");
+        LOGPRINTF(stderr, "EXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
        return EXIT_FAILURE;
    }
    catch (const std::exception& err)
    {
-        fprintf(stderr, "\nEXCEPTION occurred: %s\n", err.what());
+        fprintf(stderr, "\n");
+        LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
        return EXIT_FAILURE;
    }
    catch (...)
    {
-        fprintf(stderr, "\nUnknown ERROR occurred\n");
+        fprintf(stderr, "\n");
+        LOGPRINTF(stderr, "Unknown ERROR occurred\n");
        return EXIT_FAILURE;
    }
 }
@ -703,7 +733,8 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
 #ifdef __WINDOWS__
 void TerminateThis()
 {
-    fprintf(stderr, "terminate_this: aborting\n"), fflush(stderr);
+    LOGPRINTF(stderr, "terminate_this: aborting\n");
+    fflush(stderr);
    exit(EXIT_FAILURE);
 }

@ -714,7 +745,7 @@ static void LogDelayLoadError(PEXCEPTION_POINTERS pExcPointers)
    if (pExcPointers->ExceptionRecord->ExceptionCode == EXCEPTION_DLL_NOT_FOUND)
    {
        const auto & pDelayLoadInfo = *PDelayLoadInfo(pExcPointers->ExceptionRecord->ExceptionInformation[0]);
-        fprintf(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
+        LOGPRINTF(stderr, "CNTK: Failed to load DLL '%s'.\n", pDelayLoadInfo.szDll);
    }
 }

@ -736,7 +767,7 @@ int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 excepti
        else if (code == EXCEPTION_INT_DIVIDE_BY_ZERO) msg = ": Integer division by zero";
        else if (code == EXCEPTION_STACK_OVERFLOW)     msg = ": Stack overflow";
        else if (code == EXCEPTION_DLL_NOT_FOUND)      msg = ": Module not found";
-        fprintf(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
+        LOGPRINTF(stderr, "CNTK: Caught Win32 exception 0x%08x%s.\n", (unsigned int)code, msg);
        fflush(stderr);
        exit(EXIT_FAILURE);
    }
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@ -9,6 +9,7 @@

 #include "ModelEditLanguage.h"
 #include "ConvolutionalNodes.h"
+#include "InputAndParamNodes.h"
 #include <map>

 namespace Microsoft { namespace MSR { namespace CNTK {
@ -58,8 +59,7 @@ enum MELProperty
    melPropFinalCriterion,
    melPropEvaluation,
    melPropOutput,
-    melPropRecurrent,
-    melPropBatchNormMode
+    melPropRecurrent
 };

 // SetGroupTag - Set the group tag on a node
@ -73,7 +73,7 @@ void MELScript<ElemType>::SetGroupTag(ComputationNodeBasePtr nodeProp, Computati
        cn->AddToNodeGroup(groupTag, nodeProp);
    else
        cn->RemoveFromNodeGroup(groupTag, nodeProp);
-}
+    }

 // ProcessNDLScript - Process the NDL script
 // netNdl - netNDL structure
@ -384,18 +384,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
            inputNodes[i - 1] = nodeFrom[0];
        }

-#if 1
        nodeTo[0]->AttachInputs(inputNodes);
-#else   // TODO: delete this
-        if (inputNodes.size() == 1)
-            nodeTo[0]->AttachInputs(inputNodes[0]);
-        else if (inputNodes.size() == 2)
-            nodeTo[0]->AttachInputs(inputNodes[0], inputNodes[1]);
-        else if (inputNodes.size() == 3)
-            nodeTo[0]->AttachInputs(inputNodes[0], inputNodes[1], inputNodes[2]);
-        else
-            RuntimeError("SetNodeInputs(): You specified more than 3 input nodes.");
-#endif
    }
    else if (EqualInsensitive(name, "SetProperty"))
    {
@ -416,8 +405,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa

        // map property name to property enum
        // Please keep this table sorted.
-             if (EqualInsensitive(propName, "batchNormEvalMode"))      prop = melPropBatchNormMode;
-        else if (EqualInsensitive(propName, "criterion"))              prop = melPropFinalCriterion;
+             if (EqualInsensitive(propName, "criterion"))              prop = melPropFinalCriterion;
        else if (EqualInsensitive(propName, "evaluation"))             prop = melPropEvaluation;
        else if (EqualInsensitive(propName, "feature"))                prop = melPropFeature;
        else if (EqualInsensitive(propName, "label"))                  prop = melPropLabel;
@ -483,32 +471,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
                // what to do here?
                break;
            }
-            case melPropBatchNormMode:
-            {
-                if (node->OperationName() != OperationNameOf(BatchNormalizationNode))
-                {
-                    RuntimeError("Invalid node type: node %ls (type:%ls) is not a %ls node; therefore cannot apply batchNormEvalMode on it.",
-                                 node->NodeName().c_str(),
-                                 node->OperationName().c_str(),
-                                 OperationNameOf(BatchNormalizationNode).c_str());
-                }
-                bool property = params[2];
-                auto pnode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
-                if (pnode)
-                    pnode->SetEvalMode(property);
-                else
-                {
-                    auto pnode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
-                    if (pnode2)
-                        pnode2->SetEvalMode(property);
-                    else
-                    {
-                        RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n",
-                                     node->NodeName().c_str());
-                    }
-                }
-                break;
-            }
            default:
            {
                RuntimeError("Invalid property, %s, is not supported", propName.c_str());
@ -534,10 +496,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
        {
            prop = melPropLearningRateMultiplier;
        }
-        else if (EqualInsensitive(propName, "batchNormEvalMode"))
-        {
-            prop = melPropBatchNormMode;
-        }
        else
        {
            RuntimeError("Invalid property, %s, is not supported", propName.c_str());
@ -566,12 +524,6 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
                netNdl->cn->SetLearnableNodesBelowLearningRateMultiplier(learningRateMultiplier, node);
                break;
            }
-            case melPropBatchNormMode:
-            {
-                bool evalMode = params[2];
-                netNdl->cn->SetBatchNormalizationNodesBelowEvalMode(evalMode, node);
-                break;
-            }
            default:
            {
                RuntimeError("Invalid property, %s, is not supported", propName.c_str());
--- a/Source/Common/Include/ProgressTracing.h
+++ b/Source/Common/Include/ProgressTracing.h
@ -4,10 +4,33 @@
 //
 #pragma once

+#include <chrono>
 #include "TimerUtility.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

+// TODO: make this proper C++ functions with variadic templates and a name that reflects their difference to fprintf(stderr) which already implies printing to log
+// If the Tracing flag is set, print out a timestamp with no new line at the end
+#define PREPENDTS(stream) \
+    do \
+    { \
+        if (ProgressTracing::GetTimestampingFlag()) \
+        { \
+           std::time_t tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); \
+           char mbstr[30]; \
+           if (std::strftime(mbstr, sizeof(mbstr), "%m/%d/%Y %H:%M:%S", std::localtime(&tt))) \
+               fprintf(stream, "%s: ", mbstr);  \
+        } \
+    } while(0)
+
+// Print out a log message.  If the Tracing flag is set, prepend with a timestamp
+#define LOGPRINTF(stream, ...) \
+    do \
+    { \
+        PREPENDTS(stream); \
+        fprintf(stream, __VA_ARGS__); \
+    } while(0)
+
 // ---------------------------------------------------------------------------
 // ProgressTracing -- static helper class for logging a progress indicator
 //
@ -29,12 +52,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 {
    bool m_enabled;
    bool m_tracingFlag;
+    bool m_timestampFlag;        // TODO: What does this do? TODO: camelCase
    size_t m_totalNumberOfSteps; // total number of epochs in entire training run
    size_t m_currentStepOffset;  // current offset
    Timer m_progressTracingTimer;

    ProgressTracing()
-        : m_enabled(false), m_tracingFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
+        : m_enabled(false), m_tracingFlag(false), m_timestampFlag(false), m_totalNumberOfSteps(0), m_currentStepOffset(0)
    {
    }

@ -50,12 +74,24 @@ public:
        return GetStaticInstance().m_tracingFlag;
    }

+    static bool GetTimestampingFlag()
+    {
+        return GetStaticInstance().m_timestampFlag;
+        // TODO: timestampFlag or timestampingFlag? (Or timeStampFlag?)
+    }
+
    static void SetTracingFlag()
    {
        auto& us = GetStaticInstance();
        us.m_tracingFlag = true;
    }

+    static void SetTimestampingFlag()
+    {
+        auto& us = GetStaticInstance();
+        us.m_timestampFlag = true;
+    }
+
    // call TraceTotalNumberOfSteps() to set the total number of steps
    // Calling this with totalNumberOfSteps>0 will enable progress tracing.
    static void TraceTotalNumberOfSteps(size_t totalNumberOfSteps)
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@ -780,6 +780,11 @@ static inline ImageLayoutKind ImageLayoutKindFrom(const wstring& s)
 struct ImageDimensions
 {
    size_t m_width, m_height, m_numChannels;
+    // convenience accessors. TODO: use only one name. Rename the members themselves?
+    size_t w() const { return m_width;       }
+    size_t h() const { return m_height;      }
+    size_t c() const { return m_numChannels; }
+
    // interpret TensorShape as image
    ImageDimensions(const TensorShape& shape, ImageLayoutKind imageLayoutKind)
    {
@ -787,14 +792,14 @@ struct ImageDimensions
            InvalidArgument("Convolution operation currently only supports 1D or 2D convolution on 3D tensors.");
        if (imageLayoutKind == ImageLayoutKind::CHW)
        {
-            m_width = shape[0];
-            m_height = shape[1];
+            m_width       = shape[0];
+            m_height      = shape[1];
            m_numChannels = shape[2];
        }
        else if (imageLayoutKind == ImageLayoutKind::HWC)
        {
-            m_width = shape[1];
-            m_height = shape[2];
+            m_width      = shape[1];
+            m_height     = shape[2];
            m_numChannels = shape[0];
        }
        else
--- a/Source/Common/fileutil.cpp
+++ b/Source/Common/fileutil.cpp
@ -609,11 +609,6 @@ void renameOrDie(const std::string& from, const std::string& to)
    // WORKAROUND: "rename" should do this but this is a workaround
    // to the HDFS FUSE implementation's bug of failing to do so
    // workaround for FUSE rename when running on Philly
-    if (ProgressTracing::GetTracingFlag())
-    {
-        fprintf(stderr, "rename %s to %s\n", from.c_str(), to.c_str());
-    }    
-    
    unlinkOrDie(to);
    if (rename(from.c_str(), to.c_str()) != 0)
    {
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -514,25 +514,32 @@ template <class ElemType>
 }

 template <class ElemType>
-/*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant)
+/*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode,
+                                                                       double normalizationTimeConstant, double& prevNormalizationTimeConstant,
+                                                                       double blendTimeConstant, double& prevBlendTimeConstant)
 {
-    if (normalizationTimeConstant != prevNormalizationTimeConstant && normalizationTimeConstant != numeric_limits<double>::infinity())
+    if (normalizationTimeConstant != prevNormalizationTimeConstant || blendTimeConstant != prevBlendTimeConstant)
    {
-        fprintf(stderr, "Setting batch normalization time constant to %.8g.\n", normalizationTimeConstant);
+        if (normalizationTimeConstant != prevNormalizationTimeConstant)
+            fprintf(stderr, "Setting batch normalization time constant to %.8g.\n", normalizationTimeConstant);
+        if (blendTimeConstant != prevBlendTimeConstant)
+            fprintf(stderr, "Setting batch normalization blend time constant to %.8g.\n", blendTimeConstant);
        // TODO: Change this to use an interface that is independent of <ElemType>.
-        list<ComputationNodeBasePtr> batchNormalizationNodes = net->GetNodesWithType(OperationNameOf(BatchNormalizationNode), criterionNode);
-        if (batchNormalizationNodes.size() == 0 && normalizationTimeConstant != numeric_limits<double>::infinity())
+        auto batchNormalizationNodes = net->GetNodesWithType(OperationNameOf(BatchNormalizationNode), criterionNode);
+        if (batchNormalizationNodes.size() == 0)
            fprintf(stderr, "WARNING: there is no batch normalization node.\n");
        else
        { 
            for (auto& nodeIter : batchNormalizationNodes)
            {
                auto node = dynamic_pointer_cast<BatchNormalizationNode<ElemType>>(nodeIter);
-                node->SetNormalizationTimeConstant(normalizationTimeConstant);
+                node->SetNormalizationTimeConstants(normalizationTimeConstant, prevNormalizationTimeConstant,
+                                                    blendTimeConstant, prevBlendTimeConstant);
            }
        }

        prevNormalizationTimeConstant = normalizationTimeConstant;
+        prevBlendTimeConstant = blendTimeConstant;
    }
 }

@ -1434,7 +1441,7 @@ template void ComputationNetwork::Read<float>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
 template /*static*/ void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
-template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
+template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
 template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
                                                     const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 template void ComputationNetwork::SaveToDbnFile<float>(ComputationNetworkPtr net, const std::wstring& fileName) const;
@ -1444,7 +1451,7 @@ template void ComputationNetwork::Read<double>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
 template /*static*/ void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
-template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstant<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
+template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
 template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
                                                      const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 template void ComputationNetwork::SaveToDbnFile<double>(ComputationNetworkPtr net, const std::wstring& fileName) const;
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -103,8 +103,6 @@ public:
        Read<ElemType>(fileName);
        // perform all further post-processing, caching, etc.
        CompileNetwork();
-        // To ensure that all the BN nodes changed to eval mode unless it's in Training mode.
-        SetBatchNormalizationNodesBelowEvalMode(true);
    }

    // static helper to instantiate a network from a file
@ -363,7 +361,6 @@ public:
    void AddFeatureNode(ComputationNodeBasePtr featureNode);
    //ComputationNodeBasePtr RemoveFeatureNode(ComputationNodeBasePtr featureNode);
    void SetLearnableNodesBelowLearningRateMultiplier(const float learningRateMultiplier, const ComputationNodeBasePtr& rootNode = nullptr);
-    void SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode = nullptr);

    // -----------------------------------------------------------------------
    // node access
@ -429,7 +426,9 @@ public:
    static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);

    template <class ElemType>
-    static void SetBatchNormalizationTimeConstant(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant);
+    static void SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, 
+                                                   double normalizationTimeConstant, double& prevNormalizationTimeConstant,
+                                                   double blendTimeConstant, double& prevBlendTimeConstant);

    template <class ElemType>
    static void SetSeqParam(ComputationNetworkPtr net,
--- a/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
@ -106,13 +106,13 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
                assert(node->m_numNonDelayedParentsInLoop == 0); // (in PurgeStateForFormingRecurrentLoops())
        }
        for (let& node : nestedNodes)
-        {
-            for (auto& input : node->GetInputs())
            {
+            for (auto& input : node->GetInputs())
+                {
                if (input->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
                    input->m_numNonDelayedParentsInLoop++; // cound #parents of 'input' that are not delay nodes
+                }
            }
-        }

        // re-traverse the graph for all nestedNodes, starting with the first
        // Then update m_nestedNodes with the re-traversed order.
@ -301,19 +301,19 @@ void ComputationNetwork::DetermineSCCsR(ComputationNodeBasePtr cur,
            for (let& iter : m_allSEQNodes)
            {
                for (let& iter2 : iter->m_nestedNodes)
-                {
+            {
                    if (iter2 == cur)
-                    {
-                        bFound = true;
+                {
+                    bFound = true;
                        // validate that the loop is really the same, by a set comparison
                        unordered_set<ComputationNodeBasePtr> newLoop     (        nestedNodes.begin(),         nestedNodes.end());
                        unordered_set<ComputationNodeBasePtr> existingLoop(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end());
                        if (newLoop != existingLoop)
                            LogicError("DetermineSCCsR: %ls %ls operation rediscovered in a loop, but that loop is not the same as last time.", cur->NodeName().c_str(), cur->OperationName().c_str());
-                        break;
-                    }
+                    break;
                }
            }
+            }
            if (bFound)
                fprintf(stderr, "\nDetermineSCCsR: %ls %ls operation was discovered multiple times as as loop participant", cur->NodeName().c_str(), cur->OperationName().c_str());
            // TODO: Once we forbid FormRecurrentLoops() from non-NULL, can we ever re-hit a loop here? If not, then turn bFound into a LogicError().
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -128,6 +128,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateNode(const std::wstring& node
    if      (nodeType == OperationNameOf(AveragePoolingNode))       return New<AveragePoolingNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(BatchNormalizationNode))   return New<BatchNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ConvolutionNode))          return New<ConvolutionNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(PoolingNode))              return New<PoolingNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(SparseInputValue))         return New<SparseInputValue<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(InputValue))               return New<InputValue<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LearnableParameter))       return New<LearnableParameter<ElemType>>(forward<_Types>(_Args)...);
@ -229,6 +230,27 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
                                                                       maxTempMemSizeInSamples));
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount,
+                                                                                                 const TensorShape& strideShape, const std::vector<bool>& sharing,
+                                                                                                 const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                                                                                 ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+{
+    return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
+                                                                       kernelShape, mapCount, strideShape,
+                                                                       sharing, autoPadding, lowerPad, upperPad,
+                                                                       imageLayout, maxTempMemSizeInSamples));
+}
+
+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreatePoolingNode(const std::wstring& nodeName, PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
+                                                                                             const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                                                                             ImageLayoutKind imageLayout)
+{
+    return net.AddNodeToNetWithElemType(New<PoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
+                                                                   poolKind, kernelShape, strideShape, autoPadding, lowerPad, upperPad, imageLayout));
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateMaxPoolingNode(const std::wstring& nodeName,
                                                                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
@ -261,7 +283,9 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
                                                                                       const ComputationNodePtr inputValues,
-                                                                                       const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
+                                                                                       const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, 
+                                                                                       const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                                                                       ImageLayoutKind imageLayoutKind, const bool zeroPadding, const size_t maxTempMemSizeInSamples,
                                                                                       const std::wstring nodeName)
 {
    return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
@ -269,6 +293,34 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convo
                                                                          maxTempMemSizeInSamples), { weight, inputValues });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convolution(const ComputationNodePtr weight,
+                                                                                       const ComputationNodePtr inputValues, 
+                                                                                       const TensorShape& kernelShape, const TensorShape& mapCount, 
+                                                                                       const TensorShape& strideShape, const std::vector<bool>& sharing,
+                                                                                       const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                                                                       ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
+                                                                                       const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
+                                                                          kernelShape, mapCount, strideShape,
+                                                                          sharing, autoPadding, lowerPad, upperPad,
+                                                                          imageLayout, maxTempMemSizeInSamples),
+                                                                          weight, inputValues);
+}
+
+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Pooling(const ComputationNodePtr inputValues,
+                                                                                   PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
+                                                                                   const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                                                                   ImageLayoutKind imageLayout,
+                                                                                   const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<PoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
+                                                                      poolKind, kernelShape, strideShape, autoPadding, lowerPad, upperPad, imageLayout),
+                                                                      inputValues);
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
                                                                                      const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
@ -636,10 +688,11 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Looku
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::BatchNormalization(const ComputationNodePtr input,
                                                                                              const ComputationNodePtr scale, const ComputationNodePtr bias, const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev,
-                                                                                              bool eval, bool spatial, double normalizationTimeConstant, double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind,
+                                                                                              bool spatial, double normalizationTimeConstant, double blendTimeConstant, double epsilon, bool useCntkEngine,
+                                                                                              ImageLayoutKind imageLayoutKind,
                                                                                              const std::wstring nodeName)
 {
-    return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, eval, spatial, normalizationTimeConstant, epsilon, useCntkEngine, imageLayoutKind), { input, scale, bias, runMean, runInvStdDev });
+    return net.AddNodeToNetAndAttachInputs(New<BatchNormalizationNode<ElemType>>(net.GetDeviceId(), nodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, useCntkEngine, imageLayoutKind), { input, scale, bias, runMean, runInvStdDev });
 }

 template class ComputationNetworkBuilder<float>;
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -7,7 +7,8 @@
 #include "Basics.h"
 #include "ComputationNode.h"
 #include "ComputationNetwork.h"
-#include "TrainingNodes.h" // for NCEEvalMode
+#include "TrainingNodes.h"      // for NCEEvalMode
+#include "ConvolutionalNodes.h" // for PoolKind
 #include "ScriptableObjects.h"
 #include "TensorShape.h"
 #include <string>
@ -51,7 +52,15 @@ public:
    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows);
    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout);
    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout);
-    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
+    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
+                                             const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                             ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
+    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, 
+                                             const size_t horizontalSubsample, const size_t verticalSubsample, 
+                                             ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
+    ComputationNodePtr CreatePoolingNode(const std::wstring& nodeName, PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
+                                         const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                         ImageLayoutKind imageLayout);
    ComputationNodePtr CreateMaxPoolingNode(const std::wstring& nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
    ComputationNodePtr CreateAveragePoolingNode(const std::wstring& nodeName, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind);
    // this is the catch-all for all cases not covered as special cases above
@ -60,7 +69,7 @@ public:
    // The following functions create nodes and link them to the network and their inputs.
    // TODO: Do we need both this set and the one above that does not add inputs? Can they share more code?
    ComputationNodePtr BatchNormalization(const ComputationNodePtr input, const ComputationNodePtr scale, const ComputationNodePtr bias,
-                                          const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool eval = false, bool spatial = false, double normalizationTimeConstant = 0, double epsilon = 1e-5, bool useCntkEngine = true,
+                                          const ComputationNodePtr runMean, const ComputationNodePtr runInvStdDev, bool spatial = false, double normalizationTimeConstant = 0, double blendTimeConstant = 0, double epsilon = 1e-5, bool useCntkEngine = true,
                                          ImageLayoutKind imageLayoutKind = ImageLayoutKind::CHW, const std::wstring nodeName = L"");
    ComputationNodePtr Convolution(const ComputationNodePtr weight,
                                   const ComputationNodePtr inputValues,
@ -68,6 +77,17 @@ public:
                                   const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
                                   const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0,
                                   const std::wstring nodeName = L"");
+    ComputationNodePtr Convolution(const ComputationNodePtr weight,
+                                   const ComputationNodePtr inputValues,
+                                   const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
+                                   const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                   ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
+                                   const std::wstring nodeName = L"");
+    ComputationNodePtr Pooling(const ComputationNodePtr inputValues, 
+                               PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
+                               const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                               ImageLayoutKind imageLayout,
+                               const std::wstring nodeName = L"");
    ComputationNodePtr MaxPooling(const ComputationNodePtr inputValues,
                                  const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
                                  const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@ -332,42 +332,4 @@ void ComputationNetwork::SetLearnableNodesBelowLearningRateMultiplier(const floa
    }
 }

-void ComputationNetwork::SetBatchNormalizationNodesBelowEvalMode(const bool evalMode, const ComputationNodeBasePtr& rootNode /* = nullptr */)
-{
-    vector<ComputationNodeBasePtr> nodes;
-    if (rootNode == nullptr)
-    {
-        for (auto pair : m_nameToNodeMap)
-        {
-            nodes.push_back(pair.second);
-        }
-    }
-    else
-    {
-        auto allnodes = rootNode->EnumerateNodes();
-        for (auto node : allnodes)
-            nodes.push_back(node);
-    }
-
-    for (auto& node : nodes)
-    {
-        if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
-        {
-            auto pNode = dynamic_pointer_cast<BatchNormalizationNode<float>>(node);
-            if (!pNode)
-            {
-                auto pNode2 = dynamic_pointer_cast<BatchNormalizationNode<double>>(node);
-                if (!pNode2)
-                {
-                    RuntimeError("Invalid node type: node name=%ls. We assume either BatchNormalizationNode<float> or BatchNormalizationNode<double>\n", node->NodeName().c_str());
-                }
-            }
-            else
-            {
-                pNode->SetEvalMode(evalMode);
-            }
-        }
-    }
-}
-
 }}}
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -114,9 +114,11 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
        {
            // instead of the node itself, include the sentinel SEQTraversalFlowControlNode in our list
            m_nestedNodes.push_back(recInfo);
+
            // and verify that we only encountered the loop once (all nodes should have been consecutive)
            if (!loopsSeen.insert(recInfo).second)
                LogicError("PARTraversalFlowControlNode: members of loop %ls are not consecutive in node list.", recInfo->NodeName().c_str());
+
            // consume all nodes that are part of the same loop (they are all consecutive)
            while (nodeIter != allNodes.end() && (*nodeIter)->IsPartOfLoop() && FindInRecurrentLoops(recurrentInfo, *nodeIter) == recInfo)
                nodeIter++;
@ -303,8 +305,10 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
    // look in all recurrent loops of the network
    // TODO: Check for IsPartOfLoop(). Also why not store the loop id in the node for direct lookup?
    for (auto& iter : recurrentInfo)
+    {
        if (std::find(iter->m_nestedNodes.begin(), iter->m_nestedNodes.end(), node) != iter->m_nestedNodes.end()) // TODO: should this loop need to be a method of SEQTraversalFlowControlNode?
            return iter;
+    }
    return nullptr; // not part of a recurrent loop
 }

@ -357,8 +361,10 @@ void ComputationNetwork::PrintComputationTree(const ComputationNodeBasePtr& root
    if (nodes.size() == 0)
        fprintf(stderr, "\n(empty)\n");
    else
+    {
        for (const auto& node : nodes)
            node->PrintSelf(printMatrices);
+    }
 }

 // -----------------------------------------------------------------------
@ -399,7 +405,7 @@ void ComputationNetwork::CompileNetwork()
    // all steps below have to be repeated for all root nodes (=nodes without parents and PreComputeNodes)
    DetermineSetOfAllRoots();

-    fprintf(stderr, "\n%d roots:\n", (int) m_allRoots.size());
+    fprintf(stderr, "\n%d roots:\n", (int)m_allRoots.size());
    for (const auto& root : m_allRoots)
        fprintf(stderr, "\t%ls = %ls()\n", root->NodeName().c_str(), root->OperationName().c_str());

@ -469,7 +475,7 @@ void ComputationNetwork::DetermineSetOfAllRoots()
            auto input = node->Input(i);
            if (!input) // this may be the result of an incorrect MEL operation
            {
-                InvalidArgument("DetermineSetOfAllRoots: Input %d of %ls %ls operation if not connected, network is malformed.",
+                InvalidArgument("DetermineSetOfAllRoots: Input %d of %ls %ls operation is not connected, network is malformed.",
                                (int) i, node->NodeName().c_str(), node->OperationName().c_str());
            }
            referencedNodes.insert(input);
@ -592,7 +598,7 @@ void ComputationNetwork::ValidateNetwork()
    }
    if (!nonDefaultNodes.empty())
    {
-        fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int) nonDefaultNodes.size(), (int) nodes.size());
+        fprintf(stderr, "%d out of %d nodes do not share the minibatch layout with the input data.\n", (int)nonDefaultNodes.size(), (int)nodes.size());
        // for (auto node : nonDefaultNodes)
        //    fprintf(stderr, "    %ls\n", node->NodeName().c_str());
        // fprintf(stderr, "\n\n");
@ -652,6 +658,7 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
            hasVisitedChild |= child->m_visited; // if not a single visited child then no point in validating
            allChildrenVisited &= child->m_visited;
        }
+
        // if there is not at least one visited child
        bool valid = false;
        if (hasVisitedChild || isLeaf) // got at least one child: it makes sense to call Validate()
@ -850,7 +857,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
        else
        {
            nodeIter->RequestMatricesBeforeForwardProp(m_matrixPool);
-            // we only release matrices for the children since the root node's informatioin will be used and should not be shared
+            // we only release matrices for the children since the root node's information will be used and should not be shared
            // with others
            ReleaseMatricesAfterEvalForChildren(nodeIter, parentCount);
        }
--- a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
@ -13,7 +13,6 @@
 #include "RecurrentNodes.h"
 #include "NonlinearityNodes.h"
 #include "LinearAlgebraNodes.h"
-#include "ConvolutionalNodes.h"
 #include "ReshapingNodes.h"

 #include "ComputationNetwork.h"
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -402,6 +402,19 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
    }
    let& sequences = pMBLayout->GetAllSequences();
    let  width     = pMBLayout->GetNumTimeSteps();
+
+    TensorShape tensorShape = GetSampleLayout();
+    stringstream str;
+    let dims = tensorShape.GetDims();
+    for (auto dim : dims)
+        str << dim << ' ';
+    let shape = str.str(); // BUGBUG: change to string(tensorShape) to make sure we always use the same format
+
+    bool sequencePrologueHasShape = sequencePrologue.find("%x") != sequencePrologue.npos;
+    bool sampleSeparatorHasShape  = sampleSeparator.find("%x")  != sampleSeparator.npos;
+    bool sequencePrologueHasSeqId = sequencePrologue.find("%d") != sequencePrologue.npos;
+    bool sampleSeparatorHasSeqId  = sampleSeparator.find("%d")  != sampleSeparator.npos;
+
    for (size_t s = 0; s < sequences.size(); s++)
    {
        const auto& seqInfo = sequences[s];
@ -429,9 +442,30 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
        let  seqCols   = t1 - t0;
        let  seqStride = pMBLayout->GetNumParallelSequences() * matStride;

+        auto seqProl = sequencePrologue;
+        auto sampleSep = sampleSeparator;
+
+        if (sequencePrologueHasShape || sampleSeparatorHasShape)
+        {
+            auto sh = msra::strfun::_strprintf<char>("%s%ld", shape.c_str(), (unsigned long long)seqInfo.GetNumTimeSteps());
+            if (sequencePrologueHasShape)
+                seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%x", sh);
+            if (sampleSeparatorHasShape)
+                sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%x", sh);
+        }
+
+        if (sequencePrologueHasSeqId || sampleSeparatorHasSeqId)
+        {
+            auto sh = msra::strfun::_strprintf<char>("%ld", (unsigned long long)seqInfo.seqId);
+            if (sequencePrologueHasSeqId)
+                seqProl = msra::strfun::ReplaceAll<std::string>(seqProl, "%d", sh);
+            if (sampleSeparatorHasSeqId)
+                sampleSep = msra::strfun::ReplaceAll<std::string>(sampleSep, "%d", sh);
+        }
+
        if (s > 0)
            fprintfOrDie(f, "%s", sequenceSeparator.c_str());
-        fprintfOrDie(f, "%s", sequencePrologue.c_str());
+        fprintfOrDie(f, "%s", seqProl.c_str());

        // output it according to our format specification
        auto formatChar = valueFormatString.back();
@ -530,14 +564,14 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
        else
        {
            for (size_t j = 0; j < jend; j++) // loop over output rows     --BUGBUG: row index is 'i'!! Rename these!!
-            {
-                if (j > 0)
-                    fprintfOrDie(f, "%s", sampleSeparator.c_str());
+        {
+            if (j > 0)
+                    fprintfOrDie(f, "%s", sampleSep.c_str());
                if (j == jstop && jstop < jend - 1) // if jstop == jend-1 we may as well just print the value instead of '...'
-                {
+            {
                    fprintfOrDie(f, "...+%d", (int)(jend - jstop)); // 'nuff said
-                    break;
-                }
+                break;
+            }
                // inject sample tensor index if we are printing row-wise and it's a tensor
                if (!transpose && sampleLayout.size() > 1 && !isCategoryLabel) // each row is a different sample dimension
                {
@ -547,15 +581,15 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
                }
                // print a row of values
                for (size_t i = 0; i < iend; i++) // loop over elements
-                {
-                    if (i > 0)
-                        fprintfOrDie(f, "%s", elementSeparator.c_str());
+            {
+                if (i > 0)
+                    fprintfOrDie(f, "%s", elementSeparator.c_str());
                    if (i == istop && istop < iend - 1)
-                    {
+                {
                        fprintfOrDie(f, "...+%d", (int)(iend - istop));
-                        break;
-                    }
-                    double dval = seqData[i * istride + j * jstride];
+                    break;
+                }
+                double dval = seqData[i * istride + j * jstride];
                    print(dval);
                }
            }
@ -566,7 +600,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
 }

 /*static*/ string WriteFormattingOptions::Processed(const wstring& nodeName, string fragment, size_t minibatchId)
-{
+                {
    fragment = msra::strfun::ReplaceAll<string>(fragment, "\\n", "\n");
    fragment = msra::strfun::ReplaceAll<string>(fragment, "\\r", "\r");
    fragment = msra::strfun::ReplaceAll<string>(fragment, "\\t", "\t");
@ -577,7 +611,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram
        fragment = msra::strfun::ReplaceAll<string>(fragment, "%n", msra::strfun::_strprintf<char>("%ld", minibatchId).c_str());
    // %d: sequenceId
    return fragment;
-}
+                }

 template <class ConfigRecordType>
 WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
@ -588,14 +622,14 @@ WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
    {
        const ConfigRecordType& formatConfig(config(L"format", ConfigRecordType::Record()));
        if (formatConfig.ExistsCurrent(L"type")) // do not inherit 'type' from outer block
-        {
+                {
            wstring type = formatConfig(L"type");
            if      (type == L"real")     ; // default
            else if (type == L"category") isCategoryLabel = true;
            else if (type == L"sparse")   isSparse = true;
            else                         InvalidArgument("write: type must be 'real', 'category', or 'sparse'");
            labelMappingFile = (wstring)formatConfig(L"labelMappingFile", L"");
-        }
+                }
        transpose = formatConfig(L"transpose", transpose);
        prologue  = formatConfig(L"prologue",  prologue);
        epilogue  = formatConfig(L"epilogue",  epilogue);
@ -606,8 +640,8 @@ WriteFormattingOptions::WriteFormattingOptions(const ConfigRecordType& config) :
        sampleSeparator   = msra::strfun::utf8(formatConfig(L"sampleSeparator",   (wstring)msra::strfun::utf16(sampleSeparator)));
        precisionFormat   = msra::strfun::utf8(formatConfig(L"precisionFormat",   (wstring)msra::strfun::utf16(precisionFormat)));
        // TODO: change those strings into wstrings to avoid this conversion mess
-    }
-}
+                }
+            }

 void WriteFormattingOptions::Save(File& fstream) const
 {
@ -623,7 +657,7 @@ void WriteFormattingOptions::Save(File& fstream) const
    fstream << elementSeparator;
    fstream << sampleSeparator;
    fstream << precisionFormat;
-}
+        }

 void WriteFormattingOptions::Load(File& fstream, size_t modelVersion)
 {
@ -710,5 +744,6 @@ public:
 ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedTensorShape>    registerTensorShape(L"TensorShape");
 ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<int>>    registerIntVector  (L"IntVector");
 ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<size_t>> registerSizeVector (L"SizeVector");
+ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<bool>>   registerBoolVector (L"BoolVector");

 }}}
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -31,17 +31,15 @@
 // version number to control how to read and write
 #define CNTK_MODEL_VERSION_1 1
 #define CNTK_MODEL_VERSION_2 2
-#define CNTK_MODEL_VERSION_3 3 // (Row)Slice: axis; LearnableParameter: tensor shape; Times: outputRank; TransposeDimensions: axes
-#define CNTK_MODEL_VERSION_4 4 // PastValue: tensor shape
-#define CNTK_MODEL_VERSION_5 5 // ElemType tag in model file
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_5
+#define CNTK_MODEL_VERSION_3 3
+#define CNTK_MODEL_VERSION_4 4 // PastValue
+#define CNTK_MODEL_VERSION_5 5 // ND convolution and pooling
+#define CNTK_MODEL_VERSION_6 6 // Batch norm blending
+#define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_7

 extern bool g_shareNodeValueMatrices;

-#ifndef UNREFERENCED_PARAMETER // TODO: unify with UNUSED()
-#define UNREFERENCED_PARAMETER(P) (P)
-#endif
-
 // helper mode for debugging
 // If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
 // #define TRACK_GAP_NANS
@ -902,7 +900,7 @@ public:
            if (m_value)
            {
                node->CreateValueMatrixIfNull();
-                node->m_value->SetValue(*m_value);
+            node->m_value->SetValue(*m_value);
            }
            else
                node->m_value = nullptr;
@ -1112,6 +1110,9 @@ public:
    const Matrix<ElemType>& Gradient() const { return *m_gradient; }
    Matrix<ElemType>&       Gradient()       { return *m_gradient; }

+    MatrixBasePtr GradientPtr() const { return m_gradient; }
+    // TODO: This is only used for testing whether a gradient has been allocated. Maybe reduce to bool HasGradient()?
+
 private:

    template<class E>
@ -1268,8 +1269,8 @@ protected:
        DetermineDataSize(rows, cols);
        try
        {
-            m.VerifySize(rows, cols);
-        }
+        m.VerifySize(rows, cols);
+    }
        catch (const std::exception& e)
        {
            Rethrow(e);
@ -1499,8 +1500,8 @@ public:
                                             "%13.10f"/*valueFormatString*/);
            if (m_traceNodeValueSparse)
                WriteMinibatchWithFormatting(stderr, FrameRange(), SIZE_MAX,                SIZE_MAX,              false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/true, std::vector<std::string>(),
-                                             ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
-                                             "%13.10f"/*valueFormatString*/);
+                                         ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
+                                         "%13.10f"/*valueFormatString*/);
        }
    }

--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -7,31 +7,19 @@
 #include "Basics.h"
 #include "Matrix.h"
 #include "ComputationNode.h"
-#include "InputAndParamNodes.h"
 #include "ConvolutionEngine.h"

-#include <unordered_set>
-#include <map>
-#include <string>
-#include <vector>
-#include <stdexcept>
-#include <list>
-#include <memory>
-#include <algorithm>
-#include <assert.h>
-#include <atomic>
-#include <sstream>
-#include <iostream>
-
 namespace Microsoft { namespace MSR { namespace CNTK {

 // -----------------------------------------------------------------------
-// ConvolutionNode (convolutionWeights, inputFeature)
+// ConvolutionNodeBase
 // -----------------------------------------------------------------------

-// Convolutions (incl. pooling) support two different storage formats:
+// ConvolutionNodeBase is a base class for ND-convolution(ConvolutionNode) and ND-pooling(PoolingNode).
+// 
+// 2D convolutions (incl. pooling) support two different storage formats:
 //
-// * legacy ("HWC") mode (CPU and GPU without cudnn): Channels are tuples of scalars
+// * legacy ("HWC") mode: Channels are tuples of scalars
 //
 //    This follows "high performance convolutional neural networks for document processing" by Kumar Chellapilla, Sidde Puri, and Patrice Simard.
 //    Each sample is stored as a column-major matrix (height, width) of float[numChannels] (r00, g00, b00, r10, g10, b10, r01, g01, b01, r11, g11, b11).
@ -40,7 +28,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 //     - output : [C' x W' x H'     x T]  or  ARRAY[1..T] OF                ARRAY[1..H'] OF ARRAY[1..W'] OF ARRAY[1..C']
 //     - filter : [C' x W" x H" x C    ]  or                 ARRAY[1..C] OF ARRAY[1..H"] OF ARRAY[1..W"] OF ARRAY[1..C']
 //
-// * cudnn ("CHW") mode (GPU only): Channels are planes
+// * cudnn ("CHW") mode (works both GPU and CPU): Channels are planes
 //
 //     - input :   [W  x H  x C       x T]   or  ARRAY[1..T] OF                 ARRAY[1..C]  OF ARRAY[1..H]  OF ARRAY[1..W]
 //     - output :  [W' x H' x      C' x T]   or  ARRAY[1..T] OF ARRAY[1..C'] OF                 ARRAY[1..H'] OF ARRAY[1..W']
@ -54,71 +42,269 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 //     - 3 for color images, 1 for B&W images
 //     - for hidden layer: dimension of activation vector for each pixel
 //  - C' = output channels = dimension of activation vector for each pixel (also called N by NVidia, inconsistently)
+//
+// For ND-convolution/pooling only second format ('cudnn') is supported.
+// 
 template <class ElemType>
-class ConvolutionNode : public ComputationNode<ElemType>, public NumInputs<2>
+class ConvolutionNodeBase : public ComputationNode<ElemType>
 {
-    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"Convolution"; }
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers;

 public:
-    ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name),
-          m_kernelWidth(SIZE_MAX),
-          m_kernelHeight(SIZE_MAX),
-          // initialize to dummy values so we catch missing initialization
-          m_horizontalSubsample(SIZE_MAX),
-          m_verticalSubsample(SIZE_MAX),
-          m_zeroPadding(false),
-          m_maxTempMemSizeInSamples(SIZE_MAX),
-          m_imageLayoutKind(ImageLayoutKind::HWC)
+    ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name), m_poolKind(PoolKind::None), m_maxTempMemSizeInSamples(0)
    {
-        SetDims(ImageDimensions::AsTensorShape(1, 1, 0, m_imageLayoutKind), 0);
    }
-    ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
-                    const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0)
-        : Base(deviceId, name),
-          m_outputChannels(outputChannels),
-          m_kernelWidth(kernelWidth),
-          m_kernelHeight(kernelHeight),
-          m_horizontalSubsample(horizontalSubsample),
-          m_verticalSubsample(verticalSubsample),
-          m_zeroPadding(zeroPadding),
-          m_maxTempMemSizeInSamples(maxTempMemSizeInSamples),
-          m_imageLayoutKind(imageLayoutKind)
+    ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
+                    const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                    PoolKind poolKind, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+                    : Base(deviceId, name), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(strideShape), m_sharing(sharing),
+                    m_autoPad(autoPadding), m_lowerPad(lowerPad), m_upperPad(upperPad), m_poolKind(poolKind),
+                    m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
    {
-        SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), 0); // TODO: necessary?
-        m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
-    }
-    ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
-        : ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelWidth"), configp->Get(L"kernelHeight"), configp->Get(L"outputChannels"),
-                          configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")),
-                          configp->Get(L"zeroPadding"), configp->Get(L"maxTempMemSizeInSamples"))
-    {
-        // weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0
-        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
    }

+public:
    void Save(File& fstream) const override
    {
        Base::Save(fstream);
-        fstream << m_kernelWidth << m_kernelHeight << m_horizontalSubsample << m_verticalSubsample;
-        uint32_t imageLayoutKind = (uint32_t) m_imageLayoutKind;
-        uint32_t outputChannels = (uint32_t) m_outputChannels;
-        fstream << outputChannels << imageLayoutKind;
-        fstream << m_zeroPadding << m_maxTempMemSizeInSamples;
+
+        m_kernelShape.Save(fstream);
+        m_mapCount.Save(fstream);
+        m_stride.Save(fstream);
+        fstream << m_sharing;
+        fstream << m_autoPad;
+        m_lowerPad.Save(fstream);
+        m_upperPad.Save(fstream);
+        fstream << (int32_t)m_poolKind;
+        fstream << (int32_t)m_imageLayout;
+        fstream << m_maxTempMemSizeInSamples;
    }

    void Load(File& fstream, size_t modelVersion) override
    {
        Base::Load(fstream, modelVersion);
-        fstream >> m_kernelWidth >> m_kernelHeight >> m_horizontalSubsample >> m_verticalSubsample;
-        uint32_t imageLayoutKind, outputChannels;
-        fstream >> outputChannels >> imageLayoutKind;
-        m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
-        m_outputChannels = outputChannels;
-        SetDims(ImageDimensions::AsTensorShape(1, 1, m_outputChannels, m_imageLayoutKind), HasMBLayout()); // TODO: needed?
-        fstream >> m_zeroPadding >> m_maxTempMemSizeInSamples;
-        m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
+
+        // Let ConvolutionNode handle older models.
+        if (modelVersion >= CNTK_MODEL_VERSION_5)
+        {
+            m_kernelShape.Load(fstream);
+            m_mapCount.Load(fstream);
+            m_stride.Load(fstream);
+            fstream >> m_sharing;
+            fstream >> m_autoPad;
+            m_lowerPad.Load(fstream);
+            m_upperPad.Load(fstream);
+            int32_t k;
+            fstream >> k;
+            m_poolKind = (PoolKind)k;
+            int32_t layout;
+            fstream >> layout;
+            m_imageLayout = (ImageLayoutKind)layout;
+            fstream >> m_maxTempMemSizeInSamples;
+    }
+    }
+
+    void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
+        {
+            auto node = dynamic_pointer_cast<ConvolutionNodeBase<ElemType>>(nodeP);
+            node->m_kernelShape = m_kernelShape;
+            node->m_mapCount = m_mapCount;
+            node->m_stride = m_stride;
+            node->m_sharing = m_sharing;
+            node->m_autoPad = m_autoPad;
+            node->m_lowerPad = m_lowerPad;
+            node->m_upperPad = m_upperPad;
+            node->m_poolKind = m_poolKind;
+            node->m_imageLayout = m_imageLayout;
+            node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
+        }
+    }
+
+    void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+    {
+        auto sliceOutputGrad = GradientFor(fr);
+
+        if (m_poolKind == PoolKind::None)
+        {
+        if (inputIndex == 0) // derivative with respect to the weight matrix
+        {
+            auto& grad = Input(0)->GradientAsMatrix();
+                auto sliceInput1Value = Input(1)->ValueFor(fr);
+                m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
+        }
+        else if (inputIndex == 1) // derivative with respect to the input feature
+        {
+            auto& input0 = Input(0)->ValueAsMatrix();
+            auto sliceInput1Grad = Input(1)->GradientFor(fr);
+                m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
+        }
+    }
+        else
+        {
+            Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
+
+            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+
+            m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
+        }
+    }
+
+    bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The ConvolutionNode requires output values only for max pooling.
+        return m_poolKind == PoolKind::Max;
+    }
+
+    void ForwardProp(const FrameRange& fr) override
+    {
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+
+        if (m_poolKind == PoolKind::None)
+        {
+        const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
+        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+            m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
+        }
+        else
+        {
+            const Matrix<ElemType>& input0 = Input(0)->ValueFor(fr);
+            m_convEng->ForwardPooling(input0, sliceOutputValue);
+        }
+    }
+
+    void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override
+    {
+        Base::DumpNodeInfo(printValues, printMetadata, fstream);
+
+        if (m_convEng != nullptr)
+            fstream << "Geometry: " << string(*m_convEng->Geometry()) << "\n";
+        fstream << "PoolKind: " << (int)m_poolKind << "\n";
+    }
+
+protected:
+    TensorShape m_kernelShape;
+    TensorShape m_mapCount;
+    TensorShape m_stride;
+    std::vector<bool> m_sharing;
+    std::vector<bool> m_autoPad;
+    TensorShape m_lowerPad;
+    TensorShape m_upperPad;
+    PoolKind m_poolKind;
+    ImageLayoutKind m_imageLayout;
+
+    size_t m_maxTempMemSizeInSamples;
+    shared_ptr<Matrix<ElemType>> m_tempMatrix;
+
+    std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
+};
+
+#define UsingConvolutionNodeBaseMembers     \
+    UsingComputationNodeMembersBoilerplate; \
+protected:                                  \
+    using Base::m_kernelShape;              \
+    using Base::m_mapCount;                 \
+    using Base::m_stride;                   \
+    using Base::m_sharing;                  \
+    using Base::m_autoPad;                  \
+    using Base::m_lowerPad;                 \
+    using Base::m_upperPad;                 \
+    using Base::m_poolKind;                 \
+    using Base::m_imageLayout;              \
+    using Base::m_maxTempMemSizeInSamples;  \
+    using Base::m_tempMatrix;               \
+    using Base::m_convEng;                  \
+public:
+
+// -----------------------------------------------------------------------
+// ConvolutionNode (convolutionWeights, inputFeature)
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class ConvolutionNode : public ConvolutionNodeBase<ElemType>, public NumInputs<2>
+{
+    typedef ConvolutionNodeBase<ElemType> Base;
+    UsingConvolutionNodeBaseMembers;
+    static const std::wstring TypeName()
+    {
+        return L"Convolution";
+    }
+
+public:
+    ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+    ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
+                    const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                    ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+                    : Base(deviceId, name, kernelShape, mapCount, strideShape, sharing, autoPadding, lowerPad, upperPad, PoolKind::None, imageLayout, maxTempMemSizeInSamples),
+                    m_convolution2D(false)
+    {
+    }
+    ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels,
+                    const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayout,
+                    bool zeroPadding, size_t maxTempMemSizeInSamples)
+                    : ConvolutionNode(deviceId, name, TensorShape(kernelWidth, kernelHeight, 1), TensorShape(1, 1, outputChannels),
+                                      TensorShape(horizontalSubsample, verticalSubsample, 1), vector<bool>{true}, 
+                                      vector<bool>{zeroPadding}, TensorShape(0), TensorShape(0),
+                                      imageLayout, maxTempMemSizeInSamples)
+    {
+        m_convolution2D = true;
+    }
+    ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
+        : ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelShape"), configp->Get(L"mapCount"), configp->Get(L"strideShape"),
+                          configp->Get(L"dimSharing"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
+                          ImageLayoutKindFrom(configp->Get(L"imageLayout")), configp->Get(L"maxTempMemSizeInSamples"))
+    {
+        AttachInputs(configp, GetExpectedNumInputs());
+    }
+
+public:
+    void Save(File& fstream) const override
+    {
+        Base::Save(fstream);
+        fstream << m_convolution2D;
+    }
+
+    void Load(File& fstream, size_t modelVersion) override
+    {
+        Base::Load(fstream, modelVersion);
+
+        // Back compat: load pre-ND convolution models.
+        if (modelVersion < CNTK_MODEL_VERSION_5)
+        {
+            size_t kW, kH, sW, sH;
+            fstream >> kW;
+            fstream >> kH;
+            fstream >> sW;
+            fstream >> sH;
+            uint32_t imageLayout, mapCount;
+            fstream >> mapCount;
+            fstream >> imageLayout;
+            m_imageLayout = (ImageLayoutKind)imageLayout;
+            bool pad;
+            fstream >> pad;
+            fstream >> m_maxTempMemSizeInSamples;
+            m_poolKind = PoolKind::None;
+            m_convolution2D = true;
+
+            m_kernelShape = TensorShape(kW, kH, 1);
+            m_mapCount = TensorShape(mapCount);
+            m_stride = TensorShape(sW, sH, 1);
+            m_sharing = vector<bool>{true};
+            m_autoPad = vector<bool>{pad};
+            m_lowerPad = TensorShape(0);
+            m_upperPad = TensorShape(0);
+        }
+        else
+        {
+            fstream >> m_convolution2D;
+        }
    }

    void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -127,144 +313,92 @@ public:
        if (flags & CopyNodeFlags::copyNodeValue)
        {
            auto node = dynamic_pointer_cast<ConvolutionNode<ElemType>>(nodeP);
-            node->m_kernelWidth = m_kernelWidth;
-            node->m_kernelHeight = m_kernelHeight;
-
-            node->m_horizontalSubsample = m_horizontalSubsample;
-            node->m_verticalSubsample = m_verticalSubsample;
-
-            node->m_zeroPadding = m_zeroPadding;
-
-            node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
-
-            node->m_imageLayoutKind = m_imageLayoutKind;
-
-            node->m_tempMatrix->SetValue(*m_tempMatrix);
+            node->m_convolution2D = m_convolution2D;
        }
    }

-    void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-    {
-        auto sliceOutputGrad = GradientFor(fr);
-        auto sliceInput1Value = Input(1)->ValueFor(fr);
-
-        size_t batchSize = sliceInput1Value.GetNumCols();
-        m_inT->setN(batchSize);
-        m_outT->setN(batchSize);
-        assert(m_convEng != nullptr);
-        if (inputIndex == 0) // derivative with respect to the weight matrix
-        {
-            auto& grad = Input(0)->GradientAsMatrix();
-            m_convEng->BackwardFilter(*m_outT, sliceOutputGrad, *m_inT, sliceInput1Value, *m_convDesc, *m_filterT, grad, fr.IsAllFrames(), *m_tempMatrix);
-        }
-        else if (inputIndex == 1) // derivative with respect to the input feature
-        {
-            auto& input0 = Input(0)->ValueAsMatrix();
-            auto sliceInput1Grad = Input(1)->GradientFor(fr);
-            m_convEng->BackwardData(*m_outT, sliceOutputGrad, *m_filterT, input0, *m_convDesc, *m_inT, sliceInput1Grad, *m_tempMatrix);
-        }
-    }
-
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The ConvolutionNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    void ForwardProp(const FrameRange& fr) override
-    {
-        const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
-        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
-
-        // update the tensor dimension w.r.t. number of samples
-        size_t batchSize = sliceInput1Value.GetNumCols();
-        m_inT->setN(batchSize);
-        m_outT->setN(batchSize);
-        assert(m_convEng != nullptr);
-#if NANCHECK
-        input0.HasNan("Convolution-input0");
-        sliceInput1Value.HasNan("Convolution-input1");
-#endif
-        m_convEng->Forward(*m_inT, sliceInput1Value, *m_filterT, input0, *m_convDesc, *m_outT, sliceOutputValue, *m_tempMatrix);
-#if NANCHECK
-        sliceOutputValue.HasNan("Convolution");
-#endif
-    }
-
-    void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    void Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
        InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);

-        // get input and output tensor shape and interpret as image dimensions
-        auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
+        size_t inputIdx = GetExpectedNumInputs() - 1;
+        TensorShape inputShape;
+        if (m_convolution2D)
+        {
+            // Need to update some tensors with correct input dims.
+            auto inDims = ImageDimensions(GetInputSampleLayout(inputIdx), m_imageLayout);
+            // inputShape is used in ConvolveGeometry which supports only CHW layout.
+            inputShape = inDims.AsTensorShape(ImageLayoutKind::CHW);
+            size_t kW = m_kernelShape[0];
+            size_t kH = m_kernelShape[1];
+            size_t sW = m_stride[0];
+            size_t sH = m_stride[1];
+            m_kernelShape = TensorShape(kW, kH, inDims.m_numChannels);
+            m_stride = TensorShape(sW, sH, inDims.m_numChannels);

-        if (isFinalValidationPass && (inDims.m_width < m_kernelWidth || inDims.m_height < m_kernelHeight))
-            InvalidArgument("%ls %ls operation requires that input width be >= kernelWidth and input height >= kernelHeight.", NodeName().c_str(), OperationName().c_str());
-
-        // determine output tensor shape
-        const int kernelWidthCenter  = m_zeroPadding ? m_kernelWidth  % 2 : m_kernelWidth;
-        const int kernelHeightCenter = m_zeroPadding ? m_kernelHeight % 2 : m_kernelHeight;
-        auto outDims = ImageDimensions(
-            (inDims.m_width  - kernelWidthCenter)  / m_horizontalSubsample + 1,
-            (inDims.m_height - kernelHeightCenter) / m_verticalSubsample   + 1,
-            m_outputChannels);
-
-        size_t weightCols = m_kernelWidth * m_kernelHeight * inDims.m_numChannels;
+            size_t mapCount = m_mapCount.GetNumElements();
+            size_t weightCols = kW * kH * inDims.m_numChannels;

        // check/infer input [0] (weights)
        // BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
-        Input(0)->ValidateInferInputDimsFrom(TensorShape(m_outputChannels, weightCols));
+            Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols));

-        if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != m_outputChannels))
-            LogicError("convolutionWeight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", Input(0)->NodeName().c_str(), (int) m_outputChannels, (int) weightCols);
+            if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != mapCount))
+            {
+                LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", 
+                           Input(0)->NodeName().c_str(), (int)mapCount, (int)weightCols);
+            }

-        // that's our dimension
-        SetDims(outDims.AsTensorShape(m_imageLayoutKind), true);
+            auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                 m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+            // ConvolveGeometry always uses CHW.
+            SetDims(ImageDimensions(outDims, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
+        }
+        else
+        {
+            if (m_imageLayout != ImageLayoutKind::CHW)
+        {
+                InvalidArgument(
+                    "%ls %ls supports only cuDNN (CHW) data layout. "
+                    "Please specify imageLayout=\"cudnn\" in %ls node in your script "
+                    "and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
+        }
+            inputShape = GetInputSampleLayout(inputIdx);
+            auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+            SetDims(outDims, HasMBLayout());
+    }

        if (isFinalValidationPass)
        {
-            // set up the various engines and descriptor objects
-            // REVIEW alexeyk: is there a better place to create engines?
-            assert(m_factory);
-            // if (m_factory == nullptr)
-            //    m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
-            // TODO: This seems to expose too much internal knowlegde of the engine to the ConvolutionNode().
-            //       Why not just pass everything to the engine creator, and get one object that holds everything.
            if (m_convEng == nullptr)
-                m_convEng = m_factory->CreateConvEngine(m_deviceId, m_imageLayoutKind, m_maxTempMemSizeInSamples, BatchNormImpl::Cntk);
-            if (m_inT == nullptr)
-                m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
-            if (m_filterT == nullptr)
-                m_filterT = m_factory->CreateFilter(m_kernelWidth, m_kernelHeight, inDims.m_numChannels, m_outputChannels);
-            if (m_outT == nullptr)
-                m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
-            if (m_convDesc == nullptr)
-                m_convDesc = m_factory->CreateConvDescriptor(*m_inT, *m_filterT, m_horizontalSubsample, m_verticalSubsample, m_zeroPadding);
-            // REVIEW alexeyk: create per-channel bias (shared across all pixels). Consider adding other types of biases.
-            if (m_biasT == nullptr)
-                m_biasT = m_factory->CreateTensor(1, 1, outDims.m_numChannels, 1);
+    {
+                auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+                m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
+                                                                m_maxTempMemSizeInSamples, m_poolKind);
+    }
+
+            if (Input(0)->GetAsMatrixNumCols() != m_kernelShape.GetNumElements() ||
+                Input(0)->GetAsMatrixNumRows() != m_convEng->Geometry()->KernelCount())
+    {
+                LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [kernelCount, kernelWidth * kernelHeight * inputChannels]",
+                           Input(0)->NodeName().c_str(), (int)m_convEng->Geometry()->KernelCount(), (int)m_kernelShape.GetNumElements());
+            }
        }
    }

-    void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override
+    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
-        Base::DumpNodeInfo(printValues, printMetadata, fstream);
+        Base::RequestMatricesBeforeForwardProp(matrixPool);
+        RequestMatrixFromPool(m_tempMatrix, matrixPool);
+    }

-        auto inDims = ImageDimensions(GetInputSampleLayout(1), m_imageLayoutKind);
-        auto outDims = ImageDimensions(m_sampleLayout, m_imageLayoutKind);
-
-        char str[4096];
-        sprintf(str, "Input[Width:%lu, Height:%lu, Channels:%lu]  \n", inDims.m_width, inDims.m_height, inDims.m_numChannels);
-        fstream << string(str);
-        sprintf(str, "Kernel[Width:%lu, Height:%lu]  SubSample[Horizontal:%lu, Vertical:%lu]\n", m_kernelWidth, m_kernelHeight, m_horizontalSubsample, m_verticalSubsample);
-        fstream << string(str);
-        sprintf(str, "Output[Width:%lu, Height:%lu, Channels:%lu]  \n", outDims.m_width, outDims.m_height, outDims.m_numChannels);
-        fstream << string(str);
-        sprintf(str, "zeroPadding=%ls  maxTempMemSizeInSamples=%lu\n", m_zeroPadding ? L"true" : L"false", m_maxTempMemSizeInSamples);
-        fstream << string(str);
+    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
+    {
+        Base::ReleaseMatricesAfterBackprop(matrixPool);
+        ReleaseMatrixToPool(m_tempMatrix, matrixPool);
    }

    void SetmMaxTempMemSizeInSamples(const size_t maxTempMemSizeInSamples)
@ -272,47 +406,78 @@ public:
        m_maxTempMemSizeInSamples = maxTempMemSizeInSamples;
    }

-    // request matrices needed to do node function value evaluation
-    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
-    {
-        Base::RequestMatricesBeforeForwardProp(matrixPool);
-        RequestMatrixFromPool(m_tempMatrix, matrixPool);
-    }
-
-    // release gradient and temp matrices that no longer needed after all the children's gradients are computed.
-    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
-    {
-        Base::ReleaseMatricesAfterBackprop(matrixPool);
-        ReleaseMatrixToPool(m_tempMatrix, matrixPool);
-    }
-
-private:
-    size_t m_outputChannels;
-    size_t m_kernelWidth, m_kernelHeight;
-    size_t m_horizontalSubsample, m_verticalSubsample;
-    bool m_zeroPadding;
-    bool m_1DConvolutionOnGPUSparse;
-
-    shared_ptr<Matrix<ElemType>> m_tempMatrix;
-    size_t m_maxTempMemSizeInSamples; // can change during runtime
-
-    ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)
-
-    std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
-    std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
-
-    std::unique_ptr<ConvolutionTensor4D> m_inT;
-    std::unique_ptr<ConvolutionFilter> m_filterT;
-    std::unique_ptr<ConvolutionTensor4D> m_outT;
-    std::unique_ptr<ConvolutionDescriptor> m_convDesc;
-    std::unique_ptr<ConvolutionTensor4D> m_biasT;
+protected:
+    bool m_convolution2D;
 };

-template class ConvolutionNode<float>;
-template class ConvolutionNode<double>;
+// -----------------------------------------------------------------------
+// PoolingNode (inputFeature)
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class PoolingNode : public ConvolutionNodeBase<ElemType>, public NumInputs<1>
+{
+    typedef ConvolutionNodeBase<ElemType> Base;
+    UsingConvolutionNodeBaseMembers;
+    static const std::wstring TypeName()
+    {
+        return L"Pooling";
+    }
+
+public:
+    PoolingNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+    PoolingNode(DEVICEID_TYPE deviceId, const wstring& name, PoolKind pool, const TensorShape& kernelShape, const TensorShape& strideShape,
+                    const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                    ImageLayoutKind imageLayout)
+                    : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, pool, imageLayout, 0)
+    {
+    }
+    PoolingNode(const ScriptableObjects::IConfigRecordPtr configp)
+        : PoolingNode(configp->Get(L"deviceId"), L"<placeholder>", PoolKindFrom(configp->Get(L"pool")), configp->Get(L"kernelShape"),
+                      configp->Get(L"strideShape"),
+                      configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
+                      ImageLayoutKindFrom(configp->Get(L"imageLayout")))
+    {
+        AttachInputs(configp, GetExpectedNumInputs());
+    }
+
+public:
+    void Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase();
+
+        if (m_imageLayout != ImageLayoutKind::CHW)
+        {
+            InvalidArgument(
+                "%ls %ls supports only cuDNN (CHW) data layout. "
+                "Please specify imageLayout=\"cudnn\" in %ls node in your script "
+                "and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
+        }
+
+        auto inputShape = GetInputSampleLayout(0);
+        auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                            m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+        SetDims(outDims, HasMBLayout());
+
+        if (isFinalValidationPass)
+        {
+            if (m_convEng == nullptr)
+            {
+                auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+                m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
+                                                                m_maxTempMemSizeInSamples, m_poolKind);
+            }
+        }
+    }
+};

 // -----------------------------------------------------------------------
-// PoolingNodeBase (input)
+// Legacy PoolingNodeBase (input)
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -339,7 +504,6 @@ public:
          m_verticalSubsample(verticalSubsample),
          m_imageLayoutKind(imageLayoutKind)
    {
-        m_factory = ConvolutionEngineFactory<ElemType>::Create(deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
    }
    PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp)
        : PoolingNodeBase(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"windowWidth"), configp->Get(L"windowHeight"), configp->Get(L"horizontalSubsample"), configp->Get(L"verticalSubsample"), ImageLayoutKindFrom(configp->Get(L"imageLayout")))
@ -362,8 +526,7 @@ public:
        uint32_t imageLayoutKind, windowWidth;
        fstream >> windowWidth >> imageLayoutKind >> m_windowHeight >> m_horizontalSubsample >> m_verticalSubsample;
        m_windowWidth = windowWidth;
-        m_imageLayoutKind = (ImageLayoutKind) imageLayoutKind;
-        m_factory = ConvolutionEngineFactory<ElemType>::Create(GetDeviceId(), ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
+        m_imageLayoutKind = (ImageLayoutKind)imageLayoutKind;
    }

    void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -394,12 +557,7 @@ public:
        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
        Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-        size_t batchSize = sliceInput0Value.GetNumCols();
-        m_inT->setN(batchSize);
-        m_outT->setN(batchSize);
-        assert(m_poolEng != nullptr);
-        assert(m_poolDesc != nullptr);
-        m_poolEng->Backward(*m_outT, sliceOutputValue, sliceOutputGrad, *m_poolDesc, *m_inT, sliceInput0Value, sliceInput0Grad);
+        m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
    }

    void ForwardProp(const FrameRange& fr) override
@ -407,12 +565,7 @@ public:
        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
        Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-        size_t batchSize = sliceInput0Value.GetNumCols();
-        m_inT->setN(batchSize);
-        m_outT->setN(batchSize);
-        assert(m_poolEng != nullptr);
-        assert(m_poolDesc != nullptr);
-        m_poolEng->Forward(*m_inT, sliceInput0Value, *m_poolDesc, *m_outT, sliceOutputValue);
+        m_convEng->ForwardPooling(sliceInput0Value, sliceOutputValue);
    }

    void Validate(bool isFinalValidationPass) override
@ -439,16 +592,14 @@ public:
        if (isFinalValidationPass)
        {
            // set up various engines and descriptor objects
-            // REVIEW alexeyk: is there a better place to create engines?
-            assert(m_factory);
-            // if (m_factory == nullptr)
-            //    m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
-            if (m_poolEng == nullptr)
-                m_poolEng = m_factory->CreatePoolEngine(m_deviceId, m_imageLayoutKind);
-            if (m_inT == nullptr)
-                m_inT = m_factory->CreateTensor(inDims.m_width, inDims.m_height, inDims.m_numChannels, 1);
-            if (m_outT == nullptr)
-                m_outT = m_factory->CreateTensor(outDims.m_width, outDims.m_height, outDims.m_numChannels, 1);
+            m_geometry = std::make_shared<ConvolveGeometry>(inDims.AsTensorShape(m_imageLayoutKind),
+                                                            ImageDimensions(m_windowWidth, m_windowHeight, 1).AsTensorShape(m_imageLayoutKind),
+                                                            TensorShape(1),
+                                                            ImageDimensions(m_horizontalSubsample, m_verticalSubsample, 1).AsTensorShape(m_imageLayoutKind),
+                                                            ConvolveGeometry::BoolVec{true},
+                                                            ConvolveGeometry::BoolVec{false},
+                                                            TensorShape(0),
+                                                            TensorShape(0));
        }
    }

@ -479,12 +630,8 @@ protected:

    ImageLayoutKind m_imageLayoutKind; // how to interpret the tensor (which dimensions are X/Y and C)

-    std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
-    std::unique_ptr<PoolingEngine<ElemType>> m_poolEng;
-
-    std::unique_ptr<ConvolutionTensor4D> m_inT;
-    std::unique_ptr<ConvolutionTensor4D> m_outT;
-    std::unique_ptr<PoolingDescriptor> m_poolDesc;
+    ConvolveGeometryPtr m_geometry;
+    std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
 };

 // add this at the start of each derived class, to get access to the members of ComputationNode
@ -493,19 +640,20 @@ protected:
    UsingComputationNodeMembersBoilerplate; \
    \
 protected:                                  \
-    using Base::m_factory;                  \
-    using Base::m_poolDesc;                 \
+    using Base::m_geometry;                 \
+    using Base::m_convEng;                  \
    using Base::m_windowWidth;              \
    using Base::m_windowHeight;             \
    using Base::m_horizontalSubsample;      \
    using Base::m_verticalSubsample;        \
    using Base::m_inputSizePerSample;       \
    using Base::m_outputSizePerSample;      \
+    using Base::m_imageLayoutKind;          \
    \
 public:

 // -----------------------------------------------------------------------
-// MaxPoolingNode
+// Legacy MaxPoolingNode
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -535,16 +683,13 @@ public:
    void Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
-        if (isFinalValidationPass && m_poolDesc == nullptr)
-            m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Max, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
+        if (isFinalValidationPass && m_convEng == nullptr)
+            m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Max);
    }
 };

-template class MaxPoolingNode<float>;
-template class MaxPoolingNode<double>;
-
 // -----------------------------------------------------------------------
-// AveragePoolingNode
+// Legacy AveragePoolingNode
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -574,12 +719,9 @@ public:
    void Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
-        if (isFinalValidationPass && m_poolDesc == nullptr)
-            m_poolDesc = m_factory->CreatePoolDescriptor(PoolingDescriptor::PoolKind::Average, m_windowWidth, m_windowHeight, m_horizontalSubsample, m_verticalSubsample, 0, 0);
+        if (isFinalValidationPass && m_convEng == nullptr)
+            m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Average);
    }
 };

-template class AveragePoolingNode<float>;
-template class AveragePoolingNode<double>;
-
 } } }
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -6,7 +6,6 @@

 #include "Basics.h"
 #include "ComputationNode.h"
-#include "ConvolutionalNodes.h"
 #include "Matrix.h"
 #include "TensorView.h"

--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -6,7 +6,7 @@

 #include "Basics.h"
 #include "ComputationNode.h"
-#include "ConvolutionEngine.h"
+#include "BatchNormalizationEngine.h"

 #include <map>
 #include <string>
@ -20,8 +20,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // -----------------------------------------------------------------------
 // SquareErrorNode (left, right)
 // = SumElements ((left - right) .* (left - right))
-// Note: to save computation the gradient may be scaled by an constant.
-// TODO: ^^ Dig out what that constant is and document it here. "may be scaled"??
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -47,9 +45,9 @@ public:
        FrameRange fr(Input(0)->GetMBLayout());
        m_leftMinusRight->AssignDifferenceOf(Input(0)->ValueFor(fr), Input(1)->ValueFor(fr));
        MaskMissingColumnsToZero(*m_leftMinusRight, Input(0)->GetMBLayout(), fr); // we are fine since it will only be called with full minibatch.
-        ElemType v = m_leftMinusRight->FrobeniusNorm();
+        ElemType v = m_leftMinusRight->FrobeniusNorm(); // v = sqrt( sum{ (I0[i] - I1[i])^2 } )
        Value().VerifySize(1, 1);
-        Value().SetValue(v * v / 2);
+        Value().SetValue(v * v);  // Value = sum{ (I0[i] - I1[i])^2 }
 #if NANCHECK
        Value().HasNan("SquareError");
 #endif
@ -59,7 +57,7 @@ public:
    {
        FrameRange fr(Input(0)->GetMBLayout());
        auto gradient = Input(inputIndex)->GradientFor(fr);
-        Matrix<ElemType>::Multiply1x1AndWeightedAdd(inputIndex == 0 ? 1.0f : -1.0f, Gradient() /*1x1*/, *m_leftMinusRight, 1.0f, gradient);
+        Matrix<ElemType>::Multiply1x1AndWeightedAdd(inputIndex == 0 ? 2.0f : -2.0f, Gradient() /*1x1*/, *m_leftMinusRight, 1.0f, gradient); // O = (I0-I1)^2; dO/dI0 = 2*(I0-I1); dO/dI1 = -2*(I0-I1)
    }

    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
@ -1522,12 +1520,43 @@ template class DropoutNode<float>;
 template class DropoutNode<double>;

 // -----------------------------------------------------------------------
-// BatchNormalizationNode (...)  --TODO: document inputs
-// -----------------------------------------------------------------------
-
+// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev, spatial,
+//                         normalizationTimeConstant = 0, blendTimeConstant = 0,
+//                         epsilon = 0.00001,
+//                         useCntkEngine = true, imageLayout = 'cudnn')
+//
 // Implements batch normalization technique as described in:
 // Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift [S. Ioffe, C. Szegedy]
 // http://arxiv.org/abs/1502.03167
+// In short, it normalizes layer outputs for every minibatch for each output(feature) independently and applies affine transformation to preserve representation of the layer.
+// That is, for layer input:
+// 
+// m = mean(input)
+// var = variance(input)
+// input_norm = (input - mean) / sqrt(var)
+// output = gamma * input_norm + beta
+// 
+// where gamma and beta are trainable parameters(represented as LearnableParameter).
+// 
+// * input is the input of the batch normalization node
+// * scale is a LearnableParameter that stores scale vector(gamma term in the equation above).
+// * bias is a LearnableParameter that stores bias vector(beta term). scale and bias must have the same dimensions which must be equal 
+//      to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
+// * runMean is the running mean which is used during evaluation phase and might be used during training as well.
+//      It is represented as a LearnableParameter with the same dimensions as scale and bias.
+// * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
+//      It is represented as a LearnableParameter with the same dimensions as scale and bias.
+// * spatial is a flag that specifies whether to compute mean / var for each feature in a mininbatch independently or, in case of convolutional layers, per feature map.
+// * normalizationTimeConstant is the time constant which is used to compute running average of mean and variance.
+//      Value 0 (default) means there will be no exponential smoothing and running mean / variance will always have values computed for the last seen mininbatch.
+//      Value 1#INF (infinity)means running values are "frozen" (i.e.will not be updated).
+// * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
+//      Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
+//      Value 1#INF (infinity)means only running mean / var will be used(this is used, for example, in evaluation phase).
+// * epsilon is a conditioner constant used in computing InvStdDev
+// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN - based.
+// * imageLayout is the image layout.Only cudnn is supported.
+// -----------------------------------------------------------------------
 template <class ElemType>
 class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInputs<5>
 {
@ -1540,19 +1569,20 @@ class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInput

 public:
    BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name), m_eval(false), m_spatial(false), m_normTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
+        : Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
        m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
    {
    }
-	BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool eval, bool spatial, double normalizationTimeConstant, double epsilon,
-                           bool useCntkEngine, ImageLayoutKind imageLayoutKind)
-						   : Base(deviceId, name), m_eval(eval), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_epsilon(epsilon),
-          m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
+    BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
+                           double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind)
+                           : Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
+                           m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
    {
    }
    BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp)
-        : BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"eval"), configp->Get(L"spatial"),
-                                 configp->Get(L"normalizationTimeConstant"), configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
+        : BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
+                                 configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"), 
+                                 configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
                                 ImageLayoutKindFrom(configp->Get(L"imageLayout")))
    {
        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
@ -1561,11 +1591,10 @@ public:
    void Save(File& fstream) const override
    {
        Base::Save(fstream);
-        fstream << m_version.VerWrittenCur() << m_version.VerReadableCur();

-        fstream << m_eval;
        fstream << m_spatial;
        fstream << m_normTimeConst;
+        fstream << m_blendTimeConst;
        fstream << (int32_t)m_imageLayoutKind;
        fstream << m_mbCount;
        fstream << m_epsilon;
@ -1576,40 +1605,56 @@ public:
    {
        Base::Load(fstream, modelVersion);

-        // Read and check version.
-        // REVIEW alexeyk: extract version checking so it can be re-used in other places.
-        // BUGBUG: We must serialize m_inputLayout.
-        int32_t verWritten;
-        int32_t verReadable;
-        fstream >> verWritten >> verReadable;
-
-        if (verReadable > verWritten)
-            RuntimeError("Corrupt model file.");
-        if (verWritten < m_version.VerWeCanReadBack())
-            RuntimeError("Model is too old.");
-        if (verReadable > m_version.VerWrittenCur())
-            RuntimeError("Model is too new.");
-
-        fstream >> m_eval;
-        fstream >> m_spatial;
-        if (verWritten >= 0x00010004)
+        if (modelVersion >= CNTK_MODEL_VERSION_6)
+        {
+            fstream >> m_spatial;
            fstream >> m_normTimeConst;
-        else
-        {
-            double expAvgFactor;
-            fstream >> expAvgFactor;
-            UNUSED(expAvgFactor); // Used in previous versions, replaced by m_normTimeConst.
-        }
-        if (verWritten >= 0x00010002)
-        {
+            fstream >> m_blendTimeConst;
            fstream >> m_imageLayoutKind;
            fstream >> m_mbCount;
-        }
-        if (verWritten >= 0x00010003)
-        {
            fstream >> m_epsilon;
            fstream >> m_useCntkEngine;
        }
+        else
+        {
+            // Use old versioning scheme for older models.
+
+            // Read and check version.
+            // REVIEW alexeyk: extract version checking so it can be re-used in other places.
+            int32_t verWritten;
+            int32_t verReadable;
+            fstream >> verWritten >> verReadable;
+    
+            if (verReadable > verWritten)
+                RuntimeError("Corrupt model file.");
+            if (verWritten < m_version.VerWeCanReadBack())
+                RuntimeError("Model is too old.");
+            if (verReadable > m_version.VerWrittenCur())
+                RuntimeError("Model is too new.");
+
+            bool eval;
+            fstream >> eval;
+            UNUSED(eval);
+            fstream >> m_spatial;
+            if (verWritten >= 0x00010004)
+                fstream >> m_normTimeConst;
+            else
+            {
+                double expAvgFactor;
+                fstream >> expAvgFactor;
+                UNUSED(expAvgFactor); // Used in previous versions, replaced by m_normTimeConst.
+            }
+            if (verWritten >= 0x00010002)
+            {
+                fstream >> m_imageLayoutKind;
+                fstream >> m_mbCount;
+            }
+            if (verWritten >= 0x00010003)
+            {
+                fstream >> m_epsilon;
+                fstream >> m_useCntkEngine;
+            }
+        }
    }

    void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -1620,9 +1665,9 @@ public:
            auto node = dynamic_pointer_cast<BatchNormalizationNode<ElemType>>(nodeP);
            assert(node != nullptr);

-            node->m_eval = m_eval;
            node->m_spatial = m_spatial;
            node->m_normTimeConst = m_normTimeConst;
+            node->m_blendTimeConst = m_blendTimeConst;
            node->m_imageLayoutKind = m_imageLayoutKind;
            node->m_mbCount = m_mbCount;
            node->m_epsilon = m_epsilon;
@ -1630,20 +1675,8 @@ public:
        }
    }

-    void SetNormalizationTimeConstant(const double normalizationTimeConstant)
-    {
-        m_normTimeConst = normalizationTimeConstant;
-    }
-
    void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
    {
-        static bool m_evalWarningIssued = false;  //make sure we only print warning once
-        if (m_eval && !m_evalWarningIssued)
-        {
-            fprintf(stderr, "WARNING: You turned BatchNormalization to evaluation mode during training. Please make sure this is intended.\n");
-            m_evalWarningIssued = true;
-        }
-
        if (inputIndex == 0) // derivative with respect to the input.
        {
            auto sliceOutputGrad = GradientFor(fr);
@ -1651,15 +1684,11 @@ public:
            const Matrix<ElemType>& scale = Input(1)->Value();
            const Matrix<ElemType>& bias = Input(2)->Value();

-            size_t batchSize = sliceInputValue.GetNumCols();
-            m_inT->setN(batchSize);
-            assert(m_convEng != nullptr);
-
            auto sliceInputGrad = Input(0)->GradientFor(fr);
            m_dScale->Resize(scale);
            m_dBias->Resize(bias);
            // Compute all derivatives in one step. Save derivatives with respect to scale and bias in temp matrices.
-            m_convEng->BackwardNormalizeBatch(*m_inT, sliceInputValue, sliceOutputGrad, sliceInputGrad, *m_scaleBiasT, scale, m_spatial,
+            m_bnEng->Backward(sliceInputValue, sliceOutputGrad, sliceInputGrad, scale,
                                              *m_saveMean, *m_saveInvStdDev, *m_dScale, *m_dBias);
        }
        else if (inputIndex == 1) // derivative with respect to the scale
@ -1701,48 +1730,45 @@ public:

        Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-        size_t batchSize = sliceInputValue.GetNumCols();
-        m_inT->setN(batchSize);
-        assert(m_convEng != nullptr);
-#if NANCHECK
-        sliceInputValue.HasNan("BatchNormalization-input");
-#endif
-        if (m_eval)
-            m_convEng->NormalizeBatchInference(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, runMean, runInvStdDev, sliceOutputValue);
+        double expAvgFactor;
+        double blendFactor;
+        if (!Environment().IsTraining())
+        {
+            expAvgFactor = 0;
+            blendFactor = 1.0;
+
+            m_saveMean->Resize(0, 0);
+            m_saveInvStdDev->Resize(0, 0);
+        }
        else
        {
-            double expAvgFactor;
+            double numSamples = (double)GetMBLayout()->GetActualNumSamples();
            if (m_normTimeConst > 0)
            {
-                // Convert to per-minibatch factor.
-                expAvgFactor = 1.0 - exp(-(double)GetMBLayout()->GetActualNumSamples() / m_normTimeConst);
+                // Convert to per-minibatch factor. Treat positivie infinity as if running mean/var parameters are "frozen"
+                // that is, do not require updates.
+                expAvgFactor = !isfinite(m_normTimeConst) ? 0 : (1.0 - exp(-numSamples / m_normTimeConst));
            }
            else
            {
                // REVIEW alexeyk: hack, m_normTimeConst < 0 is used to compute CMA.
-                expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1;
+                expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1.0;
            }

-            if (m_saveMean == nullptr)
-                fprintf(stderr, "WARNING: m_saveMean is null\n");
-            if (m_saveInvStdDev == nullptr)
-                fprintf(stderr, "WARNING: m_saveInvStdDev is null\n");
+            if (!isfinite(m_blendTimeConst))
+                blendFactor = 1.0;
+            else
+                blendFactor = m_blendTimeConst > 0 ? (m_blendTimeConst / (m_blendTimeConst + numSamples)) : 0;
+
            m_saveMean->Resize(runMean);
            m_saveInvStdDev->Resize(runMean);
+        }

-            m_convEng->NormalizeBatch(*m_inT, sliceInputValue, *m_scaleBiasT, scale, bias, m_spatial, expAvgFactor, runMean, runInvStdDev,
+        m_bnEng->Forward(sliceInputValue, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev,
                                      sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);

            m_mbCount++;
        }
-#if NANCHECK
-        sliceOutputValue.HasNan("BatchNormalization-output");
-        runMean.HasNan("BatchNormalization-runMean");
-        runInvStdDev.HasNan("BatchNormalization-runInvStdDev");
-        m_saveMean->HasNan("BatchNormalization-saveMean");
-        m_saveInvStdDev->HasNan("BatchNormalization-saveInvStdDev");
-#endif
-    }

    void Validate(bool isFinalValidationPass) override
    {
@ -1756,34 +1782,23 @@ public:
            if (m_spatial && m_imageLayoutKind != CHW)
            {
                InvalidArgument(
-                    "Batch normalization currently supports only cuDNN (CHW) data layout. " 
+                    "%ls %ls currently supports only cuDNN (CHW) data layout. " 
                    "Please specify imageLayout=\"cudnn\" in BatchNormalization node in your NDL/BrainScript "
-                    "and make sure your input data layout is CHW");
+                    "and make sure your input data layout is CHW", NodeName().c_str(), OperationName().c_str());
            }
            double cudnnMinEps = 1e-5; // CUDNN_BN_MIN_EPSILON
            if (!m_useCntkEngine && m_epsilon < cudnnMinEps) 
                fprintf(stderr, "\nWARNING: cuDNN batch normalization requires epsilon >= %e. Epsilon will be reset to that value.\n", cudnnMinEps);

+            if (m_blendTimeConst < 0)
+                InvalidArgument("%ls %ls requires blend time constant to be >= 0.", NodeName().c_str(), OperationName().c_str());
+
            auto shape = GetSampleLayout();

-            if (m_factory == nullptr)
-                m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
-            if (m_convEng == nullptr)
-                m_convEng = m_factory->CreateConvEngine(m_deviceId, m_imageLayoutKind, 0, m_useCntkEngine ? BatchNormImpl::Cntk : BatchNormImpl::CuDnn);
-            if (m_spatial)
+            if (m_bnEng == nullptr)
            {
-                auto dims = ImageDimensions(shape, m_imageLayoutKind);
-                if (m_inT == nullptr)
-                    m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
-                if (m_scaleBiasT == nullptr)
-                    m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
-            }
-            else
-            {
-                if (m_inT == nullptr)
-                    m_inT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
-                if (m_scaleBiasT == nullptr)
-                    m_scaleBiasT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
+                m_bnEng = BatchNormEngine<ElemType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
+                                                            m_useCntkEngine ? BatchNormEngineKind::Cntk : BatchNormEngineKind::CuDnn);
            }
        }
    }
@ -1791,41 +1806,39 @@ public:
    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeForwardProp(matrixPool);
-        //if (!m_eval)
-        {
            RequestMatrixFromPool(m_saveMean, matrixPool);
            RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
        }
-    }

    void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeBackprop(matrixPool);
-        //if (!m_eval)
-        {
            RequestMatrixFromPool(m_dScale, matrixPool);
            RequestMatrixFromPool(m_dBias, matrixPool);
        }
-    }

    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
    {
        Base::ReleaseMatricesAfterBackprop(matrixPool);
-        //if (!m_eval)
-        {
            ReleaseMatrixToPool(m_saveMean, matrixPool);
            ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
            ReleaseMatrixToPool(m_dScale, matrixPool);
            ReleaseMatrixToPool(m_dBias, matrixPool);
        }
-    }

-    void SetEvalMode(bool bnEvalMode)
+    void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
+                                       double blendTimeConstant, double prevBlendTimeConstant)
    {
-        m_eval = bnEvalMode;
+        // As this function is called from SGD solver (global), make sure we don't
+        // override settings set in NDL when it's not necessary.
+        if (normalizationTimeConstant != prevNormalizationTimeConstant)
+            m_normTimeConst = normalizationTimeConstant;
+        if (blendTimeConstant != prevBlendTimeConstant)
+            m_blendTimeConst = blendTimeConstant;
    }

 private:
+    // Old versioning - do not use. Do not remove until we're sure there are no old models around.
    struct VersionInfo
    {
        //int32_t VerWrittenCur() const      { return 0x00010001; } // Initial
@ -1838,13 +1851,20 @@ private:
    VersionInfo m_version;

 private:
-    // Determines whether to use training or inference(evaluation) mode.
-    bool m_eval;
    // Determines whether to use per-activation (used after non-convolutional layers like fully connected)
    // or spatial (used after convolutional layers).
    bool m_spatial;
    // Time constant for running mean and variance.
    double m_normTimeConst;
+    // Time constant for blending running mean/var and current minibatch mean/var.
+    // The main idea is to represent current minibatch statistics as MAP estimate, linear interpolation
+    // of smoothed and minibatch statistics. 
+    // The idea is due to Frank Seide et al.
+    // It should also work well in data parallelism scenario
+    // as opposed to plain vanilla BN implementation which would require aggregation of statistics
+    // from all nodes.
+    // REVIEW alexeyk: if this works, document it properly in Wiki.
+    double m_blendTimeConst;
    // Epsilon used to compute inverse std deviation.
    double m_epsilon;
    // Whether to use CNTK or cuDNN BN implementation.
@ -1863,10 +1883,7 @@ private:
    // Stores bias derivatives.
    shared_ptr<Matrix<ElemType>> m_dBias;

-    std::unique_ptr<ConvolutionEngineFactory<ElemType>> m_factory;
-    std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
-    std::unique_ptr<ConvolutionTensor4D> m_inT;
-    std::unique_ptr<ConvolutionTensor4D> m_scaleBiasT;
+    std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
 };

 template class BatchNormalizationNode<float>;
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
@ -88,7 +88,7 @@
    <ClCompile Include="EvalWrapper.cpp" />
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="..\..\..\Common\Include\Eval.h" />
+    <ClInclude Include="..\..\Common\Include\Eval.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj.filters
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj.filters
@ -13,7 +13,7 @@
    </Filter>
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="..\..\..\Common\Include\Eval.h">
+    <ClInclude Include="..\..\Common\Include\Eval.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
  </ItemGroup>
--- a/Source/Math/BatchNormalizationEngine.cpp
+++ b/Source/Math/BatchNormalizationEngine.cpp
@ -0,0 +1,131 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "BatchNormalizationEngine.h"
+#include "CuDnnFactories.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template <class ElemType>
+void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+                                        Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev)
+{
+    assert(in.GetNumRows() == m_inOutT.GetNumElements());
+    assert(out.GetNumRows() == m_inOutT.GetNumElements());
+    assert(in.GetNumCols() == out.GetNumCols());
+    assert(std::isfinite(expAvgFactor) && (0 <= expAvgFactor && expAvgFactor <= 1));
+    assert(std::isfinite(blendFactor) && (0 <= blendFactor && blendFactor <= 1));
+    assert(std::isfinite(epsilon) && epsilon > 0);
+    if (!m_spatial)
+    {
+        assert(m_inOutT.GetNumElements() == scale.GetNumRows());
+        assert(m_inOutT.GetNumElements() == bias.GetNumRows());
+        assert(m_inOutT.GetNumElements() == runMean.GetNumRows());
+        assert(m_inOutT.GetNumElements() == runInvStdDev.GetNumRows());
+        assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
+        assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
+    }
+    else
+    {
+        assert((m_inOutT.GetNumElements() % scale.GetNumRows()) == 0);
+        assert((m_inOutT.GetNumElements() % bias.GetNumRows()) == 0);
+        assert((m_inOutT.GetNumElements() % runMean.GetNumRows()) == 0);
+        assert((m_inOutT.GetNumElements() % runInvStdDev.GetNumRows()) == 0);
+        assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
+        assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
+    }
+    assert(scale.GetNumCols() == 1);
+    assert(bias.GetNumCols() == 1);
+    assert(runMean.GetNumCols() == 1);
+    assert(runInvStdDev.GetNumCols() == 1);
+    assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
+    assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
+
+    EnsureCompatible();
+    ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
+}
+
+template <class ElemType>
+void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, 
+                                         const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
+{
+    EnsureCompatible();
+    BackwardCore(in, srcGrad, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+}
+
+template <class ElemType>
+class CntkBatchNormEngine : public BatchNormEngine<ElemType>
+{
+public:
+    using Base = BatchNormEngine<ElemType>;
+    using typename Base::Mat;
+
+public:
+    CntkBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+                        bool spatial, ImageLayoutKind imageLayout)
+                        : Base(deviceId, inOutT, spatial, imageLayout)
+    {
+    }
+
+protected:
+    using Base::m_deviceId;
+    using Base::m_imageLayout;
+    using Base::m_inOutT;
+    using Base::m_spatial;
+
+    void EnsureCompatible() override
+    {
+        if (m_spatial && m_imageLayout == ImageLayoutKind::HWC)
+            InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
+    }
+
+    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+                     Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
+    {
+        in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
+    }
+
+    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
+                      Mat& scaleGrad, Mat& biasGrad) override
+    {
+        srcGrad.BatchNormalizationBackward(in, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+    }
+};
+
+template class CntkBatchNormEngine<float>;
+template class CntkBatchNormEngine<double>;
+
+template <typename T>
+bool HasFlag(T src, T testFlag)
+{
+    return ((int)src & (int)testFlag) != 0;
+}
+
+template <class ElemType>
+std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+                                                                             bool spatial, ImageLayoutKind imageLayout,
+                                                                             BatchNormEngineKind enabledEngines)
+{
+    // Use CNTK as default batch norm engine.
+    if (HasFlag(enabledEngines, BatchNormEngineKind::Cntk))
+    {
+        fprintf(stderr, "\nUsing CNTK batch normalization engine.\n");
+        return std::make_unique<CntkBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
+    }
+
+    if (HasFlag(enabledEngines, BatchNormEngineKind::CuDnn))
+    {
+        fprintf(stderr, "\nUsing cuDNN batch normalization engine.\n");
+        return CuDnnBatchNormEngineFactory<ElemType>::Create(deviceId, inOutT, spatial, imageLayout);
+    }
+
+    RuntimeError("Could not find appropriate batch normalization engine.");
+}
+
+template class BatchNormEngine<float>;
+template class BatchNormEngine<double>;
+
+} } }
--- a/Source/Math/BatchNormalizationEngine.h
+++ b/Source/Math/BatchNormalizationEngine.h
@ -0,0 +1,73 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "Matrix.h"
+#include "TensorShape.h" // for ImageLayoutKind
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+//-------------------------------------------------------------
+// Batch normalization engine interface.
+//-------------------------------------------------------------
+enum class BatchNormEngineKind
+{
+    None  = 0,
+    Cntk  = 1,
+    CuDnn = 1 << 1,
+
+    All  = Cntk  | CuDnn
+};
+
+#pragma warning(push)
+#pragma warning(disable : 4251)
+
+template <class ElemType>
+class MATH_API BatchNormEngine
+{
+public:
+    using Mat = Matrix<ElemType>;
+
+public:
+    virtual ~BatchNormEngine() = default;
+
+    void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+                 Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
+
+    void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
+                  Mat& scaleGrad, Mat& biasGrad);
+
+    static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+                                                             bool spatial, ImageLayoutKind imageLayout,
+                                                             BatchNormEngineKind enabledEngines = BatchNormEngineKind::All);
+
+    DISABLE_COPY_AND_MOVE(BatchNormEngine);
+
+protected:
+    BatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+                    bool spatial, ImageLayoutKind imageLayout)
+                    : m_deviceId(deviceId), m_inOutT(inOutT), m_spatial(spatial), m_imageLayout(imageLayout)
+    {
+    }
+
+    virtual void EnsureCompatible() = 0;
+
+    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+                 Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
+
+    virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
+                  Mat& scaleGrad, Mat& biasGrad) = 0;
+
+protected:
+    DEVICEID_TYPE m_deviceId;
+    TensorShape m_inOutT;
+    bool m_spatial;
+    ImageLayoutKind m_imageLayout;
+};
+
+#pragma warning(pop)
+
+} } }
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -4085,6 +4085,257 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AddAveragePoolingGradient(const CPUMat
 }
 #pragma endregion Other Helper Functions

+template <class ElemType>
+void CPUMatrix<ElemType>::ConvolutionForward(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
+                                             const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
+{
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < output.GetNumRows(); row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            int ivBase = mpRowIwht(row, 0);
+            assert(0 <= colBase && colBase < GetNumRows());
+
+            ElemType sum = 0;
+            int i0 = mpRowRun(row, 0);
+            int skip = runs(i0++, 0);
+            int size = runs(i0++, 0);
+            int imask = i0 + size;
+            for (int i = 0; i < size; i++)
+            {
+                if (runs(imask + i, 0) == 0)
+                    continue;
+                int dcol = runs(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
+                sum += kernel.BufferPointer()[ivBase + skip + i] * (*this)(colBase + dcol, sample);
+            }
+            output(row, sample) = sum;
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::ConvolutionBackwardData(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
+                                                  const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& grad) const
+{
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < GetNumRows(); row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            int ivBase = mpRowIwht(row, 0);
+            assert(0 <= colBase && colBase < grad.GetNumRows());
+
+            ElemType curGrad = (*this)(row, sample);
+
+            int i0 = mpRowRun(row, 0);
+            int skip = runs(i0++, 0);
+            int size = runs(i0++, 0);
+            int imask = i0 + size;
+            for (int i = 0; i < size; i++)
+            {
+                if (runs(imask + i, 0) == 0)
+                    continue;
+                int dcol = runs(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
+                grad(colBase + dcol, sample) += curGrad * kernel.BufferPointer()[ivBase + skip + i];
+            }
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::ConvolutionBackwardKernel(const CPUMatrix<ElemType>& in, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
+                                                    const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& kernelGrad) const
+{
+    // Do NOT parallelize these loops!
+    for (size_t sample = 0; sample < GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < GetNumRows(); row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            int ivBase = mpRowIwht(row, 0);
+            assert(0 <= colBase && colBase < in.GetNumRows());
+
+            ElemType curGrad = (*this)(row, sample);
+
+            int i0 = mpRowRun(row, 0);
+            int skip = runs(i0++, 0);
+            int size = runs(i0++, 0);
+            int imask = i0 + size;
+            for (int i = 0; i < size; i++)
+            {
+                if (runs(imask + i, 0) == 0)
+                    continue;
+                int dcol = runs(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < in.GetNumRows());
+                kernelGrad.BufferPointer()[ivBase + skip + i] += curGrad * in(colBase + dcol, sample);
+            }
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
+{
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < output.GetNumRows(); row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            assert(0 <= colBase && colBase < GetNumRows());
+
+            assert(std::numeric_limits<ElemType>::has_infinity);
+            ElemType res = -std::numeric_limits<ElemType>::infinity();
+
+            int i0 = mpRowIndices(row, 0);
+            int size = indices(i0++, 0);
+            assert(size > 0);
+            for (int i = 0; i < size; i++)
+            {
+                int dcol = indices(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
+                res = std::max(res, (*this)(colBase + dcol, sample));
+            }
+            output(row, sample) = res;
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
+                                             const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
+                                             CPUMatrix<ElemType>& grad) const
+{
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < GetNumRows(); row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            assert(0 <= colBase && colBase < grad.GetNumRows());
+
+            int i0 = mpRowIndices(row, 0);
+            int size = indices(i0++, 0);
+            assert(size > 0);
+            ElemType g = (*this)(row, sample);
+            ElemType m = out(row, sample);
+            for (int i = 0; i < size; i++)
+            {
+                int dcol = indices(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
+                if (in(colBase + dcol, sample) >= m)
+                    grad(colBase + dcol, sample) += g;
+            }
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
+{
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)output.GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < output.GetNumRows(); row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            assert(0 <= colBase && colBase < GetNumRows());
+
+            ElemType sum = 0;
+
+            int i0 = mpRowIndices(row, 0);
+            int size = indices(i0++, 0);
+            assert(size > 0);
+            for (int i = 0; i < size; i++)
+            {
+                int dcol = indices(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
+                sum += (*this)(colBase + dcol, sample);
+            }
+            // Note that we divide by size which is the number of actual elements (does not include padding).
+            output(row, sample) = sum / size;
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& grad) const
+{
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < GetNumRows(); row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            assert(0 <= colBase && colBase < grad.GetNumRows());
+
+            int i0 = mpRowIndices(row, 0);
+            int size = indices(i0++, 0);
+            assert(size > 0);
+            ElemType g = (*this)(row, sample) / size;
+            for (int i = 0; i < size; i++)
+            {
+                int dcol = indices(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < grad.GetNumRows());
+                grad(colBase + dcol, sample) += g;
+            }
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
+                                                    CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev, CPUMatrix<ElemType>& out, double epsilon,
+                                                    CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
+{
+    UNUSED(epsilon); UNUSED(saveMean); UNUSED(saveInvStdDev);
+
+    assert((GetNumRows() % scale.GetNumRows()) == 0);
+
+    if (expAvgFactor != 0 || blendFactor != 1)
+        RuntimeError("Batch normalization training on CPU is not yet implemented.");
+
+    bool spatial = GetNumRows() != scale.GetNumRows();
+    if (spatial)
+    {
+        size_t spatialSize = GetNumRows() / scale.GetNumRows();
+#pragma omp parallel for
+        for (long icol = 0; icol < out.GetNumCols(); icol++)
+        {
+            for (long irow = 0; irow < out.GetNumRows(); irow++)
+            {
+                size_t imap = irow / spatialSize;
+                out(irow, icol) = scale(imap, 0) * ((*this)(irow, icol) - runMean(imap, 0)) * runInvStdDev(imap, 0) + bias(imap, 0);
+            }
+        }
+    }
+    else
+    {
+#pragma omp parallel for
+        for (long icol = 0; icol < out.GetNumCols(); icol++)
+        {
+            for (long irow = 0; irow < out.GetNumRows(); irow++)
+            {
+                out(irow, icol) = scale(irow, 0) * ((*this)(irow, icol) - runMean(irow, 0)) * runInvStdDev(irow, 0) + bias(irow, 0);
+            }
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
+                                                     CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const
+{
+    UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
+    RuntimeError("Batch normalization training on CPU is not yet implemented.");
+}
+
+
 #pragma region Static BLAS Functions

 /// <summary>Matrix-matrix multiply with col-major matrices (a and b may be transposed): c = alpha * op(a) * op(b) + beta*c</summary>
@ -5943,4 +6194,8 @@ template void CPUMatrix<char>::SetValue(const char);
 template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char* pArray, size_t matrixFlags);
 template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
 template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
-} } }
+
+template CPUMatrix<int>::CPUMatrix(const size_t, const size_t, int*, const size_t);
+template CPUMatrix<int>::~CPUMatrix();
+
+}}}
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -317,6 +317,27 @@ public:
                                                   const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                   const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);

+    void ConvolutionForward(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
+                            const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
+    void ConvolutionBackwardData(const CPUMatrix<ElemType>& kernel, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
+                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& grad) const;
+    void ConvolutionBackwardKernel(const CPUMatrix<ElemType>& in, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
+                                   const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& kernelGrad) const;
+
+    void MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
+    void MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
+                            const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
+                            CPUMatrix<ElemType>& grad) const;
+
+    void AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
+    void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
+                                CPUMatrix<ElemType>& grad) const;
+
+    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev,
+                                   CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
+    void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
+                                    CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
+
 public:
    static int SetNumThreads(int numThreads); // note: this does not depend on <ElemType>, i.e. you can call it on any <ElemType>

@ -457,4 +478,5 @@ private:

 typedef CPUMatrix<float> CPUSingleMatrix;
 typedef CPUMatrix<double> CPUDoubleMatrix;
-} } }
+
+}}}
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -1335,4 +1335,7 @@ template CPUSparseMatrix<char> CPUSparseMatrix<char>::ColumnSlice(size_t startCo
 template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
 template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);

+template CPUSparseMatrix<int>::CPUSparseMatrix(const MatrixFormat, const size_t, const size_t, const size_t);
+template CPUSparseMatrix<int>::~CPUSparseMatrix();
+
 }}}
--- a/Source/Math/CntkBatchNormalization.cuh
+++ b/Source/Math/CntkBatchNormalization.cuh
@ -0,0 +1,963 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#pragma warning(disable : 4127)
+#pragma warning(disable : 4201)
+#pragma warning(disable : 4515)
+#endif
+#include <cub/cub.cuh>
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+size_t RoundUpToMultiple(size_t n, size_t blockSize)
+{
+    return (n + blockSize - 1) / blockSize;
+}
+
+cudaError_t GetLastCudaError()
+{
+    cudaError_t prelaunchErr = cudaGetLastError();
+    assert(cudaSuccess == prelaunchErr);
+    if (prelaunchErr != cudaSuccess)
+        return prelaunchErr;
+        
+#ifndef NO_SYNC
+    cudaError_t executionErr = cudaStreamSynchronize(GetStream());
+    assert(cudaSuccess == executionErr);
+    if (executionErr != cudaSuccess)
+        return executionErr;
+#endif
+    return cudaSuccess;
+}
+
+template <int U, typename T>
+__device__ __forceinline__ void LoadValues(const T* src, T dst[U])
+{
+#pragma unroll
+    for (int i = 0; i < U; i++)
+        dst[i] = src[i];
+}
+
+template <>
+__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2])
+{
+    // src must be aligned at 8 bytes boundary.
+    assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
+    auto v = *(const float2*)src;
+    dst[0] = v.x;
+    dst[1] = v.y;
+}
+
+template <>
+__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4])
+{
+    // src must be aligned at 16 bytes boundary.
+    assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
+    // Can do the following instead (use ld.global.nc.* on CC 3.5+):
+    // asm volatile("ld.global.v4.f32 {%0, %1, %2, %3}, [%4];" : "=f"(v.x), "=f"(v.y), "=f"(v.z), "=f"(v.w) : "l"(src));
+    // Similar for shared memory (e.g. ld.shared.*)
+    auto v = *(const float4*)src;
+    dst[0] = v.x;
+    dst[1] = v.y;
+    dst[2] = v.z;
+    dst[3] = v.w;
+}
+
+template <int U, typename T>
+__device__ __forceinline__ void StoreValues(const T src[U], T* dst)
+{
+#pragma unroll
+    for (int i = 0; i < U; i++)
+        dst[i] = src[i];
+}
+
+template <>
+__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst)
+{
+    // dst must be aligned at 8 bytes boundary.
+    assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
+    float2 v;
+    v.x = src[0];
+    v.y = src[1];
+    *(reinterpret_cast<float2*>(dst)) = v;
+}
+
+template <>
+__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst)
+{
+    // dst must be aligned at 16 bytes boundary.
+    assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
+    float4 v;
+    v.x = src[0];
+    v.y = src[1];
+    v.z = src[2];
+    v.w = src[3];
+    *(reinterpret_cast<float4*>(dst)) = v;
+}
+
+template <typename T>
+__device__ __forceinline__ T Shuffle(T input, int srcLane)
+{
+    // shfl is supported only on Kepler+. We really don't care about Fermi anymore but our build still has sm_20.
+#if __CUDA_ARCH__ >= 300
+    return cub::ShuffleIndex(input, srcLane);
+#else
+    // REVIEW alexeyk: make static_assert once we remove SM 2.0 support from our build.
+    assert(false);
+    return input;
+#endif
+}
+
+namespace Operations
+{
+    __device__ float RSqrt(float a)
+    {
+        // REVIEW alexeyk: rsqrtf is just one MUFU.RSQ instruction so it's faster than
+        // __frsqrt_rn intrinsic which performs round-to-nearest-even rounding which adds ~10 other instructions.
+        // __frsqrt_rn is unbiased rounding though, need to verify whether it is a better choice for BN implementation.
+        //return __frsqrt_rn(a);
+        return rsqrtf(a);
+    }
+
+    __device__ double RSqrt(double a)
+    {
+        return rsqrt(a);
+    }
+}
+
+// This function is used to select correct unroll factor.
+// REVIEW alexeyk: ask our C++ gurus (Marko/Amit) if there is better way.
+template <template <int> class Func, typename T, typename ...Targs>
+void Call(size_t vectorSize, Targs... args)
+{
+    if ((vectorSize % 4) == 0)
+        Func<4>::template Call<T>(args...);
+    else if ((vectorSize % 2) == 0)
+        Func<2>::template Call<T>(args...);
+    else
+        Func<1>::template Call<T>(args...);
+}
+
+//--------------------------------------------------------------------
+// Mean and variance computaion
+//--------------------------------------------------------------------
+
+// The kernel implements online, parallel and numerically stable algorithm 
+// for computing batch mean and variance (here inverse standard deviation) with one pass over the data.
+// It uses algorithms by Knuth/Welford and Chan et al (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf)
+// In short, algorithm has 2 steps:
+// 1. Each thread strides over the input and computes mean and 
+//    m2 value (used to compute variance at the end) - Welford algorithm.
+// 2. Parallel reduction (Chan algorithm) performed by columns (note that 
+//    thread block and grid X dimensions go along the vector and Y dimension - along the batch).
+//    As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
+//    
+template <int BlockDimX, int BlockDimY, int U, typename ElemType>
+__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
+                                                double epsilon, ElemType* xMean, ElemType* xInvStdDev)
+{
+    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
+    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
+    assert((vectorSize % U) == 0);
+    assert(blockDim.x == BlockDimX);
+    assert(blockDim.y == BlockDimY);
+    assert(blockDim.z == 1);
+    assert(gridDim.y == 1);
+    assert(gridDim.z == 1);
+    assert(::isfinite(epsilon) && epsilon > 0);
+    assert(::isfinite(expAvgFactor) && expAvgFactor > 0);
+
+    int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
+    if (irowSrcBase >= vectorSize)
+        return;
+    assert(irowSrcBase + U <= vectorSize);
+
+    int n = 0;
+    ElemType mean[U];
+    ElemType m2[U];
+#pragma unroll
+    for (int k = 0; k < U; k++)
+    {
+        mean[k] = 0;
+        m2[k] = 0;
+    }
+
+    int icolSrc = threadIdx.y;
+    const ElemType* psrc = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
+    // Stride over all vectors in the batch.
+    for (; icolSrc < batchSize; icolSrc += BlockDimY)
+    {
+        n++;
+        ElemType curVal[U];
+        LoadValues<U>(psrc, curVal);
+        // No need for separate unrolling, SASS looks good.
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            ElemType d = curVal[k] - mean[k];
+            // REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
+            // Using precise math slows down the code by about 40%.
+            mean[k] += d / n;
+            m2[k] += d * (curVal[k] - mean[k]);
+        }
+        psrc += vectorSize * BlockDimY;
+    }
+
+    const int tid = threadIdx.y * BlockDimX + threadIdx.x;
+    const int laneId = tid & 0x1f;
+    // First, reduce within warp using shuffle.
+    if (n > 0)
+    {
+#pragma unroll
+        for (int i = 1; i < CUB_PTX_WARP_THREADS / BlockDimX; i *= 2)
+        {
+            int srcLane = laneId + BlockDimX * i;
+            int n2 = Shuffle(n, srcLane);
+            int nsum = n + n2;
+            ElemType d[U];
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                d[k] = Shuffle(mean[k], srcLane) - mean[k];
+                ElemType dScaled = d[k] * n2 / nsum;
+                mean[k] += dScaled;
+                m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
+            }
+            n = nsum;
+        }
+    }
+
+    // Storage for each warp in a thread block. First warp ("accumulator") holds 
+    // final results so it does not need shared memory.
+    const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
+    __shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
+    __shared__ ElemType m2Res[BlockDimX * U][cwarp - 1];
+    __shared__ int nRes[cwarp - 1];
+
+    // Each warp (except warp0) will write accumulated results to shared memory.
+    const int iwarp = tid / CUB_PTX_WARP_THREADS;
+    if (iwarp > 0 && laneId < BlockDimX)
+    {
+        if (laneId == 0)
+            nRes[iwarp - 1] = n;
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            meanRes[laneId * U + k][iwarp - 1] = mean[k];
+            m2Res[laneId * U + k][iwarp - 1] = m2[k];
+        }
+    }
+    __syncthreads();
+
+    // Accumulate and write final results.
+    // REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
+    if (threadIdx.y == 0)
+    {
+        // Use simple loop as number of warps is small, 8 at max.
+#pragma unroll
+        for (int i = 0; i < cwarp - 1; i++)
+        {
+            int n2 = nRes[i];
+            int nsum = n + n2;
+            ElemType d[U];
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                d[k] = meanRes[threadIdx.x * U + k][i] - mean[k];
+                ElemType dScaled = d[k] * n2 / nsum;
+                mean[k] += dScaled;
+                m2[k] += m2Res[threadIdx.x * U + k][i] + d[k] * n * dScaled;
+            }
+            n = nsum;
+        }
+        size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
+        // Store mean and running mean.
+        StoreValues<U>(mean, xMean + idxDstBase);
+        if (expAvgFactor == 1)
+            StoreValues<U>(mean, runMean + idxDstBase);
+        else
+        {
+            ElemType run[U];
+            LoadValues<U>(runMean + idxDstBase, run);
+#pragma unroll
+            for (int k = 0; k < U; k++)
+                run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
+            StoreValues<U>(run, runMean + idxDstBase);
+        }
+        // Store inv std dev and its running version.
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            m2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
+        }
+        StoreValues<U>(m2, xInvStdDev + idxDstBase);
+        if (expAvgFactor == 1)
+            StoreValues<U>(m2, runInvStdDev + idxDstBase);
+        else
+        {
+            ElemType run[U];
+            LoadValues<U>(runInvStdDev + idxDstBase, run);
+#pragma unroll
+            for (int k = 0; k < U; k++)
+                run[k] = expAvgFactor * m2[k] + (1.0 - expAvgFactor) * run[k];
+            StoreValues<U>(run, runInvStdDev + idxDstBase);
+        }
+    }
+}
+
+// This kernel is very similar to kComputeBatchMeanAndInvStdDev except it reduces not just over N (minibatch)
+// but also W and H dimensions.
+// REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
+template <int BlockDimX, int BlockDimY, int U, typename ElemType>
+__global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x, 
+                                                        double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
+                                                        double epsilon, ElemType* xMean, ElemType* xInvStdDev)
+{
+    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
+    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
+    assert(blockDim.x == BlockDimX);
+    assert(blockDim.y == BlockDimY);
+    assert(blockDim.z == 1);
+    assert(gridDim.y == 1);
+    assert(gridDim.z == 1);
+    assert((spatialSize % U) == 0);
+    assert((vectorSize % spatialSize) == 0);
+    assert(::isfinite(expAvgFactor) && expAvgFactor > 0);
+    assert(::isfinite(epsilon) && epsilon > 0);
+
+    int irowSrcBase = blockIdx.x * spatialSize + threadIdx.x * U;
+    if (irowSrcBase >= vectorSize)
+        return;
+    assert(irowSrcBase + U <= vectorSize);
+    int irowSrcLim = (blockIdx.x + 1) * spatialSize;
+
+    int n = 0;
+    ElemType mean[U];
+    ElemType m2[U];
+#pragma unroll
+    for (int k = 0; k < U; k++)
+    {
+        mean[k] = 0;
+        m2[k] = 0;
+    }
+
+    int icolSrc = threadIdx.y;
+    const ElemType* psrcBase = x + static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
+    // Stride over all vectors in the batch.
+    for (; icolSrc < batchSize; icolSrc += BlockDimY)
+    {
+        const ElemType* psrc = psrcBase;
+        // Stride over all values in feature map (W and H dimensions).
+        for (int irowSrc = irowSrcBase; irowSrc < irowSrcLim; irowSrc += BlockDimX * U, psrc += BlockDimX * U)
+        {
+            n++;
+            ElemType curVal[U];
+            LoadValues<U>(psrc, curVal);
+            // No need for separate unrolling, SASS looks good.
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                ElemType d = curVal[k] - mean[k];
+                // REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
+                // Using precise math slows down the code by about 40%.
+                mean[k] += d / n;
+                m2[k] += d * (curVal[k] - mean[k]);
+            }
+        }
+        psrcBase += vectorSize * BlockDimY;
+    }
+
+    const int tid = threadIdx.y * BlockDimX + threadIdx.x;
+    const int laneId = tid & 0x1f;
+    // First, reduce within warp using shuffle.
+    if (n > 0)
+    {
+#pragma unroll
+        for (int i = 1; i < CUB_PTX_WARP_THREADS; i *= 2)
+        {
+            int srcLane = laneId + i;
+            int n2 = Shuffle(n, srcLane);
+            int nsum = n + n2;
+            ElemType d[U];
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                d[k] = Shuffle(mean[k], srcLane) - mean[k];
+                ElemType dScaled = d[k] * n2 / nsum;
+                mean[k] += dScaled;
+                m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
+            }
+            n = nsum;
+        }
+    }
+
+    // Storage for each warp in a thread block. First warp ("accumulator") holds 
+    // final results so it does not need shared memory.
+    const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
+    __shared__ ElemType meanRes[U][cwarp - 1];
+    __shared__ ElemType m2Res[U][cwarp - 1];
+    __shared__ int nRes[cwarp - 1];
+
+    // Each warp (except warp0) will write accumulated results to shared memory.
+    const int iwarp = tid / CUB_PTX_WARP_THREADS;
+    if (iwarp > 0 && laneId == 0)
+    {
+        nRes[iwarp - 1] = n;
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            meanRes[k][iwarp - 1] = mean[k];
+            m2Res[k][iwarp - 1] = m2[k];
+        }
+    }
+    __syncthreads();
+
+    // One thread will accumulate and write final results.
+    if (tid == 0)
+    {
+        // Use simple loop as number of warps is small, 8 at max.
+#pragma unroll
+        for (int i = 0; i < cwarp - 1; i++)
+        {
+            int n2 = nRes[i];
+            int nsum = n + n2;
+            ElemType d[U];
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                d[k] = meanRes[k][i] - mean[k];
+                ElemType dScaled = d[k] * n2 / nsum;
+                mean[k] += dScaled;
+                m2[k] += m2Res[k][i] + d[k] * n * dScaled;
+            }
+            n = nsum;
+        }
+        // Final step - accumlate results in mean[0] and m2[0].
+        // REVIEW alexeyk: move outside of the loop, before storing values to smem.
+#pragma unroll
+        for (int k = 1; k < U; k++)
+        {
+            ElemType d = mean[k] - mean[0];
+            ElemType dScaled = d * n / (n + k * n);
+            mean[0] += dScaled;
+            m2[0] += m2[k] + d * k * n * dScaled;
+        }
+
+        xMean[blockIdx.x] = mean[0];
+        runMean[blockIdx.x] = (expAvgFactor == 1) ? mean[0] : (expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x]);
+        m2[0] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
+        xInvStdDev[blockIdx.x] = m2[0];
+        runInvStdDev[blockIdx.x] = (expAvgFactor == 1) ? m2[0] : (expAvgFactor * m2[0] + (1.0 - expAvgFactor) * runInvStdDev[blockIdx.x]);
+    }
+}
+
+// The struct is used by Call function to select proper template in runtime based on the size of the vector.
+// The same pattern is used in other cases of similar structs.
+template <int U>
+struct ComputeBatchMeanAndInvStdDev
+{
+    template <typename ElemType>
+    static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
+                     double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
+    {
+        assert((vectorSize % U) == 0);
+
+        const int BlockDimX = 32 / U;
+        const int BlockDimY = 4 * U;
+        auto bdim = dim3(BlockDimX, BlockDimY);
+        // Create grid with only one block in y(batch)-dimension as kernel uses striding.
+        auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
+        kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
+            static_cast<int>(vectorSize), static_cast<int>(batchSize), 
+            x, expAvgFactor, runMean, runInvStdDev, epsilon, xMean, xInvStdDev);
+    }
+};
+
+template <int U>
+struct ComputeSpatialBatchMeanAndInvStdDev
+{
+    template <typename ElemType>
+    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, 
+                        double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
+                        double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
+    {
+        assert((vectorSize % spatialSize) == 0);
+        assert((spatialSize % U) == 0);
+
+        const int BlockDimX = 32 / U;
+        const int BlockDimY = 4 * U;
+        auto bdim = dim3(BlockDimX, BlockDimY);
+        // Create grid with only one block in y(batch)-dimension as kernel uses striding.
+        // Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
+        auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
+        kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
+            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), 
+            x, expAvgFactor, runMean, runInvStdDev,epsilon, xMean, xInvStdDev);
+    }
+};
+
+//--------------------------------------------------------------------
+// Forward propagation
+// All functions accept input/outputs tensors in column-major format where each column is a vector of a minibatch.
+// In convolutional case (i.e. spatial=true), each vector is in CHW format where W dimension has stride = 1.
+// Tensors for biases and inverse stddevs have dimensions that equal to vector dimension in non-convolutional (i.e. spatial=false)
+// or Cx1x1 in convolutional case.
+//--------------------------------------------------------------------
+
+template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
+__global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize, const ElemType* x, ElemType* y,
+    const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev)
+{
+    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
+    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
+    assert(blockDim.x == BlockDimX);
+    assert(blockDim.y == BlockDimY);
+    assert(blockDim.z == 1);
+    assert(gridDim.y == 1);
+    assert(gridDim.z == 1);
+    assert((vectorSize % U) == 0);
+    assert(!Spatial || (spatialSize % U) == 0);
+    assert((vectorSize % spatialSize) == 0);
+
+    int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
+    if (irowBase >= vectorSize)
+        return;
+    assert(irowBase + U <= vectorSize);
+
+    __shared__ ElemType meanS[BlockDimX * U];
+    __shared__ ElemType invStdDevS[BlockDimX * U];
+    __shared__ ElemType scaleS[BlockDimX * U];
+    __shared__ ElemType biasS[BlockDimX * U];
+    int offs = threadIdx.x * U;
+    // REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
+    if (threadIdx.y == 0)
+    {
+        if (Spatial)
+        {
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                int imap = (irowBase + k) / spatialSize;
+                meanS[offs + k] = batchMean[imap];
+                invStdDevS[offs + k] = batchInvStdDev[imap];
+                scaleS[offs + k] = bnScale[imap];
+                biasS[offs + k] = bnBias[imap];
+            }
+        }
+        else
+        {
+            LoadValues<U>(batchMean + irowBase, meanS + offs);
+            LoadValues<U>(batchInvStdDev + irowBase, invStdDevS + offs);
+            LoadValues<U>(bnScale + irowBase, scaleS + offs);
+            LoadValues<U>(bnBias + irowBase, biasS + offs);
+        }
+    }
+    __syncthreads();
+    ElemType mean[U];
+    ElemType invStdDev[U];
+    ElemType scale[U];
+    ElemType bias[U];
+    LoadValues<U>(meanS + offs, mean);
+    LoadValues<U>(invStdDevS + offs, invStdDev);
+    LoadValues<U>(scaleS + offs, scale);
+    LoadValues<U>(biasS + offs, bias);
+
+    int icol = blockIdx.y * BlockDimY + threadIdx.y;
+    size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
+    const ElemType* psrc = x + startOffs;
+    ElemType* pdst = y + startOffs;
+    size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
+    for (; icol < batchSize; icol += gridDim.y * BlockDimY, psrc += stride, pdst += stride)
+    {
+        ElemType val[U];
+        LoadValues<U>(psrc, val);
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            val[k] = scale[k] * (val[k] - mean[k]) * invStdDev[k] + bias[k];
+        }
+        StoreValues<U>(val, pdst);
+    }
+}
+
+template <int U>
+struct NormalizeBatchTraining
+{
+    template <typename ElemType>
+    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, ElemType* y,
+        const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev, cudaStream_t stream)
+    {
+        assert((vectorSize % U) == 0);
+
+        const int BlockDimX = 32 / U;
+        const int BlockDimY = 4 * U;
+        auto bdim = dim3(BlockDimX, BlockDimY);
+        // Create a grid that has uses striding in y-dimension to cover whole minibatch.
+        auto gdim = dim3((unsigned int)RoundUpToMultiple(vectorSize, BlockDimX * U));
+        if (spatial)
+        {
+            kNormalizeBatchTraining<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
+                (int)vectorSize, (int)spatialSize, (int)batchSize, x, y, bnScale, bnBias,
+                batchMean, batchInvStdDev);
+        }
+        else
+        {
+            kNormalizeBatchTraining<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
+                (int)vectorSize, (int)spatialSize, (int)batchSize, x, y, bnScale, bnBias,
+                batchMean, batchInvStdDev);
+        }
+    }
+};
+
+//--------------------------------------------------------------------
+// Backpropagation
+// BatchNormalizationBackward back-propagates derivatives of batch normalization function
+// with respect to the inputs and scale and bias parameters.
+// All tensor dimensions and assumptions are the same as in case of forward propagation.
+//--------------------------------------------------------------------
+
+template <int BlockDimX, int BlockDimY, int U, typename ElemType>
+__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
+                                                const ElemType* saveMean, const ElemType* saveInvStdDev)
+{
+    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
+    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
+    static_assert(((BlockDimY - 1) & BlockDimY) == 0, "BlockDimY must be a power of 2.");
+    assert((vectorSize % U) == 0);
+    assert(blockDim.x == BlockDimX);
+    assert(blockDim.y == BlockDimY);
+    assert(blockDim.z == 1);
+    assert(gridDim.y == 1);
+    assert(gridDim.z == 1);
+
+    // REVIEW alexeyk: first part looks very similar to kComputeBatchMeanAndInvStdDev, any chance to refactor?
+    int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
+    if (irowSrcBase >= vectorSize)
+        return;
+    assert(irowSrcBase + U <= vectorSize);
+
+    ElemType mean[U];
+    ElemType invStdDev[U];
+    __shared__ ElemType meanS[BlockDimX * U];
+    __shared__ ElemType invStdDevS[BlockDimX * U];
+    // Read mean and inv std dev.
+    if (threadIdx.y == 0)
+    {
+        LoadValues<U>(saveMean + irowSrcBase, mean);
+        LoadValues<U>(saveInvStdDev + irowSrcBase, invStdDev);
+        StoreValues<U>(mean, &meanS[threadIdx.x * U]);
+        StoreValues<U>(invStdDev, &invStdDevS[threadIdx.x * U]);
+    }
+    __syncthreads();
+    if (threadIdx.y != 0)
+    {
+        LoadValues<U>(&meanS[threadIdx.x * U], mean);
+        LoadValues<U>(&invStdDevS[threadIdx.x * U], invStdDev);
+    }
+
+    ElemType ds[U];
+    ElemType db[U];
+#pragma unroll
+    for (int k = 0; k < U; k++)
+    {
+        ds[k] = 0;
+        db[k] = 0;
+    }
+
+    int icolSrc = threadIdx.y;
+    size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowSrcBase;
+    const ElemType* px = x + startOffs;
+    const ElemType* pdy = dy + startOffs;
+    size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
+    // Stride over all vectors in the batch.
+    for (; icolSrc < batchSize; icolSrc += BlockDimY, px += stride, pdy += stride)
+    {
+        ElemType curX[U];
+        ElemType curdY[U];
+        LoadValues<U>(px, curX);
+        LoadValues<U>(pdy, curdY);
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            ds[k] += pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
+            db[k] += pdy[k];
+        }
+    }
+
+    // Final reduction.
+    __shared__ ElemType dsS[BlockDimY][BlockDimX * U];
+    __shared__ ElemType dbS[BlockDimY][BlockDimX * U];
+    StoreValues<U>(ds, &dsS[threadIdx.y][threadIdx.x * U]);
+    StoreValues<U>(db, &dbS[threadIdx.y][threadIdx.x * U]);
+    __syncthreads();
+    // Very simple block reduction. As the block y dim is small (e.g. 16) then the loop
+    // is executed very few times (e.g. 4) so the performance is good.
+    // Can be potentially improved by using shuffle instructions (as in kComputeBatchMeanAndInvStdDev).
+#pragma unroll
+    for (int y = BlockDimY / 2; y > 0; y /= 2)
+    {
+        if (threadIdx.y < y)
+        {
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                dsS[threadIdx.y][threadIdx.x * U + k] += dsS[threadIdx.y + y][threadIdx.x * U + k];
+                dbS[threadIdx.y][threadIdx.x * U + k] += dbS[threadIdx.y + y][threadIdx.x * U + k];
+            }
+            __syncthreads();
+        }
+    }
+
+    // Write results.
+    if (threadIdx.y == 0)
+    {
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            dScale[irowSrcBase + k] = dsS[0][threadIdx.x * U + k];
+            dBias[irowSrcBase + k] = dbS[0][threadIdx.x * U + k];
+        }
+    }
+}
+
+template <int BlockDimX, int BlockDimY, int U, typename ElemType>
+__global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, 
+                                                        ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev)
+{
+    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
+    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
+    assert(blockDim.x == BlockDimX);
+    assert(blockDim.y == BlockDimY);
+    assert(blockDim.z == 1);
+    assert(gridDim.y == 1);
+    assert(gridDim.z == 1);
+    assert((spatialSize % U) == 0);
+    assert((vectorSize % spatialSize) == 0);
+
+    int irowBase = blockIdx.x * spatialSize + threadIdx.x * U;
+    if (irowBase >= vectorSize)
+        return;
+    assert(irowBase + U <= vectorSize);
+    int irowLim = (blockIdx.x + 1) * spatialSize;
+
+    ElemType mean;
+    ElemType invStdDev;
+    __shared__ ElemType meanS;
+    __shared__ ElemType invStdDevS;
+    const int tid = threadIdx.y * BlockDimX + threadIdx.x;
+    // Read mean and inv std dev.
+    if (tid == 0)
+    {
+        meanS = saveMean[blockIdx.x];
+        invStdDevS = saveInvStdDev[blockIdx.x];
+    }
+    __syncthreads();
+    if (tid != 0)
+    {
+        mean = meanS;
+        invStdDev = invStdDevS;
+    }
+
+    ElemType ds[U];
+    ElemType db[U];
+#pragma unroll
+    for (int k = 0; k < U; k++)
+    {
+        ds[k] = 0;
+        db[k] = 0;
+    }
+
+    int icolSrc = threadIdx.y;
+    size_t startOffs = static_cast<size_t>(icolSrc) * vectorSize + irowBase;
+    const ElemType* pxBase = x + startOffs;
+    const ElemType* pdyBase = dy + startOffs;
+    size_t stride = static_cast<size_t>(vectorSize) * BlockDimY;
+    // Stride over all vectors in the batch.
+    for (; icolSrc < batchSize; icolSrc += BlockDimY, pxBase += stride, pdyBase += stride)
+    {
+        const ElemType* px = pxBase;
+        const ElemType* pdy = pdyBase;
+        // Stride over all values in feature map (W and H dimensions).
+        for (int irow = irowBase; irow < irowLim; irow += BlockDimX * U, px += BlockDimX * U, pdy += BlockDimX * U)
+        {
+            ElemType curX[U];
+            ElemType curdY[U];
+            LoadValues<U>(px, curX);
+            LoadValues<U>(pdy, curdY);
+#pragma unroll
+            for (int k = 0; k < U; k++)
+            {
+                ds[k] += pdy[k] * (curX[k] - mean) * invStdDev;
+                db[k] += pdy[k];
+            }
+        }
+    }
+    __syncthreads();
+    using BlockReduce = cub::BlockReduce<ElemType, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
+    // Note: must use separate temp storages for each reduction.
+    __shared__ typename BlockReduce::TempStorage tmp1;
+    ElemType dsRes = BlockReduce(tmp1).Sum(ds);
+    __shared__ typename BlockReduce::TempStorage tmp2;
+    ElemType dbRes = BlockReduce(tmp2).Sum(db);
+    if (tid == 0)
+    {
+        dScale[blockIdx.x] = dsRes;
+        dBias[blockIdx.x] = dbRes;
+    }
+}
+
+template <int U>
+struct ComputeScaleAndBiasGradients
+{
+    template <typename ElemType>
+    static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
+        ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+    {
+        assert((vectorSize % U) == 0);
+        const int BlockDimX = 32 / U;
+        const int BlockDimY = 4 * U;
+        auto bdim = dim3(BlockDimX, BlockDimY);
+        // Create a grid that has uses striding in y-dimension to cover whole minibatch.
+        auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
+        kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
+            static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
+    }
+};
+
+template <int U>
+struct ComputeSpatialScaleAndBiasGradients
+{
+    template <typename ElemType>
+    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
+        ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+    {
+        assert((spatialSize % U) == 0);
+        assert((vectorSize % spatialSize) == 0);
+
+        const int BlockDimX = 32 / U;
+        const int BlockDimY = 4 * U;
+        auto bdim = dim3(BlockDimX, BlockDimY);
+        // Create a grid that has uses striding in y-dimension to cover whole minibatch.
+        auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
+        kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
+            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, saveMean, saveInvStdDev);
+    }
+};
+
+template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
+__global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
+                                                    const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias,
+                                                    const ElemType* saveMean, const ElemType* saveInvStdDev)
+{
+    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
+    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
+    assert(blockDim.x == BlockDimX);
+    assert(blockDim.y == BlockDimY);
+    assert(blockDim.z == 1);
+    assert(gridDim.z == 1);
+    assert((vectorSize % U) == 0);
+    assert(Spatial || spatialSize == 1);
+    assert(!Spatial || (spatialSize % U) == 0);
+    assert((vectorSize % spatialSize) == 0);
+
+    int irowBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
+    if (irowBase >= vectorSize)
+        return;
+    assert(irowBase + U <= vectorSize);
+    ElemType scale[U];
+    ElemType ds[U];
+    ElemType db[U];
+    ElemType mean[U];
+    ElemType invStdDev[U];
+    // REVIEW alexeyk: here we're wasting some bandwidth but this might be ok as it's a one-timer.
+    if (Spatial)
+    {
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            int imap = (irowBase + k) / spatialSize;
+            scale[k] = bnScale[imap];
+            ds[k] = dScale[imap];
+            db[k] = dBias[imap];
+            mean[k] = saveMean[imap];
+            invStdDev[k] = saveInvStdDev[imap];
+        }
+    }
+    else
+    {
+        LoadValues<U>(bnScale + irowBase, scale);
+        LoadValues<U>(dScale + irowBase, ds);
+        LoadValues<U>(dBias + irowBase, db);
+        LoadValues<U>(saveMean + irowBase, mean);
+        LoadValues<U>(saveInvStdDev + irowBase, invStdDev);
+    }
+
+    int icol = blockIdx.y * BlockDimY + threadIdx.y;
+    size_t startOffs = static_cast<size_t>(icol) * vectorSize + irowBase;
+    const ElemType* px = x + startOffs;
+    const ElemType* pdy = dy + startOffs;
+    ElemType* pdx = dx + startOffs;
+    size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
+    for (; icol < batchSize; icol += gridDim.y * BlockDimY, px += stride, pdy += stride, pdx += stride)
+    {
+        ElemType xCur[U];
+        ElemType dyCur[U];
+        ElemType dxCur[U];
+        LoadValues<U>(px, xCur);
+        LoadValues<U>(pdy, dyCur);
+        LoadValues<U>(pdx, dxCur);
+        // From the BN paper, dL/dxi is a sum of three terms: dL/dxi = t1 + t2 + t3
+        // After simplifcation, they become the following:
+        // 1. t1 = scale * dL/dyi * invStdDev
+        // 2. t2 = (-scale / m) * invStdDev * xHat * dL/dScale
+        // 3. t3 = (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
+        // Simplifying this a bit more, we get the formula below.
+        ElemType val[U];
+        int m = Spatial ? batchSize * spatialSize : batchSize;
+#pragma unroll
+        for (int k = 0; k < U; k++)
+        {
+            ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k];
+            val[k] = dxCur[k] + (scale[k] * invStdDev[k]) * (dyCur[k] - (xNorm * ds[k] + db[k]) / m);
+        }
+        StoreValues<U>(val, pdx);
+    }
+}
+
+template <int U>
+struct BackpropagateBatchNormGradients
+{
+    template <typename ElemType>
+    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
+                        const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+    {
+        assert((vectorSize % U) == 0);
+        const int BlockDimX = 32 / U;
+        const int BlockDimY = 4 * U;
+        auto bdim = dim3(BlockDimX, BlockDimY);
+        auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)),
+                            static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
+        if (spatial)
+        {
+            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
+                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
+        }
+        else
+        {
+            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
+                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
+        }
+    }
+};
+
+} } }
--- a/Source/Math/Convolution.cuh
+++ b/Source/Math/Convolution.cuh
@ -0,0 +1,272 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <math_constants.h>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template <typename ElemType>
+__global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__ kernel,
+                                    const int* mpRowCol, const int* mpRowIwht,
+                                    const int* mpRowRun, const int* __restrict__ runs,
+                                    const ElemType* __restrict__ src, int srcVecSize,
+                                    ElemType* dst, int dstVecSize)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= dstVecSize)
+        return;
+
+    src += blockIdx.y * srcVecSize;
+    dst += blockIdx.y * dstVecSize;
+
+    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
+    {
+        int colBase = mpRowCol[row];
+        int ivBase = mpRowIwht[row];
+        assert(0 <= colBase && colBase < srcVecSize);
+
+        ElemType sum = 0;
+        int i0 = mpRowRun[row];
+        int skip = runs[i0++];
+        int size = runs[i0++];
+        int imask = i0 + size;
+        for (int i = 0; i < size; i++)
+        {
+            if (runs[imask + i] == 0)
+                continue;
+            int dcol = runs[i0 + i];
+            assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
+            sum += kernel[ivBase + skip + i] * src[colBase + dcol];
+        }
+        dst[row] = sum;
+
+        src += blockDim.y * srcVecSize;
+        dst += blockDim.y * dstVecSize;
+    }
+}
+
+template <typename ElemType>
+__global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restrict__ kernel,
+                                         const int* mpRowCol, const int* mpRowIwht,
+                                         const int* mpRowRun, const int* __restrict__ runs,
+                                         const ElemType* __restrict__ srcGrad, int srcVecSize,
+                                         ElemType* grad, int dstVecSize)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= srcVecSize)
+        return;
+
+    srcGrad += blockIdx.y * srcVecSize;
+    grad += blockIdx.y * dstVecSize;
+
+    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
+    {
+        int colBase = mpRowCol[row];
+        int ivBase = mpRowIwht[row];
+        assert(0 <= colBase && colBase < dstVecSize);
+
+        ElemType g = srcGrad[row];
+        int i0 = mpRowRun[row];
+        int skip = runs[i0++];
+        int size = runs[i0++];
+        int imask = i0 + size;
+        for (int i = 0; i < size; i++)
+        {
+            if (runs[imask + i] == 0)
+                continue;
+            int dcol = runs[i0 + i];
+            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
+            atomicAdd(&grad[colBase + dcol], g * kernel[ivBase + skip + i]);
+        }
+
+        srcGrad += blockDim.y * srcVecSize;
+        grad += blockDim.y * dstVecSize;
+    }
+}
+
+template <typename ElemType>
+__global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int outVecSize,
+                                           const ElemType* __restrict__ in,
+                                           const int* mpRowCol, const int* mpRowIwht,
+                                           const int* mpRowRun, const int* __restrict__ runs,
+                                           const ElemType* __restrict__ srcGrad,
+                                           ElemType* kernelGrad)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= outVecSize)
+        return;
+
+    in += blockIdx.y * inVecSize;
+    srcGrad += blockIdx.y * outVecSize;
+
+    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
+    {
+        int colBase = mpRowCol[row];
+        int ivBase = mpRowIwht[row];
+        assert(0 <= colBase && colBase < inVecSize);
+
+        ElemType g = srcGrad[row];
+        int i0 = mpRowRun[row];
+        int skip = runs[i0++];
+        int size = runs[i0++];
+        int imask = i0 + size;
+        for (int i = 0; i < size; i++)
+        {
+            if (runs[imask + i] == 0)
+                continue;
+            int dcol = runs[i0 + i];
+            assert(0 <= colBase + dcol && colBase + dcol < inVecSize);
+            atomicAdd(&kernelGrad[ivBase + skip + i], g * in[colBase + dcol]);
+        }
+
+        in += blockDim.y * inVecSize;
+        srcGrad += blockDim.y * outVecSize;
+    }
+}
+
+template <typename ElemType>
+__global__ void kMaxPoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
+                                   const ElemType* __restrict__ src, int srcVecSize,
+                                   ElemType* dst, int dstVecSize)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= dstVecSize)
+        return;
+
+    src += blockIdx.y * srcVecSize;
+    dst += blockIdx.y * dstVecSize;
+
+    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
+    {
+        int colBase = mpRowCol[row];
+        assert(0 <= colBase && colBase < srcVecSize);
+
+        int i0 = mpRowIndices[row];
+        int size = indices[i0++];
+        ElemType res = src[colBase + indices[i0]];
+        for (int i = 1; i < size; i++)
+        {
+            int dcol = indices[i0 + i];
+            assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
+            res = max(res, src[colBase + dcol]);
+        }
+        dst[row] = res;
+
+        src += blockDim.y * srcVecSize;
+        dst += blockDim.y * dstVecSize;
+    }
+}
+
+template <typename ElemType>
+__global__ void kMaxPoolingBackward(int batchSize, const ElemType* out, const ElemType* in,
+                                    const int* mpRowCol, const int* mpRowIndices, const int* indices,
+                                    const ElemType* __restrict__ srcGrad, int srcVecSize,
+                                    ElemType* grad, int dstVecSize)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= srcVecSize)
+        return;
+
+    in += blockIdx.y * dstVecSize;
+    out += blockIdx.y * srcVecSize;
+    srcGrad += blockIdx.y * srcVecSize;
+    grad += blockIdx.y * dstVecSize;
+
+    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
+    {
+        int colBase = mpRowCol[row];
+        assert(0 <= colBase && colBase < dstVecSize);
+
+        int i0 = mpRowIndices[row];
+        int size = indices[i0++];
+        assert(size > 0);
+        ElemType g = srcGrad[row];
+        ElemType m = out[row];
+        for (int i = 0; i < size; i++)
+        {
+            int dcol = indices[i0 + i];
+            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
+            if (in[colBase + dcol] >= m)
+                atomicAdd(&grad[colBase + dcol], g);
+        }
+
+        in += blockDim.y * dstVecSize;
+        out += blockDim.y * srcVecSize;
+        srcGrad += blockDim.y * srcVecSize;
+        grad += blockDim.y * dstVecSize;
+    }
+}
+
+template <typename ElemType>
+__global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
+                                       const ElemType* __restrict__ src, int srcVecSize,
+                                       ElemType* dst, int dstVecSize)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= dstVecSize)
+        return;
+
+    src += blockIdx.y * srcVecSize;
+    dst += blockIdx.y * dstVecSize;
+
+    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
+    {
+        int colBase = mpRowCol[row];
+        assert(0 <= colBase && colBase < srcVecSize);
+
+        int i0 = mpRowIndices[row];
+        int size = indices[i0++];
+        ElemType sum = 0;
+        for (int i = 0; i < size; i++)
+        {
+            int dcol = indices[i0 + i];
+            assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
+            sum += src[colBase + dcol];
+        }
+        dst[row] = sum / size;
+
+        src += blockDim.y * srcVecSize;
+        dst += blockDim.y * dstVecSize;
+    }
+}
+
+template <typename ElemType>
+__global__ void kAveragePoolingBackward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
+                                        const ElemType* __restrict__ srcGrad, int srcVecSize,
+                                        ElemType* grad, int dstVecSize)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= srcVecSize)
+        return;
+
+    srcGrad += blockIdx.y * srcVecSize;
+    grad += blockIdx.y * dstVecSize;
+
+    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
+    {
+        int colBase = mpRowCol[row];
+        assert(0 <= colBase && colBase < dstVecSize);
+
+        int i0 = mpRowIndices[row];
+        int size = indices[i0++];
+        assert(size > 0);
+        ElemType g = srcGrad[row] / size;
+        for (int i = 0; i < size; i++)
+        {
+            int dcol = indices[i0 + i];
+            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
+            atomicAdd(&grad[colBase + dcol], g);
+        }
+
+        srcGrad += blockDim.y * srcVecSize;
+        grad += blockDim.y * dstVecSize;
+    }
+}
+
+} } }
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -5,242 +5,295 @@

 #include "stdafx.h"
 #include "ConvolutionEngine.h"
-#include "CuDnnConvolutionEngine.h"
+#include "CuDnnFactories.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

 template <class ElemType>
-void ConvolutionEngine<ElemType>::Forward(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, 
-                                          const ConvDesc& convDesc, const Tensor4D& outT, Mat& out, Mat& workspace)
+void ConvolutionEngine<ElemType>::Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace)
 {
-    assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
-    assert(inT.n() == in.GetNumCols());
-    assert(filterT.k() == filter.GetNumRows());
-    assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
-    assert(inT.c() == filterT.c());
-    assert(outT.c() == filterT.k());
-    assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
-    assert(outT.n() == out.GetNumCols());
-
-    EnsureCompatible();
-    ForwardCore(inT, in, filterT, filter, convDesc, outT, out, workspace);
-}
-
-template <class ElemType>
-void ConvolutionEngine<ElemType>::BackwardData(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                                               const Tensor4D& gradT, Mat& grad, Mat& workspace)
-{
-    assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
-    assert(srcGradT.n() == srcGrad.GetNumCols());
-    assert(filterT.k() == filter.GetNumRows());
-    assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
-    assert(srcGradT.c() == filterT.k());
-    assert(gradT.c() == filterT.c());
-    assert(gradT.w() * gradT.h() * gradT.c() == grad.GetNumRows());
-    assert(gradT.n() == grad.GetNumCols());
-
-    EnsureCompatible();
-    BackwardDataCore(srcGradT, srcGrad, filterT, filter, convDesc, gradT, grad, workspace);
-}
-
-template <class ElemType>
-void ConvolutionEngine<ElemType>::BackwardFilter(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
-                                                 const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace)
-{
-    assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
-    assert(srcGradT.n() == srcGrad.GetNumCols());
-    assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
-    assert(inT.n() == in.GetNumCols());
-    assert(srcGradT.c() == filterT.k());
-    assert(inT.c() == filterT.c());
-    assert(filterT.k() == filter.GetNumRows());
-    assert(filterT.w() * filterT.h() * filterT.c() == filter.GetNumCols());
-
-    EnsureCompatible();
-    BackwardFilterCore(srcGradT, srcGrad, inT, in, convDesc, filterT, filter, allowReuse, workspace);
-}
-
-template <class ElemType>
-void ConvolutionEngine<ElemType>::NormalizeBatch(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                                                 bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
-                                                 double epsilon, Mat& saveMean, Mat& saveInvStdDev)
-{
-    const size_t crowIn = inT.w() * inT.h() * inT.c();
-    if (spatial)
-    {
-        assert(scaleBiasT.c() == inT.c());
-        assert(scaleBiasT.w() == 1);
-        assert(scaleBiasT.h() == 1);
-        assert(runMean.GetNumRows() == inT.c());
-        assert(runInvStdDev.GetNumRows() == inT.c());
-    }
-    else
-    {
-        assert(scaleBiasT.c() == inT.c());
-        assert(scaleBiasT.w() == inT.w());
-        assert(scaleBiasT.h() == inT.h());
-        assert(runMean.GetNumRows() == crowIn);
-        assert(runInvStdDev.GetNumRows() == crowIn);
-    }
-    assert(scaleBiasT.n() == 1);
-    assert(crowIn == in.GetNumRows());
-    assert(crowIn == out.GetNumRows());
-    assert(inT.n() == in.GetNumCols());
-    assert(inT.n() == out.GetNumCols());
-    assert(bias.GetNumCols() == 1);
-    assert(scale.GetNumCols() == 1);
-    assert(runMean.GetNumCols() == 1);
-    assert(runInvStdDev.GetNumCols() == 1);
-    assert(runMean.GetNumCols() == saveMean.GetNumCols());
-    assert(runMean.GetNumRows() == saveMean.GetNumRows());
-    assert(runInvStdDev.GetNumCols() == saveInvStdDev.GetNumCols());
-    assert(runInvStdDev.GetNumRows() == saveInvStdDev.GetNumRows());
-
-#ifndef _DEBUG
-    UNUSED(crowIn); // crowIn used only in asserts.
+    const auto& g = *m_geometry;
+    assert(g.InputShape().GetNumElements() == in.GetNumRows());
+    assert(g.OutputShape().GetNumElements() == out.GetNumRows());
+    size_t batchSize = in.GetNumCols();
+    assert(batchSize == out.GetNumCols());
+    // REVIEW alexeyk: add shape-aware asserts?
+    assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
+#ifdef NDEBUG
+    UNUSED(g);
+    UNUSED(batchSize);
 #endif

-    EnsureCompatibleBatchNorm(spatial);
-    NormalizeBatchCore(inT, in, scaleBiasT, scale, bias, spatial, expAvgFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
+    EnsureCompatible();
+    EnsureConvolutionInitialized();
+    ForwardCore(in, kernel, out, workspace);
 }

 template <class ElemType>
-void ConvolutionEngine<ElemType>::NormalizeBatchInference(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                                                          bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out)
+void ConvolutionEngine<ElemType>::BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace)
 {
-    const size_t crowIn = inT.w() * inT.h() * inT.c();
-
-    if (spatial)
-    {
-        assert(scaleBiasT.c() == inT.c());
-        assert(scaleBiasT.w() == 1);
-        assert(scaleBiasT.h() == 1);
-        assert(scaleBiasT.c() == runMean.GetNumRows());
-        assert(scaleBiasT.c() == runInvStdDev.GetNumRows());
-    }
-    else
-    {
-        assert(scaleBiasT.c() == inT.c());
-        assert(scaleBiasT.w() == inT.w());
-        assert(scaleBiasT.h() == inT.h());
-        assert(crowIn == runMean.GetNumRows());
-        assert(crowIn == runInvStdDev.GetNumRows());
-    }
-    assert(scaleBiasT.n() == 1);
-    assert(crowIn == in.GetNumRows());
-    assert(crowIn == out.GetNumRows());
-    assert(inT.n() == in.GetNumCols());
-    assert(inT.n() == out.GetNumCols());
-    assert(bias.GetNumCols() == 1);
-    assert(scale.GetNumCols() == 1);
-    assert(runMean.GetNumCols() == 1);
-    assert(runInvStdDev.GetNumCols() == 1);
-#ifndef _DEBUG
-    // used only in asserts.
-    UNUSED(crowIn);
+    const auto& g = *m_geometry;
+    assert(g.InputShape().GetNumElements() == grad.GetNumRows());
+    assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
+    size_t batchSize = srcGrad.GetNumCols();
+    assert(batchSize == grad.GetNumCols());
+    assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
+#ifdef NDEBUG
+    UNUSED(g);
+    UNUSED(batchSize);
 #endif

-    EnsureCompatibleBatchNorm(spatial);
-    NormalizeBatchInferenceCore(inT, in, scaleBiasT, scale, bias, spatial, runMean, runInvStdDev, out);
+    EnsureCompatible();
+    EnsureConvolutionInitialized();
+    BackwardDataCore(srcGrad, kernel, grad, workspace);
 }

 template <class ElemType>
-void ConvolutionEngine<ElemType>::BackwardNormalizeBatch(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
-                                                         const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
-                                                         Mat& scaleGrad, Mat& biasGrad)
+void ConvolutionEngine<ElemType>::BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernel, bool allowReuse, Mat& workspace)
 {
-    const size_t crowIn = inT.w() * inT.h() * inT.c();
-
-    if (spatial)
-    {
-        assert(scaleBiasT.c() == inT.c());
-        assert(scaleBiasT.w() == 1);
-        assert(scaleBiasT.h() == 1);
-    }
-    else
-    {
-        assert(scaleBiasT.c() == inT.c());
-        assert(scaleBiasT.w() == inT.w());
-        assert(scaleBiasT.h() == inT.h());
-    }
-    assert(scaleBiasT.n() == 1);
-    assert(crowIn == in.GetNumRows());
-    assert(crowIn == srcGrad.GetNumRows());
-    assert(crowIn == grad.GetNumRows());
-    assert(inT.n() == in.GetNumCols());
-    assert(inT.n() == srcGrad.GetNumCols());
-    assert(inT.n() == grad.GetNumCols());
-    assert(scaleGrad.GetNumRows() == scale.GetNumRows());
-    assert(scaleGrad.GetNumCols() == scale.GetNumCols());
-    assert(biasGrad.GetNumRows() == scale.GetNumRows());
-    assert(biasGrad.GetNumCols() == scale.GetNumCols());
-#ifndef _DEBUG
-    UNUSED(crowIn); // crowIn used only in asserts.
+    const auto& g = *m_geometry;
+    assert(g.InputShape().GetNumElements() == in.GetNumRows());
+    assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
+    size_t batchSize = in.GetNumCols();
+    assert(batchSize == srcGrad.GetNumCols());
+    assert(g.KernelShape().GetNumElements() * g.KernelCount() == kernel.GetNumElements());
+#ifdef NDEBUG
+    UNUSED(g);
+    UNUSED(batchSize);
 #endif

-    EnsureCompatibleBatchNorm(spatial);
-    BackwardNormalizeBatchCore(inT, in, srcGrad, grad, scaleBiasT, scale, spatial, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+    EnsureCompatible();
+    EnsureConvolutionInitialized();
+    BackwardKernelCore(srcGrad, in, kernel, allowReuse, workspace);
+}
+
+template <class ElemType>
+void ConvolutionEngine<ElemType>::ForwardPooling(const Mat& in, Mat& out)
+{
+    const auto& g = *m_geometry;
+    assert(g.InputShape().GetNumElements() == in.GetNumRows());
+    assert(g.OutputShape().GetNumElements() == out.GetNumRows());
+    size_t batchSize = in.GetNumCols();
+    assert(batchSize == out.GetNumCols());
+#ifdef NDEBUG
+    UNUSED(g);
+    UNUSED(batchSize);
+#endif
+
+    EnsureCompatible();
+    EnsurePoolingInitialized();
+    ForwardPoolingCore(in, out);
+}
+
+template <class ElemType>
+void ConvolutionEngine<ElemType>::BackwardPooling(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad)
+{
+    const auto& g = *m_geometry;
+    assert(g.InputShape().GetNumElements() == grad.GetNumRows());
+    assert(g.InputShape().GetNumElements() == in.GetNumRows());
+    assert(g.OutputShape().GetNumElements() == srcGrad.GetNumRows());
+    assert(g.OutputShape().GetNumElements() == out.GetNumRows());
+    size_t batchSize = out.GetNumCols();
+    assert(batchSize == srcGrad.GetNumCols());
+    assert(batchSize == in.GetNumCols());
+    assert(batchSize == grad.GetNumCols());
+#ifdef NDEBUG
+    UNUSED(g);
+    UNUSED(batchSize);
+#endif
+
+    EnsureCompatible();
+    EnsurePoolingInitialized();
+    BackwardPoolingCore(out, srcGrad, in, grad);
 }

 //------------------------------------------------------------------
-// Default (legacy) convolution engine implementation.
+// Reference convolution engine implementation.
+// This engine supports arbitrary convolution geometry but does not provide efficient implementation.
+// Its main purpose is to serve as a baseline for optmized engines (e.g. cuDNN) that 
+// usually implement only a subset of a general convolution geometry.
 //------------------------------------------------------------------
 template <class ElemType>
-class DefaultConvolutionEngine : public ConvolutionEngine<ElemType>
+class ReferenceConvolutionEngine : public ConvolutionEngine<ElemType>
 {
 public:
    using Base = ConvolutionEngine<ElemType>;
    using typename Base::Mat;
-    using typename Base::Tensor4D;
-    using typename Base::Filter;
-    using typename Base::ConvDesc;

 public:
-    DefaultConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
-        : Base(deviceId, imageLayout), m_ones(deviceId), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_bnImpl(bnImpl)
+    ReferenceConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
+        : Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind), 
+        m_mpRowCol(geometry->MpRowCol().size(), 1, const_cast<int*>(geometry->MpRowCol().data()), deviceId, IsGpu(deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer)
    {
    }

 protected:
+    using Base::m_geometry;
    using Base::m_deviceId;
    using Base::m_imageLayout;
+    using Base::m_maxTempMemSizeInSamples;
+    using Base::m_poolKind;
+
+    void EnsureCompatible() override
+    {
+        if (m_imageLayout != ImageLayoutKind::CHW)
+            RuntimeError("Reference convolution engine supports only CHW/cudnn layout.");
+    }
+
+    void EnsureConvolutionInitialized() override
+    {
+        if (m_mpRowIwht == nullptr)
+        {
+            auto flags = IsGpu(m_deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer;
+            m_mpRowIwht = std::make_unique<Matrix<int>>(m_geometry->MpRowIwht().size(), 1, 
+                                                        const_cast<int*>(m_geometry->MpRowIwht().data()), m_deviceId, flags);
+            m_mpRowRun = std::make_unique<Matrix<int>>(m_geometry->MpRowRun().size(), 1,
+                                                       const_cast<int*>(m_geometry->MpRowRun().data()), m_deviceId, flags);
+            m_runs = std::make_unique<Matrix<int>>(m_geometry->Runs().size(), 1, 
+                                                   const_cast<int*>(m_geometry->Runs().data()), m_deviceId, flags);
+        }
+    }
+
+    void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& /*workspace*/) override
+    {
+        in.ConvolutionForward(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, out);
+    }
+
+    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& /*workspace*/) override
+    {
+        srcGrad.ConvolutionBackwardData(kernel, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, grad);
+    }
+
+    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& /*workspace*/) override
+    {
+        srcGrad.ConvolutionBackwardKernel(in, m_mpRowCol, *m_mpRowIwht, *m_mpRowRun, *m_runs, kernelGrad);
+    }
+
+    void EnsurePoolingInitialized() override
+    {
+        if (m_indices == nullptr)
+        {
+            auto flags = IsGpu(m_deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer;
+            m_mpRowIndices = std::make_unique<Matrix<int>>(m_geometry->MpRowIndices().size(), 1,
+                                                           const_cast<int*>(m_geometry->MpRowIndices().data()), m_deviceId, flags);
+            m_indices = std::make_unique<Matrix<int>>(m_geometry->Indices().size(), 1,
+                                                      const_cast<int*>(m_geometry->Indices().data()), m_deviceId, flags);
+        }
+    }
+
+    void ForwardPoolingCore(const Mat& in, Mat& out) override
+    {
+        if (m_poolKind == PoolKind::Max)
+        {
+            in.MaxPoolingForward(m_mpRowCol, *m_mpRowIndices, *m_indices, out);
+        }
+        else if (m_poolKind == PoolKind::Average)
+        {
+            in.AveragePoolingForward(m_mpRowCol, *m_mpRowIndices, *m_indices, out);
+        }
+        else
+            InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
+
+    }
+
+    void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
+    {
+        if (m_poolKind == PoolKind::Max)
+        {
+            srcGrad.MaxPoolingBackward(out, in, m_mpRowCol, *m_mpRowIndices, *m_indices, grad);
+        }
+        else if (m_poolKind == PoolKind::Average)
+        {
+            srcGrad.AveragePoolingBackward(m_mpRowCol, *m_mpRowIndices, *m_indices, grad);
+        }
+        else
+            InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
+    }
+
+private:
+    static bool IsGpu(DEVICEID_TYPE deviceId)
+    {
+        return deviceId >= 0;
+    }
+
+private:
+    using IntMatPtr = std::unique_ptr<Matrix<int>>;
+
+    Matrix<int> m_mpRowCol;
+    // Convolution-specific maps.
+    IntMatPtr m_mpRowIwht;
+    IntMatPtr m_mpRowRun;
+    IntMatPtr m_runs;
+    // Pooling-specific maps.
+    IntMatPtr m_mpRowIndices;
+    IntMatPtr m_indices;
+};
+
+//------------------------------------------------------------------
+// Legacy convolution engine implementation.
+//------------------------------------------------------------------
+template <class ElemType>
+class LegacyConvolutionEngine : public ConvolutionEngine<ElemType>
+{
+public:
+    using Base = ConvolutionEngine<ElemType>;
+    using typename Base::Mat;
+
+public:
+    LegacyConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
+        : Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind), 
+        m_inT(m_geometry->InputShape(), ImageLayoutKind::CHW), m_outT(m_geometry->OutputShape(), ImageLayoutKind::CHW),
+        m_kernelT(m_geometry->KernelShape(), ImageLayoutKind::CHW), m_strideT(m_geometry->Stride(), ImageLayoutKind::CHW)
+    {
+        m_padding = m_geometry->AutoPad()[0];
+    }
+
+protected:
+    using Base::m_geometry;
+    using Base::m_deviceId;
+    using Base::m_imageLayout;
+    using Base::m_maxTempMemSizeInSamples;
+    using Base::m_poolKind;

    void EnsureCompatible() override
    {
        if (m_imageLayout != ImageLayoutKind::HWC)
-            RuntimeError("Default convolution engine currently supports only HWC/legacy layout.");
+            RuntimeError("Legacy convolution engine supports only HWC/legacy layout.");
    }

-    void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                 const Tensor4D& outT, Mat& out, Mat& workspace) override
+    void EnsureConvolutionInitialized() override
    {
-        size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
-        size_t packedInputColsPerSample = outT.w() * outT.h();
+    }
+
+    void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) override
+    {
+        size_t batchSize = in.GetNumCols();
+        size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
+        size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
        size_t outputSizePerChannel = packedInputColsPerSample;
        // size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
        // size_t inputDim = inT.w() * inT.h() * inT.c();  // size of each input sample

-        size_t batchSize = inT.n();
        size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);

-        assert(filter.GetNumCols() == packedInputRows && filter.GetNumRows() == outT.c());
+        assert(kernel.GetNumCols() == packedInputRows && kernel.GetNumRows() == m_outT.c());
        UNUSED(packedInputRows);

        // GPU and 1-dimensional image
-        m_gpuSparseOpt = (filterT.h() == 1 &&
+        m_gpuSparseOpt = (m_kernelT.h() == 1 &&
                          in.GetCurrentMatrixLocation() == CurrentDataLocation::GPU &&
-                          convDesc.wStride() == 1 &&
-                          !convDesc.padding() &&
+                          m_strideT.w() == 1 &&
+                          !m_padding &&
                          in.GetMatrixType() == MatrixType::SPARSE);
-        m_gpuSparse1D = (m_gpuSparseOpt && inT.h() == 1);
+        m_gpuSparse1D = (m_gpuSparseOpt && m_inT.h() == 1);

        out.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);

        // Reshaping is only necessary if we are going to use the unpacking trick
        if (m_gpuSparseOpt)
-            out.Reshape(outT.c() * outT.w(), outT.h() * batchSize);
+            out.Reshape(m_outT.c() * m_outT.w(), m_outT.h() * batchSize);
        else
-            out.Reshape(outT.c(), outputSizePerChannel * batchSize);
+            out.Reshape(m_outT.c(), outputSizePerChannel * batchSize);

        size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
        size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
@ -263,53 +316,51 @@ protected:

            if (m_gpuSparseOpt)
            {
-                if (filterT.w() * inT.c() != filter.GetNumCols())
+                if (m_kernelT.w() * m_inT.c() != kernel.GetNumCols())
                    LogicError("Kernel width and weight matrix dimensions don't match.");

-                inputSubBatch.Reshape(inT.c() * inT.w(), inT.h() * smallBatchSize);
-                Mat outputSubBatch = out.ColumnSlice(startSampleId, outT.h() * smallBatchSize);
-                Mat::ConvolveAndWeightedAdd(1, filter, false, inputSubBatch, false, 0, outputSubBatch,
-                                            static_cast<int>(inT.c()), convDesc.wStride(), convDesc.padding(), true);
+                inputSubBatch.Reshape(m_inT.c() * m_inT.w(), m_inT.h() * smallBatchSize);
+                Mat outputSubBatch = out.ColumnSlice(startSampleId, m_outT.h() * smallBatchSize);
+                Mat::ConvolveAndWeightedAdd(1, kernel, false, inputSubBatch, false, 0, outputSubBatch,
+                                            static_cast<int>(m_inT.c()), m_strideT.w(), m_padding, true);
            }
            else
            {
                inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, true);
                workspace.AssignPackedConvolutionInput(inputSubBatch,
-                                                       inT.w(), inT.h(), inT.c(),
-                                                       outT.w(), outT.h(), outT.c(),
-                                                       filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
-                                                       convDesc.padding());
+                                                       m_inT.w(), m_inT.h(), m_inT.c(),
+                                                       m_outT.w(), m_outT.h(), m_outT.c(),
+                                                       m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
+                                                       m_padding);

                Mat outputSubBatch = out.ColumnSlice(outputSizePerChannel * startSampleId, outputSizePerChannel * smallBatchSize);

                // workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
                // BUGBUG: This ^^ destroys the content of the matrix. Also it seems not to change the size. Does it? Should this be a Reshape()?
-                Mat::Multiply(filter, false, workspace, false, outputSubBatch);
+                Mat::Multiply(kernel, false, workspace, false, outputSubBatch);
            }
        }

-        out.Reshape(outT.c() * outputSizePerChannel, batchSize); // each sample becomes a column
+        out.Reshape(m_outT.c() * outputSizePerChannel, batchSize); // each sample becomes a column

-        assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
-        assert(outT.n() == out.GetNumCols());
+        assert(m_outT.w() * m_outT.h() * m_outT.c() == out.GetNumRows());
+        assert(batchSize == out.GetNumCols());
    }

-    void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                      const Tensor4D& gradT, Mat& grad, Mat& workspace) override
+    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
    {
-        size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
-        size_t packedInputColsPerSample = srcGradT.w() * srcGradT.h();
+        size_t batchSize = srcGrad.GetNumCols();
+        size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
+        size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
        size_t outputSizePerChannel = packedInputColsPerSample;
        // size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
-        // size_t inputDim = gradT.w() * gradT.h() * gradT.c();  // size of each input sample
-
-        size_t batchSize = srcGradT.n();
+        // size_t inputDim = m_inT.w() * m_inT.h() * m_inT.c();  // size of each input sample

        size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);

        // Create slice which is the same as full matrix so we can reshape it.
        Matrix<ElemType> srcGradTmp = srcGrad.ColumnSlice(0, srcGrad.GetNumCols());
-        srcGradTmp.Reshape(srcGradT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
+        srcGradTmp.Reshape(m_outT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation

        size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
        size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;
@ -322,31 +373,29 @@ protected:

            workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
            Matrix<ElemType> outputGradientSubBatch = srcGradTmp.ColumnSlice(startSampleId * outputSizePerChannel, smallBatchSize * outputSizePerChannel);
-            Matrix<ElemType>::Multiply(filter, true, outputGradientSubBatch, false, workspace);
+            Matrix<ElemType>::Multiply(kernel, true, outputGradientSubBatch, false, workspace);

            Matrix<ElemType> inputGradientSubBatch = grad.ColumnSlice(startSampleId, smallBatchSize);
            workspace.UnpackConvolutionInput(inputGradientSubBatch,
-                                             gradT.w(), gradT.h(), gradT.c(),
-                                             srcGradT.w(), srcGradT.h(), srcGradT.c(),
-                                             filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
-                                             convDesc.padding());
+                                             m_inT.w(), m_inT.h(), m_inT.c(),
+                                             m_outT.w(), m_outT.h(), m_outT.c(),
+                                             m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
+                                             m_padding);
        }

-        assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
-        assert(srcGradT.n() == srcGrad.GetNumCols());
+        assert(m_outT.w() * m_outT.h() * m_outT.c() == srcGrad.GetNumRows());
+        assert(batchSize == srcGrad.GetNumCols());
    }

-    void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
-                        const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace) override
+    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) override
    {
-        size_t packedInputRows = filterT.w() * filterT.h() * filterT.c();
-        size_t packedInputColsPerSample = srcGradT.w() * srcGradT.h();
+        size_t batchSize = in.GetNumCols();
+        size_t packedInputRows = m_kernelT.w() * m_kernelT.h() * m_kernelT.c();
+        size_t packedInputColsPerSample = m_outT.w() * m_outT.h();
        size_t outputSizePerChannel = packedInputColsPerSample;
        // size_t packedInputDim = packedInputRows * packedInputColsPerSample; // size of each packed input sample
        // size_t inputDim = m_inputImageLayout.width * m_inputImageLayout.height * m_inputImageLayout.channels;  // size of each input sample

-        size_t batchSize = inT.n();
-
        size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);

        // const Matrix<ElemType> & weightMatrix = input0;
@ -354,14 +403,14 @@ protected:

        // Create slice which is the same as full matrix so we can reshape it.
        Matrix<ElemType> srcGradTmp = srcGrad.ColumnSlice(0, srcGrad.GetNumCols());
-        srcGradTmp.Reshape(srcGradT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation
+        srcGradTmp.Reshape(m_outT.c(), outputSizePerChannel * batchSize); // reshape to match the longernal operation

        size_t subBatchSize = min(batchSize, maxTempMemSizeInSamples);
        size_t numSubBatches = (batchSize + subBatchSize - 1) / subBatchSize;

        if (numSubBatches == 1 && allowReuse && !m_gpuSparseOpt) // reuse packed input from evaluation step if it's not changed by either subbatch or recurrent steps.
            // REVIEW alexeyk: the following makes an assumption that data in workspace was filled by Forward call and remained unchanged. Find way to enforce/verify that.
-            Matrix<ElemType>::MultiplyAndAdd(srcGradTmp, false, workspace, true, filter);
+            Matrix<ElemType>::MultiplyAndAdd(srcGradTmp, false, workspace, true, kernelGrad);
        else
        {
            for (size_t i = 0; i < numSubBatches; i++)
@ -379,16 +428,16 @@ protected:
                {
                    Matrix<ElemType> inputSubBatch(in.GetDeviceId());
                    inputSubBatch.SetValue(in.ColumnSlice(startSampleID, smallBatchSize));
-                    inputSubBatch.Reshape(inT.c(), smallBatchSize * inT.w() * inT.h());
+                    inputSubBatch.Reshape(m_inT.c(), smallBatchSize * m_inT.w() * m_inT.h());
                    Matrix<ElemType> inputSubBatchSparseReordered(inputSubBatch.GetNumCols(), inputSubBatch.GetNumRows(), inputSubBatch.GetDeviceId(), MatrixType::SPARSE, MatrixFormat::matrixFormatSparseCSC);
-                    Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, inT.w(), 1, smallBatchSize * inT.h(), inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);
+                    Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, inputSubBatch.Transpose(), 1, m_inT.w(), 1, smallBatchSize * m_inT.h(), m_inT.c(), 1.0f, inputSubBatchSparseReordered, inputSubBatchSparseReordered);

-                    Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * srcGradT.h() * srcGradT.w(), srcGradT.c(), outputGradientSubBatch.GetDeviceId());
-                    Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, srcGradT.w(), 1, smallBatchSize * srcGradT.h(), srcGradT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);
+                    Matrix<ElemType> outputGradientSubBatchReordered = Matrix<ElemType>::Zeros(smallBatchSize * m_outT.h() * m_outT.w(), m_outT.c(), outputGradientSubBatch.GetDeviceId());
+                    Matrix<ElemType>::TensorShuffleScaleAndAdd(0.0f, outputGradientSubBatch.Transpose(), 1, m_outT.w(), 1, smallBatchSize * m_outT.h(), m_outT.c(), 1.0f, outputGradientSubBatchReordered, outputGradientSubBatchReordered);

-                    filter.Reshape(srcGradT.c() * filterT.w(), inT.c());
-                    Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, filter, smallBatchSize * inT.h(), convDesc.wStride(), convDesc.padding(), false);
-                    filter.Reshape(srcGradT.c(), inT.c() * filterT.w());
+                    kernelGrad.Reshape(m_outT.c() * m_kernelT.w(), m_inT.c());
+                    Matrix<ElemType>::ConvolveAndWeightedAdd(1, outputGradientSubBatchReordered, true, inputSubBatchSparseReordered, false, 1, kernelGrad, smallBatchSize * m_inT.h(), m_strideT.w(), m_padding, false);
+                    kernelGrad.Reshape(m_outT.c(), m_inT.c() * m_kernelT.w());
                }
                else
                {
@ -396,288 +445,107 @@ protected:
                    Matrix<ElemType> inputSubBatch = in.ColumnSlice(startSampleID, smallBatchSize);
                    inputSubBatch.SwitchToMatrixType(MatrixType::DENSE, inputSubBatch.GetFormat(), true);
                    workspace.AssignPackedConvolutionInput(inputSubBatch,
-                                                           inT.w(), inT.h(), inT.c(),
-                                                           srcGradT.w(), srcGradT.h(), srcGradT.c(),
-                                                           filterT.w(), filterT.h(), convDesc.wStride(), convDesc.hStride(),
-                                                           convDesc.padding());
+                                                           m_inT.w(), m_inT.h(), m_inT.c(),
+                                                           m_outT.w(), m_outT.h(), m_outT.c(),
+                                                           m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h(),
+                                                           m_padding);

-                    Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, filter);
+                    Matrix<ElemType>::MultiplyAndAdd(outputGradientSubBatch, false, workspace, true, kernelGrad);
                }
            }
        }

-        assert(srcGradT.w() * srcGradT.h() * srcGradT.c() == srcGrad.GetNumRows());
-        assert(srcGradT.n() == srcGrad.GetNumCols());
+        assert(m_outT.w() * m_outT.h() * m_outT.c() == srcGrad.GetNumRows());
+        assert(batchSize == srcGrad.GetNumCols());
    }

-    void EnsureCompatibleBatchNorm(bool spatial) override
+    void EnsurePoolingInitialized() override
    {
-        if (m_deviceId >= 0)
-            InvalidArgument("This engine does not support batch normalization on GPUs.");
-        if (m_bnImpl != BatchNormImpl::Cntk)
-            InvalidArgument("Only CNTK batch normalization implementation is supported by this engine.");
-        if (spatial && m_imageLayout != ImageLayoutKind::CHW)
-            InvalidArgument("This engine batch normalization currently supports only CHW data layout for convolutional nodes.");
    }

-    void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                        bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
+    void ForwardPoolingCore(const Mat& in, Mat& out) override
    {
-        UNUSED(inT);
-        UNUSED(in);
-        UNUSED(scaleBiasT);
-        UNUSED(scale);
-        UNUSED(bias);
-        UNUSED(out);
-        UNUSED(spatial);
-        UNUSED(expAvgFactor);
-        UNUSED(runMean);
-        UNUSED(runInvStdDev);
-        UNUSED(epsilon);
-        UNUSED(saveMean);
-        UNUSED(saveInvStdDev);
-        RuntimeError("Not yet implemented.");
-    }
-
-    void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                                 bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) override
-    {
-        UNUSED(scaleBiasT);
-        if (spatial)
+        if (m_poolKind == PoolKind::Max)
        {
-            size_t spatialSize = inT.w() * inT.h();
-#pragma omp parallel for
-            for (long icol = 0; icol < out.GetNumCols(); icol++)
-            {
-                for (long irow = 0; irow < out.GetNumRows(); irow++)
-                {
-                    size_t imap = irow / spatialSize;
-                    out(irow, icol) = scale(imap, 0) * (in(irow, icol) - runMean(imap, 0)) * runInvStdDev(imap, 0) + bias(imap, 0);
-                }
-            }
+            out.AssignMaxPoolingResult(in, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
+                                       m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
+                                       m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
+        }
+        else if (m_poolKind == PoolKind::Average)
+        {
+            out.AssignAveragePoolingResult(in, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
+                                           m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
+                                           m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
        }
        else
-        {
-#pragma omp parallel for
-            for (long icol = 0; icol < out.GetNumCols(); icol++)
-            {
-                for (long irow = 0; irow < out.GetNumRows(); irow++)
-                {
-                    out(irow, icol) = scale(irow, 0) * (in(irow, icol) - runMean(irow, 0)) * runInvStdDev(irow, 0) + bias(irow, 0);
-                }
-            }
-        }
+            InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
    }

-    void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
-                                const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
-                                Mat& scaleGrad, Mat& biasGrad) override
+    void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
    {
-        UNUSED(inT);
-        UNUSED(in);
-        UNUSED(srcGrad);
-        UNUSED(grad);
-        UNUSED(scaleBiasT);
-        UNUSED(scale);
-        UNUSED(scaleGrad);
-        UNUSED(biasGrad);
-        UNUSED(spatial);
-        UNUSED(saveMean);
-        UNUSED(saveInvStdDev);
-        RuntimeError("Not yet implemented.");
+        if (m_poolKind == PoolKind::Max)
+        {
+            grad.AddMaxPoolingGradient(srcGrad, in, out,
+                                       m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
+                                       m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
+                                       m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
+        }
+        else if (m_poolKind == PoolKind::Average)
+        {
+            grad.AddAveragePoolingGradient(srcGrad, m_inT.c(), m_inT.w(), m_inT.h(), m_inT.w() * m_inT.h() * m_inT.c(),
+                                           m_outT.w(), m_outT.h(), m_outT.w() * m_outT.h() * m_outT.c(),
+                                           m_kernelT.w(), m_kernelT.h(), m_strideT.w(), m_strideT.h());
+        }
+        else
+            InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
    }

 private:
-    size_t m_maxTempMemSizeInSamples;
-    BatchNormImpl m_bnImpl;
-    Mat m_ones;
+    ImageDimensions m_inT;
+    ImageDimensions m_outT;
+    ImageDimensions m_kernelT;
+    ImageDimensions m_strideT;
+    bool m_padding;
+
    bool m_gpuSparseOpt;
    bool m_gpuSparse1D;
 };

+template <class ElemType>
+std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
+                                                                                 ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
+                                                                                 ConvolutionEngineKind enabledEngines)
+{
+    auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
+    // Note: in some cases do not throw exception even if parameters do not match as Create
+    // can be called from places like MEL with default parameters and never be used. 
+    // The check will be done later in engine's EnsureCompatible call if the egnine is actually used.
+    auto engStr = (std::string)(*geometry);
+    // Only legacy engine supports HWC layout.
+    if (imageLayout == ImageLayoutKind::HWC)
+    {
+        if (!isEnabled(ConvolutionEngineKind::Legacy))
+            RuntimeError("Trying to use Legacy convolution engine when it's disabled.");
+        // REVIEW alexeyk: should honor m_traceLevel here.
+        fprintf(stderr, "\nUsing legacy convolution engine for geometry: %s.\n", engStr.c_str());
+        return std::make_unique<LegacyConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
+    }
+
+    // Check if we can use cuDNN engine. Do not need to validate tensors as ConvolveGeometry has already done that.
+    if (isEnabled(ConvolutionEngineKind::CuDnn) &&
+        CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId, geometry, poolKind))
+    {
+        fprintf(stderr, "\nUsing cuDNN convolution engine for geometry: %s.\n", engStr.c_str());
+        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
+    }
+
+    if (!isEnabled(ConvolutionEngineKind::Reference))
+        RuntimeError("Reference convolution is disabled and no other engine supports such configuratin (or disabled).");
+    fprintf(stderr, "\nUsing reference convolution engine for geometry: %s.\n", engStr.c_str());
+    return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
+}
+
 template class ConvolutionEngine<float>;
 template class ConvolutionEngine<double>;

-//------------------------------------------------------------------
-// Pooling engine.
-//------------------------------------------------------------------
-
-
-template <class ElemType>
-void PoolingEngine<ElemType>::Forward(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out)
-{
-    assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
-    assert(inT.n() == in.GetNumCols());
-    assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
-    assert(outT.n() == out.GetNumCols());
-
-    EnsureCompatible();
-    ForwardCore(inT, in, poolDesc, outT, out);
-}
-
-template <class ElemType>
-void PoolingEngine<ElemType>::Backward(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad)
-{
-    assert(outT.w() * outT.h() * outT.c() == out.GetNumRows());
-    assert(outT.n() == out.GetNumCols());
-    assert(out.GetNumRows() == srcGrad.GetNumRows());
-    assert(out.GetNumCols() == srcGrad.GetNumCols());
-    assert(inT.w() * inT.h() * inT.c() == in.GetNumRows());
-    assert(inT.n() == in.GetNumCols());
-    assert(in.GetNumRows() == grad.GetNumRows());
-    assert(in.GetNumCols() == grad.GetNumCols());
-
-    EnsureCompatible();
-    BackwardCore(outT, out, srcGrad, poolDesc, inT, in, grad);
-}
-
-//------------------------------------------------------------------
-// Default (legacy) pooling engine implementation.
-//------------------------------------------------------------------
-template <class ElemType>
-class DefaultPoolingEngine : public PoolingEngine<ElemType>
-{
-public:
-    using Base = PoolingEngine<ElemType>;
-    using typename Base::Tensor4D;
-    using typename Base::PoolDesc;
-    using typename Base::Mat;
-
-public:
-    DefaultPoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
-        : Base(deviceId, imageLayout)
-    {
-    }
-
-protected:
-    using Base::m_deviceId;
-    using Base::m_imageLayout;
-
-    void EnsureCompatible() override
-    {
-        if (m_imageLayout != ImageLayoutKind::HWC)
-            RuntimeError("Default pooling engine currently supports only HWC/legacy layout.");
-    }
-
-    void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) override
-    {
-        if (poolDesc.kind() == PoolDesc::PoolKind::Max)
-        {
-            out.AssignMaxPoolingResult(in, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
-                                       outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
-                                       poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
-        }
-        else if (poolDesc.kind() == PoolDesc::PoolKind::Average)
-        {
-            out.AssignAveragePoolingResult(in, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
-                                           outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
-                                           poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
-        }
-        else
-            InvalidArgument("Pooling type %d is not supported.", (int)poolDesc.kind());
-    }
-
-    void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) override
-    {
-        if (poolDesc.kind() == PoolDesc::PoolKind::Max)
-        {
-            grad.AddMaxPoolingGradient(srcGrad, in, out,
-                                       inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
-                                       outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
-                                       poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
-        }
-        else if (poolDesc.kind() == PoolDesc::PoolKind::Average)
-        {
-            grad.AddAveragePoolingGradient(srcGrad, inT.c(), inT.w(), inT.h(), inT.w() * inT.h() * inT.c(),
-                                           outT.w(), outT.h(), outT.w() * outT.h() * outT.c(),
-                                           poolDesc.w(), poolDesc.h(), poolDesc.wStride(), poolDesc.hStride());
-        }
-        else
-            InvalidArgument("Pooling type %d is not supported.", (int)poolDesc.kind());
-    }
-};
-
-template class PoolingEngine<float>;
-template class PoolingEngine<double>;
-
-template <class ElemType>
-class DefaultConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
-{
-public:
-    using Base = ConvolutionEngineFactory<ElemType>;
-    using typename Base::Tensor4D;
-    using typename Base::Tensor4DPtr;
-    using typename Base::Filter;
-    using typename Base::FilterPtr;
-    using typename Base::ConvDesc;
-    using typename Base::ConvDescPtr;
-    using typename Base::PoolDesc;
-    using typename Base::PoolDescPtr;
-
-    using typename Base::ConvEnginePtr;
-    using typename Base::PoolEnginePtr;
-
-public:
-    Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override
-    {
-        return std::make_unique<ConvolutionTensor4D>(w, h, c, n);
-    }
-
-    FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override
-    {
-        return std::make_unique<Filter>(w, h, c, k);
-    }
-
-    ConvDescPtr CreateConvDescriptor(const Tensor4D& /*inT*/, const Filter& /*filterT*/,
-                                     size_t wStride, size_t hStride, bool padding) override
-    {
-        return std::make_unique<ConvDesc>(wStride, hStride, padding);
-    }
-
-    PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override
-    {
-        return std::make_unique<PoolDesc>(kind, w, h, wStride, hStride, wPad, hPad);
-    }
-
-    ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) override
-    {
-        return std::make_unique<DefaultConvolutionEngine<ElemType>>(deviceId, imageLayout, maxTempMemSizeInSamples, bnImpl);
-    }
-
-    PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) override
-    {
-        return std::make_unique<DefaultPoolingEngine<ElemType>>(deviceId, imageLayout);
-    }
-};
-
-template <class ElemType>
-std::unique_ptr<ConvolutionEngineFactory<ElemType>> ConvolutionEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind)
-{
-    if (engType == EngineType::Auto)
-    {
-        // REVIEW alexeyk: make cuDNN default when running on GPU and compiled with cuDNN, add config parameter to enable runtime switch between implementations.
-        if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId) && imageLayoutKind == ImageLayoutKind::CHW)
-            return Create(deviceId, EngineType::CuDnn, imageLayoutKind);
-        else
-            return Create(deviceId, EngineType::Legacy, imageLayoutKind);
-    }
-    else if (engType == EngineType::CuDnn)
-    {
-        if (imageLayoutKind != ImageLayoutKind::CHW)
-            InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the cuDNN engine.", ToString(imageLayoutKind).c_str());
-        if (deviceId >= 0 && CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId))
-            return std::make_unique<CuDnnConvolutionEngineFactory<ElemType>>();
-        RuntimeError("cuDNN convolution engine is not supported, check the device id and whether the code was compiled with cuDNN.");
-    }
-    else if (engType == EngineType::Legacy)
-    {
-        return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
-    }
-
-    RuntimeError("Not supported convolution engine type: %d.", (int)engType);
-}
-
-template class ConvolutionEngineFactory<float>;
-template class ConvolutionEngineFactory<double>;
-
 }}}
--- a/Source/Math/ConvolutionEngine.h
+++ b/Source/Math/ConvolutionEngine.h
@ -5,370 +5,104 @@

 #pragma once

-// REVIEW alexeyk: this seems to be repeated all over the CNTKMathDll.
-#ifdef _WIN32
-#ifdef MATH_EXPORTS
-#define MATH_API __declspec(dllexport)
-#else
-#define MATH_API __declspec(dllimport)
-#endif
-#else // no DLLs on Linux
-#define MATH_API
-#endif
-
 #include "Matrix.h"
 #include "TensorShape.h" // for ImageLayoutKind
+#include "ConvolveGeometry.h"
+#include "StringUtil.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

-// REVIEW alexeyk: this is a temp class until we have generic tensor suport in CNTK.
-class ConvolutionTensor4D
+//-------------------------------------------------------------
+// Convolution and pooling engine interface.
+//-------------------------------------------------------------
+enum class ConvolutionEngineKind
 {
-public:
-    size_t w() const
-    {
-        return m_w;
-    }
-    size_t h() const
-    {
-        return m_h;
-    }
-    size_t c() const
-    {
-        return m_c;
-    }
-    size_t n() const
-    {
-        return m_n;
-    }
-    virtual void setN(size_t n)
-    {
-        m_n = n;
-    }
+    None      = 0,
+    Reference = 1,
+    CuDnn     = 1 << 1,
+    Legacy    = 1 << 2,

-public:
-    ConvolutionTensor4D(size_t w = 1, size_t h = 1, size_t c = 1, size_t n = 1)
-    {
-        m_w = w;
-        m_h = h;
-        m_c = c;
-        m_n = n;
-    }
-
-public:
-    virtual ~ConvolutionTensor4D() = default;
-    // Deleting copy ctor/assignment as derived objects may contain non-copyable state.
-    ConvolutionTensor4D(const ConvolutionTensor4D&) = delete;
-    ConvolutionTensor4D& operator=(const ConvolutionTensor4D&) = delete;
-    // REVIEW alexeyk: Have to implement move ctor explicitly as VS2013 does not support default move ctors.
-    // ConvolutionTensor4D(ConvolutionTensor4D&&);
-    // ConvolutionTensor4D& operator=(ConvolutionTensor4D&&);
-
-private:
-    size_t m_w;
-    size_t m_h;
-    size_t m_c;
-    size_t m_n;
+    All     = Reference | CuDnn | Legacy
 };

-class ConvolutionFilter
+enum class PoolKind
 {
-public:
-    size_t w() const
-    {
-        return m_w;
-    }
-    size_t h() const
-    {
-        return m_h;
-    }
-    size_t c() const
-    {
-        return m_c;
-    }
-    size_t k() const
-    {
-        return m_k;
-    }
-
-public:
-    ConvolutionFilter(size_t w = 1, size_t h = 1, size_t c = 1, size_t k = 1)
-    {
-        m_w = w;
-        m_h = h;
-        m_c = c;
-        m_k = k;
-    }
-
-public:
-    virtual ~ConvolutionFilter() = default;
-
-    // Deleting copy ctor/assignment as derived objects may contain non-copyable state.
-    ConvolutionFilter(const ConvolutionFilter&) = delete;
-    ConvolutionFilter& operator=(const ConvolutionFilter&) = delete;
-
-private:
-    size_t m_w;
-    size_t m_h;
-    size_t m_c;
-    size_t m_k;
+    None,
+    Max,
+    Average
 };

-// ConvolutionDescriptor describes properties specific to convolution application.
-class ConvolutionDescriptor
-{
-public:
-    // Horizontal stride (in w-dimension).
-    size_t wStride() const
-    {
-        return m_wStride;
-    }
-    // Vertical stride (in h-dimension).
-    size_t hStride() const
-    {
-        return m_hStride;
-    }
-    bool padding() const
-    {
-        return m_padding;
-    }
-
-public:
-    ConvolutionDescriptor(size_t wStride = 1, size_t hStride = 1, bool padding = false)
-    {
-        m_wStride = wStride;
-        m_hStride = hStride;
-        m_padding = padding;
-    }
-
-public:
-    virtual ~ConvolutionDescriptor() = default;
-    // Deleting copy ctor/assignment as derived objects may contain non-copyable state.
-    ConvolutionDescriptor(const ConvolutionDescriptor&) = delete;
-    ConvolutionDescriptor& operator=(const ConvolutionDescriptor&) = delete;
-
-private:
-    size_t m_wStride;
-    size_t m_hStride;
-    bool m_padding;
-};
-
-// PoolingDescriptor describes properties specific to convolution application.
-class PoolingDescriptor
-{
-public:
-    enum class PoolKind
-    {
-        Max,
-        Average
-    };
-
-    PoolKind kind() const
-    {
-        return m_kind;
-    }
-    // Pooling window size.
-    size_t w() const
-    {
-        return m_w;
-    }
-    size_t h() const
-    {
-        return m_h;
-    }
-    // Horizontal stride (in w-dimension).
-    size_t wStride() const
-    {
-        return m_wStride;
-    }
-    // Vertical stride (in h-dimension).
-    size_t hStride() const
-    {
-        return m_hStride;
-    }
-    // Horizontal pad (in w-dimension).
-    size_t wPad() const
-    {
-        return m_wPad;
-    }
-    // Vertical pad (in h-dimension).
-    size_t hPad() const
-    {
-        return m_hPad;
-    }
-
-public:
-    PoolingDescriptor(PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
-    {
-        m_kind = kind;
-        m_w = w;
-        m_h = h;
-        m_wStride = wStride;
-        m_hStride = hStride;
-        m_wPad = wPad;
-        m_hPad = hPad;
-    }
-
-public:
-    virtual ~PoolingDescriptor() = default;
-    // Deleting copy ctor/assignment as derived objects may contain non-copyable state.
-    PoolingDescriptor(const PoolingDescriptor&) = delete;
-    PoolingDescriptor& operator=(const PoolingDescriptor&) = delete;
-
-private:
-    PoolKind m_kind;
-    size_t m_w;
-    size_t m_h;
-    size_t m_wStride;
-    size_t m_hStride;
-    size_t m_wPad;
-    size_t m_hPad;
-};
+#pragma warning(push)
+#pragma warning(disable : 4251)

 template <class ElemType>
 class MATH_API ConvolutionEngine
 {
 public:
-    using Tensor4D = ConvolutionTensor4D;
-    using Filter = ConvolutionFilter;
-    using ConvDesc = ConvolutionDescriptor;
    using Mat = Matrix<ElemType>;

 public:
-    ConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
-        : m_deviceId(deviceId), m_imageLayout(imageLayout)
-    {
-    }
    virtual ~ConvolutionEngine() = default;

-    void Forward(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                 const Tensor4D& outT, Mat& out, Mat& workspace);
+    void Forward(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace);

-    void BackwardData(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                      const Tensor4D& gradT, Mat& grad, Mat& workspace);
+    void BackwardData(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace);

-    void BackwardFilter(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
-                        const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace);
+    void BackwardKernel(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace);

-    void NormalizeBatch(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                        bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
-                        double epsilon, Mat& saveMean, Mat& saveInvStdDev);
+    void ForwardPooling(const Mat& in, Mat& out);

-    void NormalizeBatchInference(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                                 bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out);
+    void BackwardPooling(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad);

-    void BackwardNormalizeBatch(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
-                                const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
-                                Mat& scaleGrad, Mat& biasGrad);
+    std::shared_ptr<const ConvolveGeometry> Geometry() const { return m_geometry; }
+
+    static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
+                                                               size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None, ConvolutionEngineKind enabledEngines = ConvolutionEngineKind::All);

    DISABLE_COPY_AND_MOVE(ConvolutionEngine);

 protected:
-    virtual void EnsureCompatible() = 0;
-
-    virtual void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                             const Tensor4D& outT, Mat& out, Mat& workspace) = 0;
-
-    virtual void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                                  const Tensor4D& gradT, Mat& grad, Mat& workspace) = 0;
-
-    virtual void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
-                                    const Filter& filterT, Mat& filter, bool allowReuse, Mat& workspace) = 0;
-
-    virtual void EnsureCompatibleBatchNorm(bool spatial) = 0;
-
-    virtual void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                                    bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
-                                    double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
-
-    // REVIEW alexeyk: roll into NormalizeBatchCore.
-    virtual void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                                             bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) = 0;
-
-    virtual void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
-                                            const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
-                                            Mat& scaleGrad, Mat& biasGrad) = 0;
-
-protected:
-    DEVICEID_TYPE m_deviceId;
-    ImageLayoutKind m_imageLayout;
-};
-
-template <class ElemType>
-class MATH_API PoolingEngine
-{
-public:
-    using Tensor4D = ConvolutionTensor4D;
-    using PoolDesc = PoolingDescriptor;
-    using Mat = Matrix<ElemType>;
-
-public:
-    PoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
-        : m_deviceId(deviceId), m_imageLayout(imageLayout)
+    ConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
+        : m_geometry(geometry), m_deviceId(deviceId), m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_poolKind(poolKind)
    {
+        assert(m_geometry != nullptr);
    }
-    virtual ~PoolingEngine() = default;

-    void Forward(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out);
-    void Backward(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad);
-
-    DISABLE_COPY_AND_MOVE(PoolingEngine);
-
-protected:
    virtual void EnsureCompatible() = 0;
-    virtual void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) = 0;
-    virtual void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) = 0;
+
+    virtual void EnsureConvolutionInitialized() = 0;
+
+    virtual void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) = 0;
+
+    virtual void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) = 0;
+
+    virtual void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool allowReuse, Mat& workspace) = 0;
+
+    virtual void EnsurePoolingInitialized() = 0;
+
+    virtual void ForwardPoolingCore(const Mat& in, Mat& out) = 0;
+
+    virtual void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) = 0;

 protected:
+    ConvolveGeometryPtr m_geometry;
    DEVICEID_TYPE m_deviceId;
    ImageLayoutKind m_imageLayout;
+    size_t m_maxTempMemSizeInSamples;
+    PoolKind m_poolKind;
 };

-// REVIEW alexeyk: this is a temporary hack until we find a better place for the BatchNorm engine(s).
-enum class BatchNormImpl
+#pragma warning(pop)
+
+static inline PoolKind PoolKindFrom(const wstring& s)
 {
-    CuDnn,
-    Cntk
-};
+    if (s.empty() || AreEqualIgnoreCase(s, L"none"))
+        return PoolKind::None;
+    if (AreEqualIgnoreCase(s, L"max"))
+        return PoolKind::Max;
+    if (AreEqualIgnoreCase(s, L"average"))
+        return PoolKind::Average;
+    InvalidArgument("Unknown pooling kind: '%ls'. Supported values: 'none', 'max', 'average'.", s.c_str());
+}

-template <class ElemType>
-class MATH_API ConvolutionEngineFactory
-{
-public:
-    using Tensor4D = ConvolutionTensor4D;
-    using Tensor4DPtr = std::unique_ptr<Tensor4D>;
-    using Filter = ConvolutionFilter;
-    using FilterPtr = std::unique_ptr<ConvolutionFilter>;
-    using ConvDesc = ConvolutionDescriptor;
-    using ConvDescPtr = std::unique_ptr<ConvolutionDescriptor>;
-    using PoolDesc = PoolingDescriptor;
-    using PoolDescPtr = std::unique_ptr<PoolingDescriptor>;
-
-    using ConvEnginePtr = std::unique_ptr<ConvolutionEngine<ElemType>>;
-    using PoolEnginePtr = std::unique_ptr<PoolingEngine<ElemType>>;
-
-public:
-    ConvolutionEngineFactory() = default;
-    virtual ~ConvolutionEngineFactory() = default;
-
-    virtual Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) = 0;
-    virtual FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) = 0;
-    virtual ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT,
-                                             size_t wStride, size_t hStride, bool padding) = 0;
-    virtual PoolDescPtr CreatePoolDescriptor(PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) = 0;
-    // virtual Tensor4DPtr CreateLrnDescriptor() = 0;
-
-    virtual ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) = 0;
-    virtual PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) = 0;
-
-    enum class EngineType
-    {
-        Auto,
-        CuDnn,
-        Legacy
-    };
-    static std::unique_ptr<ConvolutionEngineFactory<ElemType>> Create(DEVICEID_TYPE deviceId, EngineType engType, ImageLayoutKind imageLayoutKind);
-
-    DISABLE_COPY_AND_MOVE(ConvolutionEngineFactory);
-};
 } } }
--- a/Source/Math/ConvolveGeometry.h
+++ b/Source/Math/ConvolveGeometry.h
@ -0,0 +1,552 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+#pragma once
+
+#include "Basics.h"
+#include "TensorShape.h"
+#include <iterator>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// Notes:
+// * ConvolveGeometry represents the application of one or more rectangular "kernels" (all of the same size)
+//   to a rectangular input to produce a rectangular output.
+// * A "cell" in the rectangular input is identified by a single coordinate called a "col" (for column).
+// * A "cell" in the rectangular output is identified by a single coordinate called a "row".
+// * The kernels may involve weights, in which case MpRowIwht indicates the starting index of the weights
+//   used for a given output cell.
+// The overall idea of ConvolveGeometry is to precompute maps that can be used to apply convolutions of
+// arbitrary configurations and dimensions. In such case the generic implementation becomes very simple and invariant
+// wrt convolution configuration and dimensionality. For specific cases like 2D/3D convolutions and full sharing,
+// highly optimized implementations (e.g. cuDNN) are used.
+class ConvolveGeometry final
+{
+public:
+    using IntVec = std::vector<int>;
+    using BoolVec = std::vector<bool>;
+
+    const TensorShape& InputShape() const { return m_inputShape; }
+    const TensorShape& OutputShape() const { return m_outputShape; }
+    const TensorShape& KernelShape() const { return m_kernelShape; }
+    const TensorShape& MapCount() const { return m_mapCount; }
+    const TensorShape& Stride() const { return m_stride; }
+    const BoolVec& Sharing() const { return m_sharing; }
+    const BoolVec& AutoPad() const { return m_autoPad; }
+    const TensorShape& LowerPad() const { return m_lowerPad; }
+    const TensorShape& UpperPad() const { return m_upperPad; }
+
+    // Maps from a "row" (index of output cell) to its base "col" (index of input cell). For a given row,
+    // the cols that contribute to it are { MpRowCol[row] + Indices[i0 + 1 + i] | 0 <= i < Indices[i0] },
+    // where i0 = MpRowIndices[row].
+    const IntVec& MpRowCol() const { return m_mpRowCol; }
+
+    // Maps from a "row" (index of output cell) to where to start in the weights array. Each run of weights
+    // consists of KernelSize weights.
+    const IntVec& MpRowIwht() const { return m_mpRowIwht; }
+
+    // Maps from a "row" (index of output cell) to its starting index in Runs. A run consists of:
+    // * skip count (to skip that many weights)
+    // * item count
+    // * relative indices into source (item count of these)
+    // * masks (all 1's or all 0's) (item count of these)
+    // For items that are masked out (0 mask), the index stored is the next valid index.
+    // This ensures that accessing the corresponding neuron value doesn't fault and that
+    // backprop operations write the correct value last (any previous writes won't change
+    // the value).
+    // NOTE: The first (zeroth) run is always the "full" kernel run. Also, MpRowRun can be empty,
+    // indicating that all values are zero (all outputs use the "full" kernel run).
+    const IntVec& MpRowRun() const { return m_mpRowRun; }
+    const IntVec& Runs() const { return m_runs; }
+
+    // Maps from a "row" (index of output cell) to its starting index in Indices. Note that "Runs" is intended
+    // for kernels that have weights, while "Indices" is intended for kernels that don't need to access weights.
+    // As a result, the encoding in Indices is simpler and more direct.
+    // A run in Indices consists of:
+    // * item count
+    // * relative indices into source (item count of these)
+    // NOTE: The first run of indices is always the "full" kernel run. Also, MpRowIndices can be empty,
+    // indicating that all values are zero (all outputs use the "full" kernel run).
+    // In addition, all items in Indices are valid source indices so no masking is required in subsequent computation.
+    const IntVec&  MpRowIndices() const { return m_mpRowIndices; }
+    const IntVec&  Indices() const { return m_indices; }
+
+    // Number of kernels (equal to MapCount if sharing is all true values).
+    size_t KernelCount() const { return m_kernelCount; }
+
+    ConvolveGeometry(const TensorShape& inputShape, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& stride,
+                     const BoolVec& sharing, const BoolVec& autoPad, const TensorShape& lowerPad, const TensorShape& upperPad)
+                     : m_inputShape(inputShape), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(stride), m_sharing(sharing),
+                     m_autoPad(autoPad), m_lowerPad(lowerPad), m_upperPad(upperPad)
+    {
+        // Note: this ctor is a bit long so sit back and relax.
+
+        assert(m_inputShape.GetRank() == m_kernelShape.GetRank());
+        assert(m_mapCount.GetRank() == 1 || m_mapCount.GetRank() == m_inputShape.GetRank());
+        assert(m_stride.GetRank() == 1 || m_stride.GetRank() == m_inputShape.GetRank());
+        assert(m_sharing.size() == 1 || m_sharing.size() == m_inputShape.GetRank());
+        assert(m_autoPad.size() == 1 || m_autoPad.size() == m_inputShape.GetRank());
+        assert(m_lowerPad.GetRank() == 1 || m_lowerPad.GetRank() == m_inputShape.GetRank());
+        assert(m_upperPad.GetRank() == 1 || m_upperPad.GetRank() == m_inputShape.GetRank());
+        
+        m_outputShape = ComputeOutputShape(m_inputShape, m_kernelShape, m_mapCount, m_stride,
+                                           m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+        assert(m_inputShape.GetRank() == m_outputShape.GetRank());
+
+        size_t dimCount = inputShape.GetRank();
+        size_t kernelSize = kernelShape.GetNumElements();
+
+        // Compute the total number of kernels.
+        m_kernelCount = 1;
+        for (size_t i = 0; i < dimCount; i++)
+            m_kernelCount *= !GetSharing(i) ? m_outputShape[i] : GetMapCount(i);
+
+        // Compute the "Start" indices.
+        m_start.resize(dimCount);
+        m_startIndex = 0;
+        m_originIndex = 0;
+        for (int i = (int)dimCount - 1; i >= 0; i--)
+        {
+            assert((m_outputShape[i] % GetMapCount(i)) == 0);
+            int outPerMap = (int)(m_outputShape[i] / GetMapCount(i));
+            // Number of cells between first and last "centers", inclusive.
+            int cells = (int)((outPerMap - 1) * GetStride(i) + 1);
+            assert(m_inputShape[i] >= cells);
+
+            // Extra cells, to the left and right of "cells".
+            int extra = (int)m_inputShape[i] - cells;
+            assert(extra >= 0);
+
+            // When LowerPad and/or UpperPad are specified, the Start[i] value is determined by those values.
+            int lo = GetAutoPad(i) ? 0 : (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : i];
+            int hi = GetAutoPad(i) ? 0 : (int)m_upperPad[m_upperPad.size() == 1 ? 0 : i];
+            if (lo != 0 || hi != 0)
+            {
+                assert(extra + lo + hi + 1 == m_kernelShape[i]);
+                // Compute the number of cells on the left and right parts of the kernel,
+                // not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
+                // placed on the right (the center is shifted to the left).
+                int right = (int)m_kernelShape[i] - 1;
+                int left = right / 2;
+                right -= left;
+                assert(left <= right);
+                assert(right <= left + 1);
+
+                assert(lo <= left);
+                assert(hi <= right);
+                m_start[i] = left - lo;
+                assert(m_start[i] + cells + right == m_inputShape[i] + hi);
+            }
+            else
+            {
+                m_start[i] = extra / 2;
+#ifdef _DEBUG
+                // If we're padding then extra should be covered.
+                bool padded = GetAutoPad(i);
+                assert(!padded || extra + 1 <= m_kernelShape[i]);
+                // If we're not padding then, we should stay within the input dimension.
+                assert(padded || extra + 1 >= m_kernelShape[i]);
+
+                // Compute the number of cells on the left and right parts of the kernel,
+                // not counting the "kernel-center" cell. If m_kernelShape[i] is even, the extra cell is
+                // placed on the right (the center is shifted to the left).
+                int right = (int)m_kernelShape[i] - 1;
+                int left = right / 2;
+                right -= left;
+                assert(0 <= left);
+                assert(left <= right);
+                assert(right <= left + 1);
+
+                int min = m_start[i] - left;
+                int max = m_start[i] + (int)cells + right;
+                assert(!padded || min <= 0 && max >= m_inputShape[i]);
+                assert(padded || min >= 0 && max <= m_inputShape[i]);
+
+                int diff = min - ((int)m_inputShape[i] - max);
+                assert(std::abs(diff) <= 1);
+
+                UNUSED(padded);
+                UNUSED(diff);
+#endif
+            }
+
+            m_startIndex = m_startIndex * (int)m_inputShape[i] + m_start[i];
+            m_originIndex = m_originIndex * (int)m_inputShape[i] + ((int)m_kernelShape[i] - 1) / 2;
+        }
+        
+        // Compute support, mapping from the index into the kernel to offset into source.
+        // Support consists of the column deltas of the kernels, as offsets from MpRowCol[row].
+        IntVec support(kernelSize);
+        std::vector<IntVec> kernelCoords(kernelSize);
+        for (int idx = 0; idx < kernelSize; idx++)
+        {
+            kernelCoords[idx].resize(dimCount);
+            int ivSrc = 0;
+            int factor = 1;
+            int cur = idx;
+            for (size_t i = 0; i < dimCount; i++)
+            {
+                assert(cur >= 0);
+                int d = (int)m_kernelShape[i];
+                assert(d > 0);
+                int coord = cur % d;
+                cur /= d;
+                kernelCoords[idx][i] = coord;
+                ivSrc += factor * coord;
+                factor *= (int)m_inputShape[i];
+            }
+            assert(cur == 0);
+            assert(ivSrc < m_inputShape.GetNumElements());
+            support[idx] = ivSrc - m_originIndex;
+        }
+        
+        size_t outputSize = m_outputShape.GetNumElements();
+        // Compute the mappings (where row = output node index, col = source node index):
+        // * from row to the index of the first weight to use for that row.
+        // * from row to the first input col. The rest are col + _support[i].
+        m_mpRowIwht.resize(outputSize);
+        m_mpRowCol.resize(outputSize);
+        m_mpRowRun.resize(outputSize);
+        m_mpRowIndices.resize(outputSize);
+
+        // A "key" is an equivalence class of run/masks.
+        // Calculate the key for an interior cell (for using all of support - when all masks are 1's).
+        int keyInterior = 0;
+        for (size_t i = 0; i < dimCount; i++)
+        {
+            int width = (int)m_kernelShape[i];
+            keyInterior = keyInterior * width + (width - 1) / 2;
+        }
+
+        m_runs.resize(2 * kernelSize + 2, -1);
+        m_indices.resize(kernelSize + 1);
+        m_runs[0] = 0; // Skip count
+        m_runs[1] = (int)kernelSize; // Count of entries
+        m_indices[0] = (int)kernelSize;
+        for (size_t i = 0; i < kernelSize; i++)
+        {
+            m_runs[2 + i] = support[i];
+            m_indices[1 + i] = support[i];
+        }
+
+        // Working buffer for masks.
+        IntVec masks(kernelSize);
+
+        // Map from key to pair of starting locations in Runs and Indices.
+        std::map<int, std::pair<int, int>>  mpkeystarts;
+        mpkeystarts[keyInterior] = std::make_pair(0, 0);
+
+        IntVec dkey(dimCount);
+        for (size_t row = 0; row < outputSize; row++)
+        {
+            // Compute the kernel number, column, and key.
+            // REVIEW alexeyk: Seems like there should be a simpler and faster way, without starting
+            // from scratch for each output (row)....
+            int kern = 0;
+            int col = 0;
+            int factorKern = 1;
+            int factorCol = 1;
+            int key = 0;
+            int cur = (int)row;
+            for (size_t i = 0; i < dimCount; i++)
+            {
+                int dim = (int)(m_outputShape[i] / GetMapCount(i));
+                int coord = cur % dim;
+                cur /= dim;
+
+                // Kernel
+                if (!GetSharing(i))
+                {
+                    kern += factorKern * coord;
+                    factorKern *= dim;
+                }
+
+                int maps = (int)GetMapCount(i);
+                if (maps > 1)
+                {
+                    kern += factorKern * (cur % maps);
+                    cur /= maps;
+                    factorKern *= maps;
+                }
+
+                // Transform coord to input index space.
+                coord *= (int)GetStride(i);
+                coord += m_start[i];
+
+                col += factorCol * coord;
+                factorCol *= (int)m_inputShape[i];
+
+                int width = (int)m_kernelShape[i];
+                int half = (width - 1) / 2;
+                int min = coord - half;
+                int lim = min + width;
+                if (min < 0)
+                    dkey[i] = min;
+                else if (lim > m_inputShape[i])
+                    dkey[i] = lim - (int)m_inputShape[i];
+                else
+                    dkey[i] = 0;
+                int dk = dkey[i] + half;
+                assert(0 <= dk);
+                assert(dk < width);
+                key = key * width + dk;
+            }
+            assert(cur == 0);
+            assert(0 <= kern);
+            assert(kern < m_kernelCount);
+            assert(0 <= col);
+            assert(col < m_inputShape.GetNumElements());
+
+            auto startsIter = mpkeystarts.find(key);
+            if (startsIter == mpkeystarts.end())
+            {
+                auto starts = std::make_pair((int)m_runs.size(), (int)m_indices.size());
+                mpkeystarts[key] = starts;
+
+                int indexCount = 0;
+                for (int idx = 0; idx < kernelSize; idx++)
+                {
+                    const auto& coords = kernelCoords[idx];
+                    int mask = 0;
+                    for (int i = (int)dimCount; ; )
+                    {
+                        if (--i < 0)
+                        {
+                            // All OK.
+                            mask = -1;
+                            break;
+                        }
+                        int k = dkey[i] + coords[i];
+                        if (k < 0)
+                            break;
+                        if (k >= m_kernelShape[i])
+                            break;
+                    }
+                    assert(mask == 0 || mask == -1);
+                    indexCount -= mask;
+                    masks[idx] = mask;
+                }
+
+                int skip = 0;
+                while (masks[skip] == 0)
+                    skip++;
+                int count = (int)kernelSize;
+                while (masks[count - 1] == 0)
+                    count--;
+
+                count -= skip;
+                m_runs.push_back(skip); // Skip count
+                m_runs.push_back(count); // Count of entries
+                m_indices.push_back(indexCount);
+                for (int i = 0, iMin = 0; i < count; i++)
+                {
+                    int index = support[skip + i];
+                    int mask = masks[skip + i];
+                    if (mask != 0)
+                    {
+                        // Add "index" to runs for this slot and any immediately preceeding
+                        // slots that have mask == 0.
+                        assert(iMin <= i);
+                        assert(m_runs.size() == starts.first + 2 + iMin);
+                        for (; iMin <= i; iMin++)
+                            m_runs.push_back(index);
+                        assert(iMin == i + 1);
+                        assert(m_runs.size() == starts.first + 2 + iMin);
+
+                        m_indices.push_back(index);
+                    }
+                }
+                for (int i = 0; i < count; i++)
+                    m_runs.push_back(masks[skip + i]);
+                assert(m_runs.size() == std::get<0>(starts) + 2 + 2 * count);
+                assert(m_indices.size() == std::get<1>(starts) + 1 + indexCount);
+
+                m_mpRowRun[row] = starts.first;
+                m_mpRowIndices[row] = starts.second;
+            }
+            else
+            {
+                m_mpRowRun[row] = (*startsIter).second.first;
+                m_mpRowIndices[row] = (*startsIter).second.second;
+            }
+            assert(0 <= kern);
+            assert(kern < m_kernelCount);
+            m_mpRowCol[row] = col;
+            m_mpRowIwht[row] = kern * (int)kernelSize;
+        }
+    }
+
+    size_t GetStride(size_t dim) const
+    {
+        assert(m_stride.size() == 1 || dim < m_stride.size());
+        return m_stride[m_stride.size() == 1 ? 0 : dim];
+    }
+
+    size_t GetMapCount(size_t dim) const
+    {
+        assert(m_mapCount.size() == 1 || dim < m_mapCount.size());
+        // If the whole map count tensor was specified explicitly - return requested component.
+        if (m_mapCount.size() > 1)
+            return m_mapCount[dim];
+        // If map count tensor rank == 1 then assume it represents number of feature maps for the rightmost dimension.
+        if (dim == m_inputShape.size() - 1)
+            return m_mapCount[0];
+        return 1;
+    }
+
+    bool GetSharing(size_t dim) const
+    {
+        assert(m_sharing.size() == 1 || dim < m_sharing.size());
+        return m_sharing[m_sharing.size() == 1 ? 0 : dim];
+    }
+
+    bool GetAutoPad(size_t dim) const
+    {
+        assert(m_autoPad.size() == 1 || dim < m_autoPad.size());
+        return m_autoPad[m_autoPad.size() == 1 ? 0 : dim];
+    }
+
+    int GetLowerPad(size_t dim) const
+    {
+        if (!GetAutoPad(dim))
+            return (int)m_lowerPad[m_lowerPad.size() == 1 ? 0 : dim];
+
+        int kernSize = (int)m_kernelShape[dim];
+        int inpSize = (int)m_inputShape[dim];
+        int outSize = (int)m_outputShape[dim];
+        int stride = (int)GetStride(dim);
+
+        // Taken from computation in ConvolveGeometry ctor.
+        // Number of cells between first and last "centers", inclusive.
+        int cells = (outSize - 1) * stride + 1;
+        // Extra cells, to the left and right of "cells".
+        int extra = inpSize - cells;
+        int center = extra / 2;
+        return -(center - (kernSize - 1) / 2);
+    }
+
+    static TensorShape ComputeOutputShape(const TensorShape& inputShape, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& stride,
+                                          const BoolVec& sharing, const BoolVec& autoPad, const TensorShape& lowerPad, const TensorShape& upperPad)
+    {
+        if (inputShape.GetRank() != kernelShape.GetRank())
+            InvalidArgument("Convolution input and kernel tensors must have the same rank.");
+        if (mapCount.GetRank() != 1 && inputShape.GetRank() != mapCount.GetRank())
+            InvalidArgument("Convolution map tensor must have rank 1 or the same as the input tensor.");
+        if (stride.GetRank() != 1 && inputShape.GetRank() != stride.GetRank())
+            InvalidArgument("Convolution stride tensor must have rank 1 or the same as the input tensor.");
+        if (sharing.size() != 1 && inputShape.GetRank() != sharing.size())
+            InvalidArgument("Convolution sharing tensor must have rank 1 or the same as the input tensor.");
+        if (autoPad.size() != 1 && inputShape.GetRank() != autoPad.size())
+            InvalidArgument("Convolution padding tensor must have rank 1 or the same as the input tensor.");
+        if (lowerPad.GetRank() != 1 && inputShape.GetRank() != lowerPad.GetRank())
+            InvalidArgument("Convolution lower pad tensor must have rank 1 or the same as the input tensor.");
+        if (upperPad.GetRank() != 1 && inputShape.GetRank() != upperPad.GetRank())
+            InvalidArgument("Convolution upper pad tensor must have rank 1 or the same as the input tensor.");
+
+        SmallVector<size_t> dimsOutput(inputShape.GetRank());
+        for (size_t i = 0; i < inputShape.GetRank(); i++)
+        {
+            assert(inputShape[i] >= 1);
+            if (kernelShape[i] > inputShape[i])
+                InvalidArgument("Convolution operation requires that kernel dim %d <= input dim %d.", (int)kernelShape[i], (int)inputShape[i]);
+
+            size_t delta = stride[stride.GetRank() == 1 ? 0 : i];
+            size_t dim = inputShape[i];
+            bool autoPadCur = autoPad[autoPad.size() == 1 ? 0 : i];
+            size_t lo = lowerPad[lowerPad.size() == 1 ? 0 : i];
+            size_t hi = upperPad[upperPad.size() == 1 ? 0 : i];
+            if (autoPadCur)
+            {
+                dim += kernelShape[i] - 1;
+            }
+            else
+            {
+                dim += lo + hi;
+            }
+            size_t dimOut = (dim - kernelShape[i]) / delta + 1;
+            // When LowerPad and/or UpperPad are specified (i.e. > 0), we insist that the kernel applications
+            // fill the entire space.
+            if (!autoPadCur && (lo > 0 || hi > 0))
+            {
+                size_t size = (dimOut - 1) * delta + kernelShape[i];
+                if (size != dim)
+                    InvalidArgument("Convolution requires that kernel fills the entire space if auto-padding is disabled.");
+            }
+            if (mapCount.size() > 1)
+                dimOut *= mapCount[i];
+            else if (i == inputShape.GetRank() - 1)
+                dimOut *= mapCount[0];
+
+            dimsOutput[i] = dimOut;
+        }
+
+        auto dimsOut = TensorShape(dimsOutput);
+        // Check the output dimensions.
+        size_t mapCountTotal = mapCount.GetNumElements();
+        size_t sizeOut = dimsOut.GetNumElements();
+        assert((sizeOut % mapCountTotal) == 0);
+        UNUSED(mapCountTotal);
+        UNUSED(sizeOut);
+
+        return dimsOut;
+    }
+
+    // Used in unit tests and during debugging.
+    operator std::string() const
+    {
+        std::ostringstream res;
+        res << "Input: " << (string)InputShape();
+        res << ", Output: " << (string)OutputShape();
+        res << ", Kernel: " << (string)KernelShape();
+        res << ", Map: " << (string)MapCount();
+        res << ", Stride: " << (string)Stride();
+        res << ", Sharing: (";
+        std::copy(begin(Sharing()), end(Sharing()) - 1, std::ostream_iterator<bool>(res, ", "));
+        res << Sharing().back() << ")";
+        res << ", AutoPad: (";
+        std::copy(begin(AutoPad()), end(AutoPad()) - 1, std::ostream_iterator<bool>(res, ", "));
+        res << AutoPad().back() << ")";
+        res << ", LowerPad: " << (string)LowerPad();
+        res << ", UpperPad: " << (string)UpperPad();
+        return res.str();
+    }
+
+    DISABLE_COPY_AND_MOVE(ConvolveGeometry);
+
+private:
+    TensorShape m_inputShape;
+    TensorShape m_outputShape;
+    TensorShape m_kernelShape;
+    TensorShape m_mapCount;
+    TensorShape m_stride;
+    BoolVec m_sharing;
+    BoolVec m_autoPad;
+    TensorShape m_lowerPad;
+    TensorShape m_upperPad;
+
+    // There are several reasons why int type is used here rather than size_t:
+    // 1. Many of these vectors contain offsets which can be negative.
+    // 2. Most of these vectors will be copied into device memory (GPU) so the smaller the size - the better.
+    //    Also, 64-bit operations are slower on GPU.
+    // 3. If you are still not convinced, we don't expect convolutions to be more than 2B in size anyway. 
+    // See description to corresponding getter functions to understand what these are.
+    IntVec m_mpRowCol;
+    IntVec m_mpRowIwht;
+    IntVec m_mpRowRun;
+    IntVec m_runs;
+    IntVec m_mpRowIndices;
+    IntVec m_indices;
+    // The indices of the first ("top-left-most") "kernel-center" cell in the source.
+    IntVec m_start;
+    int m_startIndex;
+    // When the first kernel cell is aligned with the first source cell, this is the index of the input cell that
+    // is aligned with the "kernel-center" cell. Indices in "Runs" and "Indices" are relative to OriginIndex.
+    int m_originIndex;
+
+    size_t m_kernelCount;
+};
+
+using ConvolveGeometryPtr = std::shared_ptr<ConvolveGeometry>;
+
+} } }
--- a/Source/Math/CuDnnBatchNormalization.cu
+++ b/Source/Math/CuDnnBatchNormalization.cu
@ -0,0 +1,173 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "CuDnnFactories.h"
+#include "BatchNormalizationEngine.h"
+#include "CuDnnCommon.h"
+#include "GPUMatrix.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template <class ElemType>
+class CuDnnBatchNormEngine : public BatchNormEngine<ElemType>
+{
+public:
+    using Base = BatchNormEngine<ElemType>;
+    using typename Base::Mat;
+
+public:
+    CuDnnBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+                        bool spatial, ImageLayoutKind imageLayout)
+                        : Base(deviceId, inOutT, spatial, imageLayout),
+                        m_cudnn(CuDnn::Instance()),
+                        m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<ElemType>()),
+                        m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<ElemType>())
+    {
+    }
+
+protected:
+    using Base::m_deviceId;
+    using Base::m_imageLayout;
+    using Base::m_inOutT;
+    using Base::m_spatial;
+
+    void EnsureCompatible() override
+    {
+        if (m_spatial && m_imageLayout == ImageLayoutKind::HWC)
+            InvalidArgument("cuDNN batch normalization supports only cudnn(CHW) layout.");
+        if (m_inOutT.GetRank() > 4)
+            InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
+    }
+
+    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
+                     Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
+    {
+        // REVIEW alexeyk: there might be a way to do this in cuDNN.
+        if (blendFactor != 0 && (blendFactor != 1 || expAvgFactor > 0))
+            InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");
+
+        m_inOutCuDnnT.UpdateBatchSize(in.GetNumCols());
+        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
+        // cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
+        epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
+        // expAvgFactor == 0 && blendFactor == 1 means we are in eval mode.
+        if (expAvgFactor == 0 && blendFactor == 1)
+        {
+            CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
+                m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
+        }
+        else
+        {
+            CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
+                m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
+                epsilon, ptr(saveMean), ptr(saveInvStdDev)));
+        }
+    }
+
+    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
+                      Mat& scaleGrad, Mat& biasGrad) override
+    {
+        m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
+        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
+        // REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
+#if CUDNN_PATCHLEVEL >= 7
+        CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
+            m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
+#else
+        CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
+            m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
+#endif
+    }
+
+private:
+    static ElemType* ptr(Mat& src)
+    {
+        return src.BufferPointer();
+    }
+    static const ElemType* ptr(const Mat& src)
+    {
+        return src.BufferPointer();
+    }
+
+    static TensorShape GetInOutTensor(const TensorShape& inOutT)
+    {
+        // cuDNN supports only 3D and 4D tensors (in cuDNN docs it's 4D and 5D dues to N dimension)
+        // even for non-spatial inputs so expand the tensor if needed.
+        if (inOutT.GetRank() > 2)
+            return inOutT;
+        SmallVector<size_t> v(std::max(inOutT.GetRank(), (size_t)3), 1);
+        for (size_t i = 0; i < inOutT.GetRank(); i++)
+            v[i] = inOutT[i];
+        return TensorShape(v);
+    }
+
+    static TensorShape GetScaleBiasTensor(const TensorShape& inOutT, bool spatial)
+    {
+        if (!spatial)
+            return GetInOutTensor(inOutT);
+
+        const auto& t = GetInOutTensor(inOutT);
+        SmallVector<size_t> v(t.GetRank(), 1);
+        v[v.size() - 1] = t[t.GetRank() - 1];
+        return TensorShape(v);
+    }
+
+private:
+    using C = Consts<ElemType>;
+
+    CuDnn::ptr_t m_cudnn;
+    CuDnnTensor m_inOutCuDnnT;
+    CuDnnTensor m_scaleBiasCuDnnT;
+};
+
+template class CuDnnBatchNormEngine<float>;
+template class CuDnnBatchNormEngine<double>;
+
+template <typename ElemType>
+std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+                                                                                         bool spatial, ImageLayoutKind imageLayout)
+{
+    return std::make_unique<CuDnnBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
+}
+
+template class CuDnnBatchNormEngineFactory<float>;
+template class CuDnnBatchNormEngineFactory<double>;
+
+CudaTimer::~CudaTimer()
+{
+    // TODO: Should not throw if std::uncaught_exception()
+    if (m_start != nullptr)
+        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
+    if (m_stop != nullptr)
+        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
+}
+void CudaTimer::Start()
+{
+    cudaEvent_t start;
+    cudaEvent_t stop;
+    if (m_start != nullptr)
+        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
+    if (m_stop != nullptr)
+        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
+    CUDA_CALL(cudaEventCreate(&start));
+    CUDA_CALL(cudaEventCreate(&stop));
+    m_start = start;
+    m_stop = stop;
+    CUDA_CALL(cudaEventRecord(start, GetStream()));
+}
+void CudaTimer::Stop()
+{
+    CUDA_CALL(cudaEventRecord(reinterpret_cast<cudaEvent_t>(m_stop), GetStream()));
+    CUDA_CALL(cudaEventSynchronize(reinterpret_cast<cudaEvent_t>(m_stop)));
+}
+float CudaTimer::Elapsed()
+{
+    float ms;
+    CUDA_CALL(cudaEventElapsedTime(&ms, reinterpret_cast<cudaEvent_t>(m_start), reinterpret_cast<cudaEvent_t>(m_stop)));
+    return ms;
+}
+
+} } }
--- a/Source/Math/CuDnnCommon.cpp
+++ b/Source/Math/CuDnnCommon.cpp
@ -0,0 +1,108 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "GPUMatrix.h"
+#include "CuDnnCommon.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template <>
+const float Consts<float>::One = 1;
+template <>
+const double Consts<double>::One = 1;
+template <>
+const float Consts<float>::Zero = 0;
+template <>
+const double Consts<double>::Zero = 0;
+
+CuDnnTensor::CuDnnTensor(const TensorShape& src, cudnnDataType_t dataType)
+    : m_tensor(nullptr)
+{
+    CUDNN_CALL(cudnnCreateTensorDescriptor(&m_tensor));
+    // Set cuDNN tensor dimensions. cuDNN uses row-major format while TensorShape - column-major
+    // so conversion is required. N dimension will be set to 1.
+    const auto& stridesSrc = src.GetStrides();
+    SmallVector<int> dims(src.GetRank() + 1);
+    SmallVector<int> strides(stridesSrc.size() + 1);
+    assert(dims.size() == strides.size());
+    for (int i = 0; i < src.GetRank(); i++)
+    {
+        dims[dims.size() - 1 - i] = (int)src[i];
+        strides[dims.size() - 1 - i] = (int)stridesSrc[i];
+    }
+    // Set "minibatch"(aka N) dimension.
+    dims[0] = 1;
+    strides[0] = strides[1] * dims[1];
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(m_tensor, dataType, (int)dims.size(), dims.data(), strides.data()));
+}
+
+CuDnnTensor::~CuDnnTensor()
+{
+    if (m_tensor != nullptr)
+    {
+        cudnnDestroyTensorDescriptor(m_tensor);
+        m_tensor = nullptr;
+    }
+}
+
+void CuDnnTensor::UpdateBatchSize(size_t batchSize)
+{
+    // Currently cuDNN supports only 2D and 3D convlutions anyway (so max 5D tensors).
+    const int MaxDims = 5;
+    int dims[MaxDims];
+    int strides[MaxDims];
+    int nbDims = 0;
+    cudnnDataType_t dataType;
+    // According to NVIDIA, Get/Set functions are very fast so it's safe to call them in a loop.
+    CUDNN_CALL(cudnnGetTensorNdDescriptor(m_tensor, MaxDims, &dataType, &nbDims, dims, strides));
+    assert(nbDims <= MaxDims);
+    dims[0] = (int)batchSize;
+    CUDNN_CALL(cudnnSetTensorNdDescriptor(m_tensor, dataType, nbDims, dims, strides));
+}
+
+template <typename ElemType>
+cudnnDataType_t CuDnnTensor::GetDataType()
+{
+    if (typeid(ElemType) == typeid(float))
+        return CUDNN_DATA_FLOAT;
+    else if (typeid(ElemType) == typeid(double))
+        return CUDNN_DATA_DOUBLE;
+    else
+        InvalidArgument("cuDNN engine currently supports only single and double precision data types.");
+}
+
+template cudnnDataType_t CuDnnTensor::GetDataType<float>();
+template cudnnDataType_t CuDnnTensor::GetDataType<double>();
+
+CuDnn::ptr_t CuDnn::Instance()
+{
+    auto createNew = []()
+    {
+        int deviceId;
+        CUDA_CALL(cudaGetDevice(&deviceId));
+        cudaDeviceProp props = {0};
+        if (cudaGetDeviceProperties(&props, deviceId) != cudaSuccess || props.major < 3)
+            RuntimeError("cuDNN requires device with compute capability 3.0 or higher.");
+        cudnnHandle_t* cudnn = new cudnnHandle_t;
+        CUDNN_CALL(cudnnCreate(cudnn));
+        CUDNN_CALL(cudnnSetStream(*cudnn, GetStream()));
+        return cudnn;
+    };
+
+    static std::shared_ptr<cudnnHandle_t> m_instance = std::shared_ptr<cudnnHandle_t>(createNew(), [](cudnnHandle_t* src)
+    {
+        assert(*src != nullptr);
+        auto err = cudnnDestroy(*src);
+        assert(err == CUDNN_STATUS_SUCCESS);
+#ifdef NDEBUG
+        UNUSED(err);
+#endif
+        delete src;
+    });
+    return m_instance;
+}
+
+} } }
--- a/Source/Math/CuDnnCommon.h
+++ b/Source/Math/CuDnnCommon.h
@ -0,0 +1,49 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "Basics.h"
+#include "TensorShape.h"
+#include <cudnn.h>
+#include <memory>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+class CuDnnTensor final
+{
+public:
+    CuDnnTensor(const TensorShape& src, cudnnDataType_t dataType);
+    ~CuDnnTensor();
+
+    void UpdateBatchSize(size_t batchSize);
+
+    operator cudnnTensorDescriptor_t() const { return m_tensor; }
+
+    template <typename ElemType>
+    static cudnnDataType_t GetDataType();
+
+    DISABLE_COPY_AND_MOVE(CuDnnTensor);
+
+private:
+    cudnnTensorDescriptor_t m_tensor;
+};
+
+struct CuDnn final
+{
+    using ptr_t = std::shared_ptr<cudnnHandle_t>;
+    static ptr_t Instance();
+
+    DISABLE_COPY_AND_MOVE(CuDnn);
+};
+
+template <typename ElemType>
+struct Consts
+{
+    static const ElemType Zero;
+    static const ElemType One;
+};
+
+} } }
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -4,11 +4,11 @@
 //

 #include "stdafx.h"
-#include "CuDnnConvolutionEngine.h"
+#include "CuDnnFactories.h"
 #include "GPUMatrix.h"
-#ifdef USE_CUDNN
-#include <cudnn.h>
-#include "CuDnnConvolutionEngine.cuh"
+#include <typeinfo>
+#include <typeindex>
+#include "CuDnnCommon.h"

 template <>
 const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
@ -16,287 +16,177 @@ const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
    return cudnnGetErrorString(x);
 }

-// A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for filters.
+// A note on the formats: CNTK originally used NHWC for input/output tensors and CHWN for kernels.
 // Such formats have very limited support in cuDNN and not used in other frameworks.
-// CNTK with cuDNN by default uses NCHW formats for both inputs/outputs and filters.
+// CNTK with cuDNN by default uses NCHW formats for both inputs/outputs and kernels.
 #define TENSOR_FORMAT CUDNN_TENSOR_NCHW
 #define FILTER_FORMAT CUDNN_TENSOR_NCHW
-#endif

 namespace Microsoft { namespace MSR { namespace CNTK {

-template <class ElemType>
-bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId)
-{
-// REVIEW alexeyk: compile-time for now, make runtime, config-driven.
-#ifdef USE_CUDNN
-    cudaDeviceProp props = {0};
-    return cudaGetDeviceProperties(&props, deviceId) == cudaSuccess && props.major >= 3;
-#else
-    UNUSED(deviceId);
-    return false;
-#endif
-}
-
-CudaTimer::~CudaTimer()
-{
-    // TODO: Should not throw if std::uncaught_exception()
-    if (m_start != nullptr)
-        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
-    if (m_stop != nullptr)
-        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
-}
-void CudaTimer::Start()
-{
-    cudaEvent_t start;
-    cudaEvent_t stop;
-    if (m_start != nullptr)
-        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_start)));
-    if (m_stop != nullptr)
-        CUDA_CALL(cudaEventDestroy(reinterpret_cast<cudaEvent_t>(m_stop)));
-    CUDA_CALL(cudaEventCreate(&start));
-    CUDA_CALL(cudaEventCreate(&stop));
-    m_start = start;
-    m_stop = stop;
-    CUDA_CALL(cudaEventRecord(start, GetStream()));
-}
-void CudaTimer::Stop()
-{
-    CUDA_CALL(cudaEventRecord(reinterpret_cast<cudaEvent_t>(m_stop), GetStream()));
-    CUDA_CALL(cudaEventSynchronize(reinterpret_cast<cudaEvent_t>(m_stop)));
-}
-float CudaTimer::Elapsed()
-{
-    float ms;
-    CUDA_CALL(cudaEventElapsedTime(&ms, reinterpret_cast<cudaEvent_t>(m_start), reinterpret_cast<cudaEvent_t>(m_stop)));
-    return ms;
-}
-
-#ifdef USE_CUDNN
-
 static bool IsGpu(DEVICEID_TYPE deviceId)
 {
    return deviceId >= 0;
 }

-class CuDnnTensor4D : public ConvolutionTensor4D
+class CuDnnKernel
 {
 public:
-    CuDnnTensor4D(size_t w, size_t h, size_t c, size_t n, cudnnDataType_t dataType)
-        : ConvolutionTensor4D(w, h, c, n), m_dataType(dataType), m_tensor(nullptr)
+    CuDnnKernel(const ConvolveGeometry& geometry, cudnnDataType_t dataType)
+        : m_kernel(nullptr)
    {
-        CUDNN_CALL(cudnnCreateTensorDescriptor(&m_tensor));
-        CUDNN_CALL(cudnnSetTensor4dDescriptor(m_tensor, TENSOR_FORMAT, dataType,
-                                              static_cast<int>(n), static_cast<int>(c), static_cast<int>(h), static_cast<int>(w)));
+        CUDNN_CALL(cudnnCreateFilterDescriptor(&m_kernel));
+        // Set cuDNN kernel dimensions. cuDNN uses row-major format while TensorShape - column-major
+        // so conversion is required.
+        const auto& filt = geometry.KernelShape();
+        size_t mapCount = geometry.GetMapCount(geometry.InputShape().GetRank() - 1);
+        if (mapCount != geometry.MapCount().GetNumElements())
+            InvalidArgument("cuDNN does not support map tensor of this configuration.");
+        SmallVector<int> dims(filt.GetRank() + 1);
+        for (int i = 0; i < filt.GetRank(); i++)
+            dims[dims.size() - 1 - i] = (int)filt[i];
+        // Set map count(aka K) dimension.
+        dims[0] = (int)mapCount;
+        CUDNN_CALL(cudnnSetFilterNdDescriptor_v4(m_kernel, dataType, FILTER_FORMAT, (int)dims.size(), dims.data()));
    }

-public:
-    operator cudnnTensorDescriptor_t() const
+    ~CuDnnKernel()
    {
-        return m_tensor;
-    }
-
-    ~CuDnnTensor4D() noexcept
-    {
-        if (m_tensor != nullptr)
+        if (m_kernel != nullptr)
        {
-            // TODO: Check for error code and throw if !std::uncaught_exception()
-            cudnnDestroyTensorDescriptor(m_tensor);
-            m_tensor = nullptr;
+            cudnnDestroyFilterDescriptor(m_kernel);
+            m_kernel = nullptr;
        }
    }

-    void setN(size_t newN) override
-    {
-        ConvolutionTensor4D::setN(newN);
-        CUDNN_CALL(cudnnSetTensor4dDescriptor(m_tensor, TENSOR_FORMAT, m_dataType,
-                                              static_cast<int>(n()), static_cast<int>(c()), static_cast<int>(h()), static_cast<int>(w())));
-    }
-
-private:
-    cudnnDataType_t m_dataType;
-    cudnnTensorDescriptor_t m_tensor;
-};
-
-class CuDnnFilter : public ConvolutionFilter
-{
-public:
-    CuDnnFilter(size_t w, size_t h, size_t c, size_t k, cudnnDataType_t dataType)
-        : ConvolutionFilter(w, h, c, k), m_filter(nullptr)
-    {
-        CUDNN_CALL(cudnnCreateFilterDescriptor(&m_filter));
-        CUDNN_CALL(cudnnSetFilter4dDescriptor_v4(m_filter, dataType, FILTER_FORMAT,
-                                                 static_cast<int>(k), static_cast<int>(c), static_cast<int>(h), static_cast<int>(w)));
-    }
-
-public:
    operator cudnnFilterDescriptor_t() const
    {
-        return m_filter;
+        return m_kernel;
    }

-    ~CuDnnFilter() noexcept
-    {
-        if (m_filter != nullptr)
-        {
-            // TODO: Check for error code and throw if !std::uncaught_exception()
-            cudnnDestroyFilterDescriptor(m_filter);
-            m_filter = nullptr;
-        }
-    }
+    DISABLE_COPY_AND_MOVE(CuDnnKernel);

 private:
-    cudnnFilterDescriptor_t m_filter;
+    cudnnFilterDescriptor_t m_kernel;
 };

-class CuDnnConvolutionDescriptor : public ConvolutionDescriptor
+class CuDnnConv
 {
 public:
-    CuDnnConvolutionDescriptor(size_t wStride, size_t hStride, size_t wPad, size_t hPad)
-        : ConvolutionDescriptor(wStride, hStride, wPad > 0 || hPad > 0), m_conv(nullptr)
+    CuDnnConv(const ConvolveGeometry& geometry, cudnnDataType_t dataType)
+        : m_conv(nullptr)
    {
        CUDNN_CALL(cudnnCreateConvolutionDescriptor(&m_conv));
-        CUDNN_CALL(cudnnSetConvolution2dDescriptor(m_conv,
-                                                   static_cast<int>(hPad), static_cast<int>(wPad),
-                                                   static_cast<int>(hStride), static_cast<int>(wStride),
-                                                   1, 1, CUDNN_CROSS_CORRELATION));
+        // Set cuDNN convolution parameters. cuDNN uses row-major format while TensorShape - column-major
+        // so conversion is required. Also, for 2D convolutions (which have 3D tensor shapes)
+        // cuDNN uses 2D descriptors while for 3D convolutions - 3D so we need to ignore
+        // rightmost dimension in ConvolveGeometry tensors.
+        SmallVector<int> stride(geometry.InputShape().GetRank() - 1);
+        SmallVector<int> pad(stride.size());
+        for (int i = 0; i < stride.size(); i++)
+        {
+            stride[stride.size() - 1 - i] = (int)geometry.GetStride(i);
+            pad[stride.size() - 1 - i] = geometry.GetLowerPad(i);
+        }
+        SmallVector<int> upscale(stride.size(), 1);
+        CUDNN_CALL(cudnnSetConvolutionNdDescriptor(m_conv, (int)stride.size(), pad.data(),
+                                                   stride.data(), upscale.data(),
+                                                   CUDNN_CROSS_CORRELATION, dataType));
    }

-public:
-    operator cudnnConvolutionDescriptor_t() const
-    {
-        return m_conv;
-    }
-
-    ~CuDnnConvolutionDescriptor() noexcept
+    ~CuDnnConv()
    {
        if (m_conv != nullptr)
        {
-            // TODO: Check for error code and throw if !std::uncaught_exception()
            cudnnDestroyConvolutionDescriptor(m_conv);
            m_conv = nullptr;
        }
    }

+    operator cudnnConvolutionDescriptor_t() const
+    {
+        return m_conv;
+    }
+
+    DISABLE_COPY_AND_MOVE(CuDnnConv);
+
 private:
    cudnnConvolutionDescriptor_t m_conv;
 };

-class CuDnnPoolingDescriptor : public PoolingDescriptor
+class CuDnnPool
 {
 public:
-    CuDnnPoolingDescriptor(PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
-        : PoolingDescriptor(kind, w, h, wStride, hStride, wPad, hPad), m_pool(nullptr)
+    CuDnnPool(const ConvolveGeometry& geometry, PoolKind kind)
+        : m_pool(nullptr)
    {
        assert(kind == PoolKind::Max || kind == PoolKind::Average);

        CUDNN_CALL(cudnnCreatePoolingDescriptor(&m_pool));
-        CUDNN_CALL(cudnnSetPooling2dDescriptor(m_pool,
+        // Set cuDNN pooling parameters. cuDNN uses row-major format while TensorShape - column-major
+        // so conversion is required. Same as in convolution descriptor, cuDNN uses 2D descriptors
+        // for 3D inputs.
+        SmallVector<int> dims(geometry.InputShape().GetRank() - 1);
+        SmallVector<int> stride(dims.size());
+        SmallVector<int> pad(stride.size());
+        int j = (int)dims.size() - 1;
+        for (int i = 0; i < stride.size(); i++, j--)
+        {
+            dims[j] = (int)geometry.KernelShape()[i];
+            stride[j] = (int)geometry.GetStride(i);
+            pad[j] = geometry.GetLowerPad(i);
+        }
+
+        // Must use CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING to get the same results as in reference engine.
+        CUDNN_CALL(cudnnSetPoolingNdDescriptor(m_pool,
                                               kind == PoolKind::Max ? CUDNN_POOLING_MAX : CUDNN_POOLING_AVERAGE_COUNT_EXCLUDE_PADDING,
-                                               static_cast<int>(h), static_cast<int>(w),
-                                               static_cast<int>(hPad), static_cast<int>(wPad),
-                                               static_cast<int>(hStride), static_cast<int>(wStride)));
+                                               (int)dims.size(), dims.data(), pad.data(), stride.data()));
    }

-public:
-    operator cudnnPoolingDescriptor_t() const
-    {
-        return m_pool;
-    }
-
-    ~CuDnnPoolingDescriptor() noexcept
+    ~CuDnnPool()
    {
        if (m_pool != nullptr)
        {
-            // TODO: Check for error code and throw if !std::uncaught_exception()
            cudnnDestroyPoolingDescriptor(m_pool);
            m_pool = nullptr;
        }
    }

+    operator cudnnPoolingDescriptor_t() const
+    {
+        return m_pool;
+    }
+
+    DISABLE_COPY_AND_MOVE(CuDnnPool);
+
 private:
    cudnnPoolingDescriptor_t m_pool;
 };

-template <typename CuDnnT, typename In>
-static CuDnnT& As(In& src)
-{
-    // Do dynamic_cast only in debug builds and static_cast in release builds.
-    assert(dynamic_cast<CuDnnT*>(&src) != nullptr);
-    return static_cast<CuDnnT&>(src);
-}
-static const CuDnnTensor4D& t(const ConvolutionTensor4D& src)
-{
-    return As<const CuDnnTensor4D>(src);
-}
-static const CuDnnFilter& f(const ConvolutionFilter& src)
-{
-    return As<const CuDnnFilter>(src);
-}
-static const CuDnnConvolutionDescriptor& cd(const ConvolutionDescriptor& src)
-{
-    return As<const CuDnnConvolutionDescriptor>(src);
-}
-static const CuDnnPoolingDescriptor& p(const PoolingDescriptor& src)
-{
-    return As<const CuDnnPoolingDescriptor>(src);
-}
-template <typename ElemType>
-static ElemType* ptr(Matrix<ElemType>& src)
-{
-    return src.BufferPointer();
-}
-template <typename ElemType>
-static const ElemType* ptr(const Matrix<ElemType>& src)
-{
-    return src.BufferPointer();
-}
-
-template <typename ElemType>
-struct Consts
-{
-    static const ElemType Zero;
-    static const ElemType One;
-};
-template <>
-const float Consts<float>::One = 1;
-template <>
-const double Consts<double>::One = 1;
-template <>
-const float Consts<float>::Zero = 0;
-template <>
-const double Consts<double>::Zero = 0;
-
-template <typename ElemType>
+template <class ElemType>
 class CuDnnConvolutionEngine : public ConvolutionEngine<ElemType>
 {
 public:
    using Base = ConvolutionEngine<ElemType>;
    using typename Base::Mat;
-    using typename Base::Tensor4D;
-    using typename Base::Filter;
-    using typename Base::ConvDesc;

-    CuDnnConvolutionEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
-        : Base(deviceId, imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples), m_bnImpl(bnImpl), m_stream(GetStream()), m_cudnn(nullptr)
+public:
+    CuDnnConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
+                           size_t maxTempMemSizeInSamples, PoolKind poolKind)
+                           : Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind),
+                           m_cudnn(CuDnn::Instance()),
+                           m_dataType(CuDnnTensor::GetDataType<ElemType>()),
+                           m_inT(geometry->InputShape(), m_dataType),
+                           m_outT(geometry->OutputShape(), m_dataType)
    {
-        CUDNN_CALL(cudnnCreate(&m_cudnn));
-        CUDNN_CALL(cudnnSetStream(m_cudnn, m_stream));
-    }
-
-    ~CuDnnConvolutionEngine()
-    {
-        if (m_cudnn != nullptr)
-        {
-            // TODO: Check for error code and throw if !std::uncaught_exception()
-            cudnnDestroy(m_cudnn);
-            m_cudnn = nullptr;
-        }
    }

 protected:
+    using Base::m_geometry;
    using Base::m_deviceId;
    using Base::m_imageLayout;
+    using Base::m_maxTempMemSizeInSamples;
+    using Base::m_poolKind;

    void EnsureCompatible() override
    {
@ -306,26 +196,39 @@ protected:
            RuntimeError("cuDNN convolution engine supports GPU devices only.");
    }

-    void ForwardCore(const Tensor4D& inT, const Mat& in, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                     const Tensor4D& outT, Mat& out, Mat& workspace) override
+    void EnsureConvolutionInitialized() override
    {
-        // Find best algo and allocate temp buffer, if needed.
-        auto finder = [&](int& calgo, cudnnConvolutionFwdAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
+        if (m_kernelT == nullptr)
        {
-            return cudnnFindConvolutionForwardAlgorithm(m_cudnn, t(inT), f(filterT), cd(convDesc), t(outT), MaxAlgoCount, &calgo, algoPerf);
+            m_kernelT = std::make_unique<CuDnnKernel>(*m_geometry, m_dataType), 
+            m_conv = std::make_unique<CuDnnConv>(*m_geometry, m_dataType);
+        }
+    }
+
+    void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) override
+    {
+        size_t batchSize = in.GetNumCols();
+        // Find best algo and allocate temp buffer, if needed.
+        auto finder = [this](int& calgo, cudnnConvolutionFwdAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
+        {
+            return cudnnFindConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, MaxAlgoCount, &calgo, algoPerf);
        };
-        FindBestAlgo(t(inT), m_fwdAlgo, finder);
+        auto staticFinder = [this](cudnnConvolutionFwdAlgo_t& algo) -> cudnnStatus_t
+        {
+            return cudnnGetConvolutionForwardAlgorithm(*m_cudnn, m_inT, *m_kernelT, *m_conv, m_outT, CUDNN_CONVOLUTION_FWD_NO_WORKSPACE, 0, &algo);
+        };
+        FindBestAlgo(batchSize, m_fwdAlgo, finder, staticFinder);
        if (m_fwdAlgo.Algo.memory > 0)
            workspace.Resize((m_fwdAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
        // Perform forward convolution operation.
-        auto err = cudnnConvolutionForward(m_cudnn, &C::One, t(inT), ptr(in), f(filterT), ptr(filter), cd(convDesc),
-                                           m_fwdAlgo.Algo.algo, ptr(workspace), m_fwdAlgo.Algo.memory, &C::Zero, t(outT), ptr(out));
+        auto err = cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv,
+                                           m_fwdAlgo.Algo.algo, ptr(workspace), m_fwdAlgo.Algo.memory, &C::Zero, m_outT, ptr(out));
        // There might be a case where cuDNN fails due to workspace being too small, try using no-workspace algo instead.
        // REVIEW alexeyk: NVIDIA is currently reviewing this issue.
        if (CUDNN_STATUS_INVALID_VALUE == err && m_fwdAlgo.Algo.memory > 0)
        {
-            auto err2 = cudnnConvolutionForward(m_cudnn, &C::One, t(inT), ptr(in), f(filterT), ptr(filter), cd(convDesc),
-                                                m_fwdAlgo.NoWorkspaceAlgo, nullptr, 0, &C::Zero, t(outT), ptr(out));
+            auto err2 = cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv,
+                                                m_fwdAlgo.NoWorkspaceAlgo, nullptr, 0, &C::Zero, m_outT, ptr(out));
            // Update original error in case of success.
            if (CUDNN_STATUS_SUCCESS == err2)
                err = CUDNN_STATUS_SUCCESS;
@ -333,128 +236,104 @@ protected:
        CUDNN_CALL(err);
    }

-    void BackwardDataCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Filter& filterT, const Mat& filter, const ConvDesc& convDesc,
-                          const Tensor4D& gradT, Mat& grad, Mat& workspace) override
+    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
    {
+        size_t batchSize = srcGrad.GetNumCols();
        // Find best algo and allocate temp buffer, if needed.
-        auto finder = [&](int& calgo, cudnnConvolutionBwdDataAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
+        auto finder = [this](int& calgo, cudnnConvolutionBwdDataAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
        {
-            return cudnnFindConvolutionBackwardDataAlgorithm(m_cudnn, f(filterT), t(srcGradT), cd(convDesc), t(gradT), MaxAlgoCount, &calgo, algoPerf);
+            return cudnnFindConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, MaxAlgoCount, &calgo, algoPerf);
        };
-        FindBestAlgo(t(srcGradT), m_backDataAlgo, finder);
+        auto staticFinder = [this](cudnnConvolutionBwdDataAlgo_t& algo) -> cudnnStatus_t
+        {
+            return cudnnGetConvolutionBackwardDataAlgorithm(*m_cudnn, *m_kernelT, m_outT, *m_conv, m_inT, CUDNN_CONVOLUTION_BWD_DATA_NO_WORKSPACE, 0, &algo);
+        };
+        FindBestAlgo(batchSize, m_backDataAlgo, finder, staticFinder);
        if (m_backDataAlgo.Algo.memory > 0)
            workspace.Resize((m_backDataAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
        // Compute gradients with respect to the output tensor (data).
-        CUDNN_CALL(cudnnConvolutionBackwardData(m_cudnn, &C::One, f(filterT), ptr(filter), t(srcGradT), ptr(srcGrad), cd(convDesc), m_backDataAlgo.Algo.algo,
-                                                ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, t(gradT), ptr(grad)));
+        CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
+                                                ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
    }

-    void BackwardFilterCore(const Tensor4D& srcGradT, const Mat& srcGrad, const Tensor4D& inT, const Mat& in, const ConvDesc& convDesc,
-                            const Filter& filterT, Mat& filter, bool /*allowReuse*/, Mat& workspace) override
+    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
    {
+        size_t batchSize = in.GetNumCols();
        // Find best algo and allocate temp buffer, if needed.
-        auto finder = [&](int& calgo, cudnnConvolutionBwdFilterAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
+        auto finder = [this](int& calgo, cudnnConvolutionBwdFilterAlgoPerf_t algoPerf[MaxAlgoCount]) -> cudnnStatus_t
        {
-            return cudnnFindConvolutionBackwardFilterAlgorithm(m_cudnn, t(inT), t(srcGradT), cd(convDesc), f(filterT), MaxAlgoCount, &calgo, algoPerf);
+            return cudnnFindConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, MaxAlgoCount, &calgo, algoPerf);
        };
-        FindBestAlgo(t(inT), m_backFiltAlgo, finder);
+        auto staticFinder = [this](cudnnConvolutionBwdFilterAlgo_t& algo) -> cudnnStatus_t
+        {
+            return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE, 0, &algo);
+        };
+        FindBestAlgo(batchSize, m_backFiltAlgo, finder, staticFinder);
        if (m_backFiltAlgo.Algo.memory > 0)
            workspace.Resize((m_backFiltAlgo.Algo.memory + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
        // Compute gradients with respect to the output tensor (data).
-        CUDNN_CALL(cudnnConvolutionBackwardFilter(m_cudnn, &C::One, t(inT), ptr(in), t(srcGradT), ptr(srcGrad), cd(convDesc), m_backFiltAlgo.Algo.algo,
-                                                  ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, f(filterT), ptr(filter)));
+        CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
+                                                  ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
    }

-    void EnsureCompatibleBatchNorm(bool spatial) override
+    void EnsurePoolingInitialized() override
    {
-        if (!IsGpu(m_deviceId))
-            InvalidArgument("cuDNN engine does not support batch normalization on CPUs.");
-        if (spatial && m_imageLayout != ImageLayoutKind::CHW)
-            InvalidArgument("cuDNN engine batch normalization currently supports only CHW data layout for convolutional nodes.");
+        if (m_pool == nullptr)
+            m_pool = std::make_unique<CuDnnPool>(*m_geometry, m_poolKind);
    }

-    void NormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                            bool spatial, double expAvgFactor, Mat& runMean, Mat& runInvStdDev, Mat& out,
-                            double epsilon, Mat& saveMean, Mat& saveInvStdDev) override
+    void ForwardPoolingCore(const Mat& in, Mat& out) override
    {
-        if (m_bnImpl == BatchNormImpl::CuDnn)
-        {
-            cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
-            // cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
-            epsilon = std::max(epsilon, CUDNN_BN_MIN_EPSILON);
-            CUDNN_CALL(cudnnBatchNormalizationForwardTraining(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
-                t(scaleBiasT), ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev), 
-                epsilon, ptr(saveMean), ptr(saveInvStdDev)));
-        }
-        else if (m_bnImpl == BatchNormImpl::Cntk)
-        {
-            epsilon = std::max(epsilon, 1e-9);
-            CUDA_CALL(BatchNormalizationForwardTraining(inT, spatial, ptr(in), ptr(out), ptr(scale), ptr(bias),
-                                                        expAvgFactor, ptr(runMean), ptr(runInvStdDev),
-                                                        epsilon, ptr(saveMean), ptr(saveInvStdDev), m_stream));
-        }
-        else
-            RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
+        size_t batchSize = in.GetNumCols();
+        m_inT.UpdateBatchSize(batchSize);
+        m_outT.UpdateBatchSize(batchSize);
+        CUDNN_CALL(cudnnPoolingForward(*m_cudnn, *(m_pool), &C::One, m_inT, ptr(in), &C::Zero, m_outT, ptr(out)));
    }

-    void NormalizeBatchInferenceCore(const Tensor4D& inT, const Mat& in, const Tensor4D& scaleBiasT, const Mat& scale, const Mat& bias,
-                                     bool spatial, const Mat& runMean, const Mat& runInvStdDev, Mat& out) override
+    void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) override
    {
-        if (m_bnImpl == BatchNormImpl::CuDnn)
-        {
-            cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
-            CUDNN_CALL(cudnnBatchNormalizationForwardInference(m_cudnn, mode, &C::One, &C::Zero, t(inT), ptr(in), t(inT), ptr(out),
-                                                               t(scaleBiasT), ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), CUDNN_BN_MIN_EPSILON));
-        }
-        else if (m_bnImpl == BatchNormImpl::Cntk)
-        {
-            CUDA_CALL(BatchNormalizationForwardInference(inT, spatial, ptr(in), ptr(out), ptr(scale), ptr(bias),
-                                                         ptr(runMean), ptr(runInvStdDev), m_stream));
-        }
-        else
-            RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
-    }
-
-    void BackwardNormalizeBatchCore(const Tensor4D& inT, const Mat& in, const Mat& srcGrad, Mat& grad,
-                                    const Tensor4D& scaleBiasT, const Mat& scale, bool spatial, const Mat& saveMean, const Mat& saveInvStdDev,
-                                    Mat& scaleGrad, Mat& biasGrad) override
-    {
-        if (m_bnImpl == BatchNormImpl::CuDnn)
-        {
-            cudnnBatchNormMode_t mode = spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
-// REVIEW alexeyk: remove once Philly is upgraded to prod version.
-#if CUDNN_PATCHLEVEL >= 7
-            CUDNN_CALL(cudnnBatchNormalizationBackward(m_cudnn, mode, &C::One, &C::One, &C::One, &C::One, t(inT), ptr(in), t(inT), ptr(srcGrad), t(inT), ptr(grad),
-                                                       t(scaleBiasT), ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
-#else
-            CUDNN_CALL(cudnnBatchNormalizationBackward(m_cudnn, mode, &C::One, &C::One, t(inT), ptr(in), t(inT), ptr(srcGrad), t(inT), ptr(grad),
-                t(scaleBiasT), ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
-#endif
-
-        }
-        else if (m_bnImpl == BatchNormImpl::Cntk)
-        {
-            CUDA_CALL(BatchNormalizationBackward(inT, spatial, ptr(in), ptr(srcGrad), ptr(grad), ptr(scale), ptr(scaleGrad), ptr(biasGrad),
-                                                 ptr(saveMean), ptr(saveInvStdDev), m_stream));
-        }
-        else
-            RuntimeError("Provided batch norm implementation (%d) is not supported.", m_bnImpl);
+        size_t batchSize = in.GetNumCols();
+        m_inT.UpdateBatchSize(batchSize);
+        m_outT.UpdateBatchSize(batchSize);
+        CUDNN_CALL(cudnnPoolingBackward(*m_cudnn, *(m_pool), &C::One, m_outT, ptr(out), m_outT, ptr(srcGrad),
+                                        m_inT, ptr(in), &C::One, m_inT, ptr(grad)));
    }

 private:
+    using C = Consts<ElemType>;
+
    static const int MaxAlgoCount = 10;

-    template <typename TAlgo, typename TFinder>
-    void FindBestAlgo(const CuDnnTensor4D& t, TAlgo& algo, TFinder finder)
+    template <typename TAlgo, typename TFinder, typename TStaticFinder>
+    void FindBestAlgo(size_t batchSize, TAlgo& algo, TFinder finder, TStaticFinder staticFinder)
    {
-        if (!algo.NeedAutotuning(t))
+        if (!algo.NeedAutotuning(batchSize))
            return;
+        m_inT.UpdateBatchSize(batchSize);
+        m_outT.UpdateBatchSize(batchSize);
        using CuDnnAlgoT = decltype(TAlgo::Algo);
        CuDnnAlgoT algoPerf[MaxAlgoCount];
        int calgo = 0;
-        CUDNN_CALL(finder(calgo, algoPerf));
+        cudnnStatus_t err = finder(calgo, algoPerf);
+        // Alloc failed - usually means cuDNN runtime auto-tuner could not allocate workspace.
+        // In such case, use static auto-tuner with no workspace.
+        if (err == CUDNN_STATUS_ALLOC_FAILED)
+        {
+            decltype(CuDnnAlgoT::algo) noMemAlgo;
+            CUDNN_CALL(staticFinder(noMemAlgo));
+            algo.CurMBSize = batchSize;
+            algo.Algo = algoPerf[0];
+            algo.Algo.algo = noMemAlgo;
+            algo.Algo.memory = 0;
+            algo.Algo.status = CUDNN_STATUS_SUCCESS;
+            algo.NoWorkspaceAlgo = noMemAlgo;
+            return;
+        }
+        CUDNN_CALL(err);
        assert(calgo > 0);
-        size_t maxMem = m_maxTempMemSizeInSamples == 0 ? (std::numeric_limits<size_t>::max)() : t.w() * t.h() * t.c() * m_maxTempMemSizeInSamples * sizeof(ElemType);
+        size_t inputSampleSize = m_geometry->InputShape().GetNumElements();
+        size_t maxMem = m_maxTempMemSizeInSamples == 0 ? (std::numeric_limits<size_t>::max)() : inputSampleSize * m_maxTempMemSizeInSamples * sizeof(ElemType);
+        // Find best (fastest) algorithm which satisfies workspace requirements.
        auto res = std::find_if(algoPerf, algoPerf + calgo,
            [=](const CuDnnAlgoT& cur)
            {
@ -462,8 +341,9 @@ private:
            });
        if (res == algoPerf + calgo)
            RuntimeError("cuDNN could not find suitable algorithm for the current convolution configuration.");
-        algo.CurMBSize = t.n();
+        algo.CurMBSize = batchSize;
        algo.Algo = *res;
+        // Find fastest algorithm that does NOT require workspace. It is used as a fallback algo in Forward function.
        res = std::find_if(algoPerf, algoPerf + calgo,
            [](const CuDnnAlgoT& cur)
            {
@ -478,6 +358,15 @@ private:
            algo.NoWorkspaceAlgo = (*res).algo;
    }

+    static ElemType* ptr(Mat& src)
+    {
+        return src.BufferPointer();
+    }
+    static const ElemType* ptr(const Mat& src)
+    {
+        return src.BufferPointer();
+    }
+
 private:
    template <typename T>
    struct ConvAlgoInfo
@ -495,7 +384,7 @@ private:
        T Algo;
        CuDnnAlgoT NoWorkspaceAlgo;

-        bool NeedAutotuning(const CuDnnTensor4D& t)
+        bool NeedAutotuning(size_t batchSize)
        {
            // Need to re-run auto-tuner in case minibatch size is increased.
            // If minibatch size is decreased we assume that previously selected algorithm requires less or the same amount of workspace.
@ -504,186 +393,57 @@ private:
            // We also need to reset auto-tuning status at the beginning of each epoch but ComputationNode currently does not provide such notification.
            // We assume no other dimensions of tensors can change so we don't check it.
            // REVIEW alexeyk: review once we get response from NVIDIA.
-            return (Algo.status != CUDNN_STATUS_SUCCESS || t.n() > CurMBSize);
+            return (Algo.status != CUDNN_STATUS_SUCCESS || batchSize > CurMBSize);
        }
    };

-    using C = Consts<ElemType>;
+    CuDnn::ptr_t m_cudnn;
+    cudnnDataType_t m_dataType;
+    CuDnnTensor m_inT;
+    CuDnnTensor m_outT;
+    // Convolution specific.
+    std::unique_ptr<CuDnnKernel> m_kernelT;
+    std::unique_ptr<CuDnnConv> m_conv;
+    // Pooling specific.
+    std::unique_ptr<CuDnnPool> m_pool;

-    // REVIEW alexeyk: currently limit is set once in ctor though in CNTK it can be, theoretically, changed in runtime.
-    size_t m_maxTempMemSizeInSamples;
-    BatchNormImpl m_bnImpl;
-    cudnnHandle_t m_cudnn;
-    cudaStream_t m_stream;
    ConvAlgoInfo<cudnnConvolutionFwdAlgoPerf_t> m_fwdAlgo;
    ConvAlgoInfo<cudnnConvolutionBwdDataAlgoPerf_t> m_backDataAlgo;
    ConvAlgoInfo<cudnnConvolutionBwdFilterAlgoPerf_t> m_backFiltAlgo;
 };

 template <class ElemType>
-class CuDnnPoolingEngine : public PoolingEngine<ElemType>
+std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr geometry,
+                                                                                             DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
+                                                                                             size_t maxTempMemSizeInSamples, PoolKind poolKind)
 {
-public:
-    using Base = PoolingEngine<ElemType>;
-    using typename Base::Tensor4D;
-    using typename Base::PoolDesc;
-    using typename Base::Mat;
-
-public:
-    CuDnnPoolingEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
-        : Base(deviceId, imageLayout), m_cudnn(nullptr)
-    {
-        CUDNN_CALL(cudnnCreate(&m_cudnn));
-        CUDNN_CALL(cudnnSetStream(m_cudnn, GetStream()));
-    }
-
-    ~CuDnnPoolingEngine()
-    {
-        if (m_cudnn != nullptr)
-        {
-            // TODO: Check for error code and throw if !std::uncaught_exception()
-            cudnnDestroy(m_cudnn);
-            m_cudnn = nullptr;
-        }
-    }
-
-protected:
-    using Base::m_deviceId;
-    using Base::m_imageLayout;
-
-    void EnsureCompatible() override
-    {
-        if (m_imageLayout != ImageLayoutKind::CHW)
-            RuntimeError("cuDNN pooling engine supports only CHW/cudnn layout.");
-        if (!IsGpu(m_deviceId))
-            RuntimeError("cuDNN pooling engine supports GPU devices only.");
-    }
-
-    void ForwardCore(const Tensor4D& inT, const Mat& in, const PoolDesc& poolDesc, const Tensor4D& outT, Mat& out) override
-    {
-        CUDNN_CALL(cudnnPoolingForward(m_cudnn, p(poolDesc), &C::One, t(inT), ptr(in), &C::Zero, t(outT), ptr(out)));
-    }
-
-    void BackwardCore(const Tensor4D& outT, const Mat& out, const Mat& srcGrad, const PoolDesc& poolDesc, const Tensor4D& inT, const Mat& in, Mat& grad) override
-    {
-        CUDNN_CALL(cudnnPoolingBackward(m_cudnn, p(poolDesc), &C::One, t(outT), ptr(out), t(outT), ptr(srcGrad),
-                                        t(inT), ptr(in), &C::One, t(inT), ptr(grad)));
-    }
-
-private:
-    using C = Consts<ElemType>;
-
-    cudnnHandle_t m_cudnn;
-};
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
-{
-    // REVIEW alexeyk: assert fires in GCC but not in VC++.
-    // static_assert(false, "cuDNN engine currently supports only single and double precision tensors.");
-    RuntimeError("Not implemented.");
-}
-template <>
-typename CuDnnConvolutionEngineFactory<float>::Tensor4DPtr CuDnnConvolutionEngineFactory<float>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
-{
-    return std::make_unique<CuDnnTensor4D>(w, h, c, n, CUDNN_DATA_FLOAT);
-}
-template <>
-typename CuDnnConvolutionEngineFactory<double>::Tensor4DPtr CuDnnConvolutionEngineFactory<double>::CreateTensor(size_t w, size_t h, size_t c, size_t n)
-{
-    return std::make_unique<CuDnnTensor4D>(w, h, c, n, CUDNN_DATA_DOUBLE);
+    return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
 }

 template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
+bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind)
 {
-    // REVIEW alexeyk: assert fires in GCC but not in VC++.
-    // static_assert(false, "cuDNN engine currently supports only single and double precision filters.");
-    RuntimeError("Not implemented.");
-}
-template <>
-typename CuDnnConvolutionEngineFactory<float>::FilterPtr CuDnnConvolutionEngineFactory<float>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
-{
-    return std::make_unique<CuDnnFilter>(w, h, c, k, CUDNN_DATA_FLOAT);
-}
-template <>
-typename CuDnnConvolutionEngineFactory<double>::FilterPtr CuDnnConvolutionEngineFactory<double>::CreateFilter(size_t w, size_t h, size_t c, size_t k)
-{
-    return std::make_unique<CuDnnFilter>(w, h, c, k, CUDNN_DATA_DOUBLE);
-}
+    // REVIEW alexeyk: IsSupported check should be performed by cuDNN itself. Is there a good way to do that?

-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
-    const Tensor4D& /*inT*/, const Filter& filterT, size_t wStride, size_t hStride, bool padding)
-{
-    size_t wPad = padding ? filterT.w() / 2 : 0;
-    size_t hPad = padding ? filterT.h() / 2 : 0;
-    return std::make_unique<CuDnnConvolutionDescriptor>(wStride, hStride, wPad, hPad);
+    cudaDeviceProp props = {0};
+    if (cudaGetDeviceProperties(&props, deviceId) != cudaSuccess || props.major < 3)
+        return false;
+
+    const auto& input = geometry->InputShape();
+    const auto& kernel = geometry->KernelShape();
+    const auto& sharing = geometry->Sharing();
+    const auto& mapCount = geometry->MapCount();
+    // cuDNN supports 2D and 3D convolutions at the moment with full sharing.
+    // In case map count size > 1, then it should have all ones except last dimension.
+    // If pooling is requested, then cuDNN supports only 2D/3D inputs and 2D pooling kernels.
+    return (input.GetRank() <= 4 &&
+            std::find(begin(sharing), end(sharing), false) == sharing.end() &&
+            mapCount.GetNumElements() == mapCount[mapCount.GetRank() - 1] &&
+            (poolKind == PoolKind::None || 
+             input.GetRank() <= 3 && (kernel.GetRank() < 3 || kernel[2] == 1)));
 }

-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
-    typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad)
-{
-    return std::make_unique<CuDnnPoolingDescriptor>(kind, w, h, wStride, hStride, wPad, hPad);
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(
-    DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl)
-{
-    return std::make_unique<CuDnnConvolutionEngine<ElemType>>(deviceId, imageLayout, maxTempMemSizeInSamples, bnImpl);
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(
-    DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout)
-{
-    return std::make_unique<CuDnnPoolingEngine<ElemType>>(deviceId, imageLayout);
-}
-
-#else
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t, size_t, size_t, size_t)
-{
-    RuntimeError("The code is compiled without USE_CUDNN macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t, size_t, size_t, size_t)
-{
-    RuntimeError("The code is compiled without USE_CUDNN macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
-    const Tensor4D&, const Filter&, size_t, size_t, bool)
-{
-    RuntimeError("The code is compiled without USE_CUDNN macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
-    typename PoolDesc::PoolKind, size_t, size_t, size_t, size_t, size_t, size_t)
-{
-    RuntimeError("The code is compiled without USE_CUDNN macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, ImageLayoutKind, size_t, BatchNormImpl)
-{
-    RuntimeError("The code is compiled without USE_CUDNN macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(DEVICEID_TYPE, ImageLayoutKind)
-{
-    RuntimeError("The code is compiled without USE_CUDNN macro.");
-}
-
-#endif
-
 template class CuDnnConvolutionEngineFactory<float>;
 template class CuDnnConvolutionEngineFactory<double>;
+
 } } }
--- a/Source/Math/CuDnnConvolutionEngine.cuh
+++ b/Source/Math/CuDnnConvolutionEngine.cuh
--- a/Source/Math/CuDnnConvolutionEngine.h
+++ b/Source/Math/CuDnnConvolutionEngine.h
@ -1,61 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-
-#pragma once
-
-#include "ConvolutionEngine.h"
-
-namespace Microsoft { namespace MSR { namespace CNTK {
-
-template <class ElemType>
-class CuDnnConvolutionEngineFactory : public ConvolutionEngineFactory<ElemType>
-{
-public:
-    using Base = ConvolutionEngineFactory<ElemType>;
-    using typename Base::Tensor4D;
-    using typename Base::Tensor4DPtr;
-    using typename Base::Filter;
-    using typename Base::FilterPtr;
-    using typename Base::ConvDesc;
-    using typename Base::ConvDescPtr;
-    using typename Base::PoolDesc;
-    using typename Base::PoolDescPtr;
-
-    using typename Base::ConvEnginePtr;
-    using typename Base::PoolEnginePtr;
-
-public:
-    Tensor4DPtr CreateTensor(size_t w, size_t h, size_t c, size_t n) override;
-    FilterPtr CreateFilter(size_t w, size_t h, size_t c, size_t k) override;
-    ConvDescPtr CreateConvDescriptor(const Tensor4D& inT, const Filter& filterT,
-                                     size_t wStride, size_t hStride, bool padding) override;
-    PoolDescPtr CreatePoolDescriptor(typename PoolDesc::PoolKind kind, size_t w, size_t h, size_t wStride, size_t hStride, size_t wPad, size_t hPad) override;
-
-    ConvEnginePtr CreateConvEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, BatchNormImpl bnImpl) override;
-    PoolEnginePtr CreatePoolEngine(DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout) override;
-
-    static bool IsSupported(DEVICEID_TYPE deviceId);
-};
-
-// REVIEW alexeyk: wrong place. It is currently used only in unit tests but I can't add it there because of the build issues.
-// Timer that can be used to measure CUDA calls. 
-// Uses CUDA event and will synchronize(!) the stream when Stop is called.
-class MATH_API CudaTimer
-{
-public:
-    CudaTimer(): m_start(nullptr), m_stop(nullptr)
-    {
-    }
-    ~CudaTimer();
-    void Start();
-    void Stop();
-    float Elapsed();
-
-    DISABLE_COPY_AND_MOVE(CudaTimer);
-private:
-    void* m_start;
-    void* m_stop;
-};
-} } }
--- a/Source/Math/CuDnnFactories.h
+++ b/Source/Math/CuDnnFactories.h
@ -0,0 +1,51 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "ConvolutionEngine.h"
+#include "BatchNormalizationEngine.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template <class ElemType>
+class CuDnnConvolutionEngineFactory
+{
+public:
+    static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
+                                                               ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
+                                                               PoolKind poolKind);
+    static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind);
+};
+
+template <class ElemType>
+class CuDnnBatchNormEngineFactory
+{
+public:
+    static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+                                                             bool spatial, ImageLayoutKind imageLayout);
+};
+
+// REVIEW alexeyk: wrong place? It is currently used only in unit tests but I can't add it there because of the build issues.
+// Timer that can be used to measure CUDA calls. 
+// Uses CUDA event and will synchronize(!) the stream when Stop is called.
+class MATH_API CudaTimer
+{
+public:
+    CudaTimer(): m_start(nullptr), m_stop(nullptr)
+    {
+    }
+    ~CudaTimer();
+    void Start();
+    void Stop();
+    float Elapsed();
+
+    DISABLE_COPY_AND_MOVE(CudaTimer);
+private:
+    void* m_start;
+    void* m_stop;
+};
+
+} } }
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -24,6 +24,8 @@
 #include "cublas_v2.h"
 #include <assert.h>
 #include <memory>
+#include "CntkBatchNormalization.cuh"
+#include "Convolution.cuh"

 #pragma comment(lib, "cudart.lib") // instruct linker to reference these libs
 #pragma comment(lib, "cublas.lib")
@ -145,7 +147,7 @@ AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numE
    }

    AllocatedElemType* deviceBufferPtr = AllocateNoTrace<AllocatedElemType>(deviceId, numElements);
-
+    
    if (IsTraceEnabled())
    {
        fprintf(stderr, "Allocated DeviceBufferPointer = %p\n", (void*)deviceBufferPtr);
@ -3001,6 +3003,178 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMat

 #pragma endregion Other helper functions

+template <class ElemType>
+void GPUMatrix<ElemType>::ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                                             const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const
+{
+    const int BlockSize = 128;
+    auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
+    PrepareDevice();
+    SyncGuard syncGuard;
+    kConvolutionForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), kernel.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
+                                                            runs.m_pArray, m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                                                  const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const
+{
+    const int BlockSize = 128;
+    auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
+    PrepareDevice();
+    SyncGuard syncGuard;
+    kConvolutionBackwardData<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), kernel.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
+                                                                 runs.m_pArray, m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                                                    const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const
+{
+    const int BlockSize = 128;
+    auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
+    PrepareDevice();
+    SyncGuard syncGuard;
+    kConvolutionBackwardKernel<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), (int)in.GetNumRows(), (int)GetNumRows(),
+                                                                   in.m_pArray, mpRowCol.m_pArray, mpRowIwht.m_pArray, mpRowRun.m_pArray,
+                                                                   runs.m_pArray, m_pArray, kernelGrad.m_pArray);
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
+{
+    const int BlockSize = 128;
+    auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
+    PrepareDevice();
+    SyncGuard syncGuard;
+    kMaxPoolingForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
+                                                           m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
+                                             const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
+                                             GPUMatrix<ElemType>& grad) const
+{
+    const int BlockSize = 128;
+    auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
+    PrepareDevice();
+    SyncGuard syncGuard;
+    kMaxPoolingBackward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), out.m_pArray, in.m_pArray,
+                                                            mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
+                                                            m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
+{
+    const int BlockSize = 128;
+    auto gdim = dim3((output.GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
+    PrepareDevice();
+    SyncGuard syncGuard;
+    kAveragePoolingForward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
+                                                               m_pArray, (int)GetNumRows(), output.m_pArray, (int)output.GetNumRows());
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const
+{
+    const int BlockSize = 128;
+    auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
+    PrepareDevice();
+    SyncGuard syncGuard;
+    kAveragePoolingBackward<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.m_pArray, mpRowIndices.m_pArray, indices.m_pArray,
+                                                                m_pArray, (int)GetNumRows(), grad.m_pArray, (int)grad.GetNumRows());
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
+                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
+                                                    GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
+{
+    assert((GetNumRows() % scale.GetNumRows()) == 0);
+
+    bool spatial = GetNumRows() != scale.GetNumRows();
+    size_t vectorSize = GetNumRows();
+    size_t spatialSize = spatial ? (GetNumRows() / scale.GetNumRows()) : 1;
+    size_t batchSize = GetNumCols();
+
+    assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
+    assert(0 < batchSize  && batchSize  <= std::numeric_limits<int>::max());
+
+    SyncGuard syncGuard;
+    // If expAvgFactor == 0 && blendFactor == 1 then we don't need to compute current minibatch statistics.
+    if (expAvgFactor > 0 || blendFactor < 1)
+    {
+        if (spatial)
+        {
+            Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, m_pArray,
+                                                                expAvgFactor, runMean.m_pArray, runInvStdDev.m_pArray, epsilon,
+                                                                saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
+        }
+        else
+        {
+            Call<ComputeBatchMeanAndInvStdDev, ElemType>(vectorSize, vectorSize, batchSize, m_pArray,
+                                                         expAvgFactor, runMean.m_pArray, runInvStdDev.m_pArray, epsilon,
+                                                         saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
+        }
+    }
+    // When:
+    //     blendFactor == 1 - use running mean/var instead of the current minibatch mean/var.
+    // 0 < blendFactor <  1 - blend running mean/var with mean/var of the current minibatch: saveMean = (1 - blendFactor) * saveMean + blendFactor * runMean
+    //     blendFactor == 0 - use mean/var of the current minibatch.
+    if (blendFactor < 1)
+    {
+        if (blendFactor > 0)
+        {
+            // REVIEW alexeyk: can be rolled into NormalizeBatchTraining to save bandwidth.
+            Scale((ElemType)(1 - blendFactor), saveMean);
+            ScaleAndAdd((ElemType)blendFactor, runMean, saveMean);
+            Scale((ElemType)(1 - blendFactor), saveInvStdDev);
+            ScaleAndAdd((ElemType)blendFactor, runInvStdDev, saveInvStdDev);
+        }
+        Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
+                                               spatial, m_pArray, out.m_pArray, scale.m_pArray, bias.m_pArray,
+                                               saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
+    }
+    else
+    {
+        Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
+                                               spatial, m_pArray, out.m_pArray, scale.m_pArray, bias.m_pArray,
+                                               runMean.m_pArray, runInvStdDev.m_pArray, GetStream());
+    }
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, 
+                                                     const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
+                                                     GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
+{
+    assert((GetNumRows() % scale.GetNumRows()) == 0);
+
+    bool spatial = GetNumRows() != scale.GetNumRows();
+    size_t vectorSize = GetNumRows();
+    size_t spatialSize = spatial ? (GetNumRows() / scale.GetNumRows()) : 1;
+    size_t batchSize = GetNumCols();
+
+    assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
+    assert(0 < batchSize  && batchSize  <= std::numeric_limits<int>::max());
+
+    SyncGuard syncGuard;
+    if (spatial)
+    {
+        Call<ComputeSpatialScaleAndBiasGradients, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, in.m_pArray, m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray,
+                                                            saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
+    }
+    else
+    {
+        Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.m_pArray, m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray,
+                                                     saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
+    }
+    Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
+                                                    in.m_pArray, m_pArray, grad.m_pArray, scale.m_pArray, scaleGrad.m_pArray, biasGrad.m_pArray, saveMean.m_pArray, saveInvStdDev.m_pArray, GetStream());
+}
+
 #pragma region Static BLAS Functions
 // float/double overloads of cublasSgemm()/cublasDgemm()
 static cublasStatus_t cublas_gemm(cublasHandle_t handle, cublasOperation_t transa, cublasOperation_t transb, int m, int n, int k, const float* alpha, const float* A, int lda, const float* B, int ldb, const float* beta, float* C, int ldc)
@ -4216,6 +4390,9 @@ template void GPUMatrix<char>::SetValue(const char);
 template void GPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, int deviceId, char* pArray, size_t matrixFlags);
 template void GPUMatrix<char>::SetValue(GPUMatrix<char> const&);

+template GPUMatrix<int>::GPUMatrix(const size_t, const size_t, int, int*, const size_t);
+template GPUMatrix<int>::~GPUMatrix();
+
 template int* TracingGPUMemoryAllocator::Allocate<int>(int, size_t);
 template size_t* TracingGPUMemoryAllocator::Allocate<size_t>(int, size_t);
 template long* TracingGPUMemoryAllocator::Allocate<long>(int, size_t);
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -45,6 +45,11 @@ typedef struct CUstream_st* cudaStream_t;
 #define USE_TIME_BASED_SEED ULONG_MAX
 #endif

+// Max number of GPUs on a _single_ node.
+#ifndef MAX_GPUS
+#define MAX_GPUS 16
+#endif
+
 // Stream management functions
 void MATH_API SetStream(cudaStream_t stream);
 cudaStream_t MATH_API GetStream();
@ -100,7 +105,7 @@ class MATH_API GPUMatrix : public BaseMatrix<ElemType>
    friend class GPUMatrix;

 public:
-    static const int MaxGpus = 8; // support up to 8 GPUs
+    static const int MaxGpus = MAX_GPUS;
    using BaseMatrix<ElemType>::m_computeDevice;
    using BaseMatrix<ElemType>::m_elemSizeAllocated;
    using BaseMatrix<ElemType>::m_format;
@ -402,6 +407,27 @@ public:
                                                   const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                   const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);

+    void ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                            const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const;
+    void ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                                 const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const;
+    void ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                                   const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const;
+
+    void MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
+    void MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
+                            const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
+                            GPUMatrix<ElemType>& grad) const;
+
+    void AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
+    void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;
+
+    void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
+                                   GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
+                                   GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
+    void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
+                                    GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const;
+
 public:
    // static BLAS functions
    static void MultiplyAndWeightedAdd(ElemType alpha, const GPUMatrix<ElemType>& a, const bool transposeA, const GPUMatrix<ElemType>& b, const bool transposeB, ElemType beta, GPUMatrix<ElemType>& c);
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -2644,6 +2644,10 @@ template GPUSparseMatrix<char> GPUSparseMatrix<char>::ColumnSlice(size_t startCo
 template GPUMatrix<char> GPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
 template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix<char>&& deepCopy);

+template GPUSparseMatrix<int>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
+template GPUSparseMatrix<int>::~GPUSparseMatrix();
+template void GPUSparseMatrix<int>::Resize(const size_t, const size_t, const size_t, const bool, bool);
+
 template <class ElemType>
 MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
 {
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -156,8 +156,10 @@
    <ClInclude Include="..\Common\Include\TensorShape.h" />
    <ClInclude Include="..\Common\Include\File.h" />
    <ClInclude Include="..\Common\Include\fileutil.h" />
+    <ClInclude Include="BatchNormalizationEngine.h" />
    <ClInclude Include="CommonMatrix.h" />
    <ClInclude Include="ConvolutionEngine.h" />
+    <ClInclude Include="ConvolveGeometry.h" />
    <ClInclude Include="CPUMatrix.h" />
    <ClInclude Include="MatrixQuantizerImpl.h" />
    <ClInclude Include="TensorOps.h" />
@ -188,6 +190,7 @@
    <ClCompile Include="..\Common\fileutil.cpp">
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
    </ClCompile>
+    <ClCompile Include="BatchNormalizationEngine.cpp" />
    <ClCompile Include="ConvolutionEngine.cpp" />
    <ClCompile Include="CPUSparseMatrix.cpp" />
    <ClCompile Include="CUDAPageLockedMemAllocator.cpp" />
@ -212,4 +215,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/Source/Math/Math.vcxproj.filters
+++ b/Source/Math/Math.vcxproj.filters
@ -44,6 +44,9 @@
    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
      <Filter>Common</Filter>
    </ClCompile>
+    <ClCompile Include="BatchNormalizationEngine.cpp">
+      <Filter>BatchNormalization</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="CommonMatrix.h" />
@ -97,6 +100,12 @@
    <ClInclude Include="MatrixQuantizerImpl.h">
      <Filter>1bitSGD</Filter>
    </ClInclude>
+    <ClInclude Include="ConvolveGeometry.h">
+      <Filter>Convolution</Filter>
+    </ClInclude>
+    <ClInclude Include="BatchNormalizationEngine.h">
+      <Filter>BatchNormalization</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="GPUMatrix.h">
@ -143,5 +152,8 @@
    <Filter Include="1bitSGD">
      <UniqueIdentifier>{546cacbd-253e-485b-8c8c-8b9ee0e2f631}</UniqueIdentifier>
    </Filter>
+    <Filter Include="BatchNormalization">
+      <UniqueIdentifier>{8f982dac-298d-4e48-b060-8e6cba5ff554}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
 </Project>
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@ -143,6 +143,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
  <ItemGroup>
    <ClInclude Include="..\Common\Include\File.h" />
    <ClInclude Include="..\Common\Include\fileutil.h" />
+    <ClInclude Include="CntkBatchNormalization.cuh" />
    <ClInclude Include="ColumnQuantizer.h" />
    <ClInclude Include="CommonMatrix.h" />
    <ClInclude Include="cudabasetypes.h" />
@ -151,11 +152,12 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
    <ClInclude Include="cudalatticeops.cu.h" />
    <ClInclude Include="cudalatticeops.h" />
    <ClInclude Include="cudalib.h" />
-    <ClInclude Include="CuDnnConvolutionEngine.cuh" />
-    <ClInclude Include="CuDnnConvolutionEngine.h" />
+    <ClInclude Include="CuDnnCommon.h" />
+    <ClInclude Include="CuDnnFactories.h" />
    <ClInclude Include="GPUDataTransferer.h" />
    <ClInclude Include="GPUTensor.h" />
    <ClInclude Include="latticefunctionskernels.h" />
+    <ClInclude Include="Convolution.cuh" />
    <ClInclude Include="TensorOps.h" />
    <ClInclude Include="ValueQuantizer.h" />
    <None Include="GPUWatcher.h">
@ -170,6 +172,9 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
+    <CudaCompile Include="CuDnnBatchNormalization.cu">
+      <FileType>CppCode</FileType>
+    </CudaCompile>
    <CudaCompile Include="GPUTensor.cu">
      <InterleaveSourceInPTX Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</InterleaveSourceInPTX>
      <Keep Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</Keep>
@ -190,6 +195,7 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
    <CudaCompile Include="CuDnnConvolutionEngine.cu">
      <FileType>CppCode</FileType>
    </CudaCompile>
+    <ClCompile Include="CuDnnCommon.cpp" />
    <ClCompile Include="GPUDataTransferer.cpp" />
    <ClCompile Include="stdafx.cpp">
      <PrecompiledHeader>Create</PrecompiledHeader>
--- a/Source/Math/MathCUDA.vcxproj.filters
+++ b/Source/Math/MathCUDA.vcxproj.filters
@ -28,6 +28,9 @@
    <CudaCompile Include="CuDnnConvolutionEngine.cu">
      <Filter>GPU\Convolution</Filter>
    </CudaCompile>
+    <CudaCompile Include="CuDnnBatchNormalization.cu">
+      <Filter>GPU\BatchNormalization</Filter>
+    </CudaCompile>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="cudalattice.cpp">
@ -45,6 +48,9 @@
    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
+    <ClCompile Include="CuDnnCommon.cpp">
+      <Filter>GPU\CuDnn</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\Include\File.h">
@ -98,8 +104,8 @@
    <ClInclude Include="CommonMatrix.h">
      <Filter>from Math</Filter>
    </ClInclude>
-    <ClInclude Include="CuDnnConvolutionEngine.h">
-      <Filter>GPU\Convolution</Filter>
+    <ClInclude Include="CuDnnFactories.h">
+      <Filter>GPU\CuDnn</Filter>
    </ClInclude>
    <ClInclude Include="TensorOps.h">
      <Filter>from Math</Filter>
@ -107,7 +113,13 @@
    <ClInclude Include="GPUDataTransferer.h">
      <Filter>GPU</Filter>
    </ClInclude>
-    <ClInclude Include="CuDnnConvolutionEngine.cuh">
+    <ClInclude Include="CntkBatchNormalization.cuh">
+      <Filter>GPU\BatchNormalization</Filter>
+    </ClInclude>
+    <ClInclude Include="CuDnnCommon.h">
+      <Filter>GPU\CuDnn</Filter>
+    </ClInclude>
+    <ClInclude Include="Convolution.cuh">
      <Filter>GPU\Convolution</Filter>
    </ClInclude>
  </ItemGroup>
@ -150,5 +162,11 @@
    <Filter Include="GPU\Convolution">
      <UniqueIdentifier>{3155488f-128f-494e-858d-459b4cc9fab7}</UniqueIdentifier>
    </Filter>
+    <Filter Include="GPU\BatchNormalization">
+      <UniqueIdentifier>{639ff4b6-39b5-4a5b-8856-ee918eeea91e}</UniqueIdentifier>
+    </Filter>
+    <Filter Include="GPU\CuDnn">
+      <UniqueIdentifier>{05351afa-de95-40c8-830a-d70eede55dc0}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
 </Project>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -3987,6 +3987,189 @@ Matrix<ElemType>& Matrix<ElemType>::AddAveragePoolingGradient(const Matrix<ElemT

 #pragma endregion Other Helper Functions

+template <class ElemType>
+void Matrix<ElemType>::ConvolutionForward(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
+                                          const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowIwht.GetNumCols() == 1);
+    assert(mpRowRun.GetNumCols() == 1);
+    assert(runs.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, output);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->ConvolutionForward(*(kernel.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
+                                                              *(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(output.m_CPUMatrix)),
+                            m_GPUMatrix->ConvolutionForward(*(kernel.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
+                                                             *(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(output.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::ConvolutionBackwardData(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
+                                               const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& grad) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowIwht.GetNumCols() == 1);
+    assert(mpRowRun.GetNumCols() == 1);
+    assert(runs.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, grad);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->ConvolutionBackwardData(*(kernel.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
+                                                                   *(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(grad.m_CPUMatrix)),
+                            m_GPUMatrix->ConvolutionBackwardData(*(kernel.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
+                                                                   *(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(grad.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::ConvolutionBackwardKernel(const Matrix<ElemType>& in, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
+                                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& kernelGrad) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowIwht.GetNumCols() == 1);
+    assert(mpRowRun.GetNumCols() == 1);
+    assert(runs.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, kernelGrad);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->ConvolutionBackwardKernel(*(in.m_CPUMatrix), *(mpRowCol.m_CPUMatrix), *(mpRowIwht.m_CPUMatrix),
+                                                                     *(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(kernelGrad.m_CPUMatrix)),
+                            m_GPUMatrix->ConvolutionBackwardKernel(*(in.m_GPUMatrix), *(mpRowCol.m_GPUMatrix), *(mpRowIwht.m_GPUMatrix),
+                                                                     *(mpRowRun.m_GPUMatrix), *(runs.m_GPUMatrix), *(kernelGrad.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowIndices.GetNumCols() == 1);
+    assert(indices.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, output);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->MaxPoolingForward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(output.m_CPUMatrix)),
+                            m_GPUMatrix->MaxPoolingForward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(output.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
+                                          const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
+                                          Matrix<ElemType>& grad) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowIndices.GetNumCols() == 1);
+    assert(indices.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, grad);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->MaxPoolingBackward(*(out.m_CPUMatrix), *(in.m_CPUMatrix),
+                                                              *(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix),
+                                                              *(grad.m_CPUMatrix)),
+                            m_GPUMatrix->MaxPoolingBackward(*(out.m_GPUMatrix), *(in.m_GPUMatrix),
+                                                              *(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix),
+                                                              *(grad.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowIndices.GetNumCols() == 1);
+    assert(indices.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, output);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->AveragePoolingForward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(output.m_CPUMatrix)),
+                            m_GPUMatrix->AveragePoolingForward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(output.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowIndices.GetNumCols() == 1);
+    assert(indices.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, grad);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->AveragePoolingBackward(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(grad.m_CPUMatrix)),
+                            m_GPUMatrix->AveragePoolingBackward(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(grad.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor, 
+                                                 Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
+                                                 Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const
+{
+    DecideAndMoveToRightDevice(*this, out);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->BatchNormalizationForward(*(scale.m_CPUMatrix), *(bias.m_CPUMatrix), expAvgFactor, blendFactor,
+                                                                   *(runMean.m_CPUMatrix), *(runInvStdDev.m_CPUMatrix),
+                                                                   *(out.m_CPUMatrix), epsilon, *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix)),
+                            m_GPUMatrix->BatchNormalizationForward(*(scale.m_GPUMatrix), *(bias.m_GPUMatrix), expAvgFactor, blendFactor,
+                                                                   *(runMean.m_GPUMatrix), *(runInvStdDev.m_GPUMatrix),
+                                                                   *(out.m_GPUMatrix), epsilon, *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
+                                                  Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const
+{
+    DecideAndMoveToRightDevice(*this, grad);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix),
+                                                                    *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix),
+                                                                    *(scaleGrad.m_CPUMatrix), *(biasGrad.m_CPUMatrix)),
+                            m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix),
+                                                                    *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix),
+                                                                    *(scaleGrad.m_GPUMatrix), *(biasGrad.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
 #pragma region Static BLAS Functions

 template <class ElemType>
@ -5108,4 +5291,6 @@ template void Matrix<char>::SetValue(const Matrix<char>&, MatrixFormat);
 template bool Matrix<char>::IsEmpty() const;
 template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);

+template Matrix<int>::Matrix(const size_t, const size_t, int*, DEVICEID_TYPE, const size_t, const size_t);
+
 }}}
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -453,6 +453,27 @@ public:
                                                const size_t outputWidth, const size_t outputHeight, const size_t outputSizePerSample,
                                                const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample);

+    void ConvolutionForward(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
+                            const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
+    void ConvolutionBackwardData(const Matrix<ElemType>& kernel, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
+                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& grad) const;
+    void ConvolutionBackwardKernel(const Matrix<ElemType>& in, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
+                                   const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& kernelGrad) const;
+
+    void MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
+    void MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
+                            const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
+                            Matrix<ElemType>& grad) const;
+
+    void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
+    void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;
+
+    void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
+                                   Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
+                                   Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
+    void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
+                                    Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;
+
 public:
    // TODO: why are these not static? And why are they here?
    ElemType Exp10(ElemType num);
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -12,7 +12,7 @@
 #include "GPUMatrix.h"
 #include "GPUSparseMatrix.h"
 #include "MatrixQuantizerGPU.h"
-#include "CuDnnConvolutionEngine.h"
+#include "CuDnnFactories.h"
 #include "TensorShape.h"
 #include "GPUDataTransferer.h"

@ -676,6 +676,7 @@ void GPUSparseMatrix<ElemType>::CopyBuffer(OutType* outBuffer, const InType* inB
 template class MATH_API GPUSparseMatrix<char>;
 template class MATH_API GPUSparseMatrix<float>;
 template class MATH_API GPUSparseMatrix<double>;
+template class MATH_API GPUSparseMatrix<int>;

 template <typename ElemType>
 MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
@ -1728,6 +1729,60 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AddAveragePoolingGradient(const GPUMat
    return *this;
 }

+template <class ElemType>
+void GPUMatrix<ElemType>::ConvolutionForward(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                                               const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& output) const
+{
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::ConvolutionBackwardData(const GPUMatrix<ElemType>& kernel, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                                                    const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& grad) const
+{
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::ConvolutionBackwardKernel(const GPUMatrix<ElemType>& in, const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIwht,
+                                                      const GPUMatrix<int>& mpRowRun, const GPUMatrix<int>& runs, GPUMatrix<ElemType>& kernelGrad) const
+{
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::MaxPoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
+{
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
+                                               const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
+                                               GPUMatrix<ElemType>& grad) const
+{
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
+{
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const
+{
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, 
+                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
+                                                    GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const
+{
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, 
+                                                     const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
+                                                     GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
+{
+}
+
 #pragma endregion Other helper functions

 #pragma region Static BLAS Functions
@ -2096,6 +2151,7 @@ void GPUDataTransferer<ElemType>::WaitForCopyCPUToGPUAsync()
 template class GPUMatrix<char>;
 template class GPUMatrix<float>;
 template class GPUMatrix<double>;
+template class GPUMatrix<int>;
 template class DeviceBoundNumber<float>;
 template class DeviceBoundNumber<double>;
 template MatrixQuantizerGPU<float>::~MatrixQuantizerGPU();
@ -2113,45 +2169,14 @@ template <class ElemType>
 void* GPUMatrix<ElemType>::s_curandGenerator = NULL;

 template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::Tensor4DPtr CuDnnConvolutionEngineFactory<ElemType>::CreateTensor(size_t, size_t, size_t, size_t)
+std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemType>::Create(ConvolveGeometryPtr, DEVICEID_TYPE,
+                                                                                             ImageLayoutKind, size_t, PoolKind)
 {
    RuntimeError("The code is compiled with CPUONLY macro.");
 }

 template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::FilterPtr CuDnnConvolutionEngineFactory<ElemType>::CreateFilter(size_t, size_t, size_t, size_t)
-{
-    RuntimeError("The code is compiled with CPUONLY macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::ConvDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvDescriptor(
-    const Tensor4D&, const Filter&, size_t, size_t, bool)
-{
-    RuntimeError("The code is compiled with CPUONLY macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::PoolDescPtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolDescriptor(
-    typename PoolDesc::PoolKind, size_t, size_t, size_t, size_t, size_t, size_t)
-{
-    RuntimeError("The code is compiled with CPUONLY macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::ConvEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreateConvEngine(DEVICEID_TYPE, ImageLayoutKind, size_t, BatchNormImpl)
-{
-    RuntimeError("The code is compiled with CPUONLY macro.");
-}
-
-template <class ElemType>
-typename CuDnnConvolutionEngineFactory<ElemType>::PoolEnginePtr CuDnnConvolutionEngineFactory<ElemType>::CreatePoolEngine(DEVICEID_TYPE, ImageLayoutKind)
-{
-    RuntimeError("The code is compiled with CPUONLY macro.");
-}
-
-template <class ElemType>
-bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE)
+bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE, ConvolveGeometryPtr, PoolKind)
 {
    return false;
 }
@ -2159,6 +2184,16 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE)
 template class CuDnnConvolutionEngineFactory<float>;
 template class CuDnnConvolutionEngineFactory<double>;

+template <class ElemType>
+std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+                                                                                         bool spatial, ImageLayoutKind imageLayout)
+{
+    RuntimeError("The code is compiled with CPUONLY macro.");
+}
+
+template class CuDnnBatchNormEngineFactory<float>;
+template class CuDnnBatchNormEngineFactory<double>;
+
 CudaTimer::~CudaTimer()
 {
 }
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.cpp
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.cpp
@ -18,29 +18,39 @@ CNTKTextFormatReader::CNTKTextFormatReader(MemoryProviderPtr provider,
    m_provider(provider)
 {
    TextConfigHelper configHelper(config);
-    
-    if (configHelper.GetElementType() == ElementType::tfloat) 
-    {
-        m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<float>(configHelper));
-    }
-    else 
-    {
-        m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<double>(configHelper));
-    }

-    TransformerPtr randomizer;
-    if (configHelper.ShouldRandomize())
+    try
    {
-        randomizer = make_shared<BlockRandomizer>(0, SIZE_MAX, m_deserializer);
+        if (configHelper.GetElementType() == ElementType::tfloat)
+        {
+            m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<float>(configHelper));
+        }
+        else
+        {
+            m_deserializer = shared_ptr<IDataDeserializer>(new TextParser<double>(configHelper));
+        }
+
+        size_t window = configHelper.GetRandomizationWindow();
+        TransformerPtr randomizer;
+        if (window > 0)
+        {
+            // Verbosity is a general config parameter, not specific to the text format reader.
+            int verbosity = config(L"verbosity", 2);
+            randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer);
+        }
+        else
+        {
+            randomizer = std::make_shared<NoRandomizer>(m_deserializer);
+        }
+
+        randomizer->Initialize(nullptr, config);
+
+        m_transformer = randomizer;
    }
-    else
+    catch (const std::runtime_error& e)
    {
-        randomizer = std::make_shared<NoRandomizer>(m_deserializer);
+        RuntimeError("CNTKTextFormatReader: While reading '%ls': %s", configHelper.GetFilePath().c_str(), e.what());
    }
-
-    randomizer->Initialize(nullptr, config);
-
-    m_transformer = randomizer;
 }

 std::vector<StreamDescriptionPtr> CNTKTextFormatReader::GetStreamDescriptions()
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj
@ -90,7 +90,6 @@
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <ClInclude Include="..\..\Common\Include\basetypes.h" />
    <ClInclude Include="..\..\Common\Include\DataReader.h" />
    <ClInclude Include="..\..\Common\Include\File.h" />
    <ClInclude Include="..\..\Common\Include\fileutil.h" />
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj.filters
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj.filters
@ -27,9 +27,6 @@
  <ItemGroup>
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="targetver.h" />
-    <ClInclude Include="..\..\Common\Include\basetypes.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
    <ClInclude Include="..\..\Common\Include\DataReader.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
--- a/Source/Readers/CNTKTextFormatReader/TextConfigHelper.cpp
+++ b/Source/Readers/CNTKTextFormatReader/TextConfigHelper.cpp
@ -5,6 +5,7 @@

 #include "stdafx.h"
 #include "TextConfigHelper.h"
+#include "DataReader.h"
 #include "StringUtil.h"

 using std::string;
@ -105,19 +106,25 @@ TextConfigHelper::TextConfigHelper(const ConfigParameters& config)

    m_filepath = msra::strfun::utf16(config(L"file"));

-    string rand = config(L"randomize", "auto");
-
-    if (AreEqualIgnoreCase(rand, "auto"))
+    if (config.Exists(L"randomize"))
    {
-        m_randomize = true;
-    }
-    else if (AreEqualIgnoreCase(rand, "none"))
-    {
-        m_randomize = false;
-    }
+        wstring randomizeString = config.CanBeString(L"randomize") ? config(L"randomize") : wstring();
+        if (!_wcsicmp(randomizeString.c_str(), L"none"))
+        {
+            m_randomizationWindow = randomizeNone;
+        }
+        else if (!_wcsicmp(randomizeString.c_str(), L"auto"))
+        {
+            m_randomizationWindow = randomizeAuto;
+        }
+        else
+        {
+            m_randomizationWindow = config(L"randomize");
+        }
+    } 
    else
    {
-        RuntimeError("'randomize' parameter must be set to 'auto' or 'none'");
+        m_randomizationWindow = randomizeAuto;
    }

    m_skipSequenceIds = config(L"skipSequenceIds", false);
--- a/Source/Readers/CNTKTextFormatReader/TextConfigHelper.h
+++ b/Source/Readers/CNTKTextFormatReader/TextConfigHelper.h
@ -25,7 +25,7 @@ public:
    // Get full path to the input file.
    const wstring& GetFilePath() const { return m_filepath; }

-    bool ShouldRandomize() const { return m_randomize; }
+    size_t GetRandomizationWindow() const { return m_randomizationWindow; }

    bool ShouldSkipSequenceIds() const { return m_skipSequenceIds; }

@ -44,7 +44,7 @@ public:
 private:
    std::wstring m_filepath;
    std::vector<StreamDescriptor> m_streams;
-    bool m_randomize;
+    size_t m_randomizationWindow;
    ElementType m_elementType;
    bool m_skipSequenceIds;
    unsigned int m_maxErrors;
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.cpp
@ -32,13 +32,10 @@ HTKDataDeserializer::HTKDataDeserializer(
      m_corpus(corpus),
      m_totalNumberOfFrames(0)
 {
-    // Currently we only support frame mode.
-    // TODO: Support of full sequences.
-    bool frameMode = feature.Find("frameMode", "true");
-    if (!frameMode)
-    {
-        LogicError("Currently only reader only supports frame mode. Please check your configuration.");
-    }
+    // The frame mode is currently specified once per configuration,
+    // not in the configuration of a particular deserializer, but on a higher level in the configuration.
+    // Because of that we are using find method below.
+    m_frameMode = feature.Find("frameMode", "true");

    ConfigHelper config(feature);
    config.CheckFeatureType();
@ -49,11 +46,18 @@ HTKDataDeserializer::HTKDataDeserializer(
    m_dimension = config.GetFeatureDimension();
    m_dimension = m_dimension * (1 + context.first + context.second);

-    m_augmentationWindow = config.GetContextWindow();
-
    InitializeChunkDescriptions(config);
    InitializeStreams(featureName);
    InitializeFeatureInformation();
+
+    m_augmentationWindow = config.GetContextWindow();
+
+    // If not given explicitly, we need to identify the required augmentation range from the expected dimension
+    // and the number of dimensions in the file.
+    if (m_augmentationWindow.first == 0 && m_augmentationWindow.second == 0)
+    {
+        m_augmentationWindow.first = m_augmentationWindow.second = msra::dbn::augmentationextent(m_ioFeatureDimension, m_dimension);
+    }
 }

 // Initializes chunks based on the configuration and utterance descriptions.
@ -170,7 +174,9 @@ ChunkDescriptions HTKDataDeserializer::GetChunkDescriptions()
        auto cd = make_shared<ChunkDescription>();
        cd->m_id = i;
        cd->m_numberOfSamples = m_chunks[i].GetTotalFrames();
-        cd->m_numberOfSequences = m_chunks[i].GetTotalFrames();
+        // In frame mode, each frame is represented as sequence.
+        // The augmentation is still done for frames in the same sequence only, please see GetSequenceById method.
+        cd->m_numberOfSequences = m_frameMode ? m_chunks[i].GetTotalFrames() : m_chunks[i].GetNumberOfUtterances();
        chunks.push_back(cd);
    }
    return chunks;
@ -187,16 +193,32 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe
    {
        auto utterance = chunk.GetUtterance(i);
        size_t major = utterance->GetId();
-        // Because it is a frame mode, creating sequences for each frame.
-        for (size_t k = 0; k < utterance->GetNumberOfFrames(); ++k)
+
+        if (m_frameMode)
        {
+            // Because it is a frame mode, creating a sequence for each frame.
+            for (size_t k = 0; k < utterance->GetNumberOfFrames(); ++k)
+            {
+                SequenceDescription f;
+                f.m_chunkId = chunkId;
+                f.m_key.m_major = major;
+                f.m_key.m_minor = k;
+                f.m_id = offsetInChunk++;
+                f.m_isValid = true;
+                f.m_numberOfSamples = 1;
+                result.push_back(f);
+            }
+        }
+        else
+        {
+            // Creating sequence description per utterance.
            SequenceDescription f;
            f.m_chunkId = chunkId;
            f.m_key.m_major = major;
-            f.m_key.m_minor = k;
+            f.m_key.m_minor = 0;
            f.m_id = offsetInChunk++;
            f.m_isValid = true;
-            f.m_numberOfSamples = 1;
+            f.m_numberOfSamples = utterance->GetNumberOfFrames();
            result.push_back(f);
        }
    }
@ -204,7 +226,7 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe

 // A wrapper around a matrix that views it as a vector of column vectors.
 // Does not have any memory associated.
-class MatrixAsVectorOfVectors 
+class MatrixAsVectorOfVectors
 {
 public:
    MatrixAsVectorOfVectors(msra::dbn::matrixbase& m)
@ -245,7 +267,7 @@ public:
        });
    }

-    // Gets data for the sequnce.
+    // Gets data for the sequence.
    virtual void GetSequence(size_t sequenceId, vector<SequenceDataPtr>& result) override
    {
        m_parent->GetSequenceById(m_chunkId, sequenceId, result);
@ -277,73 +299,117 @@ ChunkPtr HTKDataDeserializer::GetChunk(size_t chunkId)
    return chunk;
 };

-// This class stores sequence data for HTK,
-//     - for floats: a simple pointer to the chunk data
-//     - for doubles: allocated array of doubles which is freed when the sequence is no longer used.
-struct HTKSequenceData : DenseSequenceData
+// A matrix that stores all samples of a sequence without padding (differently from ssematrix).
+// The number of columns equals the number of samples in the sequence.
+// The number of rows equals the size of the feature vector of a sample (= dimensions).
+class FeatureMatrix
 {
-    msra::dbn::matrix m_buffer;
-
-    ~HTKSequenceData()
+public:
+    FeatureMatrix(size_t numRows, size_t numColumns) : m_numRows(numRows), m_numColumns(numColumns)
    {
-        msra::dbn::matrixstripe frame(m_buffer, 0, m_buffer.cols());
-
-        // Checking if m_data just a pointer in to the 
-        if (m_data != &frame(0, 0))
-        {
-            delete[] reinterpret_cast<double*>(m_data);
-            m_data = nullptr;
-        }
+        m_data.resize(m_numRows * m_numColumns);
    }
+
+    // Returns a reference to the column.
+    inline array_ref<float> col(size_t column)
+    {
+        return array_ref<float>(m_data.data() + m_numRows * column, m_numRows);
+    }
+
+    // Gets pointer to the data.
+    inline float* GetData()
+    {
+        return m_data.data();
+    }
+
+    // Gets the number of columns. It equals the number of samples in the sequence/utterance.
+    inline size_t GetNumberOfColumns() const
+    {
+        return m_numColumns;
+    }
+
+    // Gets total size in elements of stored features.
+    inline size_t GetTotalSize() const
+    {
+        return m_data.size();
+    }
+
+private:
+    // Features
+    std::vector<float> m_data;
+    // Number of rows = dimension of the feature
+    size_t m_numRows;
+    // Number of columns = number of samples in utterance.
+    size_t m_numColumns;
 };

-typedef shared_ptr<HTKSequenceData> HTKSequenceDataPtr;
+// This class stores sequence data for HTK for floats.
+struct HTKFloatSequenceData : DenseSequenceData
+{
+    HTKFloatSequenceData(FeatureMatrix&& data) : m_buffer(data)
+    {
+        m_numberOfSamples = data.GetNumberOfColumns();
+        m_data = m_buffer.GetData();
+    }

-// Get a sequence by its chunk id and id.
+private:
+    FeatureMatrix m_buffer;
+};
+
+// This class stores sequence data for HTK for doubles.
+struct HTKDoubleSequenceData : DenseSequenceData
+{
+    HTKDoubleSequenceData(FeatureMatrix& data) : m_buffer(data.GetData(), data.GetData() + data.GetTotalSize())
+    {
+        m_numberOfSamples = data.GetNumberOfColumns();
+        m_data = m_buffer.data();
+    }
+
+private:
+    std::vector<double> m_buffer;
+};
+
+// Get a sequence by its chunk id and sequence id.
+// Sequence ids are guaranteed to be unique inside a chunk.
 void HTKDataDeserializer::GetSequenceById(size_t chunkId, size_t id, vector<SequenceDataPtr>& r)
 {
    const auto& chunkDescription = m_chunks[chunkId];
-    size_t utteranceIndex = chunkDescription.GetUtteranceForChunkFrameIndex(id);
+    size_t utteranceIndex = m_frameMode ? chunkDescription.GetUtteranceForChunkFrameIndex(id) : id;
    const UtteranceDescription* utterance = chunkDescription.GetUtterance(utteranceIndex);
    auto utteranceFrames = chunkDescription.GetUtteranceFrames(utteranceIndex);
-    size_t frameIndex = id - utterance->GetStartFrameIndexInsideChunk();

    // wrapper that allows m[j].size() and m[j][i] as required by augmentneighbors()
    MatrixAsVectorOfVectors utteranceFramesWrapper(utteranceFrames);
+    FeatureMatrix features(m_dimension, m_frameMode ? 1 : utterance->GetNumberOfFrames());

-    size_t leftExtent = m_augmentationWindow.first;
-    size_t rightExtent = m_augmentationWindow.second;
-
-    // page in the needed range of frames
-    if (leftExtent == 0 && rightExtent == 0)
+    if (m_frameMode)
    {
-        leftExtent = rightExtent = msra::dbn::augmentationextent(utteranceFramesWrapper[0].size(), m_dimension);
-    }
-
-    HTKSequenceDataPtr result = make_shared<HTKSequenceData>();
-    result->m_buffer.resize(m_dimension, 1);
-    const vector<char> noBoundaryFlags; // TODO: dummy, currently to boundaries supported.
-    msra::dbn::augmentneighbors(utteranceFramesWrapper, noBoundaryFlags, frameIndex, leftExtent, rightExtent, result->m_buffer, 0);
-
-    result->m_numberOfSamples = 1;
-    msra::dbn::matrixstripe stripe(result->m_buffer, 0, result->m_buffer.cols());
-    if (m_elementType == ElementType::tfloat)
-    {
-        result->m_data = &stripe(0, 0);
+        // For frame mode augment a single frame.
+        size_t frameIndex = id - utterance->GetStartFrameIndexInsideChunk();
+        msra::dbn::augmentneighbors(utteranceFramesWrapper, vector<char>(), frameIndex, m_augmentationWindow.first, m_augmentationWindow.second, features, 0);
    }
    else
    {
-        assert(m_elementType == ElementType::tdouble);
-        const size_t dimensions = stripe.rows();
-        double *doubleBuffer = new double[dimensions];
-        const float *floatBuffer = &stripe(0, 0);
-
-        for (size_t i = 0; i < dimensions; i++)
+        // Augment complete utterance.
+        for (size_t frameIndex = 0; frameIndex < utterance->GetNumberOfFrames(); ++frameIndex)
        {
-            doubleBuffer[i] = floatBuffer[i];
+            msra::dbn::augmentneighbors(utteranceFramesWrapper, vector<char>(), frameIndex, m_augmentationWindow.first, m_augmentationWindow.second, features, frameIndex);
        }
+    }

-        result->m_data = doubleBuffer;
+    // Copy features to the sequence depending on the type.
+    DenseSequenceDataPtr result;
+    if (m_elementType == ElementType::tdouble)
+    {
+        result = make_shared<HTKDoubleSequenceData>(features);
+    }
+    else if (m_elementType == ElementType::tfloat)
+    {
+        result = make_shared<HTKFloatSequenceData>(std::move(features));
+    }
+    else
+    {
+        LogicError("Currently, HTK Deserializer supports only double and float types.");
    }

    r.push_back(result);
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.h
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.h
@ -66,6 +66,9 @@ private:
    // Total number of frames.
    size_t m_totalNumberOfFrames;

+    // Flag that indicates whether a single speech frames should be exposed as a sequence.
+    bool m_frameMode;
+
    // Auxiliary data for checking against the data in the feature file.
    unsigned int m_samplePeriod;
    size_t m_ioFeatureDimension;
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKMLFReader.cpp
@ -11,6 +11,11 @@
 #include "ConfigHelper.h"
 #include "Bundler.h"
 #include "StringUtil.h"
+#include "SequencePacker.h"
+#include "SampleModePacker.h"
+#include "BpttPacker.h"
+#include "BlockRandomizer.h"
+#include "NoRandomizer.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -61,23 +66,58 @@ HTKMLFReader::HTKMLFReader(MemoryProviderPtr provider,
    // TODO: deserializers and transformers will be dynamically loaded
    // from external libraries based on the configuration/brain script.

-    assert(readerConfig(L"frameMode", true));
-    ConfigHelper config(readerConfig);
+    bool frameMode = readerConfig(L"frameMode", true);
+    bool truncated = readerConfig(L"truncated", false);
+    if (frameMode && truncated)
+    {
+        LogicError("frameMode and truncated BPTT are mutually exclusive.");
+    }

+    if (frameMode)
+    {
+        m_packingMode = PackingMode::sample;
+    }
+    else if (truncated)
+    {
+        m_packingMode = PackingMode::truncated;
+    }
+    else
+    {
+        m_packingMode = PackingMode::sequence;
+    }
+
+    // nbruttsineachrecurrentiter is old reader configuration, truncationLength is the new one.
+    // If truncation length is specified we estimate
+    // the number of parallel sequences we have to pack as max(1, (mbsize/truncationLength))
+    // If nbruttsineachrecurrentiter is specified we assume that the truncation size is mbSize
+    // and the real minibatch size in mbSize * nbruttsineachrecurrentiter[epochIndex]
+    m_truncationLength = readerConfig(L"truncationLength", 0);
+    m_numParallelSequencesForAllEpochs =
+        readerConfig(L"nbruttsineachrecurrentiter", ConfigParameters::Array(intargvector(vector<int> { 1 })));
+
+    ConfigHelper config(readerConfig);
    size_t window = config.GetRandomizationWindow();
    auto deserializers = CreateDeserializers(readerConfig);
    assert(deserializers.size() == 2);

    auto bundler = std::make_shared<Bundler>(readerConfig, deserializers[0], deserializers, false);
-
+    int verbosity = readerConfig(L"verbosity", 2);
    std::wstring readMethod = config.GetRandomizer();
-    if (!AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize")))
+
+    // TODO: this should be bool. Change when config per deserializer is allowed.
+    if (AreEqualIgnoreCase(readMethod, std::wstring(L"blockRandomize")))
    {
-        RuntimeError("readMethod must be 'blockRandomize'");
+        m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DecimationMode::chunk, true /* useLegacyRandomization */);
+    }
+    else if (AreEqualIgnoreCase(readMethod, std::wstring(L"none")))
+    {
+        m_randomizer = std::make_shared<NoRandomizer>(bundler);
+    }
+    else
+    {
+        RuntimeError("readMethod must be 'blockRandomize' or 'none'.");
    }

-    int verbosity = readerConfig(L"verbosity", 2);
-    m_randomizer = std::make_shared<BlockRandomizer>(verbosity, window, bundler, BlockRandomizer::DecimationMode::chunk, true /* useLegacyRandomization */);
    m_randomizer->Initialize(nullptr, readerConfig);

    // Create output stream descriptions (all dense)
@ -107,11 +147,57 @@ void HTKMLFReader::StartEpoch(const EpochConfiguration& config)
    }

    m_randomizer->StartEpoch(config);
-    m_packer = std::make_shared<SampleModePacker>(
-        m_provider,
-        m_randomizer,
-        config.m_minibatchSizeInSamples,
-        m_streams);
+
+    // TODO: should we unify sample and sequence mode packers into a single one.
+    // TODO: functionally they are the same, the only difference is how we handle
+    // TODO: MBlayout and what is the perf hit for iterating/copying sequences.
+    // TODO: Should do more perf tests before unifying these two.
+
+    // TODO: As the next step the packers will be moved out of the readers into the
+    // TODO: core CNTK. They are format agnostic and can be used with any type of 
+    // TODO: deserializers.
+    switch (m_packingMode)
+    {
+    case PackingMode::sample:
+        m_packer = std::make_shared<SampleModePacker>(
+            m_provider,
+            m_randomizer,
+            config.m_minibatchSizeInSamples,
+            m_streams);
+        break;
+    case PackingMode::sequence:
+        m_packer = std::make_shared<SequencePacker>(
+            m_provider,
+            m_randomizer,
+            config.m_minibatchSizeInSamples,
+            m_streams);
+        break;
+    case PackingMode::truncated:
+    {
+        size_t minibatchSize = config.m_minibatchSizeInSamples;
+        size_t truncationLength = m_truncationLength;
+        if (truncationLength == 0)
+        {
+            // Old config, the truncation length is specified as the minibatch size.
+            // In this case the truncation size is mbSize
+            // and the real minibatch size is truncation size * nbruttsineachrecurrentiter
+            fprintf(stderr, "Legacy configuration is used for truncated BPTT mode, please adapt the config to explicitly specify truncationLength.");
+            truncationLength = minibatchSize;
+            size_t numParallelSequences = m_numParallelSequencesForAllEpochs[config.m_epochIndex];
+            minibatchSize = numParallelSequences * truncationLength;
+        }
+
+        m_packer = std::make_shared<BpttPacker>(
+            m_provider,
+            m_randomizer,
+            minibatchSize,
+            truncationLength,
+            m_streams);
+        break;
+    }
+    default:
+        LogicError("Unsupported type of packer '%d'.", (int)m_packingMode);
+    }
 }

 Minibatch HTKMLFReader::ReadMinibatch()
--- a/Показать больше
+++ b/Показать больше