Updated ResNet samples.

2016-01-21 15:19:48 -08:00 · 2016-01-21 15:19:48 -08:00 · 08e4b993e0
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@ -1,146 +1,95 @@
-ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue)
+ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
 {
-    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
    sc = Parameter(outMap, 1, init = fixedValue, value = scValue)
    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
-    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
-    y = RectifiedLinear(bn);
+    y = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = expAvg, imageLayout = "cudnn")
 }

-# Standard building block for ResNet.
-ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
+ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg)
 {
-    W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
-    y1 = RectifiedLinear(bn1);
-    
-    W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
-    p = Plus(bn2, inp)
-    y2 = RectifiedLinear(p);
+    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    c = ConvBNLayerW(W, inp, outMap, kW, kH, hStride, vStride, bValue, scValue, expAvg)
 }

-# Standard building block for ResNet with padding.
-ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, Wproj)
+ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg)
 {
-    W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    c = ConvBNLayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue, expAvg)
+    y = RectifiedLinear(c)
+}
+
+# Standard building block for ResNet with identity shortcut (option A).
+ResNetNode2A(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
+{
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, expAvg)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, inWCount, kW, kH, 1, 1, wScale, bValue, scValue, expAvg)
+    # Identity shortcut.
+    p = Plus(c2, inp)
+    y = RectifiedLinear(p)
+}
+
+# Standard building block for ResNet with padding (option A).
+ResNetNode2AInc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, expAvg, Wproj)
+{
+    # First convolution layer.
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, expAvg)
+    # Second convolution layer, no ReLU.
+    c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, expAvg)
    
-    c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true, imageLayout = "cudnn")
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
-    y1 = RectifiedLinear(bn1);
+    # Projection convolution layer.
+    c_proj = ConvBNLayerW(Wproj, inp, outMap, 1, 1, 2, 2, bValue, scValue, expAvg)
    
-    W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale)
-    b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
-    
-    cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
-    p = Plus(bn2, cproj)
-    y2 = RectifiedLinear(p);
+    p = Plus(c2, c_proj)
+    y2 = RectifiedLinear(p)
 }

 # Bottleneck building block for ResNet.
-ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue)
+ResNetNode3A(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, expAvg)
 {
    # 1x1 reducing convolution.
-    W1 = Parameter(convMap, inMap, init = Gaussian, initValueScale = wScale)
-    b1 = Parameter(convMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(convMap, 1, init = fixedValue, value = scValue)
-    m1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c1 = Convolution(W1, inp, 1, 1, convMap, 1, 1, zeroPadding = false, imageLayout = "cudnn")
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, imageLayout = "cudnn")
-    y1 = RectifiedLinear(bn1);
-
+    c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, 1, 1, wScale, bValue, scValue, expAvg)
    # 3x3 convolution.
-    W2 = Parameter(convMap, convWCount, init = Gaussian, initValueScale = wScale)
-    b2 = Parameter(convMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(convMap, 1, init = fixedValue, value = scValue)
-    m2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
+    c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, expAvg)
+    # 1x1 expanding convolution, no ReLU.
+    c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, expAvg)
    
-    c2 = Convolution(W2, y1, 3, 3, convMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
-    y2 = RectifiedLinear(bn2);
-    
-    # 1x1 expanding convolution.
-    W3 = Parameter(outMap, convMap, init = Gaussian, initValueScale = wScale)
-    b3 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc3 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c3 = Convolution(W3, y2, 1, 1, outMap, 1, 1, zeroPadding = false, imageLayout = "cudnn")
-    bn3 = BatchNormalization(c3, sc3, b3, m3, isd3, eval = false, spatial = true, imageLayout = "cudnn")
-    
-    p = Plus(bn3, inp)
-    y3 = RectifiedLinear(p);
+    p = Plus(c3, inp)
+    y = RectifiedLinear(p)
 }

-ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, wProj, projStride)
+ResNetNode3AInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, expAvg, wProj, projStride)
 {
    # 1x1 reducing convolution.
-    W1 = Parameter(convMap, inMap, init = Gaussian, initValueScale = wScale)
-    b1 = Parameter(convMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(convMap, 1, init = fixedValue, value = scValue)
-    m1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c1 = Convolution(W1, inp, 1, 1, convMap, 1, 1, zeroPadding = false, imageLayout = "cudnn")
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, imageLayout = "cudnn")
-    y1 = RectifiedLinear(bn1);
-
+    c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, expAvg)
    # 3x3 convolution.
-    W2 = Parameter(convMap, convWCount, init = Gaussian, initValueScale = wScale)
-    b2 = Parameter(convMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(convMap, 1, init = fixedValue, value = scValue)
-    m2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
+    c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, expAvg)
+    # 1x1 expanding convolution, no ReLU.
+    c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, expAvg)
+    # Input-to-output mapping convolution.
+    c_proj = ConvBNLayerW(wProj, inp, outMap, 1, 1, projStride, projStride, wScale, bValue, scValue, expAvg)
    
-    c2 = Convolution(W2, y1, 3, 3, convMap, projStride, projStride, zeroPadding = true, imageLayout = "cudnn")
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
-    y2 = RectifiedLinear(bn2);
+    p = Plus(c3, c_proj)
+    y = RectifiedLinear(p)
+}
+
+ResNetNode3BInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, expAvg, projStride)
+{
+    # 1x1 reducing convolution.
+    c1 = ConvBNReLULayer(inp, convMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, expAvg)
+    # 3x3 convolution.
+    c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, expAvg)
+    # 1x1 expanding convolution, no ReLU.
+    c3 = ConvBNLayer(c2, outMap, convMap, 1, 1, 1, 1, wScale, bValue, scValue, expAvg)
+    # Input-to-output mapping convolution.
+    c_proj = ConvBNLayer(inp, outMap, inMap, 1, 1, projStride, projStride, wScale, bValue, scValue, expAvg)
    
-    # 1x1 expanding convolution.
-    W3 = Parameter(outMap, convMap, init = Gaussian, initValueScale = wScale)
-    b3 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc3 = Parameter(outMap, 1, init = fixedValue, value = scValue)
-    m3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    c3 = Convolution(W3, y2, 1, 1, outMap, 1, 1, zeroPadding = false, imageLayout = "cudnn")
-    bn3 = BatchNormalization(c3, sc3, b3, m3, isd3, eval = false, spatial = true, imageLayout = "cudnn")
-    
-    # Increasing input dimension convolution
-    cProj = Convolution(wProj, inp, 1, 1, outMap, projStride, projStride, zeroPadding = false, imageLayout = "cudnn")
-    
-    p = Plus(bn3, cProj)
-    y3 = RectifiedLinear(p);
+    p = Plus(c3, c_proj)
+    y = RectifiedLinear(p)
 }

 DnnLayer(hiddenDim, labelDim, x, wScale, bValue)
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.config
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.config
@ -10,12 +10,13 @@ ndlMacros=$ConfigDir$/Macros.ndl
 precision=float
 deviceId=Auto

-command=Train:AddTop5Eval:Test
+command=Train:CreateEval:Test

 parallelTrain=false

 stderr=$OutputDir$/ResNet_152
 traceLevel=1
+numMBsToShowResult=500

 Proj64to256Filename = $ConfigDir$/64to256.txt
 Proj256to512Filename = $ConfigDir$/256to512.txt
@ -32,10 +33,12 @@ Train=[
    
    SGD=[
        epochSize=0
-        minibatchSize=32
-        learningRatesPerMB=0.1*30:0.03*25:0.01*25:0.003*25:0.001
+        minibatchSize=256
+        # Note that learning rates are 10x more than in the paper due to a different
+        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
+        learningRatesPerMB=1.0*35:0.1*35:0.01
        momentumPerMB=0.9
-        maxEpochs=120
+        maxEpochs=125
        gradUpdateType=None
        L2RegWeight=0.0001
        dropoutRate=0
@ -45,11 +48,9 @@ Train=[
            distributedMBReading=true
            parallelizationStartEpoch=1
            DataParallelSGD=[
-                gradientBits=1
+                gradientBits=32
            ]
        ]
-        
-        numMBsToShowResult=100
    ]
    
    reader=[
@ -88,16 +89,16 @@ Train=[
    ]    
 ]

-AddTop5Eval=[    
+CreateEval=[    
    action=edit
    CurModel=$ModelDir$/ResNet_152
-    NewModel=$ModelDir$/ResNet_152.Top5
-    editPath=$ConfigDir$/add_top5_layer.mel
+    NewModel=$ModelDir$/ResNet_152.Eval
+    editPath=$ConfigDir$/create_eval_model.mel
 ]

 Test=[
    action=test
-    modelPath=$ModelDir$/ResNet_152.Top5
+    modelPath=$ModelDir$/ResNet_152.Eval
    # Set minibatch size for testing.
    minibatchSize=32

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
@ -17,18 +17,16 @@ ndlMacros = [
    hs = 1
    vs = 1
    
-    # Pooling settings.
-    poolW = 2
-    poolH = 2
-    poolhs = 2
-    poolvs = 2
-    
    # Initial parameter values.
    convWScale = 7.07
    convBValue = 0
+    
+    fcWScale = 2.26
+    fcBValue = 0
+    
    scValue = 1
-    fcWScale = 3.0
-    fcBValue = 1
+    
+    expAvg = 1
 ]

 DNN=[
@ -39,7 +37,8 @@ DNN=[
    cMap5 = 1024
    cMap6 = 2048
    
-    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
+    conv1WScale = 0.6
+    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, expAvg)
    # Max pooling
    pool1W = 2
    pool1H = 2
@ -47,63 +46,59 @@ DNN=[
    pool1vs = 2
    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
    
-    rn1_1_Wproj = Parameter(cMap3, cMap1, init = fromFile, initFromFilePath = "$Proj64to256Filename$", needGradient = false)
-    rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj, 1)
-    rn1_2 = ResNetNode3(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue)
-    rn1_3 = ResNetNode3(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue)
+    rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, expAvg, 1)
+    rn1_2 = ResNetNode3A(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, expAvg)
+    rn1_3 = ResNetNode3A(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, expAvg)

-    rn2_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
-    rn2_1 = ResNetNode3Inc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, rn2_1_Wproj, 2)
-    rn2_2 = ResNetNode3(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
-    rn2_3 = ResNetNode3(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
-    rn2_4 = ResNetNode3(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
-    rn2_5 = ResNetNode3(rn2_4, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
-    rn2_6 = ResNetNode3(rn2_5, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
-    rn2_7 = ResNetNode3(rn2_6, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
-    rn2_8 = ResNetNode3(rn2_7, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
+    rn2_1 = ResNetNode3BInc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg, 2)
+    rn2_2 = ResNetNode3A(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
+    rn2_3 = ResNetNode3A(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
+    rn2_4 = ResNetNode3A(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
+    rn2_5 = ResNetNode3A(rn2_4, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
+    rn2_6 = ResNetNode3A(rn2_5, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
+    rn2_7 = ResNetNode3A(rn2_6, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
+    rn2_8 = ResNetNode3A(rn2_7, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
    
-    rn3_1_Wproj = Parameter(cMap5, cMap4, init = fromFile, initFromFilePath = "$Proj512to1024Filename$", needGradient = false)
-    rn3_1 = ResNetNode3Inc(rn2_8,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, rn3_1_Wproj, 2)
-    rn3_2 = ResNetNode3(rn3_1,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_3 = ResNetNode3(rn3_2,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_4 = ResNetNode3(rn3_3,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_5 = ResNetNode3(rn3_4,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_6 = ResNetNode3(rn3_5,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_7 = ResNetNode3(rn3_6,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_8 = ResNetNode3(rn3_7,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_9 = ResNetNode3(rn3_8,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_10= ResNetNode3(rn3_9,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_11= ResNetNode3(rn3_10, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_12= ResNetNode3(rn3_11, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_13= ResNetNode3(rn3_12, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_14= ResNetNode3(rn3_13, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_15= ResNetNode3(rn3_14, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_16= ResNetNode3(rn3_15, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_17= ResNetNode3(rn3_16, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_18= ResNetNode3(rn3_17, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_19= ResNetNode3(rn3_18, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_20= ResNetNode3(rn3_19, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_21= ResNetNode3(rn3_20, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_22= ResNetNode3(rn3_21, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_23= ResNetNode3(rn3_22, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_24= ResNetNode3(rn3_23, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_25= ResNetNode3(rn3_24, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_26= ResNetNode3(rn3_25, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_27= ResNetNode3(rn3_26, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_28= ResNetNode3(rn3_27, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_29= ResNetNode3(rn3_28, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_30= ResNetNode3(rn3_29, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_31= ResNetNode3(rn3_30, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_32= ResNetNode3(rn3_31, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_33= ResNetNode3(rn3_32, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_34= ResNetNode3(rn3_33, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_35= ResNetNode3(rn3_34, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_36= ResNetNode3(rn3_35, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
+    rn3_1 = ResNetNode3BInc(rn2_8,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg, 2)
+    rn3_2 = ResNetNode3A(rn3_1,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_3 = ResNetNode3A(rn3_2,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_4 = ResNetNode3A(rn3_3,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_5 = ResNetNode3A(rn3_4,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_6 = ResNetNode3A(rn3_5,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_7 = ResNetNode3A(rn3_6,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_8 = ResNetNode3A(rn3_7,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_9 = ResNetNode3A(rn3_8,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_10= ResNetNode3A(rn3_9,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_11= ResNetNode3A(rn3_10, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_12= ResNetNode3A(rn3_11, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_13= ResNetNode3A(rn3_12, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_14= ResNetNode3A(rn3_13, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_15= ResNetNode3A(rn3_14, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_16= ResNetNode3A(rn3_15, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_17= ResNetNode3A(rn3_16, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_18= ResNetNode3A(rn3_17, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_19= ResNetNode3A(rn3_18, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_20= ResNetNode3A(rn3_19, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_21= ResNetNode3A(rn3_20, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_22= ResNetNode3A(rn3_21, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_23= ResNetNode3A(rn3_22, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_24= ResNetNode3A(rn3_23, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_25= ResNetNode3A(rn3_24, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_26= ResNetNode3A(rn3_25, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_27= ResNetNode3A(rn3_26, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_28= ResNetNode3A(rn3_27, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_29= ResNetNode3A(rn3_28, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_30= ResNetNode3A(rn3_29, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_31= ResNetNode3A(rn3_30, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_32= ResNetNode3A(rn3_31, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_33= ResNetNode3A(rn3_32, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_34= ResNetNode3A(rn3_33, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_35= ResNetNode3A(rn3_34, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_36= ResNetNode3A(rn3_35, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)

-    rn4_1_Wproj = Parameter(cMap6, cMap5, init = fromFile, initFromFilePath = "$Proj1024to2048Filename$", needGradient = false)
-    rn4_1 = ResNetNode3Inc(rn3_36, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, rn4_1_Wproj, 2)
-    rn4_2 = ResNetNode3(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
-    rn4_3 = ResNetNode3(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
+    rn4_1 = ResNetNode3BInc(rn3_36, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, expAvg, 2)
+    rn4_2 = ResNetNode3A(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue, expAvg)
+    rn4_3 = ResNetNode3A(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue, expAvg)

    # Global average pooling
    pool2W = 7
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.config
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.config
@ -10,12 +10,13 @@ ndlMacros=$ConfigDir$/Macros.ndl
 precision=float
 deviceId=Auto

-command=Train:AddTop5Eval:Test
+command=Train:CreateEval:Test

 parallelTrain=false

 stderr=$OutputDir$/ResNet_34
 traceLevel=1
+numMBsToShowResult=500

 Proj64to128Filename = $ConfigDir$/64to128.txt
 Proj128to256Filename = $ConfigDir$/128to256.txt
@ -31,10 +32,12 @@ Train=[
    
    SGD=[
        epochSize=0
-        minibatchSize=64
-        learningRatesPerMB=0.1*30:0.03*25:0.01*25:0.003*25:0.001
+        minibatchSize=256
+        # Note that learning rates are 10x more than in the paper due to a different
+        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
+        learningRatesPerMB=1.0*35:0.1*35:0.01
        momentumPerMB=0.9
-        maxEpochs=120
+        maxEpochs=125
        gradUpdateType=None
        L2RegWeight=0.0001
        dropoutRate=0
@ -44,11 +47,9 @@ Train=[
            distributedMBReading=true
            parallelizationStartEpoch=1
            DataParallelSGD=[
-                gradientBits=1
+                gradientBits=32
            ]
        ]
-        
-        numMBsToShowResult=100
    ]
    
    reader=[
@ -87,16 +88,16 @@ Train=[
    ]    
 ]

-AddTop5Eval=[    
+CreateEval=[    
    action=edit
    CurModel=$ModelDir$/ResNet_34
-    NewModel=$ModelDir$/ResNet_34.Top5
-    editPath=$ConfigDir$/add_top5_layer.mel
+    NewModel=$ModelDir$/ResNet_34.Eval
+    editPath=$ConfigDir$/create_eval_model.mel
 ]

 Test=[
    action=test
-    modelPath=$ModelDir$/ResNet_34.Top5
+    modelPath=$ModelDir$/ResNet_34.Eval
    # Set minibatch size for testing.
    minibatchSize=64

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
@ -20,14 +20,19 @@ ndlMacros = [
    # Initial parameter values.
    convWScale = 7.07
    convBValue = 0
+
+    fcWScale = 1.13
+    fcBValue = 0
+
    scValue = 1
-    fcWScale = 3.0
-    fcBValue = 1
+    
+    expAvg = 1
 ]

 DNN=[
+    conv1WScale = 0.6
    cMap1 = 64
-    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
+    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, expAvg)
    # Max pooling
    pool1W = 2
    pool1H = 2
@ -35,31 +40,31 @@ DNN=[
    pool1vs = 2
    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
    
-    rn1_1 = ResNetNode2(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue)
-    rn1_2 = ResNetNode2(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue)
-    rn1_3 = ResNetNode2(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue)
+    rn1_1 = ResNetNode2A(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_2 = ResNetNode2A(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn1_3 = ResNetNode2A(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap2 = 128
    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj64to128Filename$", needGradient = false)
-    rn2_1 = ResNetNode2Conv(rn1_3, cMap2, 576, 1152, kW, kH, convWScale, convBValue, scValue, rn2_1_Wproj)
-    rn2_2 = ResNetNode2(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue)
-    rn2_3 = ResNetNode2(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue)
-    rn2_4 = ResNetNode2(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue)
+    rn2_1 = ResNetNode2AInc(rn1_3, cMap2, 576, 1152, kW, kH, convWScale, convBValue, scValue, expAvg, rn2_1_Wproj)
+    rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_3 = ResNetNode2A(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn2_4 = ResNetNode2A(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, expAvg)
    
    cMap3 = 256
    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj128to256Filename$", needGradient = false)
-    rn3_1 = ResNetNode2Conv(rn2_4, cMap3, 1152, 2304, kW, kH, convWScale, convBValue, scValue, rn3_1_Wproj)
-    rn3_2 = ResNetNode2(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue)
-    rn3_3 = ResNetNode2(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue)
-    rn3_4 = ResNetNode2(rn3_3, cMap3, 2304, kW, kH, convWScale, convBValue, scValue)
-    rn3_5 = ResNetNode2(rn3_4, cMap3, 2304, kW, kH, convWScale, convBValue, scValue)
-    rn3_6 = ResNetNode2(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue)
+    rn3_1 = ResNetNode2AInc(rn2_4, cMap3, 1152, 2304, kW, kH, convWScale, convBValue, scValue, expAvg, rn3_1_Wproj)
+    rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_3 = ResNetNode2A(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_4 = ResNetNode2A(rn3_3, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_5 = ResNetNode2A(rn3_4, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn3_6 = ResNetNode2A(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, expAvg)

    cMap4 = 512
    rn4_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
-    rn4_1 = ResNetNode2Conv(rn3_6, cMap4, 2304, 4608, kW, kH, convWScale, convBValue, scValue, rn4_1_Wproj)
-    rn4_2 = ResNetNode2(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue)
-    rn4_3 = ResNetNode2(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue)
+    rn4_1 = ResNetNode2AInc(rn3_6, cMap4, 2304, 4608, kW, kH, convWScale, convBValue, scValue, expAvg, rn4_1_Wproj)
+    rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, expAvg)
+    rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, expAvg)

    # Global average pooling
    pool2W = 7
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.config
@ -10,12 +10,13 @@ ndlMacros=$ConfigDir$/Macros.ndl
 precision=float
 deviceId=Auto

-command=Train:AddTop5Eval:Test
+command=Train:CreateEval:Test

 parallelTrain=false

 stderr=$OutputDir$/ResNet_50
 traceLevel=1
+numMBsToShowResult=500

 Proj64to256Filename = $ConfigDir$/64to256.txt
 Proj256to512Filename = $ConfigDir$/256to512.txt
@ -32,10 +33,12 @@ Train=[
    
    SGD=[
        epochSize=0
-        minibatchSize=32
-        learningRatesPerMB=0.1*30:0.03*30:0.01*25:0.003*25:0.001
+        minibatchSize=256
+        # Note that learning rates are 10x more than in the paper due to a different
+        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
+        learningRatesPerMB=1.0*35:0.1*35:0.01
        momentumPerMB=0.9
-        maxEpochs=120
+        maxEpochs=125
        gradUpdateType=None
        L2RegWeight=0.0001
        dropoutRate=0
@ -45,11 +48,9 @@ Train=[
            distributedMBReading=true
            parallelizationStartEpoch=1
            DataParallelSGD=[
-                gradientBits=1
+                gradientBits=32
            ]
        ]
-        
-        numMBsToShowResult=100
    ]
    
    reader=[
@ -88,16 +89,16 @@ Train=[
    ]    
 ]

-AddTop5Eval=[    
+CreateEval=[    
    action=edit
    CurModel=$ModelDir$/ResNet_50
-    NewModel=$ModelDir$/ResNet_50.Top5
-    editPath=$ConfigDir$/add_top5_layer.mel
+    NewModel=$ModelDir$/ResNet_50.Eval
+    editPath=$ConfigDir$/create_eval_model.mel
 ]

 Test=[
    action=test
-    modelPath=$ModelDir$/ResNet_50.Top5
+    modelPath=$ModelDir$/ResNet_50.Eval
    # Set minibatch size for testing.
    minibatchSize=32

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
@ -20,9 +20,13 @@ ndlMacros = [
    # Initial parameter values.
    convWScale = 7.07
    convBValue = 0
+
+    fcWScale = 2.26
+    fcBValue = 0
+
    scValue = 1
-    fcWScale = 3.0
-    fcBValue = 1
+    
+    expAvg = 1
 ]

 DNN=[
@ -33,7 +37,8 @@ DNN=[
    cMap5 = 1024
    cMap6 = 2048
    
-    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
+    conv1WScale = 0.6
+    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, conv1WScale, convBValue, scValue, expAvg)
    # Max pooling
    pool1W = 2
    pool1H = 2
@ -41,29 +46,25 @@ DNN=[
    pool1vs = 2
    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
    
-    rn1_1_Wproj = Parameter(cMap3, cMap1, init = fromFile, initFromFilePath = "$Proj64to256Filename$", needGradient = false)
-    rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj, 1)
-    rn1_2 = ResNetNode3(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue)
-    rn1_3 = ResNetNode3(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue)
+    rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, expAvg, 1)
+    rn1_2 = ResNetNode3A(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, expAvg)
+    rn1_3 = ResNetNode3A(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, expAvg)

-    rn2_1_Wproj = Parameter(cMap4, cMap3, init = fromFile, initFromFilePath = "$Proj256to512Filename$", needGradient = false)
-    rn2_1 = ResNetNode3Inc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, rn2_1_Wproj, 2)
-    rn2_2 = ResNetNode3(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
-    rn2_3 = ResNetNode3(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
-    rn2_4 = ResNetNode3(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue)
+    rn2_1 = ResNetNode3BInc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg, 2)
+    rn2_2 = ResNetNode3A(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
+    rn2_3 = ResNetNode3A(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
+    rn2_4 = ResNetNode3A(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, expAvg)
    
-    rn3_1_Wproj = Parameter(cMap5, cMap4, init = fromFile, initFromFilePath = "$Proj512to1024Filename$", needGradient = false)
-    rn3_1 = ResNetNode3Inc(rn2_4,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, rn3_1_Wproj, 2)
-    rn3_2 = ResNetNode3(rn3_1, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_3 = ResNetNode3(rn3_2, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_4 = ResNetNode3(rn3_3, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_5 = ResNetNode3(rn3_4, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
-    rn3_6 = ResNetNode3(rn3_5, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue)
+    rn3_1 = ResNetNode3BInc(rn2_4,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg, 2)
+    rn3_2 = ResNetNode3A(rn3_1, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_3 = ResNetNode3A(rn3_2, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_4 = ResNetNode3A(rn3_3, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_5 = ResNetNode3A(rn3_4, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)
+    rn3_6 = ResNetNode3A(rn3_5, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, expAvg)

-    rn4_1_Wproj = Parameter(cMap6, cMap5, init = fromFile, initFromFilePath = "$Proj1024to2048Filename$", needGradient = false)
-    rn4_1 = ResNetNode3Inc(rn3_6, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, rn4_1_Wproj, 2)
-    rn4_2 = ResNetNode3(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
-    rn4_3 = ResNetNode3(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
+    rn4_1 = ResNetNode3BInc(rn3_6, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, expAvg, 2)
+    rn4_2 = ResNetNode3A(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue, expAvg)
+    rn4_3 = ResNetNode3A(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue, expAvg)

    # Global average pooling
    pool2W = 7
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/add_top5_layer.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/add_top5_layer.mel
@ -1,6 +0,0 @@
-m1=LoadModel($CurModel$, format=cntk)
-SetDefaultModel(m1)
-
-ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = Eval)
-
-SaveModel(m1, $NewModel$, format=cntk)
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/create_eval_model.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/create_eval_model.mel
@ -0,0 +1,10 @@
+m1=LoadModel($CurModel$, format=cntk)
+SetDefaultModel(m1)
+
+# Switch batch normalization to eval mode.
+SetPropertyForSubTree(CE, batchNormEvalMode, true)
+
+# Add top-5 error prediction node.
+ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = Eval)
+
+SaveModel(m1, $NewModel$, format=cntk)