diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ProjWeightsGen.py b/Examples/Image/Miscellaneous/CIFAR-10/03_ProjWeightsGen.py
new file mode 100644
index 000000000..a208adcbc
--- /dev/null
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ProjWeightsGen.py
@@ -0,0 +1,12 @@
+import sys
+import numpy as np
+
+def writeConvWeights(fname, cmapIn):
+    cmapOut = 2 * cmapIn
+    w = np.eye(cmapOut, cmapIn)
+    np.savetxt(fname, w, fmt = '%d', delimiter = ' ')
+
+if __name__ == "__main__":
+    cmapIn = int(sys.argv[1])
+    fname = sys.argv[2]
+    writeConvWeights(fname, cmapIn)
\ No newline at end of file
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
index 3465ce2db..7b8d259d4 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
@@ -17,6 +17,9 @@ command=Train:AddBNEval:Test
 stderr=$OutputDir$/03_ResNet
 traceLevel=1
 
+Proj16to32Filename = $ConfigDir$/16to32.txt
+Proj32to64Filename = $ConfigDir$/32to64.txt
+
 Train=[
     action=train
     modelPath=$ModelDir$/03_ResNet
@@ -39,7 +42,7 @@ Train=[
             distributedMBReading=true
             parallelizationStartEpoch=1
             DataParallelSGD=[
-                gradientBits=32
+                gradientBits=1
             ]
         ]
         
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
index 8ff6af0ca..013b0cfb8 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@@ -32,20 +32,22 @@ DNN=[
     cMap1 = 16
     conv1 = ConvBNReLULayer2(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue)
 
-		rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
-		rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
-		rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
+    rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
+    rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
+    rn1_3 = ResNetNode2(rn1_2, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
 
     cMap2 = 32
-		rn2_1 = ResNetNode2Reduce(rn1_3, cMap2, 144, 288, 16384, 8192, kW, kH, convWScale, convBValue, scValue)
-		rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue)
-		rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue)
+    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
+    rn2_1 = ResNetNode2Conv(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, rn2_1_Wproj)
+    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue)
+    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue)
 
     cMap3 = 64
-		rn3_1 = ResNetNode2Reduce(rn2_3, cMap3, 288, 576, 8192, 4096, kW, kH, convWScale, convBValue, scValue)
-		rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
-		rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
-		
+    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
+    rn3_1 = ResNetNode2Conv(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, rn3_1_Wproj)
+    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
+    rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
+                
     # pool
     poolW = 3
     poolH = 3
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/16to32.txt b/Examples/Image/Miscellaneous/CIFAR-10/16to32.txt
new file mode 100644
index 000000000..7742979aa
--- /dev/null
+++ b/Examples/Image/Miscellaneous/CIFAR-10/16to32.txt
@@ -0,0 +1,32 @@
+1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/32to64.txt b/Examples/Image/Miscellaneous/CIFAR-10/32to64.txt
new file mode 100644
index 000000000..0c78cca62
--- /dev/null
+++ b/Examples/Image/Miscellaneous/CIFAR-10/32to64.txt
@@ -0,0 +1,64 @@
+1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
diff --git a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
index c2760cb35..b1a5442eb 100644
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@@ -41,7 +41,7 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
     isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true)
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 0.9)
     y1 = RectifiedLinear(bn1);
     
     W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
@@ -51,12 +51,12 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
     isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true)
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 0.9)
     p = Plus(bn2, inp)
     y2 = RectifiedLinear(p);
 }
 
-ResNetNode2Reduce(inp, outMap, inWCount, wCount, inDim, outDim, kW, kH, wScale, bValue, scValue)
+ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, Wproj)
 {
     W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
     b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
@@ -65,7 +65,7 @@ ResNetNode2Reduce(inp, outMap, inWCount, wCount, inDim, outDim, kW, kH, wScale,
     isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true)
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true)
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 0.9)
     y1 = RectifiedLinear(bn1);
     
     W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale)
@@ -75,10 +75,10 @@ ResNetNode2Reduce(inp, outMap, inWCount, wCount, inDim, outDim, kW, kH, wScale,
     isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
     
     c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true)
-    WP = Parameter(outDim, inDim)
-    t = Times(WP, inp, init = Gaussian, initValueScale = wScale)
-    p = Plus(bn2, t)
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 0.9)
+    
+    cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false)
+    p = Plus(bn2, cproj)
     y2 = RectifiedLinear(p);
 }
 
diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
index 85cee3bbf..287289b1c 100644
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
@@ -8,6 +8,20 @@ DnnReLULayer(inDim, outDim, x, wScale, bValue)
     y = RectifiedLinear(z)
 }
 
+# Fully-connected layer with batch normalization and ReLU activation.
+DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
+{
+    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
+    b = Parameter(inDim, 1, init = fixedValue, value = bValue) 
+    sc = Parameter(inDim, 1, init = Gaussian, initValueScale = 0.01)
+    m = Parameter(inDim, 1, init = fixedValue, value = 0, needGradient = false)
+    isd = Parameter(inDim, 1, init = fixedValue, value = 0, needGradient = false)
+    
+    bn = BatchNormalization(x, sc, b, m, isd, eval = false, spatial = false)
+    t = Times(W, bn)
+    y = RectifiedLinear(t)
+}
+
 # Fully-connected layer.
 DnnLayer(inDim, outDim, x, wScale, bValue)
 {
@@ -27,3 +41,16 @@ ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
     y = RectifiedLinear(z);
 }
 
+# Convolutional layer with batch normalization and ReLU activation.
+ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue)
+{
+    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
+    sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
+    
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
+    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true)
+    y = RectifiedLinear(bn);
+}
diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config
index c2fe8edf9..d2132eb70 100644
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config
@@ -67,7 +67,7 @@ Train=[
             # Horizontal random flip, will be enabled by default if cropType=Random
             #hflip=0
             # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
-            cropRatio=0.9
+            cropRatio=0.875
             # Crop scale ratio jitter type.
             # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
             jitterType=UniRatio
diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
index 6ed547881..eb7fe867c 100644
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
@@ -1,7 +1,7 @@
-load=ndlMnistMacros
+load=ndlMacros
 run=DNN
 
-ndlMnistMacros = [
+ndlMacros = [
     ImageW = 224
     ImageH = 224
     ImageC = 3
diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.config b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.config
new file mode 100644
index 000000000..b4d0fc3ec
--- /dev/null
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.config
@@ -0,0 +1,118 @@
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros=$ConfigDir$/Macros.ndl
+
+precision=float
+deviceId=Auto
+
+command=Train:AddTop5Eval:Test
+
+parallelTrain=false
+
+stderr=$OutputDir$/VGG_E_BN
+traceLevel=1
+
+Train=[
+    action=train
+    modelPath=$ModelDir$/VGG_E_BN
+
+     NDLNetworkBuilder=[
+        networkDescription=$ConfigDir$/VGG_E_BN.ndl
+    ]
+    
+    SGD=[
+        epochSize=0
+        minibatchSize=16
+        learningRatesPerMB=0.01*20:0.003*12:0.001*28:0.0003
+        momentumPerMB=0.9
+        maxEpochs=70
+        gradUpdateType=None
+        L2RegWeight=0.0005
+        dropoutRate=0*5:0.5
+        
+        ParallelTrain=[
+            parallelizationMethod=DataParallelSGD
+            distributedMBReading=true
+            parallelizationStartEpoch=1
+            DataParallelSGD=[
+                gradientBits=1
+            ]
+        ]
+        
+        numMBsToShowResult=10
+    ]
+    
+    reader=[
+        readerType=ImageReader
+        # Map file which maps images to labels using the following format:
+        # <full path to image><tab><numerical label (0-based class id)>
+        # Example:
+        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
+        file=$DataDir$/train_map.txt
+        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
+        randomize=Auto
+        features=[
+            # Below are the required parameters.
+            width=224
+            height=224
+            channels=3
+            # Below are the optional parameters.
+            # Possible values: Center, Random. Default: Center
+            cropType=Random
+            # Horizontal random flip, will be enabled by default if cropType=Random
+            #hflip=0
+            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
+            cropRatio=0.875
+            # Crop scale ratio jitter type.
+            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
+            jitterType=UniRatio
+            # Interpolation to use when scaling image to width x height size.
+            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
+            interpolations=Linear
+            # Stores mean values for each pixel in OpenCV matrix XML format.
+            meanFile=$ConfigDir$/ImageNet1K_mean.xml
+        ]
+        labels=[
+            labelDim=1000
+        ]
+    ]    
+]
+
+AddTop5Eval=[    
+    action=edit
+    CurModel=$ModelDir$/VGG_E_BN
+    NewModel=$ModelDir$/VGG_E_BN.Top5
+    editPath=$ConfigDir$/add_top5_layer.mel
+]
+
+Test=[
+    action=test
+    modelPath=$ModelDir$/VGG_E_BN.Top5
+    # Set minibatch size for testing.
+    minibatchSize=128
+
+     NDLNetworkBuilder=[
+        networkDescription=$ConfigDir$/VGG_E_BN.ndl
+    ]
+    
+    reader=[
+        readerType=ImageReader
+        file=$DataDir$/val_map.txt
+        randomize=None
+        features=[
+            width=224
+            height=224
+            channels=3
+            cropType=Center
+            meanFile=$ConfigDir$/ImageNet1K_mean.xml
+        ]
+        labels=[
+            labelDim=1000
+        ]
+    ]    
+]
diff --git a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl
new file mode 100644
index 000000000..d07b2e407
--- /dev/null
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl
@@ -0,0 +1,87 @@
+load=ndlMacros
+run=DNN
+
+ndlMacros = [
+    ImageW = 224
+    ImageH = 224
+    ImageC = 3
+    LabelDim = 1000
+
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
+    featOffs = Const(0, rows = 150528)
+    featScaled = Plus(features, featOffs)
+    labels = Input(LabelDim, tag = label)
+    
+    # Kernels width and height.
+    kW = 3
+    kH = 3
+    # Kernel stride.
+    hs = 1
+    vs = 1
+    
+    # Pooling settings.
+    poolW = 2
+    poolH = 2
+    poolhs = 2
+    poolvs = 2
+    
+    # Initial parameter values.
+    convWScale = 7.07
+    convBValue = 0
+    scValue = 0.03
+    fc1WScale = 3.0
+    fc1BValue = 1
+    fc2WScale = 3.0
+    fc2BValue = 1
+    fc3WScale = 1.0
+    fc3BValue = 1
+]
+
+DNN=[
+    cMap1 = 64
+    conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv2 = ConvBNReLULayer(conv1, cMap1, 576, kW, kH, hs, vs, convWScale, convBValue, scValue)
+
+    pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs)
+
+    cMap3 = 128
+    conv3 = ConvBNReLULayer(pool1, cMap3, 576, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv4 = ConvBNReLULayer(conv3, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue)
+   
+    pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs)
+
+    cMap5 = 256
+    conv5 = ConvBNReLULayer(pool2, cMap5, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv6 = ConvBNReLULayer(conv5, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv7 = ConvBNReLULayer(conv6, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv8 = ConvBNReLULayer(conv7, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
+
+    pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs)
+
+    cMap9 = 512
+    conv9 = ConvBNReLULayer(pool3, cMap9, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv10 = ConvBNReLULayer(conv9, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv11 = ConvBNReLULayer(conv10, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv12 = ConvBNReLULayer(conv11, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
+
+    pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs)
+
+    cMap13 = 512
+    conv13 = ConvBNReLULayer(pool4, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv14 = ConvBNReLULayer(conv13, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv15 = ConvBNReLULayer(conv14, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv16 = ConvBNReLULayer(conv15, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
+
+    pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs)
+
+    hiddenDim = 4096
+    h1 = DnnBNReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
+    h1_d = Dropout(h1)
+    h2 = DnnBNReLULayer(hiddenDim, hiddenDim, h1_d, fc2WScale, fc2BValue)
+    h2_d = Dropout(h2)
+    ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
+    
+    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
+    Err = ErrorPrediction(labels, ol, tag = Eval)
+    OutputNodes = ol
+]
diff --git a/Source/ComputationNetworkLib/ConvolutionalNodes.h b/Source/ComputationNetworkLib/ConvolutionalNodes.h
index 1b6a3a820..4db7a71a8 100644
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@@ -171,9 +171,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             Base::Validate(isFinalValidationPass);
 
-            if (m_horizontalSubsample > m_kernelWidth || m_verticalSubsample > m_kernelHeight)
-                InvalidArgument("In ConvolutionNode horizontalSubsample must <= kernelWidth and verticalSubsample must <= kernelHeight.");
-
             InferMBLayoutFromInputsForStandardCase();
             InferImageDimsFromInputs();
 
@@ -387,9 +384,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         {
             Base::Validate(isFinalValidationPass);
 
-            if (m_horizontalSubsample > m_windowWidth || m_verticalSubsample > m_windowHeight)
-                InvalidArgument("PoolingNodeBase: horizontalSubsample must <= windowWidth and verticalSubsample must <= windowHeight.");
-
             InferMBLayoutFromInputsForStandardCase();
             InferImageDimsFromInputs();
 
diff --git a/Source/Math/GPUMatrix.cu b/Source/Math/GPUMatrix.cu
index f035bd2d5..2023bd5f7 100644
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@@ -1960,7 +1960,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         PrepareDevice();
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
+        // _elementWIseSigmoidOnCuda has an implementation that avoids possible overflow errors, but is slightly slower and may have an accuracy regression.
+        // We have a new implementation that is non-branching (yay!) that Frank will check in.
+#if 0
+        _elementWiseSigmoidOnCuda<<<blocksPerGrid, threadsPerBlock, 0, t_stream>>>(a.m_pArray, m_pArray, N);
+#else
         _assignSigmoidOf<<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(a.m_pArray,m_pArray,N);
+#endif
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2213,19 +2219,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateBottom (const ElemType threshold)
     {
-        if (IsEmpty())
-            LogicError("InplaceTruncateBottom: Matrix is empty.");    
-
-        CUDA_LONG N=(CUDA_LONG)GetNumElements();
-        int blocksPerGrid =(int)ceil(N*1.0/GridDim::maxThreadsPerBlock); 
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _inplaceTruncateBottom<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(m_pArray,threshold,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;
+        return AssignTruncateBottomOf(*this, threshold);
     }
 
     template<class ElemType>
@@ -2255,18 +2249,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
     template<class ElemType>
     GPUMatrix<ElemType>& GPUMatrix<ElemType>::InplaceTruncateTop (const ElemType threshold)
     {
-        if (IsEmpty())
-            LogicError("InplaceTruncateTop: Matrix is empty.");
-        CUDA_LONG N=(CUDA_LONG)GetNumElements();
-        int blocksPerGrid =(int)ceil(N*1.0/GridDim::maxThreadsPerBlock);      
-        PrepareDevice();
-        cudaEvent_t done = nullptr;
-        if (do_sync)    CUDA_CALL(cudaEventCreate(&done));        
-        _inplaceTruncateTop<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(m_pArray,threshold,N);
-        if (do_sync)    CUDA_CALL(cudaEventRecord(done));        
-        if (do_sync)    CUDA_CALL(cudaEventSynchronize(done)); 
-        if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
-        return *this;        
+        return AssignTruncateTopOf(*this, threshold);
     }
 
     template<class ElemType>
diff --git a/Source/Math/GPUMatrixCUDAKernels.cuh b/Source/Math/GPUMatrixCUDAKernels.cuh
index fd01a972f..dc358a93d 100644
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@@ -90,6 +90,25 @@ static __inline__ __device__ double atomicAdd(double* address, double val) UNUSE
 // CUDA kernels follow, lots of them
 // ===========================================================================
 
+// _elementWise*() kernels
+//
+// Designed to operate on contiguous blocks of memory, where the output is a simple function of the inputs.
+// The first parameters of every function are inputs, and the last two arguments to each function are always
+// (ElemenType *res, CUDA_LONG N), a pointer and length of the output block. Each thread computes a function
+// of the inputs for one value in the output.
+
+// This macro overloads _x() with float and double arguments, and inlines the correct library function. This simplifies templated kernel code.
+// TODO: merge with similar definition in TensorOps.h
+#define DEF_ELEMENT_PRIMITIVE(x) __device__ __forceinline__ float _##x(float f) { return x##f(f); } __device__ __forceinline__ double _##x(double f) { return x(f); }
+
+DEF_ELEMENT_PRIMITIVE(exp)
+DEF_ELEMENT_PRIMITIVE(log)
+DEF_ELEMENT_PRIMITIVE(tanh)
+DEF_ELEMENT_PRIMITIVE(sqrt)
+DEF_ELEMENT_PRIMITIVE(fabs)
+DEF_ELEMENT_PRIMITIVE(cos)
+DEF_ELEMENT_PRIMITIVE(sin)
+
 template<class ElemType>
 __global__ void _elementWisePowerOnCuda(
     const ElemType alpha,     
@@ -134,48 +153,19 @@ __global__ void _elementWiseSigmoidOnCuda(
     ElemType *res,    
     const CUDA_LONG N)
 {
-    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    if (sizeof(ElemType)==sizeof(double))
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    if (a[id] >= 0)
     {
-        if (a[id]>=0)
-        {
-            double e = exp(-1*a[id]);
-            res[id]=1/(1+e);
+        double e = _exp(-a[id]);
+        res[id] = 1 / (1 + e);
         }
         else
         {
-            double e = exp(a[id]);
-            res[id]=e/(1+e);
+        double e = _exp(a[id]);
+        res[id] = e / (1 + e);
         }
-    }
-    else
-    {
-        if (res[id]>=0)
-        {
-            float e = expf(-1*a[id]);
-            res[id]=1/(1+e);
-        }
-        else
-        {
-            float e = exp(a[id]);   // BUGBUG: Looks like this should be expf().
-            res[id]=e/(1+e);
-        }
-    }
 };
 
-__device__ __forceinline__ float _exp(float f)
-{
-    return expf(f);
-}
-
-__device__ __forceinline__ double _exp(double f)
-{
-    return exp(f);
-}
-
-//#define TENSOR_OPS_DECL __device__ __host__
-//#include "TensorOps.h"
-
 template<class ElemType>
 __global__ void _assignSigmoidOf(
     const ElemType* a,
@@ -224,16 +214,8 @@ __global__ void _elementWiseTanhOnCuda(
     ElemType *res,    
     const CUDA_LONG N)
 {
-    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        res[id]=tanh(a[id]);
-    }
-    else
-    {
-        res[id]=tanhf(a[id]);
-    }
-
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    res[id] = _tanh(a[id]);
 };
 
 //to prevent negative values caused by floating operations, we force inputs to be >=0
@@ -244,15 +226,8 @@ __global__ void _elementWiseSqrtOnCuda(
     ElemType *res,    
     const CUDA_LONG N)
 {
-    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        res[id]=sqrt(max((ElemType)0, a[id]));
-    }
-    else
-    {
-        res[id]=sqrtf(max(ElemType(0), a[id]));
-    }
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    res[id] = _sqrt(max((ElemType)0, a[id]));
 };
 
 template<class ElemType>
@@ -261,15 +236,8 @@ __global__ void _elementWiseExpOnCuda(
     ElemType *res,    
     const CUDA_LONG N)
 {
-    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        res[id]=exp(a[id]);
-    }
-    else
-    {
-        res[id]=expf(a[id]);
-    }
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    res[id] = _exp(a[id]);
 };
 
 template<class ElemType>
@@ -278,22 +246,8 @@ __global__ void _elementWiseLogOnCuda(
     ElemType *res,    
     const CUDA_LONG N)
 {
-    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    if (a[id]<EPS_IN_LOG)
-    {
-        res[id]=LOG_OF_EPS_IN_LOG;
-    }
-    else
-    {
-        if (sizeof(ElemType)==sizeof(double))
-        {
-            res[id]=log(a[id]);
-        }
-        else
-        {
-            res[id]=logf(a[id]);
-        }
-    }
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    res[id] = (a[id] < EPS_IN_LOG) ? LOG_OF_EPS_IN_LOG : _log(a[id]);
 };
 
 template<class ElemType>
@@ -302,15 +256,8 @@ __global__ void _elementWiseAbsOnCuda(
     ElemType *res,    
     const CUDA_LONG N)
 {
-    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        res[id]=fabs(a[id]);
-    }
-    else
-    {
-        res[id]=fabsf(a[id]);
-    }
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    res[id] = _fabs(a[id]);
 };
 
 template<class ElemType>
@@ -319,15 +266,8 @@ __global__ void _elementWiseCosineOnCuda(
     ElemType *res,    
     const CUDA_LONG N)
 {
-    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        res[id]=cos(a[id]);
-    }
-    else
-    {
-        res[id]=cosf(a[id]);
-    }
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    res[id] = _cos(a[id]);
 };
 
 template<class ElemType>
@@ -336,18 +276,10 @@ __global__ void _elementWiseNegativeSineOnCuda(
     ElemType *res,    
     const CUDA_LONG N)
 {
-    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT(id,N);
-    if (sizeof(ElemType)==sizeof(double))
-    {
-        res[id]=-sin(a[id]);
-    }
-    else
-    {
-        res[id]=-sinf(a[id]);
-    }
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    res[id] = -_sin(a[id]);
 };
 
-
 template<class ElemType>
 __global__ void _setValue(    
     ElemType* a,
@@ -1141,6 +1073,7 @@ __global__ void _assignColumnwiseHardmaxOf(
     }
 }
 
+#if 0
 template<class ElemType>
 __global__ void _inplaceTruncateBottom(
     ElemType* a,
@@ -1153,6 +1086,7 @@ __global__ void _inplaceTruncateBottom(
     if (a[id]<threshold)
         a[id]=threshold;
 }
+#endif
 
 template<class ElemType>
 __global__ void _assignTruncateBottom(
@@ -1161,15 +1095,11 @@ __global__ void _assignTruncateBottom(
     const ElemType threshold,
     const CUDA_LONG N)
 {
-    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (a[id]<threshold)
-        us[id]=threshold;
-    else
-        us[id]=a[id];
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    us[id] = a[id] < threshold ? threshold : a[id];
 }
 
+#if 0
 template<class ElemType>
 __global__ void _inplaceTruncateTop(
     ElemType* a,
@@ -1182,6 +1112,7 @@ __global__ void _inplaceTruncateTop(
     if (a[id]>threshold)
         a[id]=threshold;
 }
+#endif
 
 template<class ElemType>
 __global__ void _assignTruncateTop(
@@ -1190,13 +1121,8 @@ __global__ void _assignTruncateTop(
     const ElemType threshold,
     const CUDA_LONG N)
 {
-    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
-    if (a[id]>threshold)
-        us[id]=threshold;
-    else
-        us[id]=a[id];
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT;
+    us[id] = a[id] > threshold ? threshold : a[id];
 }
 
 template<class ElemType>
@@ -3716,9 +3642,7 @@ __global__ void _inplaceTruncate(
     const ElemType threshold,
     const CUDA_LONG N)
 {
-    CUDA_LONG id = blockDim.x * blockIdx.x + threadIdx.x;
-    if (id>=N)
-        return;
+    CALCULATE_ELEMENTWISE_INDEX_OR_EXIT
     ElemType locThresholdPos = abs(threshold);
     ElemType locTHresholdNeg = -locThresholdPos; 
     if (a[id] > locThresholdPos)
diff --git a/Source/Math/GPUSparseMatrix.cu b/Source/Math/GPUSparseMatrix.cu
index d77f3f8fc..3831475c1 100644
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@@ -2526,7 +2526,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         int blocksPerGrid =(int)ceil(N*1.0/GridDim::maxThreadsPerBlock);                
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _inplaceTruncateBottom<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(NzValues(), threshold, N);
+        _assignTruncateBottom<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(NzValues(), NzValues(), threshold, N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));
@@ -2570,7 +2570,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
         int blocksPerGrid =(int)ceil(N*1.0/GridDim::maxThreadsPerBlock);                
         cudaEvent_t done = nullptr;
         if (do_sync)    CUDA_CALL(cudaEventCreate(&done));
-        _inplaceTruncateTop<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(NzValues(), threshold, N);
+        _assignTruncateTop<ElemType> << <blocksPerGrid, GridDim::maxThreadsPerBlock >> >(NzValues(), NzValues(), threshold, N);
         if (do_sync)    CUDA_CALL(cudaEventRecord(done));
         if (do_sync)    CUDA_CALL(cudaEventSynchronize(done));
         if (do_sync)    CUDA_CALL(cudaEventDestroy(done));