Merge branch 'master' into qiwye/multiverso

2016-01-08 21:03:14 +08:00 · 2016-01-08 21:03:14 +08:00 · 7628026b05
--- a/Examples/Image/MNIST/Config/01_OneHidden.config
+++ b/Examples/Image/MNIST/Config/01_OneHidden.config
@ -19,6 +19,8 @@ ndlMacros = "$ConfigDir$/Macros.ndl"

 # comment the following line to write logs to the console 
 stderr = "$OutputDir$/01_OneHidden_out"
+traceLevel=1
+numMBsToShowResult=500

 #######################################
 #  TRAINING CONFIG                    #
@ -63,6 +65,7 @@ train = [

 test = [
    action = "test"
+    minibatchSize = 16

    NDLNetworkBuilder=[
        networkDescription = "$ConfigDir$/01_OneHidden.ndl"
--- a/Examples/Image/MNIST/Config/02_Convolution.config
+++ b/Examples/Image/MNIST/Config/02_Convolution.config
@ -19,6 +19,10 @@ ndlMacros = "$ConfigDir$/Macros.ndl"

 # comment the following line to write logs to the console 
 stderr = "$OutputDir$/02_Convolution_out"
+traceLevel=1
+numMBsToShowResult=500
+
+prefetch=true

 #######################################
 #  TRAINING CONFIG                    #
@ -63,6 +67,7 @@ train = [

 test = [
    action = test
+    minibatchSize = 16
    
    NDLNetworkBuilder = [
        networkDescription = "$ConfigDir$/02_Convolution.ndl"
--- a/Examples/Image/Miscellaneous/CIFAR-10/01_Conv.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/01_Conv.config
@ -1,20 +1,28 @@
-WorkDir=.
-ModelDir=$WorkDir$/_out/$ConfigName$
-stderr=$WorkDir$/_out/$ConfigName$
+RootDir = "."

-ndlMacros=$WorkDir$/Macros.ndl
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros=$ConfigDir$/Macros.ndl

 precision=float
 deviceId=Auto
+prefetch=true

 command=Train:Test

+stderr=$OutputDir$/01_Conv
+traceLevel=1
+numMBsToShowResult=500
+
 Train=[
    action=train
    modelPath=$ModelDir$/01_Convolution

     NDLNetworkBuilder=[
-        networkDescription=$WorkDir$/01_Convolution.ndl
+        networkDescription=$ConfigDir$/01_Convolution.ndl
    ]
    
    SGD=[
@ -29,7 +37,7 @@ Train=[
    
    reader=[
        readerType=UCIFastReader
-        file=$WorkDir$/Train.txt
+        file=$DataDir$/Train.txt
        randomize=None
        features=[
            dim=3072
@ -39,7 +47,7 @@ Train=[
            dim=1
            start=0
            labelDim=10
-            labelMappingFile=$WorkDir$/labelsmap.txt
+            labelMappingFile=$DataDir$/labelsmap.txt
        ]
    ]    
 ]
@ -48,15 +56,15 @@ Test=[
    action=test
    modelPath=$ModelDir$/01_Convolution
    # Set minibatch size for testing.
-    minibatchSize=128
+    minibatchSize=16

     NDLNetworkBuilder=[
-        networkDescription=$WorkDir$/01_Convolution.ndl
+        networkDescription=$ConfigDir$/01_Convolution.ndl
    ]
    
    reader=[
        readerType=UCIFastReader
-        file=$WorkDir$/Test.txt
+        file=$DataDir$/Test.txt
        randomize=None
        features=[
            dim=3072
@ -66,7 +74,7 @@ Test=[
            dim=1
            start=0
            labelDim=10
-            labelMappingFile=$WorkDir$/labelsmap.txt
+            labelMappingFile=$DataDir$/labelsmap.txt
        ]
    ]    
 ]
--- a/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl
@ -7,8 +7,8 @@ ndlMnistMacros = [
    ImageC = 3
    LabelDim = 10

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
-    featOffs = Const(128, rows = 3072)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
+    featOffs = Const(128)
    featScaled = Minus(features, featOffs)
    labels = Input(LabelDim, tag = label)
    
@ -39,7 +39,7 @@ DNN=[
    pool1H = 3
    pool1hStride = 2
    pool1vStride = 2
-    pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
+    pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout = "cudnn")

    # conv2
    kW2 = 5
@ -55,7 +55,7 @@ DNN=[
    pool2H = 3
    pool2hStride = 2
    pool2vStride = 2
-    pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
+    pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout = "cudnn")

    # conv3
    kW3 = 5
@ -71,7 +71,7 @@ DNN=[
    pool3H = 3
    pool3hStride = 2
    pool3vStride = 2
-    pool3 = MaxPooling(conv3_act, pool3W, pool3H, pool3hStride, pool3vStride)
+    pool3 = MaxPooling(conv3_act, pool3W, pool3H, pool3hStride, pool3vStride, imageLayout = "cudnn")

    hiddenDim = 64
    h1 = DNNReLULayer(576, hiddenDim, pool3, fc1WScale, fc1BValue)
--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.config
@ -1,37 +1,43 @@
-WorkDir=.
-ModelDir=$WorkDir$/_out/$ConfigName$
-stderr=$WorkDir$/_out/$ConfigName$
+RootDir = "."

-ndlMacros=$WorkDir$/Macros.ndl
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros=$ConfigDir$/Macros.ndl

 precision=float
 deviceId=Auto
 prefetch=true
-parallelTrain=false

 command=Train:AddBNEval:Test

+stderr=$OutputDir$/02_BatchNormConv
+traceLevel=1
+numMBsToShowResult=500
+
 Train=[
    action=train
    modelPath=$ModelDir$/02_BatchNormConv

     NDLNetworkBuilder=[
-        networkDescription=$WorkDir$/02_BatchNormConv.ndl
+        networkDescription=$ConfigDir$/02_BatchNormConv.ndl
    ]
    
    SGD=[
        epochSize=49984
        minibatchSize=64
-        learningRatesPerMB=0.03*7:0.01*8:0.003
-        #momentumPerMB=0.9*10:0.99
+        learningRatesPerMB=0.03*7:0.01
+        momentumPerMB=0
        maxEpochs=10
-        #L2RegWeight=0.03
-        dropoutRate=0*1:0.5
+        L2RegWeight=0
+        dropoutRate=0
    ]
    
    reader=[
        readerType=UCIFastReader
-        file=$WorkDir$/Train.txt
+        file=$DataDir$/Train.txt
        randomize=None
        features=[
            dim=3072
@ -41,7 +47,7 @@ Train=[
            dim=1
            start=0
            labelDim=10
-            labelMappingFile=$WorkDir$/labelsmap.txt
+            labelMappingFile=$DataDir$/labelsmap.txt
        ]
    ]    
 ]
@ -50,22 +56,22 @@ AddBNEval=[
    action=edit
    CurModel=$ModelDir$/02_BatchNormConv
    NewModel=$ModelDir$/02_BatchNormConv.Eval
-    editPath=$WorkDir$/02_BatchNormConv.mel
+    editPath=$ConfigDir$/02_BatchNormConv.mel
 ]

 Test=[
    action=test
    modelPath=$ModelDir$/02_BatchNormConv.Eval
    # Set minibatch size for testing.
-    minibatchSize=128
+    minibatchSize=16

     NDLNetworkBuilder=[
-        networkDescription=$WorkDir$/02_BatchNormConv.ndl
+        networkDescription=$ConfigDir$/02_BatchNormConv.ndl
    ]
    
    reader=[
        readerType=UCIFastReader
-        file=$WorkDir$/Test.txt
+        file=$DataDir$/Test.txt
        randomize=None
        features=[
            dim=3072
@ -75,7 +81,7 @@ Test=[
            dim=1
            start=0
            labelDim=10
-            labelMappingFile=$WorkDir$/labelsmap.txt
+            labelMappingFile=$DataDir$/labelsmap.txt
        ]
    ]    
 ]
--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.mel
@ -1,16 +1,16 @@
 m=LoadModel($CurModel$, format=cntk)
 SetDefaultModel(m)

-ibn_e = BatchNormalization(featScaled, isc, ib, im, iisd, eval = true, spatial = true)
-SetNodeInput(conv1.c, 1, ibn_e)
+conv1.bn_e = BatchNormalization(conv1.c, conv1.sc, conv1.b, conv1.m, conv1.isd, eval = true, spatial = true, imageLayout = "cudnn")
+SetNodeInput(conv1.y, 0, conv1.bn_e)

-conv2.bn_e = BatchNormalization(pool1, conv2.sc, conv2.b, conv2.m, conv2.isd, eval = true, spatial = true)
-SetNodeInput(conv2.c, 1, conv2.bn_e)
+conv2.bn_e = BatchNormalization(conv2.c, conv2.sc, conv2.b, conv2.m, conv2.isd, eval = true, spatial = true, imageLayout = "cudnn")
+SetNodeInput(conv2.y, 0, conv2.bn_e)

-conv3.bn_e = BatchNormalization(pool2, conv3.sc, conv3.b, conv3.m, conv3.isd, eval = true, spatial = true)
-SetNodeInput(conv3.c, 1, conv3.bn_e)
+conv3.bn_e = BatchNormalization(conv3.c, conv3.sc, conv3.b, conv3.m, conv3.isd, eval = true, spatial = true, imageLayout = "cudnn")
+SetNodeInput(conv3.y, 0, conv3.bn_e)

-h1.bn_e = BatchNormalization(pool3, h1.sc, h1.b, h1.m, h1.isd, eval = true, spatial = false)
-SetNodeInput(h1.t, 1, h1.bn_e)
+h1.bn_e = BatchNormalization(h1.t, h1.sc, h1.b, h1.m, h1.isd, eval = true, spatial = false)
+SetNodeInput(h1.y, 0, h1.bn_e)

 SaveModel(m, $NewModel$, format=cntk)
--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl
@ -7,8 +7,8 @@ ndlMnistMacros = [
    ImageC = 3
    LabelDim = 10

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
-    featOffs = Const(128, rows = 3072)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
+    featOffs = Const(128)
    featScaled = Minus(features, featOffs)
    labels = Input(LabelDim, tag = label)
    
@ -18,6 +18,9 @@ ndlMnistMacros = [
    conv2BValue = 0
    conv3WScale = 1.414
    conv3BValue = 0
+    
+    scScale = 0.03
+    
    fc1WScale = 12
    fc1BValue = 0
    fc2WScale = 1.5
@ -25,12 +28,6 @@ ndlMnistMacros = [
 ]

 DNN=[
-    ib = Parameter(ImageC, 1, init = Uniform, initValueScale = 100)
-    isc = Parameter(ImageC, 1, init = Uniform, initValueScale = 100)
-    im = Parameter(ImageC, 1, init = fixedValue, value = 0, needGradient = false)
-    iisd = Parameter(ImageC, 1, init = fixedValue, value = 0, needGradient = false)
-    ibn = BatchNormalization(featScaled, isc, ib, im, iisd, eval = false, spatial = true)
-
    # conv1
    kW1 = 5
    kH1 = 5
@ -38,14 +35,14 @@ DNN=[
    hStride1 = 1
    vStride1 = 1
    # weight[cMap1, kW1 * kH1 * ImageC]
-    conv1 = ConvReLULayer(ibn, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue)
+    conv1 = ConvBNReLULayer(featScaled, cMap1, 75, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue, scScale)

    # pool1
    pool1W = 3
    pool1H = 3
    pool1hStride = 2
    pool1vStride = 2
-    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride)
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout = "cudnn")

    # conv2
    kW2 = 5
@ -54,14 +51,14 @@ DNN=[
    hStride2 = 1
    vStride2 = 1
    # weight[cMap2, kW2 * kH2 * cMap1]
-    conv2 = ConvBNReLULayer(pool1, cMap1, cMap2, 800, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue)
+    conv2 = ConvBNReLULayer(pool1, cMap2, 800, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue, scScale)

    # pool2
    pool2W = 3
    pool2H = 3
    pool2hStride = 2
    pool2vStride = 2
-    pool2 = MaxPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride)
+    pool2 = MaxPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout = "cudnn")

    # conv3
    kW3 = 5
@ -70,19 +67,18 @@ DNN=[
    hStride3 = 1
    vStride3 = 1
    # weight[cMap3, kW3 * kH3 * cMap2]
-    conv3 = ConvBNReLULayer(pool2, cMap2, cMap3, 800, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue)
+    conv3 = ConvBNReLULayer(pool2, cMap3, 800, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue, scScale)

    # pool3
    pool3W = 3
    pool3H = 3
    pool3hStride = 2
    pool3vStride = 2
-    pool3 = MaxPooling(conv3, pool3W, pool3H, pool3hStride, pool3vStride)
+    pool3 = MaxPooling(conv3, pool3W, pool3H, pool3hStride, pool3vStride, imageLayout = "cudnn")

    hiddenDim = 64
    h1 = DnnBNReLULayer(576, hiddenDim, pool3, fc1WScale, fc1BValue)
-    h1_d = Dropout(h1)
-    ol = DNNLastLayer(hiddenDim, labelDim, h1_d, fc2WScale, fc2BValue)
+    ol = DNNLastLayer(hiddenDim, labelDim, h1, fc2WScale, fc2BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
    Err = ErrorPrediction(labels, ol, tag = Eval)
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.config
@ -16,6 +16,7 @@ command=Train:AddBNEval:Test

 stderr=$OutputDir$/03_ResNet
 traceLevel=1
+numMBsToShowResult=200

 Proj16to32Filename = $ConfigDir$/16to32.txt
 Proj32to64Filename = $ConfigDir$/32to64.txt
@ -45,8 +46,6 @@ Train=[
                gradientBits=1
            ]
        ]
-        
-        numMBsToShowResult=10
    ]
    
    reader=[
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.mel
@ -1,52 +1,52 @@
 m=LoadModel($CurModel$, format=cntk)
 SetDefaultModel(m)

-conv1.bn_e = BatchNormalization(conv1.c, conv1.sc, conv1.b, conv1.m, conv1.isd, eval = true, spatial = true)
+conv1.bn_e = BatchNormalization(conv1.c, conv1.sc, conv1.b, conv1.m, conv1.isd, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(conv1.y, 0, conv1.bn_e)

-rn1_1.bn1_e = BatchNormalization(rn1_1.c1, rn1_1.sc1, rn1_1.b1, rn1_1.m1, rn1_1.isd1, eval = true, spatial = true)
+rn1_1.bn1_e = BatchNormalization(rn1_1.c1, rn1_1.sc1, rn1_1.b1, rn1_1.m1, rn1_1.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn1_1.y1, 0, rn1_1.bn1_e)
-rn1_1.bn2_e = BatchNormalization(rn1_1.c2, rn1_1.sc2, rn1_1.b2, rn1_1.m2, rn1_1.isd2, eval = true, spatial = true)
+rn1_1.bn2_e = BatchNormalization(rn1_1.c2, rn1_1.sc2, rn1_1.b2, rn1_1.m2, rn1_1.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn1_1.p, 0, rn1_1.bn2_e)

-rn1_2.bn1_e = BatchNormalization(rn1_2.c1, rn1_2.sc1, rn1_2.b1, rn1_2.m1, rn1_2.isd1, eval = true, spatial = true)
+rn1_2.bn1_e = BatchNormalization(rn1_2.c1, rn1_2.sc1, rn1_2.b1, rn1_2.m1, rn1_2.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn1_2.y1, 0, rn1_2.bn1_e)
-rn1_2.bn2_e = BatchNormalization(rn1_2.c2, rn1_2.sc2, rn1_2.b2, rn1_2.m2, rn1_2.isd2, eval = true, spatial = true)
+rn1_2.bn2_e = BatchNormalization(rn1_2.c2, rn1_2.sc2, rn1_2.b2, rn1_2.m2, rn1_2.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn1_2.p, 0, rn1_2.bn2_e)

-rn1_3.bn1_e = BatchNormalization(rn1_3.c1, rn1_3.sc1, rn1_3.b1, rn1_3.m1, rn1_3.isd1, eval = true, spatial = true)
+rn1_3.bn1_e = BatchNormalization(rn1_3.c1, rn1_3.sc1, rn1_3.b1, rn1_3.m1, rn1_3.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn1_3.y1, 0, rn1_3.bn1_e)
-rn1_3.bn2_e = BatchNormalization(rn1_3.c2, rn1_3.sc2, rn1_3.b2, rn1_3.m2, rn1_3.isd2, eval = true, spatial = true)
+rn1_3.bn2_e = BatchNormalization(rn1_3.c2, rn1_3.sc2, rn1_3.b2, rn1_3.m2, rn1_3.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn1_3.p, 0, rn1_3.bn2_e)

-rn2_1.bn1_e = BatchNormalization(rn2_1.c1, rn2_1.sc1, rn2_1.b1, rn2_1.m1, rn2_1.isd1, eval = true, spatial = true)
+rn2_1.bn1_e = BatchNormalization(rn2_1.c1, rn2_1.sc1, rn2_1.b1, rn2_1.m1, rn2_1.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn2_1.y1, 0, rn2_1.bn1_e)
-rn2_1.bn2_e = BatchNormalization(rn2_1.c2, rn2_1.sc2, rn2_1.b2, rn2_1.m2, rn2_1.isd2, eval = true, spatial = true)
+rn2_1.bn2_e = BatchNormalization(rn2_1.c2, rn2_1.sc2, rn2_1.b2, rn2_1.m2, rn2_1.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn2_1.p, 0, rn2_1.bn2_e)

-rn2_2.bn1_e = BatchNormalization(rn2_2.c1, rn2_2.sc1, rn2_2.b1, rn2_2.m1, rn2_2.isd1, eval = true, spatial = true)
+rn2_2.bn1_e = BatchNormalization(rn2_2.c1, rn2_2.sc1, rn2_2.b1, rn2_2.m1, rn2_2.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn2_2.y1, 0, rn2_2.bn1_e)
-rn2_2.bn2_e = BatchNormalization(rn2_2.c2, rn2_2.sc2, rn2_2.b2, rn2_2.m2, rn2_2.isd2, eval = true, spatial = true)
+rn2_2.bn2_e = BatchNormalization(rn2_2.c2, rn2_2.sc2, rn2_2.b2, rn2_2.m2, rn2_2.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn2_2.p, 0, rn2_2.bn2_e)

-rn2_3.bn1_e = BatchNormalization(rn2_3.c1, rn2_3.sc1, rn2_3.b1, rn2_3.m1, rn2_3.isd1, eval = true, spatial = true)
+rn2_3.bn1_e = BatchNormalization(rn2_3.c1, rn2_3.sc1, rn2_3.b1, rn2_3.m1, rn2_3.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn2_3.y1, 0, rn2_3.bn1_e)
-rn2_3.bn2_e = BatchNormalization(rn2_3.c2, rn2_3.sc2, rn2_3.b2, rn2_3.m2, rn2_3.isd2, eval = true, spatial = true)
+rn2_3.bn2_e = BatchNormalization(rn2_3.c2, rn2_3.sc2, rn2_3.b2, rn2_3.m2, rn2_3.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn2_3.p, 0, rn2_3.bn2_e)

-rn3_1.bn1_e = BatchNormalization(rn3_1.c1, rn3_1.sc1, rn3_1.b1, rn3_1.m1, rn3_1.isd1, eval = true, spatial = true)
+rn3_1.bn1_e = BatchNormalization(rn3_1.c1, rn3_1.sc1, rn3_1.b1, rn3_1.m1, rn3_1.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_1.y1, 0, rn3_1.bn1_e)
-rn3_1.bn2_e = BatchNormalization(rn3_1.c2, rn3_1.sc2, rn3_1.b2, rn3_1.m2, rn3_1.isd2, eval = true, spatial = true)
+rn3_1.bn2_e = BatchNormalization(rn3_1.c2, rn3_1.sc2, rn3_1.b2, rn3_1.m2, rn3_1.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_1.p, 0, rn3_1.bn2_e)

-rn3_2.bn1_e = BatchNormalization(rn3_2.c1, rn3_2.sc1, rn3_2.b1, rn3_2.m1, rn3_2.isd1, eval = true, spatial = true)
+rn3_2.bn1_e = BatchNormalization(rn3_2.c1, rn3_2.sc1, rn3_2.b1, rn3_2.m1, rn3_2.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_2.y1, 0, rn3_2.bn1_e)
-rn3_2.bn2_e = BatchNormalization(rn3_2.c2, rn3_2.sc2, rn3_2.b2, rn3_2.m2, rn3_2.isd2, eval = true, spatial = true)
+rn3_2.bn2_e = BatchNormalization(rn3_2.c2, rn3_2.sc2, rn3_2.b2, rn3_2.m2, rn3_2.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_2.p, 0, rn3_2.bn2_e)

-rn3_3.bn1_e = BatchNormalization(rn3_3.c1, rn3_3.sc1, rn3_3.b1, rn3_3.m1, rn3_3.isd1, eval = true, spatial = true)
+rn3_3.bn1_e = BatchNormalization(rn3_3.c1, rn3_3.sc1, rn3_3.b1, rn3_3.m1, rn3_3.isd1, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_3.y1, 0, rn3_3.bn1_e)
-rn3_3.bn2_e = BatchNormalization(rn3_3.c2, rn3_3.sc2, rn3_3.b2, rn3_3.m2, rn3_3.isd2, eval = true, spatial = true)
+rn3_3.bn2_e = BatchNormalization(rn3_3.c2, rn3_3.sc2, rn3_3.b2, rn3_3.m2, rn3_3.isd2, eval = true, spatial = true, imageLayout = "cudnn")
 SetNodeInput(rn3_3.p, 0, rn3_3.bn2_e)

 SaveModel(m, $NewModel$, format=cntk)
--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@ -7,8 +7,8 @@ LocalMacros = [
    ImageC = 3
    LabelDim = 10

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
-    featOffs = Const(128, rows = 3072)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
+    featOffs = Const(128)
    featScaled = Minus(features, featOffs)
    labels = Input(LabelDim, tag = label)
    
@ -30,7 +30,7 @@ LocalMacros = [

 DNN=[
    cMap1 = 16
-    conv1 = ConvBNReLULayer2(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue)
+    conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hStride1, vStride1, convWScale, convBValue, scValue)

    rn1_1 = ResNetNode2(conv1, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
    rn1_2 = ResNetNode2(rn1_1, cMap1, 144, kW, kH, convWScale, convBValue, scValue)
@ -38,13 +38,13 @@ DNN=[

    cMap2 = 32
    rn2_1_Wproj = Parameter(cMap2, cMap1, init = fromFile, initFromFilePath = "$Proj16to32Filename$", needGradient = false)
-    rn2_1 = ResNetNode2Conv(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, rn2_1_Wproj)
+    rn2_1 = ResNetNode2Inc(rn1_3, cMap2, 144, 288, kW, kH, convWScale, convBValue, scValue, rn2_1_Wproj)
    rn2_2 = ResNetNode2(rn2_1, cMap2, 288, kW, kH, convWScale, convBValue, scValue)
    rn2_3 = ResNetNode2(rn2_2, cMap2, 288, kW, kH, convWScale, convBValue, scValue)

    cMap3 = 64
    rn3_1_Wproj = Parameter(cMap3, cMap2, init = fromFile, initFromFilePath = "$Proj32to64Filename$", needGradient = false)
-    rn3_1 = ResNetNode2Conv(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, rn3_1_Wproj)
+    rn3_1 = ResNetNode2Inc(rn2_3, cMap3, 288, 576, kW, kH, convWScale, convBValue, scValue, rn3_1_Wproj)
    rn3_2 = ResNetNode2(rn3_1, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
    rn3_3 = ResNetNode2(rn3_2, cMap3, 576, kW, kH, convWScale, convBValue, scValue)
                
@ -53,7 +53,7 @@ DNN=[
    poolH = 3
    poolhStride = 2
    poolvStride = 2
-    pool = AveragePooling(rn3_3, poolW, poolH, poolhStride, poolvStride)
+    pool = AveragePooling(rn3_3, poolW, poolH, poolhStride, poolvStride, imageLayout = "cudnn")

    ol = DnnLastLayer(576, labelDim, pool, fc1WScale, fc1BValue)
    
--- a/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/Macros.ndl
@ -1,83 +1,71 @@
 ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
 {
    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
+    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = "cudnn")
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
    p = Plus(c, b);
    y = RectifiedLinear(p);
 }

-ConvBNReLULayer(inp, inMap, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
-{
-    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    b = Parameter(inMap, 1, init = Gaussian, initValueScale = 0.03)
-    sc = Parameter(inMap, 1, init = Gaussian, initValueScale = 0.03)
-    m = Parameter(inMap, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = Parameter(inMap, 1, init = fixedValue, value = 0, needGradient = false)
-    bn = BatchNormalization(inp, sc, b, m, isd, eval = false, spatial = true)
-    c = Convolution(W, bn, kW, kH, outMap, hStride, vStride, zeroPadding = true)
-    y = RectifiedLinear(c);
-}
-
-ConvBNReLULayer2(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scValue)
+ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue, scScale)
 {
    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
-    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = 1.0)
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    y = RectifiedLinear(bn);
 }

-ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
+ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scScale)
 {
    W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
    b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
    m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0)
+    c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    y1 = RectifiedLinear(bn1);
    
    W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
    b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
    m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0)
+    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    p = Plus(bn2, inp)
    y2 = RectifiedLinear(p);
 }

-ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, Wproj)
+ResNetNode2Inc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scScale, Wproj)
 {
    W1 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
    b1 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc1 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
    m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true)
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0)
+    c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true, imageLayout = "cudnn")
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    y1 = RectifiedLinear(bn1);
    
    W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale)
    b2 = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scValue)
+    sc2 = Parameter(outMap, 1, init = Gaussian, initValueScale = scScale)
    m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0)
+    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    
-    cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false)
+    cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
    p = Plus(bn2, cproj)
    y2 = RectifiedLinear(p);
 }
@ -94,13 +82,13 @@ DnnReLULayer(inDim, outDim, x, wScale, bValue)
 DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
 {
    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
-    b = Parameter(inDim, 1, init = fixedValue, value = bValue) 
-    sc = Parameter(inDim, 1, init = Gaussian, initValueScale = 0.01)
-    m = Parameter(inDim, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = Parameter(inDim, 1, init = fixedValue, value = 0, needGradient = false)
-    bn = BatchNormalization(x, sc, b, m, isd, eval = false, spatial = false)
-    t = Times(W, bn)
-    y = RectifiedLinear(t)
+    b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
+    sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
+    m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
+    isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
+    t = Times(W, x)
+    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false, imageLayout = "cudnn")
+    y = RectifiedLinear(bn)
 }

 DnnLastLayer(hiddenDim, labelDim, x, wScale, bValue)
--- a/Examples/Image/Miscellaneous/CIFAR-10/readme.txt
+++ b/Examples/Image/Miscellaneous/CIFAR-10/readme.txt
@ -15,7 +15,7 @@ Short description of the network:
 01_Convolution.ndl is a convolutional network which has 3 convolutional and 3 max pooling layers and resembles the network described here:
 https://code.google.com/p/cuda-convnet/source/browse/trunk/example-layers/layers-80sec.cfg 
 (main differences are usage of max pooling layers everywhere rather than mix of max and average pooling, as well as dropout in fully-connected layer).
-The network produces 22% of error after training for about 4 minutes on GPU.
+The network produces 21% of error after training for about 3 minutes on GPU.
 To run the sample, navigate to this folder and run the following command:
 <path to CNTK executable> configFile=01_Conv.config configName=01_Conv

--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.config
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.config
@ -1,3 +1,10 @@
+RootDir = "."
+
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
 ndlMacros=$ConfigDir$/Macros.ndl

 precision=float
@ -7,10 +14,13 @@ command=Train:AddTop5Eval:Test

 parallelTrain=false

+stderr=$OutputDir$/AlexNet
+traceLevel=1
+numMBsToShowResult=500
+
 Train=[
    action=train
    modelPath=$ModelDir$/AlexNet
-    traceLevel=1

    NDLNetworkBuilder=[
        networkDescription=$ConfigDir$/AlexNet.ndl
@ -35,7 +45,7 @@ Train=[
            ]
        ]
        
-        numMBsToShowResult=10
+        numMBsToShowResult=100
    ]
    
    reader=[
@ -44,7 +54,7 @@ Train=[
        # <full path to image><tab><numerical label (0-based class id)>
        # Example:
        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
-        file=$ConfigDir$/train_map_nfs.txt
+        file=$ConfigDir$/train_map.txt
        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
        randomize=Auto
        features=[
@ -93,7 +103,7 @@ Test=[
    
    reader=[
        readerType=ImageReader
-        file=$ConfigDir$/val_map_nfs.txt
+        file=$ConfigDir$/val_map.txt
        randomize=None
        features=[
            width=224
--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.ndl
@ -7,7 +7,7 @@ ndlMacros = [
    ImageC = 3
    LabelDim = 1000

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
    labels = Input(LabelDim, tag = label)
    
    conv1WScale = 0.95
@ -36,14 +36,14 @@ DNN=[
    hStride1 = 4
    vStride1 = 4
    # weight[cMap1, kW1 * kH1 * ImageC]
-    conv1_act = ConvReLULayer(features, cMap1, 363, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue)
+    conv1 = ConvReLULayer(features, cMap1, 363, kW1, kH1, hStride1, vStride1, conv1WScale, conv1BValue)
    
    # pool1
    pool1W = 3
    pool1H = 3
    pool1hStride = 2
    pool1vStride = 2
-    pool1 = MaxPooling(conv1_act, pool1W, pool1H, pool1hStride, pool1vStride)
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout = "cudnn")

    # conv2
    kW2 = 5
@ -52,14 +52,14 @@ DNN=[
    hStride2 = 1
    vStride2 = 1
    # weight[cMap2, kW2 * kH2 * cMap1]
-    conv2_act = ConvReLULayer(pool1, cMap2, 1600, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue)
+    conv2 = ConvReLULayer(pool1, cMap2, 1600, kW2, kH2, hStride2, vStride2, conv2WScale, conv2BValue)

    # pool2
    pool2W = 3
    pool2H = 3
    pool2hStride = 2
    pool2vStride = 2
-    pool2 = MaxPooling(conv2_act, pool2W, pool2H, pool2hStride, pool2vStride)
+    pool2 = MaxPooling(conv2, pool2W, pool2H, pool2hStride, pool2vStride, imageLayout = "cudnn")

    # conv3
    kW3 = 3
@ -68,7 +68,7 @@ DNN=[
    hStride3 = 1
    vStride3 = 1
    # weight[cMap3, kW3 * kH3 * cMap2]
-    conv3_act = ConvReLULayer(pool2, cMap3, 1728, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue)
+    conv3 = ConvReLULayer(pool2, cMap3, 1728, kW3, kH3, hStride3, vStride3, conv3WScale, conv3BValue)
    
    # conv4
    kW4 = 3
@ -77,7 +77,7 @@ DNN=[
    hStride4 = 1
    vStride4 = 1
    # weight[cMap4, kW4 * kH4 * cMap3]
-    conv4_act = ConvReLULayer(conv3_act, cMap4, 3456, kW4, kH4, hStride4, vStride4, conv4WScale, conv4BValue)
+    conv4 = ConvReLULayer(conv3, cMap4, 3456, kW4, kH4, hStride4, vStride4, conv4WScale, conv4BValue)

    # conv5
    kW5 = 3
@ -86,14 +86,14 @@ DNN=[
    hStride5 = 1
    vStride5 = 1
    # weight[cMap5, kW5 * kH5 * cMap4]
-    conv5_act = ConvReLULayer(conv4_act, cMap5, 2304, kW5, kH5, hStride5, vStride5, conv5WScale, conv5BValue)
+    conv5 = ConvReLULayer(conv4, cMap5, 2304, kW5, kH5, hStride5, vStride5, conv5WScale, conv5BValue)
    
    # pool3
    pool3W = 3
    pool3H = 3
    pool3hStride = 2
    pool3vStride = 2
-    pool3 = MaxPooling(conv5_act, pool3W, pool3H, pool3hStride, pool3vStride)
+    pool3 = MaxPooling(conv5, pool3W, pool3H, pool3hStride, pool3vStride, imageLayout = "cudnn")

    hiddenDim = 4096
    h1 = DNNReLULayer(9216, hiddenDim, pool3, fc1WScale, fc1BValue)
--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/Macros.ndl
@ -1,10 +1,10 @@
 ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
 {
-    convW = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    conv = Convolution(convW, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
-    convB = Parameter(outMap, 1, init = fixedValue, value = bValue)
-    convPlusB = Plus(conv, convB);
-    act = RectifiedLinear(convPlusB);
+    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
+    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = "cudnn")
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
+    z = Plus(c, b);
+    y = RectifiedLinear(z);
 }

 DNNReLULayer(inDim, outDim, x, wScale, bValue)
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@ -6,8 +6,8 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
-    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = 1.0)
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    y = RectifiedLinear(bn);
 }

@ -20,8 +20,8 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
    m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0)
+    c1 = Convolution(W1, inp, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    y1 = RectifiedLinear(bn1);
    
    W2 = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
@ -30,8 +30,8 @@ ResNetNode2(inp, outMap, inWCount, kW, kH, wScale, bValue, scValue)
    m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0)
+    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    p = Plus(bn2, inp)
    y2 = RectifiedLinear(p);
 }
@ -45,8 +45,8 @@ ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue,
    m1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd1 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true)
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0)
+    c1 = Convolution(W1, inp, kW, kH, outMap, 2, 2, zeroPadding = true, imageLayout = "cudnn")
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    y1 = RectifiedLinear(bn1);
    
    W2 = Parameter(outMap, wCount, init = Gaussian, initValueScale = wScale)
@ -55,10 +55,10 @@ ResNetNode2Conv(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue,
    m2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd2 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true)
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0)
+    c2 = Convolution(W2, y1, kW, kH, outMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    
-    cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false)
+    cproj = Convolution(Wproj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
    p = Plus(bn2, cproj)
    y2 = RectifiedLinear(p);
 }
@ -73,8 +73,8 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue)
    m1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c1 = Convolution(W1, inp, 1, 1, convMap, 1, 1, zeroPadding = false)
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true)
+    c1 = Convolution(W1, inp, 1, 1, convMap, 1, 1, zeroPadding = false, imageLayout = "cudnn")
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, imageLayout = "cudnn")
    y1 = RectifiedLinear(bn1);

    # 3x3 convolution.
@ -84,8 +84,8 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue)
    m2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c2 = Convolution(W2, y1, 3, 3, convMap, 1, 1, zeroPadding = true)
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0)
+    c2 = Convolution(W2, y1, 3, 3, convMap, 1, 1, zeroPadding = true, imageLayout = "cudnn")
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    y2 = RectifiedLinear(bn2);
    
    # 1x1 expanding convolution.
@ -95,8 +95,8 @@ ResNetNode3(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue)
    m3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c3 = Convolution(W3, y2, 1, 1, outMap, 1, 1, zeroPadding = false)
-    bn3 = BatchNormalization(c3, sc3, b3, m3, isd3, eval = false, spatial = true)
+    c3 = Convolution(W3, y2, 1, 1, outMap, 1, 1, zeroPadding = false, imageLayout = "cudnn")
+    bn3 = BatchNormalization(c3, sc3, b3, m3, isd3, eval = false, spatial = true, imageLayout = "cudnn")
    
    p = Plus(bn3, inp)
    y3 = RectifiedLinear(p);
@ -111,8 +111,8 @@ ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue,
    m1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd1 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c1 = Convolution(W1, inp, 1, 1, convMap, 1, 1, zeroPadding = false)
-    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true)
+    c1 = Convolution(W1, inp, 1, 1, convMap, 1, 1, zeroPadding = false, imageLayout = "cudnn")
+    bn1 = BatchNormalization(c1, sc1, b1, m1, isd1, eval = false, spatial = true, imageLayout = "cudnn")
    y1 = RectifiedLinear(bn1);

    # 3x3 convolution.
@ -122,8 +122,8 @@ ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue,
    m2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd2 = Parameter(convMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c2 = Convolution(W2, y1, 3, 3, convMap, 2, 2, zeroPadding = true)
-    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0)
+    c2 = Convolution(W2, y1, 3, 3, convMap, 2, 2, zeroPadding = true, imageLayout = "cudnn")
+    bn2 = BatchNormalization(c2, sc2, b2, m2, isd2, eval = false, spatial = true, expAvgFactor = 1.0, imageLayout = "cudnn")
    y2 = RectifiedLinear(bn2);
    
    # 1x1 expanding convolution.
@ -133,11 +133,11 @@ ResNetNode3Inc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue,
    m3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd3 = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c3 = Convolution(W3, y2, 1, 1, outMap, 1, 1, zeroPadding = false)
-    bn3 = BatchNormalization(c3, sc3, b3, m3, isd3, eval = false, spatial = true)
+    c3 = Convolution(W3, y2, 1, 1, outMap, 1, 1, zeroPadding = false, imageLayout = "cudnn")
+    bn3 = BatchNormalization(c3, sc3, b3, m3, isd3, eval = false, spatial = true, imageLayout = "cudnn")
    
    # Increasing input dimension convolution
-    cProj = Convolution(wProj, inp, 1, 1, outMap, 2, 2, zeroPadding = false)
+    cProj = Convolution(wProj, inp, 1, 1, outMap, 2, 2, zeroPadding = false, imageLayout = "cudnn")
    
    p = Plus(bn3, cProj)
    y3 = RectifiedLinear(p);
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.config
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.config
@ -32,10 +32,10 @@ Train=[
    
    SGD=[
        epochSize=0
-        minibatchSize=2
-        learningRatesPerMB=0.1*20:0.03*10:0.01*30:0.003
+        minibatchSize=32
+        learningRatesPerMB=0.1*30:0.03*25:0.01*25:0.003*25:0.001
        momentumPerMB=0.9
-        maxEpochs=100
+        maxEpochs=120
        gradUpdateType=None
        L2RegWeight=0.0001
        dropoutRate=0
@ -72,7 +72,7 @@ Train=[
            # Horizontal random flip, will be enabled by default if cropType=Random
            #hflip=0
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
-            cropRatio=0.875
+            cropRatio=0.46666:0.875
            # Crop scale ratio jitter type.
            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
            jitterType=UniRatio
@ -99,7 +99,7 @@ Test=[
    action=test
    modelPath=$ModelDir$/ResNet_152.Top5
    # Set minibatch size for testing.
-    minibatchSize=128
+    minibatchSize=32

     NDLNetworkBuilder=[
        networkDescription=$ConfigDir$/ResNet_152.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
@ -7,7 +7,7 @@ ndlMacros = [
    ImageC = 3
    LabelDim = 1000

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
    featOffs = Const(0, rows = 150528)
    featScaled = Plus(features, featOffs)
    labels = Input(LabelDim, tag = label)
@ -42,7 +42,7 @@ DNN=[
    cMap6 = 2048
    
    conv1 = ConvBNReLULayer(featScaled, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
-    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs)
+    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
    
    rn1_1_Wproj = Parameter(cMap3, cMap1, init = fromFile, initFromFilePath = "$Proj64to256Filename$", needGradient = false)
    rn1_1 = ResNetNode3Inc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, rn1_1_Wproj)
@ -102,7 +102,7 @@ DNN=[
    rn4_2 = ResNetNode3(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)
    rn4_3 = ResNetNode3(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue)

-    pool5 = AveragePooling(rn4_3, poolW, poolH, poolhs, poolvs)
+    pool5 = AveragePooling(rn4_3, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    ol = DnnLayer(8192, labelDim, pool5, fcWScale, fcBValue)
    
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.config
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.config
@ -32,9 +32,9 @@ Train=[
    SGD=[
        epochSize=0
        minibatchSize=64
-        learningRatesPerMB=0.1*20:0.03*10:0.01*30:0.003
+        learningRatesPerMB=0.1*30:0.03*25:0.01*25:0.003*25:0.001
        momentumPerMB=0.9
-        maxEpochs=100
+        maxEpochs=120
        gradUpdateType=None
        L2RegWeight=0.0001
        dropoutRate=0
@ -71,7 +71,7 @@ Train=[
            # Horizontal random flip, will be enabled by default if cropType=Random
            #hflip=0
            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
-            cropRatio=0.875
+            cropRatio=0.46666:0.875
            # Crop scale ratio jitter type.
            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
            jitterType=UniRatio
@ -98,7 +98,7 @@ Test=[
    action=test
    modelPath=$ModelDir$/ResNet_34.Top5
    # Set minibatch size for testing.
-    minibatchSize=128
+    minibatchSize=64

     NDLNetworkBuilder=[
        networkDescription=$ConfigDir$/ResNet_34.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
@ -7,9 +7,7 @@ ndlMacros = [
    ImageC = 3
    LabelDim = 1000

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
-    featOffs = Const(0, rows = 150528)
-    featScaled = Plus(features, featOffs)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
    labels = Input(LabelDim, tag = label)
    
    # Kernels width and height.
@ -35,8 +33,8 @@ ndlMacros = [

 DNN=[
    cMap1 = 64
-    conv1 = ConvBNReLULayer(featScaled, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
-    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs)
+    conv1 = ConvBNReLULayer(features, cMap1, 147, 7, 7, 2, 2, convWScale, convBValue, scValue)
+    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")
    
    rn1_1 = ResNetNode2(pool1, cMap1, 576, kW, kH, convWScale, convBValue, scValue)
    rn1_2 = ResNetNode2(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue)
@ -64,7 +62,7 @@ DNN=[
    rn4_2 = ResNetNode2(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue)
    rn4_3 = ResNetNode2(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue)

-    pool5 = AveragePooling(rn4_3, poolW, poolH, poolhs, poolvs)
+    pool5 = AveragePooling(rn4_3, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    ol = DnnLayer(4608, labelDim, pool5, fcWScale, fcBValue)
    
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
@ -12,14 +12,13 @@ DnnReLULayer(inDim, outDim, x, wScale, bValue)
 DnnBNReLULayer(inDim, outDim, x, wScale, bValue)
 {
    W = Parameter(outDim, inDim, init = Gaussian, initValueScale = wScale) 
-    b = Parameter(inDim, 1, init = fixedValue, value = bValue) 
-    sc = Parameter(inDim, 1, init = Gaussian, initValueScale = 0.01)
-    m = Parameter(inDim, 1, init = fixedValue, value = 0, needGradient = false)
-    isd = Parameter(inDim, 1, init = fixedValue, value = 0, needGradient = false)
-    
-    bn = BatchNormalization(x, sc, b, m, isd, eval = false, spatial = false)
-    t = Times(W, bn)
-    y = RectifiedLinear(t)
+    b = Parameter(outDim, 1, init = fixedValue, value = bValue) 
+    sc = Parameter(outDim, 1, init = Gaussian, initValueScale = 0.01)
+    m = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
+    isd = Parameter(outDim, 1, init = fixedValue, value = 0, needGradient = false)
+    t = Times(W, x)
+    bn = BatchNormalization(t, sc, b, m, isd, eval = false, spatial = false)
+    y = RectifiedLinear(bn)
 }

 # Fully-connected layer.
@ -35,8 +34,8 @@ DnnLayer(inDim, outDim, x, wScale, bValue)
 ConvReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue)
 {
    W = Parameter(outMap, inWCount, init = Gaussian, initValueScale = wScale)
-    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
-    b = Parameter(outMap, 1, init = fixedValue, value = bValue)
+    b = ImageParameter(1, 1, outMap, init = fixedValue, value = bValue, imageLayout = "cudnn")
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
    z = Plus(c, b);
    y = RectifiedLinear(z);
 }
@ -50,7 +49,7 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    m = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    isd = Parameter(outMap, 1, init = fixedValue, value = 0, needGradient = false)
    
-    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true)
-    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true)
+    c = Convolution(W, inp, kW, kH, outMap, hStride, vStride, zeroPadding = true, imageLayout = "cudnn")
+    bn = BatchNormalization(c, sc, b, m, isd, eval = false, spatial = true, imageLayout = "cudnn")
    y = RectifiedLinear(bn);
 }
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.config
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.config
@ -1,21 +1,28 @@
-WorkDir=.
-ModelDir=$WorkDir$/_out/$ConfigName$
-stderr=$WorkDir$/_out/$ConfigName$
+RootDir = "."

-ndlMacros=$WorkDir$/Macros.ndl
+ConfigDir = "$RootDir$"
+DataDir = "$RootDir$"
+OutputDir = "$RootDir$/Output"
+ModelDir = "$OutputDir$/Models"
+
+ndlMacros=$ConfigDir$/Macros.ndl

 precision=float
 deviceId=Auto

 command=Train:AddTop5Eval:Test

+stderr=$OutputDir$/VGG_A
+traceLevel=1
+numMBsToShowResult=500
+
 Train=[
    action=train
    modelPath=$ModelDir$/VGG_A
    traceLevel=1

     NDLNetworkBuilder=[
-        networkDescription=$WorkDir$/VGG_A.ndl
+        networkDescription=$ConfigDir$/VGG_A.ndl
    ]
    
    SGD=[
@ -37,7 +44,7 @@ Train=[
        # <full path to image><tab><numerical label (0-based class id)>
        # Example:
        # C:\Data\ImageNet\2012\train\n01440764\n01440764_10026.JPEG<tab>0
-        file=$WorkDir$/train_map.txt
+        file=$ConfigDir$/train_map.txt
        # Randomize images before every epoch. Possible values: None, Auto. Default: Auto.
        randomize=Auto
        features=[
@ -59,7 +66,7 @@ Train=[
            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
            interpolations=Linear
            # Stores mean values for each pixel in OpenCV matrix XML format.
-            meanFile=$WorkDir$/ImageNet1K_mean.xml
+            meanFile=$ConfigDir$/ImageNet1K_mean.xml
        ]
        labels=[
            labelDim=1000
@ -71,29 +78,29 @@ AddTop5Eval=[
    action=edit
    CurModel=$ModelDir$/VGG_A
    NewModel=$ModelDir$/VGG_A.Top5
-    editPath=$WorkDir$/add_top5_layer.mel
+    editPath=$ConfigDir$/add_top5_layer.mel
 ]

 Test=[
    action=test
    modelPath=$ModelDir$/VGG_A.Top5
    # Set minibatch size for testing.
-    minibatchSize=128
+    minibatchSize=32

     NDLNetworkBuilder=[
-        networkDescription=$WorkDir$/VGG_A.ndl
+        networkDescription=$ConfigDir$/VGG_A.ndl
    ]
    
    reader=[
        readerType=ImageReader
-        file=$WorkDir$/val_map.txt
+        file=$ConfigDir$/val_map.txt
        randomize=None
        features=[
            width=224
            height=224
            channels=3
            cropType=Center
-            meanFile=$WorkDir$/ImageNet1K_mean.xml
+            meanFile=$ConfigDir$/ImageNet1K_mean.xml
        ]
        labels=[
            labelDim=1000
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.ndl
@ -1,13 +1,13 @@
-load=ndlMnistMacros
+load=ndlMacros
 run=DNN

-ndlMnistMacros = [
+ndlMacros = [
    ImageW = 224
    ImageH = 224
    ImageC = 3
    LabelDim = 1000

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
    labels = Input(LabelDim, tag = label)
    
    # Kernels width and height.
@ -38,30 +38,30 @@ DNN=[
    cMap1 = 64
    conv1 = ConvReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue)

-    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs)
+    pool1 = MaxPooling(conv1, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap2 = 128
    conv2 = ConvReLULayer(pool1, cMap2, 576, kW, kH, hs, vs, convWScale, convBValue)
   
-    pool2 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs)
+    pool2 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap3 = 256
    conv3 = ConvReLULayer(pool2, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue)
    conv4 = ConvReLULayer(conv3, cMap3, 2304, kW, kH, hs, vs, convWScale, convBValue)

-    pool3 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs)
+    pool3 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap5 = 512
    conv5 = ConvReLULayer(pool3, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
    conv6 = ConvReLULayer(conv5, cMap5, 4608, kW, kH, hs, vs, convWScale, convBValue)

-    pool4 = MaxPooling(conv6, poolW, poolH, poolhs, poolvs)
+    pool4 = MaxPooling(conv6, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap6 = 512
    conv7 = ConvReLULayer(pool4, cMap6, 4608, kW, kH, hs, vs, convWScale, convBValue)
    conv8 = ConvReLULayer(conv7, cMap6, 4608, kW, kH, hs, vs, convWScale, convBValue)

-    pool5 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs)
+    pool5 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    hiddenDim = 4096
    h1 = DnnReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.config
@ -94,7 +94,7 @@ Test=[
    action=test
    modelPath=$ModelDir$/VGG_E.Top5
    # Set minibatch size for testing.
-    minibatchSize=128
+    minibatchSize=16

     NDLNetworkBuilder=[
        networkDescription=$ConfigDir$/VGG_E.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
@ -7,7 +7,7 @@ ndlMacros = [
    ImageC = 3
    LabelDim = 1000

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
    labels = Input(LabelDim, tag = label)
    
    # Kernels width and height.
@ -39,13 +39,13 @@ DNN=[
    conv1 = ConvReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue)
    conv2 = ConvReLULayer(conv1, cMap1, 576, kW, kH, hs, vs, convWScale, convBValue)

-    pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs)
+    pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap3 = 128
    conv3 = ConvReLULayer(pool1, cMap3, 576, kW, kH, hs, vs, convWScale, convBValue)
    conv4 = ConvReLULayer(conv3, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue)
   
-    pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs)
+    pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap5 = 256
    conv5 = ConvReLULayer(pool2, cMap5, 1152, kW, kH, hs, vs, convWScale, convBValue)
@ -53,7 +53,7 @@ DNN=[
    conv7 = ConvReLULayer(conv6, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)
    conv8 = ConvReLULayer(conv7, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue)

-    pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs)
+    pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap9 = 512
    conv9 = ConvReLULayer(pool3, cMap9, 2304, kW, kH, hs, vs, convWScale, convBValue)
@ -61,7 +61,7 @@ DNN=[
    conv11 = ConvReLULayer(conv10, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue)
    conv12 = ConvReLULayer(conv11, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue)

-    pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs)
+    pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap13 = 512
    conv13 = ConvReLULayer(pool4, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
@ -69,7 +69,7 @@ DNN=[
    conv15 = ConvReLULayer(conv14, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)
    conv16 = ConvReLULayer(conv15, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue)

-    pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs)
+    pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    hiddenDim = 4096
    h1 = DnnReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.config
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.config
@ -94,7 +94,7 @@ Test=[
    action=test
    modelPath=$ModelDir$/VGG_E_BN.Top5
    # Set minibatch size for testing.
-    minibatchSize=128
+    minibatchSize=16

     NDLNetworkBuilder=[
        networkDescription=$ConfigDir$/VGG_E_BN.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl
@ -7,9 +7,7 @@ ndlMacros = [
    ImageC = 3
    LabelDim = 1000

-    features = ImageInput(ImageW, ImageH, ImageC, tag = feature)
-    featOffs = Const(0, rows = 150528)
-    featScaled = Plus(features, featOffs)
+    features = ImageInput(ImageW, ImageH, ImageC, tag = feature, imageLayout = "cudnn")
    labels = Input(LabelDim, tag = label)
    
    # Kernels width and height.
@ -39,16 +37,16 @@ ndlMacros = [

 DNN=[
    cMap1 = 64
-    conv1 = ConvBNReLULayer(featScaled, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue, scValue)
+    conv1 = ConvBNReLULayer(features, cMap1, 27, kW, kH, hs, vs, convWScale, convBValue, scValue)
    conv2 = ConvBNReLULayer(conv1, cMap1, 576, kW, kH, hs, vs, convWScale, convBValue, scValue)

-    pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs)
+    pool1 = MaxPooling(conv2, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap3 = 128
    conv3 = ConvBNReLULayer(pool1, cMap3, 576, kW, kH, hs, vs, convWScale, convBValue, scValue)
    conv4 = ConvBNReLULayer(conv3, cMap3, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue)
   
-    pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs)
+    pool2 = MaxPooling(conv4, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap5 = 256
    conv5 = ConvBNReLULayer(pool2, cMap5, 1152, kW, kH, hs, vs, convWScale, convBValue, scValue)
@ -56,7 +54,7 @@ DNN=[
    conv7 = ConvBNReLULayer(conv6, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
    conv8 = ConvBNReLULayer(conv7, cMap5, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)

-    pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs)
+    pool3 = MaxPooling(conv8, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap9 = 512
    conv9 = ConvBNReLULayer(pool3, cMap9, 2304, kW, kH, hs, vs, convWScale, convBValue, scValue)
@ -64,7 +62,7 @@ DNN=[
    conv11 = ConvBNReLULayer(conv10, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
    conv12 = ConvBNReLULayer(conv11, cMap9, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)

-    pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs)
+    pool4 = MaxPooling(conv12, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    cMap13 = 512
    conv13 = ConvBNReLULayer(pool4, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
@ -72,7 +70,7 @@ DNN=[
    conv15 = ConvBNReLULayer(conv14, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)
    conv16 = ConvBNReLULayer(conv15, cMap13, 4608, kW, kH, hs, vs, convWScale, convBValue, scValue)

-    pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs)
+    pool5 = MaxPooling(conv16, poolW, poolH, poolhs, poolvs, imageLayout = "cudnn")

    hiddenDim = 4096
    h1 = DnnBNReLULayer(25088, hiddenDim, pool5, fc1WScale, fc1BValue)
--- a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
+++ b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
@ -47,6 +47,7 @@ using namespace std;
        L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
        L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
        // TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
+        L"Shift(input, fromOffsets, boundaryValue, dim=-1, offsetRanges=1, multiOffsetDim=0, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) ; fromOffset = new IntVector [ items = fromOffsets ] ; offsetRange = new SizeVector [items=    new SizeVector [ items = offsetRanges ]   ]/*plus the function args*/ ]\n"
        L"RowSlice(startIndex, numRows, input, needGradient = false, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
        L"RowRepeat(input, numRepeats, needGradient = false, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
        L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
--- a/Source/Common/Include/ScriptableObjects.h
+++ b/Source/Common/Include/ScriptableObjects.h
@ -699,5 +699,4 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
    }
    template<class V> /*static*/ const std::vector<typename V::value_type> & IConfigRecord::Array(const V & vec) { return static_cast<const std::vector<typename V::value_type> &>(vec); }  // use this specifically for XXXargvector

-
 }}} // end namespaces
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@ -208,7 +208,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
        }

-        void Load(File& fstream, bool acceptLegacyFormat = false)
+        const TensorShape & Load(File& fstream, bool acceptLegacyFormat = false)
        {
            // format: uint32_t n, dim[0], dim[1], ..., dim[n-1]
            // We are also able to read (but not write) an older format, which stores 3-dimensional tensors as size_t W, H, C
@ -232,6 +232,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                fstream >> m_dims[2] >> m_dims[0]; // stored in order C, W, H
            }
            InitAsNoSlice();
+            return *this;
        }

        // accessors
@ -404,7 +405,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            m_strides.resize(m_dims.size());
            for (size_t k = 0; k < m_dims.size(); k++)
                m_strides[k] = k > 0 ? m_strides[k - 1] * (ptrdiff_t)m_dims[k - 1] : 1;
-            m_allocation = m_dims.empty() ? 0 : m_dims.back() * (size_t)m_strides.back();
+            m_allocation = m_dims.empty() ? 0 : m_dims.back() * (size_t)m_strides.back();   // TODO: Or should an empty shape mean it's a scalar?
        }

    private:
--- a/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
@ -24,6 +24,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // The methods below determine evaluation order, which is tricky in presence of recurrent loops.
    // TODO: Can this be moved to a separate class?

+    static const vector<int> & GetRecurrenceDirections(const ComputationNodeBasePtr &);
+
    // FormRecurrentLoops() -- MAIN ENTRY POINT for network recurrent-loop analysis. All other functions in this CPP are called only from this one.
    // This function analysis the networks for recurrent loops present in the computation of 'rootNode.'
    // This sets/updates:
@ -83,16 +85,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            unordered_set<ComputationNodeBasePtr> visited;
            unordered_set<ComputationNodeBasePtr> recStack;

-            // set m_indexInLoop for all nodes except Past/FutureValueNodes in all loops
+            // set m_indexInLoop for all nodes except recurrent nodes in all loops
            // This value is only used in the block right after this.
            for (size_t j = 0; j < iter->m_nestedNodes.size(); j++)
            {
-                ComputationNodeBasePtr node = iter->m_nestedNodes[j];
+                const auto & node = iter->m_nestedNodes[j];
                for (size_t i = 0; i < node->GetNumInputs(); i++)
                {
-                    if (node->Input(i)->m_loopId == node->m_loopId && 
-                        node->OperationName() != OperationNameOf(PastValueNode) &&
-                        node->OperationName() != OperationNameOf(FutureValueNode))      // TODO: test for type RecurrentNode instead?
+                    if (node->Input(i)->m_loopId == node->m_loopId && GetRecurrenceDirections(node).empty())
                    {
                        //assert(node->Input(i)->m_indexInLoop == 0);                    // No. It seems this variable really counts the number of parents.
                        node->Input(i)->m_indexInLoop++;               // BUGBUG: this is bumping up the m_indexInLoop, but I don't think it is initialized anywhere other than PurgeStateForFormingRecurrentLoops(). i-1?
@ -146,7 +146,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #endif
        }

-
        // log the loops
        for (auto & iter : m_allSEQNodes)
        {
@ -168,6 +167,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #endif
    }

+    // checks whether a node is recurrent, and which direction
+    static vector<int> emptyVector;
+    static const vector<int> & GetRecurrenceDirections(const ComputationNodeBasePtr & node)
+    {
+        if (node->Is<IRecurrentNode>())
+            return node->As<IRecurrentNode>()->GetRecurrenceDirections();
+        else
+            return emptyVector;
+    }
+
    static int DetermineLoopDirection(const std::vector<ComputationNodeBasePtr> & nestedNodes);
    // get the strongly connected components from the graph
    // This sets index, lowLink, m_visited, and m_inStack.
@ -299,8 +308,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            visited.insert(cur);
            recStack.insert(cur);

-            if (cur->OperationName() != OperationNameOf(PastValueNode) &&   // recurrence stops at delays
-                cur->OperationName() != OperationNameOf(FutureValueNode))
+            if (GetRecurrenceDirections(cur).empty())   // recurrence stops at delays
            {
                for (size_t i = 0; i < cur->GetNumInputs(); i++)
                    if (cur->Input(i)->m_loopId == cur->m_loopId)
@ -384,28 +392,28 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    // set m_steppingDirection for all loops
    // TODO: Move this up to where it is used (in a separate commit since git cannot track moving and changing at the same time).
+    // BUGBUG: Need to extend to multi-dimensional loop directions. Use a vector<int>.
    static int DetermineLoopDirection(const std::vector<ComputationNodeBasePtr> & nestedNodes)
    {
-
-            bool hasPastValueNode = false;
-            bool hasFutureValueNode = false;
+        vector<int> recurrenceDirections;

        for (auto & node : nestedNodes)
        {
-                if (node->OperationName() == OperationNameOf(PastValueNode))
-                    hasPastValueNode = true;
-                else if (node->OperationName() == OperationNameOf(FutureValueNode))
-                    hasFutureValueNode = true;
+            const auto & dirs = GetRecurrenceDirections(node);
+            if (dirs.empty())   // not a recurrent node
+                continue;
+            if (recurrenceDirections.empty())
+                recurrenceDirections = dirs;
+            else if (recurrenceDirections != dirs)
+                InvalidArgument("It is not allowed to have multiple different recurrence directions in the same loop (loop connected to %ls %ls operation).",
+                                nestedNodes.front()->NodeName().c_str(), nestedNodes.front()->OperationName().c_str());
        }

-            if (hasPastValueNode && !hasFutureValueNode)
-            return +1;
-            else if (hasFutureValueNode && !hasPastValueNode)
-            return -1;
-            else if (hasPastValueNode && hasFutureValueNode)
-                InvalidArgument("It is not allowed to have both PastValue and FutureValue nodes in the same loop. How do you think that should work??");
-            else
-                LogicError("There is neither PastValue nor FutureValue nodes in the loop.");
+        if (recurrenceDirections.empty())
+            LogicError("There is no recurrent node in the loop connected to %ls %ls operation.",
+                       nestedNodes.front()->NodeName().c_str(), nestedNodes.front()->OperationName().c_str());
+        // BUGBUG: Multiple recurrence dimensions not yet supported beyond this point.
+        return -recurrenceDirections[0];
    }

 }}}
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -46,6 +46,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        else if (nodeType == OperationNameOf(CrossEntropyNode))                     return New<CrossEntropyNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(CrossEntropyWithSoftmaxNode))          return New<CrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(SequenceWithSoftmaxNode))              return New<SequenceWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
+        else if (nodeType == OperationNameOf(DiagonalNode))                         return New<DiagonalNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(DiagTimesNode))                        return New<DiagTimesNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(DropoutNode))                          return New<DropoutNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(DummyCriterionNode))                   return New<DummyCriterionNode<ElemType>>(forward<_Types>(_Args)...);
@ -82,7 +83,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        else if (nodeType == OperationNameOf(RowElementTimesNode))                  return New<RowElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
 #endif
        else if (nodeType == OperationNameOf(RowRepeatNode))                        return New<RowRepeatNode<ElemType>>(forward<_Types>(_Args)...);
-        else if (nodeType == OperationNameOf(DiagonalNode))	                    return New<DiagonalNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(RowSliceNode))                         return New<RowSliceNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(RowStackNode))                         return New<RowStackNode<ElemType>>(forward<_Types>(_Args)...);
 #ifdef ENABLE_BROADCASTING_ELEMENTTIMES
@ -91,6 +91,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        else if (nodeType == OperationNameOf(ScaleNode))                            return New<ScaleNode<ElemType>>(forward<_Types>(_Args)...);
 #endif
        else if (nodeType == OperationNameOf(SequenceDecoderNode))                  return New<SequenceDecoderNode<ElemType>>(forward<_Types>(_Args)...);
+        else if (nodeType == OperationNameOf(ShiftNode))                            return New<ShiftNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(SigmoidNode))                          return New<SigmoidNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(SoftmaxNode))                          return New<SoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
        else if (nodeType == OperationNameOf(SquareErrorNode))                      return New<SquareErrorNode<ElemType>>(forward<_Types>(_Args)...);
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -305,8 +305,9 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
        static TensorShape TensorShapeFromConfig(const IConfigRecord & config)
        {
            const auto & valp = config[L"dims"];
-            // TODO: Add code that if input is already a tensor shape it is also OK.
-            if (valp.Is<ConfigArray>())
+            if (valp.Is<TensorShape>())
+                return valp.AsRef<TensorShape>();   // UNTESTED
+            else if (valp.Is<ConfigArray>())
                return TensorShape(valp.AsRef<ConfigArray>().AsVector<size_t>([&](const wstring & msg){ valp.Fail(msg); }));
            else
                return TensorShape(std::vector<size_t>(1, (size_t)valp));       // single element
@ -315,6 +316,26 @@ namespace Microsoft { namespace MSR { namespace ScriptableObjects {
        BoxedTensorShape(const IConfigRecordPtr configp) : BoxOf<TensorShape>(TensorShapeFromConfig(*configp)) { }
    };

-    ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedTensorShape> registerTensoShape(L"TensorShape");
+    template<typename E>
+    class BoxedVector : public BoxOf<vector<E>>
+    {
+        // create a vector from config
+        static vector<E> VectorFromConfig(const IConfigRecord & config)
+        {
+            const auto & valp = config[L"items"];
+            if (valp.Is<vector<E>>())
+                return valp.AsRef<vector<E>>(); // UNTESTED
+            else if (valp.Is<ConfigArray>())
+                return valp.AsRef<ConfigArray>().AsVector<E>([&](const wstring & msg){ valp.Fail(msg); });
+            else
+                return std::vector<E>(1, (E)valp);       // single element
+        }
+    public:
+        BoxedVector(const IConfigRecordPtr configp) : BoxOf<vector<E>>(VectorFromConfig(*configp)) { }
+    };
+
+    ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedTensorShape>    registerTensorShape(L"TensorShape");
+    ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<int>>    registerIntVector(L"IntVector");
+    ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<BoxedVector<size_t>> registerSizeVector(L"SizeVector");

 }}}
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -132,12 +132,13 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        virtual ~INodeState() {} 
    };

-    struct /*interface*/ IStateFulNode
+    struct /*interface*/ IStatefulNode
    {
        typedef std::shared_ptr<INodeState> NodeStatePtr;
        virtual NodeStatePtr ExportState() = 0;
-        virtual void ImportState(const NodeStatePtr& pImportedState) = 0;
+        virtual void ImportState(NodeStatePtr && state) = 0;
    };
+    typedef IStatefulNode::NodeStatePtr NodeStatePtr;

    // =======================================================================
    // ComputationNetworkOwnedNodeState -- class to collect ComputationNode members that are really owned by ComputationNetwork
@ -444,7 +445,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }

        void LinkToMBLayout(MBLayoutPtr pMBLayout) { m_pMBLayout = pMBLayout; }
-        //MBLayoutPtr GetMBLayout() { return m_pMBLayout; }
        const MBLayoutPtr & GetMBLayout() const { return m_pMBLayout; }
        bool HasMBLayout() const { return !!m_pMBLayout; }

@ -1505,6 +1505,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    };


+    // =======================================================================
+    // IRecurrentNode -- helper wrapper class for ComputationNodes that can be recurrent
+    // =======================================================================
+
+    struct IRecurrentNode { virtual const std::vector<int> & GetRecurrenceDirections() const = 0; };
+

    // =======================================================================
    // helper macro to ease access to base members in presence of C++ two-phase name lookup
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -734,20 +734,27 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (isFinalValidationPass)
            {
                const auto m_imageLayoutKind = ImageLayoutKind::CHW;        // BUGBUG: Finish this. Must be serialized.
-                auto dims = ImageDimensions(GetSampleLayout(), m_imageLayoutKind);
+
+                auto shape = GetSampleLayout();

                if (m_factory == nullptr)
                    m_factory = ConvolutionEngineFactory<ElemType>::Create(m_deviceId, ConvolutionEngineFactory<ElemType>::EngineType::Auto, m_imageLayoutKind);
                if (m_convEng == nullptr)
                    m_convEng = m_factory->CreateConvEngine(m_deviceId, 0);
+                if (m_spatial)
+                {
+                    auto dims = ImageDimensions(shape, m_imageLayoutKind);
                    if (m_inT == nullptr)
                        m_inT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
                    if (m_scaleBiasT == nullptr)
-                {
-                    if (m_spatial)
                        m_scaleBiasT = m_factory->CreateTensor(1, 1, dims.m_numChannels, 1);
+                }
                else
-                        m_scaleBiasT = m_factory->CreateTensor(dims.m_width, dims.m_height, dims.m_numChannels, 1);
+                {
+                    if (m_inT == nullptr)
+                        m_inT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
+                    if (m_scaleBiasT == nullptr)
+                        m_scaleBiasT = m_factory->CreateTensor(shape.GetNumElements(), 1, 1, 1);
                }
            }
        }
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -72,8 +72,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ValidateUnaryMap(isFinalValidationPass);
        }

-        // We don't need our output values in backprop.
-        virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+        virtual bool OutputUsedInComputingInputNodesGradients() const override { return gradientFromOutput; }
+        virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return !gradientFromOutput; }
    };

 #define UnaryElementWiseWithOpCodeNodeBaseMembers UsingComputationNodeMembersBoilerplate;
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -25,6 +25,256 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+    // -----------------------------------------------------------------------
+    // ShiftNode (input, fromOffset, boundaryValue, dim=-1, offsetRange=1, multiOffsetDim=0) -- delay and rolling window
+    //
+    // This shifts the input by (-fromOffset) steps. In other words, output(t) will be input(t+fromOffset).
+    // E.g. for fromOffset=-1, this gives the past value.
+    // This node has quite some options that make it powerful for many use cases.
+    //
+    // This node can be used in a recurrent loop. This requires special handling by the ComputationNetwork,
+    // for both execution (sequential execution) and creation (avoiding circular references).
+    // TODO: When outside a recurrent loop and used with frame randomization, this will communicate to the reader
+    // that additional frames are needed, which will then return a frame range. TODO: This will not match
+    // the labels, which are still 1 frame. Think through which dimension this should go in.
+    //
+    // Values shifted in from beyond sequence boundaries will be copied from boundaryValue.
+    // Normally, this is a scalar Constant(). However, it can be any node, which will be indexed from the end
+    // (e.g. for fromOffset=-1, the last frame of boundaryValue will be used). This can implement
+    // sequence-to-sequence models. Broadcasting is supported, so it can be e.g. a single output-dimension vector
+    // applied to all sequences.
+    //
+    // To delay (past value), use negative fromOffset. To access future value, use positive fromOffset.
+    //
+    // To pull in multiple offsets, use offsetRange>1. This will pull in offsetRange consecutive offsets starting
+    // with fromOffset. This implements a rolling window. A new dimension will be inserted at multiOffsetDim
+    // (default 0 means after the last sample dimension). Special considerations:
+    //  - If the boundaryValue is not wide enough, the sequence will be dropped (e.g. if you pull in 5 history frames,
+    //    but the sequence in boundaryValue only has 4 samples).
+    //  - If you feed back such an expanded output into this node in a loop, you get an inconsistency
+    //    and will eventually fail. You must pull the dimensions apart.
+    //  - If the current time step (offset 0) is included in the range (e.g. fromOffset=-1, offsetRange=3) then
+    //    this node cannot participate in a recurrence.
+    //
+    // By default, this shifts over the time dimension, but you can choose to shift over any
+    // sample tensor dimension instead using 'dim' (-1 stands for time). This will only work, however,
+    // when all involved nodes are implemented using the tensor library. Nodes implemented using
+    // Matrix slices can only support iterating over time.
+    //
+    // The fromOffset can also be a tensor, e.g. (1,1). In that case, iteration will be over multiple
+    // consecutive dimensions. offsetRange must have the same number of dimensions.
+    //
+    // If the boundaryValue has 0 elements, the sequence will be trimmed (frames reaching beyond the boundary
+    // are dropped). This will initially not be implemented for the time dimension (as it would require
+    // change of MBLayout).
+    // -----------------------------------------------------------------------
+
+    template<class ElemType>
+    class ShiftNode : public ComputationNode<ElemType>, public IRecurrentNode, public ILateAttachingNode, public IStatefulNode,  public NumInputs<2>
+    {
+        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+        static const std::wstring TypeName() { return L"Shift"; }
+    public:
+        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name, const std::vector<int> & fromOffset, int shiftDimension, const std::vector<size_t> & offsetRange, int expandDimension) :
+            Base(deviceId, name), m_fromOffsetBegin(fromOffset),
+            m_shiftDimension(shiftDimension), m_expandDimension(expandDimension),
+            m_insertExpandShapeAt(SIZE_MAX/*uninitialized at this point*/)
+        {
+            // determine m_fromOffsetEnd from fromOffset/offsetRange
+            bool anyNonRecurrent = false;
+            for (size_t k = 0; k < m_fromOffsetBegin.size(); k++)
+            {
+                m_fromOffsetEnd.push_back(m_fromOffsetBegin[k] + (k < offsetRange.size() ? (int)offsetRange[k] : 1));
+                if (m_fromOffsetEnd[k] <= 0)
+                    m_recurrenceDirections.push_back(-1);
+                else if (m_fromOffsetBegin[k] > 0)
+                    m_recurrenceDirections.push_back(+1);
+                else
+                    m_recurrenceDirections.push_back(0);
+                anyNonRecurrent |= m_recurrenceDirections[k] == 0;
+            }
+            if (anyNonRecurrent)
+                m_recurrenceDirections.clear();
+            CreateMatrixIfNull(m_value);
+            SetDims(TensorShape(), 0);  // empty for now
+        }
+        ShiftNode(DEVICEID_TYPE deviceId, const wstring & name) :
+            ShiftNode(deviceId, name, std::vector<int> { 1 }, -1, std::vector<size_t> { 1 }, 0)
+        { }
+        ShiftNode(const ScriptableObjects::IConfigRecordPtr configp) :
+            ShiftNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"fromOffset"), configp->Get(L"dim"), configp->Get(L"offsetRange"), configp->Get(L"multiOffsetDim"))
+        {
+            // We do NOT attach the inputs, as we cannot resolve the main input without causing a circular reference.
+            // Instead, we capture them in a lambda, which will be called by ComputationNetwork during the build process through LateAttachInputs() below.
+            // This is a contract between ComputationNetwork and this specific node type.
+            // (TODO: We could force-evaluate the boundary input here.)
+            m_attachInputsFn = [this, configp]()   // This is the lambda to complete the process. Note that config captured as a shared_ptr.
+            {
+                AttachInputs(GetInputsFromConfig(configp));    // this is executed by network builder while iterating the nodes
+            };
+        }
+        virtual void /*ILateAttachingNode::*/LateAttachInputs() override final
+        {
+            m_attachInputsFn();
+            m_attachInputsFn = [](){ LogicError("LateAttachingNode::AttachInputs: must only be called once"); };
+        }
+    public:
+        void Save(File& fstream) const
+        {
+            Base::Save(fstream);
+
+            fstream << m_fromOffsetBegin;
+            fstream << m_fromOffsetEnd;
+            fstream << m_shiftDimension;
+            fstream << m_expandDimension;
+            fstream << m_recurrenceDirections;
+        }
+
+        virtual void Load(File& fstream, size_t modelVersion) override
+        {
+            Base::Load(fstream, modelVersion);
+
+            fstream >> m_fromOffsetBegin;
+            fstream >> m_fromOffsetEnd;
+            fstream >> m_shiftDimension;
+            fstream >> m_expandDimension;
+            fstream >> m_recurrenceDirections;
+        }
+
+        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
+        {
+            assert(inputIndex == 0); inputIndex;
+            fr;
+        }
+
+        virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+        virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override {return false; }
+
+        virtual void EndForwardProp() override        // called after last iteration step of ForwardProp()
+        {
+            Base::EndForwardProp();
+
+            // In BPTT, we carry over left-to-right state across minibatches.
+            // TODO: package up the state using ExportState(). Then in BeginForwardProp() bring it back. In-between, the packages can be moved around.
+        }
+
+        // This function assumes BeginForwardProp/EndForwardProp() to be called before/after the iteration loop.
+        // TODO: In the future, there may be value for one more way of handling the boundary condition: Fill as 'NoInput'. Then we can use this to implement rolling windows (albeit inefficiently). Would require to unshare the layout.
+        virtual void ForwardProp(const FrameRange & fr) override
+        {
+            fr;
+        }
+
+        virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+        {
+            assert(m_inputs.size() == 2);
+            ComputationNodeBase::Validate(isFinalValidationPass);
+
+            if (isFinalValidationPass)
+                sin(1.0f);
+
+            // MBLayout is just inherited
+            m_pMBLayout = Input(0)->GetMBLayout();
+            if (isFinalValidationPass && !m_pMBLayout)
+                InvalidArgument("%ls %ls operation must operate on data (must have an MB Layout).", NodeName().c_str(), OperationName().c_str());
+
+            // determine expandShape--empty if no multiple offsets; otherwise the 1 or more dimensions that need to be added at m_expandDimension
+            m_expandShape.clear();
+            for (size_t k = 0; k < m_fromOffsetBegin.size(); k++)
+            {
+                size_t dim = m_fromOffsetEnd[k] - m_fromOffsetBegin[k];
+                if (dim > 1)
+                {
+                    m_expandShape.resize(k, 1);
+                    m_expandShape.push_back(dim);
+                }
+            }
+            if (!m_expandShape.empty())
+                m_expandShape.resize(m_fromOffsetBegin.size(), 1);  // pad ones to end
+            // now it either matches the dimensions to insert, or is empty if none to append
+
+            // determine final sample layout
+            auto inputSampleLayout = Input(0)->GetSampleLayout();
+            auto inputDims = inputSampleLayout.GetDims();
+            if (m_expandDimension < 0)
+                InvalidArgument("%ls %ls operation: Specified insertion location %d refers to a time dimension, but this is not allowed.", 
+                                NodeName().c_str(), OperationName().c_str(), m_expandDimension);
+            m_insertExpandShapeAt = m_expandShape.empty() ? 0 : (m_expandDimension > 0 ? m_expandDimension - 1 : inputDims.size());
+            if (m_insertExpandShapeAt > inputDims.size())
+                if (isFinalValidationPass)
+                    InvalidArgument("%ls %ls operation: Specified insertion location %d beyond end of input sample layout [%s].",
+                                    NodeName().c_str(), OperationName().c_str(), m_expandDimension, string(inputSampleLayout).c_str());
+                else
+                    m_insertExpandShapeAt = inputDims.size();   // this may be an error, but we want to catch that only in the final pass
+            SmallVector<size_t> dims;
+            if (!m_expandShape.empty() && inputDims.size() + m_expandShape.size() > dims.capacity())
+                InvalidArgument("%ls %ls operation: Too many dimensions. Did you feed back output of this node without stripping the extra dimensions?",
+                                NodeName().c_str(), OperationName().c_str());
+            dims.append(inputDims.begin(), inputDims.begin() + m_insertExpandShapeAt);
+            dims.append(m_expandShape.begin(), m_expandShape.end());
+            dims.append(inputDims.begin() + m_insertExpandShapeAt, inputDims.end());
+            auto sampleLayout = TensorShape(dims);
+
+            SetDims(sampleLayout, 0);
+        }
+
+        // special interface for use by loop detection
+        virtual const std::vector<int> & /*IRecurrentNode::*/GetRecurrenceDirections() const override
+        {
+            return m_recurrenceDirections;
+        }
+
+        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+        {
+            Base::CopyTo(nodeP, newName, flags);
+            if (flags & CopyNodeFlags::copyNodeValue)
+            {
+                auto node = dynamic_pointer_cast<ShiftNode<ElemType>>(nodeP);
+                node->m_fromOffsetBegin      = m_fromOffsetBegin;
+                node->m_fromOffsetEnd        = m_fromOffsetEnd;
+                node->m_recurrenceDirections = m_recurrenceDirections;
+                node->m_shiftDimension       = m_shiftDimension;
+                node->m_expandDimension      = m_expandDimension;
+                node->m_expandShape          = m_expandShape;
+                node->m_insertExpandShapeAt  = m_insertExpandShapeAt;
+                node->m_state                = m_state;
+            }
+        }
+
+        class ShiftNodeState : public INodeState
+        {
+            Matrix<ElemType> m_delayedActivation;       // saves the activation of the previous step that this node points to
+        };
+        typedef std::shared_ptr<ShiftNodeState> ShiftNodeStatePtr;
+
+        // state export/import
+        // This is done with a shared_ptr. The moment state is exported, the internal state is cleared; ownership is transferred to the exporting entity.
+        // This way, the next invocation does not overwrite the exported state, but is required to create a new one if needed.
+        // On the other hand, once imported, the state object is owned by the node and will be overwritten with the next state.
+        virtual NodeStatePtr ExportState() { return std::move(m_state); }
+        virtual void ImportState(NodeStatePtr && state) override
+        {
+            m_state = dynamic_pointer_cast<ShiftNodeState>(state);
+            if (state && !m_state)
+                LogicError("ImportState: Wrong state object passed (wrong type).");
+        }
+    protected:
+        // parameters remembered from construction
+        std::vector<int> m_fromOffsetBegin;         // offset to pull from; first offset in case of offset range
+        std::vector<int> m_fromOffsetEnd;           // end of offset range
+        int m_shiftDimension;                       // dimension to shift (default: time)
+        int m_expandDimension;                      // in case of offset range, this is where a new dimension will be inserted
+
+        // derived params set up in Validate()
+        SmallVector<size_t> m_expandShape;          // offsetEnd-offsetBegin if >1 offset in any dimension; empty otherwise
+        size_t m_insertExpandShapeAt;               // at which dimension to insert (internal 0-based index)
+        std::vector<int> m_recurrenceDirections;    // for GetRecurrenceDirections()
+
+        ShiftNodeStatePtr m_state;                  // saves the activation of the previous step that this node points to
+
+        function<void()> m_attachInputsFn;          // for late expansion of inputs (scripting)
+    };
+
    // -----------------------------------------------------------------------
    // DelayedValueNodeState -- helper class for exporting/importing state from/to DelayedValueNodes.
    // This is used for sub-minibatching in case of truncated BPTT.
@ -76,12 +326,21 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    // -----------------------------------------------------------------------
    // DelayedValueNodeBase (input) -- abstract base class for PastValueNode and FutureValueNode to hold all shared code
    // The two differ in the step direction, some loop directions, and sequence-boundary flags.
+    // This is an old node which will be replaced by ShiftNode (with Past/FutureValueNode being emulated).
+    //
+    // This is planned:
+    //  - carrying over state at sentence boundaries from other nodes (for s2s)
+    //  - ranges of neighbor frames as a secondary tensor dimension (i.e. can be used to implement a rolling window)
+    //  - full support/efficiency of non-recurrent use (in which case the range can be from negative to positive, e.g. a symmetric rolling window)
+    //  - denoting which tensor dimension to loop over (this may not be completed, but I will plant a seed)
+    //  - support for Yongqiang’s sub-minibatching with BPTT (export/import state)
+    //  - more efficient storage of carried-over state (only store the needed frames, not a full copy of the previous MB as currently; which will on the other hand also allow windows that reach back beyond a minibatch)
    // -----------------------------------------------------------------------

    // TODO: 'direction' is really too general. signOfTimeOffset?
    template<class ElemType, int direction/*-1 for Past/left-to-right or +1 for Future/right-to-left*/  /*, MinibatchPackingFlags SequenceStart_or_End/*-Start or -End*/>
-    class DelayedValueNodeBase : public ComputationNode<ElemType>, public
-                                 ILateAttachingNode, public IStateFulNode,  public NumInputs<1>
+    class DelayedValueNodeBase : public ComputationNode<ElemType>, public IRecurrentNode,
+                                 public ILateAttachingNode, public IStatefulNode,  public NumInputs<1>
    {
        typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
        typedef std::shared_ptr<DelayedValueNodeState<ElemType>> DelayedNodeStatePtr; 
@ -91,9 +350,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        {
            m_initialActivationValue = initialActivationValue;
            m_timeStep = 1;
+            m_recurrenceDirections.push_back(direction);
            CreateMatrixIfNull(m_value);
-            SetDims(sampleLayout, 0);              // TODO: needed? Can we not infer it? How about setting a sample layout?
-            m_isHistoryCarryOverManagedExternally = false;      // used for PairNetworkNode/PastValueNode combination, which is deprecated
+            SetDims(sampleLayout, 0);
            m_value->SetValue(m_initialActivationValue);        // is this needed?
        }
    protected:
@ -139,7 +398,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

        virtual void Load(File& fstream, size_t modelVersion) override
        {
-            // the node has already been initialized e.g. w.r.t. direction and sequence flags
+            // the node has already been initialized e.g. w.r.t. direction
            Base::Load(fstream, modelVersion);

            fstream >> m_timeStep;
@ -155,63 +414,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                fstream >> m_initialActivationValue;
        }

-#if 0
-    private:
-        // cache a post-processed version of m_pMBLayout (depends on the actual minibatch)
-        // This post-processed layout has its bits spread out over m_timeStep, to help detect if we'd hop across a boundary.
-        void CacheMBLayout()
-        {
-            if (m_timeStep <= 0)
-                LogicError("timeStep should be 1 or larger");
-
-            m_pShiftedMBLayout->CopyFrom(m_pMBLayout);      // it gets modified below
-            if (m_timeStep == 1)
-                return;
-
-#if 1
-            LogicError("CacheMBLayout: m_timeStep > 1 temporarily disabled until MBLayout update completed.");
-#else
-            // modify m_pShiftedMBLayout
-            // If two utterances are packed together (S: start, E: end, N: no input) and we need to get values 2 steps in the past
-            //    S X X X E S X X X X E N N
-            // then this becomes
-            //    S S X X E S S X X X E N N
-
-            size_t numSeq = GetNumParallelSequences();
-
-            // each row has a number to indicate how many values should be reset for that utterance
-            // TODO: This algorithm is not obvious and should be explained. E.g. how come it is direction independent?
-            vector<int> numResetLeft(numSeq, 0);
-            for (size_t i = 0; i < GetNumTimeSteps(); i++)   // i = frame index (time)
-            {
-                if (m_pMBLayout->Is(i, SequenceStart_or_End | MinibatchPackingFlags::NoFeature))
-                {
-                    // we set timeStep-1 elements following it to be SequenceStart until met NoInput
-                    for (size_t j = 0; j < numSeq; j++)        // j = stream
-                    {
-                        // we use & since ((int) MinibatchPackingFlags::SequenceStart) may come with NoLabel
-                        if (m_pMBLayout->Is(j, i, SequenceStart_or_End))
-                            numResetLeft[j] = m_timeStep;
-                        else if (m_pMBLayout->Is(j, i, MinibatchPackingFlags::NoFeature))
-                            numResetLeft[j] = 0;
-                    }
-                }
-
-                // now set the sequence-boundary flag
-                for (size_t j = 0; j < numSeq; j++)
-                {
-                    if (numResetLeft[j]-- > 0)
-                    {
-                        m_pShiftedMBLayout->Mask(j, i, MinibatchPackingFlags::NoLabel); // keep only this flag
-                        m_pShiftedMBLayout->Set(j, i, SequenceStart_or_End);            // now implant the boundary flag
-                    }
-                }
-            }
-#endif
-        }
-    public:
-#endif
-
        virtual void /*ComputationNode::*/BackpropTo(const size_t inputIndex, const FrameRange & fr) override
        {
            assert(inputIndex == 0); inputIndex;
@ -283,12 +485,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return false;
        }

-        //virtual void BeginForwardProp() override      // called before first iteration step of ForwardProp()
-        //{
-        //    Base::BeginForwardProp();
-        //    CacheMBLayout();
-        //}
-
        virtual void EndForwardProp() override        // called after last iteration step of ForwardProp()
        {
            // In BPTT, we carry over left-to-right state across minibatches.
@ -299,12 +495,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            //  - we don't need to keep anything if all sequences are closed (sentence end)
            //    This condition includes full-sequence mode.
            // TODO: Can we optimize this and only copy if there is a sequence spanning across the end of the MB? And add a check to BeginForwardProp() to make sure we got one if there is a boundary at the start?
-            if (!m_isHistoryCarryOverManagedExternally) // means it's externally managed (for PairNetworkNode)
-            {
            m_delayedActivation = Input(0)->Value();
            if (!m_delayedActivationMBLayout) m_delayedActivationMBLayout = make_shared<MBLayout>();
            m_delayedActivationMBLayout->CopyFrom(m_pMBLayout);
-            }

            Base::EndForwardProp();
        }
@ -350,6 +543,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                for (size_t id = 0; id < GetNumParallelSequences(); id++)
                {
+                    if (m_pMBLayout->IsGap(fr.Sequence(id)))    // if output is in a gap then don't bother filling it
+                        continue;
+
                    Matrix<ElemType> out = ValueFor(fr.Sequence(id));

                    //assert(m_pShiftedMBLayout->Is(id, t, SequenceStart_or_End) == m_pMBLayout->IsBeyondStartOrEnd(frDelayed.Sequence(id)));
@ -391,34 +587,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            ValidateUnaryMap(isFinalValidationPass);
        }

-        // this function is only used for PairNetworkNode (on PastValueNode)
-        // BUGBUG: Need to transfer the layout as well. PairNetworkNode will go away.
-        bool GetHistory(Matrix<ElemType>& hist, bool)
+        // special interface for use by loop detection
+        virtual const std::vector<int> & /*IRecurrentNode::*/GetRecurrenceDirections() const override
        {
-            DEVICEID_TYPE device = hist.GetDeviceId();
-            hist.TransferFromDeviceToDevice(device, m_deviceId, true);
-
-            hist.SetValue(Input(0)->Value());
-
-            hist.TransferFromDeviceToDevice(m_deviceId, device, true);
-            return true;
-        }
-
-        // this function is only used for PairNetworkNode (on PastValueNode)
-        void SetHistory(const Matrix<ElemType>& hist)
-        {
-            DEVICEID_TYPE device = hist.GetDeviceId();
-            hist.TransferFromDeviceToDevice(device, m_deviceId, true);
-
-            m_delayedActivation.SetValue(hist);
-            m_isHistoryCarryOverManagedExternally = true;
-
-            hist.TransferFromDeviceToDevice(m_deviceId, device, true);
-
-            // need a layout as well
-            // ForwardProp() expects it to have the same number of parallel sequences.
-            if (!m_delayedActivationMBLayout) m_delayedActivationMBLayout = make_shared<MBLayout>();
-            m_delayedActivationMBLayout->Init(GetNumParallelSequences(), hist.GetNumCols() / GetNumParallelSequences());
+            return m_recurrenceDirections;
        }

        virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -434,15 +606,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    (node->m_delayedActivationMBLayout = make_shared<MBLayout>())->CopyFrom(m_delayedActivationMBLayout);
                else
                    node->m_delayedActivationMBLayout = nullptr;
-                node->m_isHistoryCarryOverManagedExternally = false;
            }
        }

-        //========================================
-        // implement the IStateFulNode interface
-        //========================================
-
-        virtual NodeStatePtr ExportState()
+        virtual NodeStatePtr /*IStatefulNode::*/ExportState() override
        {
            NodeStatePtr pExportedState;
            size_t nT = m_pMBLayout->GetNumTimeSteps();
@ -530,7 +697,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            }
            return pExportedState;
        }
-        virtual void ImportState(const NodeStatePtr& pImportedState) override
+
+        virtual void /*IStatefulNode::*/ImportState(NodeStatePtr && pImportedState) override
        {
            DelayedNodeStatePtr pState = dynamic_pointer_cast<DelayedValueNodeState<ElemType>> (pImportedState); 

@ -561,7 +729,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {// it is really a compile error ? 
                RuntimeError("Unrecognized direction in DelayedValueNodeBase");
            }
-
        }
    protected:

@ -569,14 +736,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        Matrix<ElemType> m_delayedActivation;       // saves the activation of the previous step that this node points to
        MBLayoutPtr m_delayedActivationMBLayout;    // layout for m_delayedActivation
        int m_timeStep;                             // delay in frames (typ. 1)
-        //MBLayoutPtr m_pShiftedMBLayout;             // individual sentence boundary information     --TODO: do we actually need this separate variable?
-        bool m_isHistoryCarryOverManagedExternally; // for PastValueNode only
        function<void()> m_attachInputsFn;          // for late expansion of inputs (scripting)
+        std::vector<int> m_recurrenceDirections;    // for GetRecurrenceDirections()
    };

 #define UsingDelayedValueNodeMembers UsingComputationNodeMembersBoilerplate; \
-    using Base::m_initialActivationValue; using Base::m_delayedActivation; using Base::m_timeStep; \
-    /*using Base::m_pShiftedMBLayout;*/ using Base::m_isHistoryCarryOverManagedExternally;
+    using Base::m_initialActivationValue; using Base::m_delayedActivation; using Base::m_timeStep;

    // -----------------------------------------------------------------------
    // PastValueNode (input) -- delay node
@ -606,7 +771,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    template class PastValueNode<float>; 
    template class PastValueNode<double>;

-
    // -----------------------------------------------------------------------
    // FutureValueNode (input) -- delay node in future direction
    // -----------------------------------------------------------------------
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -5621,7 +5621,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #pragma omp parallel for
                for (int k = 0; k < (int)K; k++)
                    TensorOpIteration<ElemType, OPFN, 3, true/*vectorizable*/, -1/*no reduction*/, -1/*scalar*/>::Loop(0, array<ElemType*, 3> { pa + k, pb + k, pc + k }, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
-            // TODO: somehow this does not use 4-way parallelism with SSE (VS 2013), and the signedness of k (required for omp) causes an extra sign-extend
+            // TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default).
+            // TODO: The signedness of k (required for omp) causes an extra sign-extend.
            // TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
        }
    };
@ -5737,6 +5738,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                       const SmallVector<size_t> & regularOpDims,  const array<SmallVector<ptrdiff_t>, 2> & regularStrides,
                                       const SmallVector<size_t> & reducingOpDims, const array<SmallVector<ptrdiff_t>, 2> & reducingStrides)
    {
+        // TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
        #define CaseUnaryTensorOp(oper) \
            case ElementWiseOperator::op ## oper: \
                return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2> & pp) { return Op ## oper((*(pp[0]))); }, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -46,7 +46,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            size_t batchSize = inT.n();
            size_t maxTempMemSizeInSamples = (m_maxTempMemSizeInSamples == 0 ? batchSize : m_maxTempMemSizeInSamples);

-            assert(filter.GetNumCols() == packedInputRows && filter.GetNumRows() == outT.c());
+            assert(filter.GetNumCols() == packedInputRows && filter.GetNumRows() == outT.c()); UNUSED(packedInputRows);

            // GPU and 1-dimensional image
            bool gpuSparse1D = (inT.h() == 1 &&
@ -100,7 +100,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

                    Mat outputSubBatch = out.ColumnSlice(outputSizePerChannel * startSampleId, outputSizePerChannel * smallBatchSize);

-                    workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
+                    //workspace.Resize(packedInputRows, packedInputColsPerSample * smallBatchSize);
+                    // BUGBUG: This ^^ destroys the content of the matrix. Also it seems not to change the size. Does it? Should this be a Reshape()?
                    Mat::Multiply(filter, false, workspace, false, outputSubBatch);
                }
            }
@ -454,8 +455,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
        else if (engType == EngineType::Legacy)
        {
+            // REVIEW alexeyk: temp hack to allow this to work in MEL scenarios. InvalidArgument should be used instead.
            if (imageLayoutKind != ImageLayoutKind::HWC)
-                InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the legacy convolution engine.", ToString(imageLayoutKind).c_str());
+                fprintf(stderr, "WARNING: trying to use cuDNN on unsupported platform. It is safe to ignore the warning if it's produced during model editing command.\n");
+                //InvalidArgument("ConvolutionEngineFactory: ImageLayout '%s' is not compatible with the legacy convolution engine.", ToString(imageLayoutKind).c_str());
            return std::make_unique<DefaultConvolutionEngineFactory<ElemType>>();
        }

--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@ -378,7 +378,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                if (tid < i && tid + i < tids) accumulators[tid] += accumulators[tid + i];
                if (0 + i < tids) __syncthreads();    // sync if condition true for at least one thread
-                // TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values
+                // TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values. See Amit's allreduce() function implementation in MatrixQuantizer_kernel.cu.
            }

            // now set final value to output coordinate
--- a/Source/SGDLib/DataReaderHelpers.h
+++ b/Source/SGDLib/DataReaderHelpers.h
@ -230,12 +230,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            std::map<wstring, vector<shared_ptr<INodeState>>>   m_NetStates;            // m_NetStatefulNodes[node][i] caches the state of i-th subminibatch of node
            bool                                                m_hasLattices; 

-            Matrices                                            m_CachedGraident;
+            Matrices                                            m_cachedGradient;
            // we also need to remember where to put into the net
            MBLayoutPtr                                         m_NetMBLayoutPtr;
            std::map<wstring, shared_ptr<ComputationNode<ElemType>>>    m_LearnableNodePtr;
            // followings are lattice-related 
-            Matrices                                            m_NetInputMatrixPtr;
+            Matrices                                            m_NetInputMatrixPtr;    // TODO: camelCase for all m_Net...
            LatticePtr                                          m_NetLatticePtr;
            UidPtr                                              m_NetUidPtr;
            ExtrauttMapPtr                                      m_NetExtrauttMapPtr;
@ -248,18 +248,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            std::vector<shared_ptr<ComputationNode<ElemType>>>  m_NetCriterionNodes;
            std::vector<shared_ptr<ComputationNode<ElemType>>>  m_NetEvaluationNodes;
-            std::map<wstring, shared_ptr<IStateFulNode>>        m_NetStatefulNodes;      // we need to Export/Import states of stateful nodes when we swtich subminibatches 
+            std::map<wstring, shared_ptr<IStatefulNode>>        m_NetStatefulNodes;      // we need to Export/Import states of stateful nodes when we swtich subminibatches 

        private:

-            void EnumerateStatefulNodeWithRoot(ComputationNetwork& net, ComputationNodeBasePtr root, std::map<wstring, shared_ptr<IStateFulNode>>& statefulnode)
+            void EnumerateStatefulNodeWithRoot(ComputationNetwork& net, ComputationNodeBasePtr root, std::map<wstring, shared_ptr<IStatefulNode>>& statefulnode)
            {
                const std::list<ComputationNodeBasePtr> evalorder = net.GetEvalOrder(root);
                for (auto& x : evalorder)
                {
                    wstring name = x->GetName();
                    if (statefulnode.find(name) != statefulnode.end()) continue; // already in the list 
-                    shared_ptr<IStateFulNode> pNode = dynamic_pointer_cast<IStateFulNode>(x);
+                    shared_ptr<IStatefulNode> pNode = dynamic_pointer_cast<IStatefulNode>(x);
                    if (pNode)
                    {
                        statefulnode[name] = pNode;
@ -267,20 +267,20 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
            }

-            std::map<wstring, shared_ptr<IStateFulNode>> EnumerateStatefulNode(ComputationNetwork& net,
+            std::map<wstring, shared_ptr<IStatefulNode>> EnumerateStatefulNode(ComputationNetwork& net,
                                                                               const std::vector<ComputationNodeBasePtr>& criterionNode,
                                                                               const std::vector<ComputationNodeBasePtr>& evaluationNode)
            {
-                std::map<wstring, shared_ptr<IStateFulNode>> statefulnodes;
+                std::map<wstring, shared_ptr<IStatefulNode>> statefulNodes;
                for (auto& root : criterionNode)
                {
-                    EnumerateStatefulNodeWithRoot(net, root, statefulnodes);
+                    EnumerateStatefulNodeWithRoot(net, root, statefulNodes);
                }
                for (auto& root : evaluationNode)
                {
-                    EnumerateStatefulNodeWithRoot(net, root, statefulnodes);
+                    EnumerateStatefulNodeWithRoot(net, root, statefulNodes);
                }
-                return statefulnodes;
+                return statefulNodes;
            }

        public:
@ -353,7 +353,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    delete x.second;
                }

-                for (auto x : m_CachedGraident)
+                for (auto x : m_cachedGradient)
                {
                    delete x.second;
                }
@ -418,11 +418,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        auto funvalue = pLearnableNode->Value();   // gradient may not be allocated when this function is first called 
                        size_t nrow = funvalue.GetNumRows();
                        size_t ncol = funvalue.GetNumCols();
-                        if (m_CachedGraident.find(nodeName) == m_CachedGraident.end())
+                        if (m_cachedGradient.find(nodeName) == m_cachedGradient.end())
                        {
                            // not allocated yet 
-                            m_CachedGraident[nodeName] = new Matrix<ElemType>(nrow, ncol, funvalue.GetDeviceId());
-                            m_CachedGraident[nodeName]->SetValue((ElemType)0);
+                            m_cachedGradient[nodeName] = new Matrix<ElemType>(nrow, ncol, funvalue.GetDeviceId());
+                            m_cachedGradient[nodeName]->SetValue((ElemType)0);
                        }
                    }
                }
@ -511,9 +511,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                for (auto& x : m_NetStatefulNodes)
                {
                    wstring name = x.first;
-                    shared_ptr<IStateFulNode>   pNode = x.second;
+                    shared_ptr<IStatefulNode>   pNode = x.second;
                    if (m_NetStates[name][iSubminibatch])
-                        pNode->ImportState(m_NetStates[name][iSubminibatch]);
+                        pNode->ImportState(std::move(m_NetStates[name][iSubminibatch]));
                }
            }

@ -521,7 +521,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            void DoneWithCurrentSubMinibatch(size_t iSubminibatch)
            {
                // accumulate gradient here 
-                for (auto x : m_CachedGraident)
+                for (auto x : m_cachedGradient)
                {
                    wstring nodename = x.first;
                    if (m_LearnableNodePtr.find(nodename) == m_LearnableNodePtr.end())
@ -529,7 +529,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                        RuntimeError("ERROR: in DoneWithCurrentSubMinibatch: node %ls not found in LeanrableNode", nodename.c_str());
                    }
                    shared_ptr<ComputationNode<ElemType>> pNode = m_LearnableNodePtr[nodename];
-                    m_CachedGraident[nodename]->operator+=(pNode->Gradient());
+                    m_cachedGradient[nodename]->operator+=(pNode->Gradient());
                    pNode->Gradient().SetValue((ElemType)0);
                }
                // accumulate criterion value 
@ -554,7 +554,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {

            void DoneWithCurrentMinibatch()
            {
-                for (auto& x : m_CachedGraident)
+                for (auto& x : m_cachedGradient)
                {
                    wstring name = x.first;
                    Matrix<ElemType>* accumulategrad = x.second;
--- a/Tests/EndToEndTests/Speech/LSTM/cntk.config
+++ b/Tests/EndToEndTests/Speech/LSTM/cntk.config
@ -66,8 +66,12 @@ speechTrain = [
            C(c) = DiagTimes(WeightParam(cellDim, 1), Stabilize(c)) // cell-to-hiddden

            // LSTM cell
-            dh = PastValue(outputDim, output);                   // hidden state(t-1)
-            dc = PastValue(cellDim, ct);                         // cell(t-1)
+            # TODO: This is temporary test code for the new ShiftNode (until we switch PastValue() itself over)
+            PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1, offsetRanges=1, multiOffsetDim=2)
+            PastValue1 = PastValue
+            #PastValue1 = PastValueShift
+            dh = PastValue1(outputDim, output);                   // hidden state(t-1)
+            dc = PastValue1(cellDim, ct);                         // cell(t-1)

            // note: the W(inputx) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
            it = Sigmoid(W(inputx) + B() + H(dh) + C(dc))       // input gate(t)