Merge remote-tracking branch 'origin/master' into mahilleb/CuDnn5Test

Conflicts: Source/ComputationNetworkLib/ComputationNode.h Source/ComputationNetworkLib/TrainingNodes.h Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.linux.txt Tests/EndToEndTests/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv/baseline.windows.txt Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
2016-08-25 00:29:10 +02:00 · 2016-08-25 00:29:10 +02:00 · 0285fa9a13
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -31,6 +31,9 @@

    <HasOpenCv>false</HasOpenCv>
    <HasOpenCv Condition="Exists('$(OPENCV_PATH)') Or Exists('$(OPENCV_PATH_V31)')">true</HasOpenCv>
+	
+	<HasBoost>false</HasBoost>
+    <HasBoost Condition="Exists('$(BOOST_INCLUDE_PATH)') And Exists('$(BOOST_LIB_PATH)')">true</HasBoost>

    <UseZip>false</UseZip>
    <UseZip Condition="Exists('$(ZLIB_PATH)')">true</UseZip>
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1156,6 +1156,72 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BrainScriptTests", "Tests\U
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tutorials", "Tutorials", "{8BE0642A-A3AA-4A64-95D0-C78FB285B2A4}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ImageHandsOn", "ImageHandsOn", "{2230BF3D-4317-4A3F-A743-DDD6160503F8}"
+	ProjectSection(SolutionItems) = preProject
+		Tutorials\ImageHandsOn\ImageHandsOn.cntk = Tutorials\ImageHandsOn\ImageHandsOn.cntk
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLUHandsOn", "SLUHandsOn", "{CC143D08-567D-4DAC-9E14-264749C19039}"
+	ProjectSection(SolutionItems) = preProject
+		Tutorials\SLUHandsOn\SLUHandsOn.cntk = Tutorials\SLUHandsOn\SLUHandsOn.cntk
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Pretrained Models", "Pretrained Models", "{0ED2EE97-0A26-4865-871F-11033867BA34}"
+	ProjectSection(SolutionItems) = preProject
+		Tutorials\ImageHandsOn\cifar10.pretrained.cmf = Tutorials\ImageHandsOn\cifar10.pretrained.cmf
+		Tutorials\ImageHandsOn\cifar10.ResNet.cmf = Tutorials\ImageHandsOn\cifar10.ResNet.cmf
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solutions", "Solutions", "{A2A4893C-0D5B-42E2-BFAD-C123AE7FDAFD}"
+	ProjectSection(SolutionItems) = preProject
+		Tutorials\ImageHandsOn\ImageHandsOn_Solution1.cntk = Tutorials\ImageHandsOn\ImageHandsOn_Solution1.cntk
+		Tutorials\ImageHandsOn\ImageHandsOn_Solution2.cntk = Tutorials\ImageHandsOn\ImageHandsOn_Solution2.cntk
+		Tutorials\ImageHandsOn\ImageHandsOn_Solution3.cntk = Tutorials\ImageHandsOn\ImageHandsOn_Solution3.cntk
+		Tutorials\ImageHandsOn\ImageHandsOn_Solution4.cntk = Tutorials\ImageHandsOn\ImageHandsOn_Solution4.cntk
+		Tutorials\ImageHandsOn\ImageHandsOn_Solution5.cntk = Tutorials\ImageHandsOn\ImageHandsOn_Solution5.cntk
+		Tutorials\ImageHandsOn\ImageHandsOn_Task4_Start.cntk = Tutorials\ImageHandsOn\ImageHandsOn_Task4_Start.cntk
+		Tutorials\ImageHandsOn\ImageHandsOn_Task6.cntk = Tutorials\ImageHandsOn\ImageHandsOn_Task6.cntk
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{8CFBD0DB-5F16-48E6-984C-4401317FA10E}"
+	ProjectSection(SolutionItems) = preProject
+		Tutorials\SLUHandsOn\atis.test.ctf = Tutorials\SLUHandsOn\atis.test.ctf
+		Tutorials\SLUHandsOn\atis.train.ctf = Tutorials\SLUHandsOn\atis.train.ctf
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solutions", "Solutions", "{BD7FF8C0-EC3A-49CD-9D81-4A8A29B8AD8E}"
+	ProjectSection(SolutionItems) = preProject
+		Tutorials\SLUHandsOn\SLUHandsOn_Solution1.cntk = Tutorials\SLUHandsOn\SLUHandsOn_Solution1.cntk
+		Tutorials\SLUHandsOn\SLUHandsOn_Solution2.cntk = Tutorials\SLUHandsOn\SLUHandsOn_Solution2.cntk
+		Tutorials\SLUHandsOn\SLUHandsOn_Solution3.cntk = Tutorials\SLUHandsOn\SLUHandsOn_Solution3.cntk
+		Tutorials\SLUHandsOn\SLUHandsOn_Solution4.cntk = Tutorials\SLUHandsOn\SLUHandsOn_Solution4.cntk
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Pretrained Models", "Pretrained Models", "{4727594B-A052-4834-B0E8-57DBB9ADEF13}"
+	ProjectSection(SolutionItems) = preProject
+		Tutorials\SLUHandsOn\slu.forward.backward.cmf = Tutorials\SLUHandsOn\slu.forward.backward.cmf
+		Tutorials\SLUHandsOn\slu.forward.cmf = Tutorials\SLUHandsOn\slu.forward.cmf
+		Tutorials\SLUHandsOn\slu.forward.lookahead.cmf = Tutorials\SLUHandsOn\slu.forward.lookahead.cmf
+		Tutorials\SLUHandsOn\slu.forward.nobn.cmf = Tutorials\SLUHandsOn\slu.forward.nobn.cmf
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{4A59163B-1EDE-4439-9E7D-40A30B82A3A0}"
+	ProjectSection(SolutionItems) = preProject
+		Tutorials\ImageHandsOn\CifarConverter.py = Tutorials\ImageHandsOn\CifarConverter.py
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TIMIT", "TIMIT", "{B586AA4C-0BB9-4629-9EDA-25FF2618AC9F}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "TrainSimpleNetwork", "TrainSimpleNetwork", "{C2102C39-BF5F-4B12-9C41-849D1ED35EE8}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Examples\Speech\TIMIT\TrainSimpleNetwork\baseline.linux.txt = Tests\EndToEndTests\Examples\Speech\TIMIT\TrainSimpleNetwork\baseline.linux.txt
+		Tests\EndToEndTests\Examples\Speech\TIMIT\TrainSimpleNetwork\baseline.windows.txt = Tests\EndToEndTests\Examples\Speech\TIMIT\TrainSimpleNetwork\baseline.windows.txt
+		Tests\EndToEndTests\Examples\Speech\TIMIT\TrainSimpleNetwork\run-test = Tests\EndToEndTests\Examples\Speech\TIMIT\TrainSimpleNetwork\run-test
+		Tests\EndToEndTests\Examples\Speech\TIMIT\TrainSimpleNetwork\testcases.yml = Tests\EndToEndTests\Examples\Speech\TIMIT\TrainSimpleNetwork\testcases.yml
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug_CpuOnly|x64 = Debug_CpuOnly|x64
@ -1601,5 +1667,15 @@ Global
 		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
 		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
 		{9F999212-AFC5-4EAC-AA78-F7247D46C456} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
+		{2230BF3D-4317-4A3F-A743-DDD6160503F8} = {8BE0642A-A3AA-4A64-95D0-C78FB285B2A4}
+		{CC143D08-567D-4DAC-9E14-264749C19039} = {8BE0642A-A3AA-4A64-95D0-C78FB285B2A4}
+		{0ED2EE97-0A26-4865-871F-11033867BA34} = {2230BF3D-4317-4A3F-A743-DDD6160503F8}
+		{A2A4893C-0D5B-42E2-BFAD-C123AE7FDAFD} = {2230BF3D-4317-4A3F-A743-DDD6160503F8}
+		{8CFBD0DB-5F16-48E6-984C-4401317FA10E} = {CC143D08-567D-4DAC-9E14-264749C19039}
+		{BD7FF8C0-EC3A-49CD-9D81-4A8A29B8AD8E} = {CC143D08-567D-4DAC-9E14-264749C19039}
+		{4727594B-A052-4834-B0E8-57DBB9ADEF13} = {CC143D08-567D-4DAC-9E14-264749C19039}
+		{4A59163B-1EDE-4439-9E7D-40A30B82A3A0} = {2230BF3D-4317-4A3F-A743-DDD6160503F8}
+		{B586AA4C-0BB9-4629-9EDA-25FF2618AC9F} = {FB7AF7B9-6BEA-459F-94D9-94D53916D2B6}
+		{C2102C39-BF5F-4B12-9C41-849D1ED35EE8} = {B586AA4C-0BB9-4629-9EDA-25FF2618AC9F}
 	EndGlobalSection
 EndGlobal
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_CNTK_Adv_Chapter.lyx
@ -259,7 +259,7 @@ CE=CrossEntropyWithSoftmax(labels, Plus2)

 \begin_layout Plain Layout

-ErrPredict=ErrorPrediction(labels, Plus2)
+ErrPredict=ClassificationError(labels, Plus2)
 \end_layout

 \begin_layout Plain Layout
@ -616,7 +616,7 @@ CE=CrossEntropyWithSoftmax(labels, Plus2)

 \begin_layout Plain Layout

-ErrPredict=ErrorPrediction(labels, Plus2)
+ErrPredict=ClassificationError(labels, Plus2)
 \end_layout

 \end_inset
@ -633,19 +633,19 @@ CrossEntropyWithSoftmax

 \end_inset

-() to compute the training criterion and the operator ErrorPrediction
+() to compute the training criterion and the operator ClassificationError
 \begin_inset Index idx
 status open

 \begin_layout Plain Layout
-ErrorPrediction
+ClassificationError
 \end_layout

 \end_inset

 () to compute the testing criterion.
 These operators are internally represented as computation nodes CrossEntropyWit
-hSoftmaxNode and ErrorPredictionNode with names CE and ErrPredict, respectively.
+hSoftmaxNode and ClassificationErrorNode with names CE and ErrPredict, respectively.
 \end_layout

 \begin_layout Subsubsection
@ -740,7 +740,7 @@ status open

 \begin_layout Plain Layout

-ErrPredict=ErrorPrediction(labels, Plus2) # classification error
+ErrPredict=ClassificationError(labels, Plus2) # classification error
 \end_layout

 \begin_layout Plain Layout
@ -1025,7 +1025,7 @@ reference "sub:NDL-Basic-Concepts"
 but is much simpler and easier to understand because of the use of macros.
 One new feature shown in this network definition is the access to macro-region
 variables.
- ErrorPrediction() needs to access an intermediate result from SMBFF before
+ ClassificationError() needs to access an intermediate result from SMBFF before
 the CrossEntropyWithSoftmax() is applied.
 Although the needed variable is local to the macro, it can be accessed
 via the 
@ -1107,7 +1107,7 @@ CE = SMBFF(L1, LDim, HDim, labels)

 \begin_layout Plain Layout

-Err=ErrorPrediction(labels, CE.F)
+Err=ClassificationError(labels, CE.F)
 \end_layout

 \end_inset
@ -1280,7 +1280,7 @@ CE = SMBFF(L3, LDim, HDim, labels, tag="criterion")

 \begin_layout Plain Layout

-Err=ErrorPrediction(labels, CE.F, tag="evaluation")
+Err=ClassificationError(labels, CE.F, tag="evaluation")
 \end_layout

 \end_inset
@ -2900,12 +2900,12 @@ classProbBeforeSoftmax - applying softmax on this matrix will result in
 \end_layout

 \begin_layout Subsubsection
-ErrorPrediction
+ClassificationError
 \begin_inset Index idx
 status open

 \begin_layout Plain Layout
-ErrorPrediction
+ClassificationError
 \end_layout

 \end_inset
@ -2941,7 +2941,7 @@ status open

 \begin_layout Plain Layout

-ErrorPrediction(labels, m)
+ClassificationError(labels, m)
 \end_layout

 \begin_layout Plain Layout
@ -4059,7 +4059,7 @@ CE = SMBFF(L3, LDim, HDim, labels, tag="criterion")

 \begin_layout Plain Layout

-Err=ErrorPrediction(labels, CE.F, tag="evaluation")
+Err=ClassificationError(labels, CE.F, tag="evaluation")
 \end_layout

 \begin_layout Plain Layout
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_ExampleSetup_Chapter.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_ExampleSetup_Chapter.lyx
@ -290,7 +290,7 @@ cntkSpeech.dnn"

 \begin_layout Plain Layout

-    evalCriterion="ErrorPrediction"
+    evalCriterion="ClassificationError"
 \end_layout

 \begin_layout Plain Layout
@ -1610,7 +1610,7 @@ CE1=CrossEntropyWithSoftmax(labels,BFF1.FF.P,tag="evaluation")

 \begin_layout Plain Layout

-FER1 = ErrorPrediction(labels,BFF1.FF.P,tag="evaluation")
+FER1 = ClassificationError(labels,BFF1.FF.P,tag="evaluation")
 \end_layout

 \begin_layout Plain Layout
@ -1634,7 +1634,7 @@ CE2=CrossEntropyWithSoftmax(regions,BFF2.FF.P,tag="evaluation")

 \begin_layout Plain Layout

-FER2 = ErrorPrediction(regions,BFF2.FF.P,tag="evaluation")
+FER2 = ClassificationError(regions,BFF2.FF.P,tag="evaluation")
 \end_layout

 \begin_layout Plain Layout
--- a/Documentation/CNTK-TechReport/lyx/CNTKBook_Introduction.lyx
+++ b/Documentation/CNTK-TechReport/lyx/CNTKBook_Introduction.lyx
@ -514,7 +514,7 @@ Simple_Demo=[

 \begin_layout Plain Layout

-        evalCriterion="ErrorPrediction"
+        evalCriterion="ClassificationError"
 \end_layout

 \begin_layout Plain Layout
--- a/Examples/Image/MNIST/Config/01_OneHidden.cntk
+++ b/Examples/Image/MNIST/Config/01_OneHidden.cntk
@ -52,12 +52,12 @@ train = [
        z  = DNNLayer        (hiddenDim, labelDim, h1,         1)

        ce   = CrossEntropyWithSoftmax (labels, z)
-        errs = ErrorPrediction         (labels, z)
+        errs = ClassificationError         (labels, z)

        # set top5Errs as an evaluation node to compute the top-5 error rate
        # This is not marked tag="evaluation" since expensive during training.
        # We explicitly select it as an output node in the "test" command.
-        top5Errs = ErrorPrediction (labels, z, topN=5)
+        top5Errs = ClassificationError (labels, z, topN=5)

        # declare special nodes
        featureNodes    = (features)
--- a/Examples/Image/MNIST/Config/01_OneHidden.ndl
+++ b/Examples/Image/MNIST/Config/01_OneHidden.ndl
@ -22,8 +22,8 @@ DNN = [
    ol = DNNLayer(hiddenDim, labelDim, h1, 1)

    ce = CrossEntropyWithSoftmax(labels, ol)
-    errs = ErrorPrediction(labels, ol)
-    top5Errs = ErrorPrediction(labels, ol, Const(5), tag="eval")  # only used in testing
+    errs = ClassificationError(labels, ol)
+    top5Errs = ClassificationError(labels, ol, Const(5), tag="eval")  # only used in testing

    # Special Nodes
    FeatureNodes = (features)
--- a/Examples/Image/MNIST/Config/02_Convolution.ndl
+++ b/Examples/Image/MNIST/Config/02_Convolution.ndl
@ -58,7 +58,7 @@ DNN=[
    ol = DNNLayer(h1Dim, labelDim, h1, 1)
    
    ce = CrossEntropyWithSoftmax(labels, ol)
-    errs = ErrorPrediction(labels, ol)
+    errs = ClassificationError(labels, ol)

    # Special Nodes
    FeatureNodes = (features)
--- a/Examples/Image/MNIST/Config/03_ConvBatchNorm.ndl
+++ b/Examples/Image/MNIST/Config/03_ConvBatchNorm.ndl
@ -64,7 +64,7 @@ DNN = [
    ol = DNNLayer(h1Dim, labelDim, h1, 1)
    
    ce = CrossEntropyWithSoftmax(labels, ol)
-    errs = ErrorPrediction(labels, ol)
+    errs = ClassificationError(labels, ol)
    
    # Special Nodes
    FeatureNodes = (features)
--- a/Examples/Image/MNIST/Config/04_DeConv.ndl
+++ b/Examples/Image/MNIST/Config/04_DeConv.ndl
@ -48,7 +48,7 @@ DNN=[
    deconv1 = DeconvReLULayer(unpool1, kW1, kH1, imageC, 25, cMap1, hStride1, vStride1, lpad1, upad1, wScale1, bValue1)
 
    mse = SquareError(featScaled, deconv1)
-    #err = ErrorPrediction(labels, ol)
+    #err = ClassificationError(labels, ol)

    # Special Nodes
    FeatureNodes = (features)
--- a/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/01_Convolution.ndl
@ -79,7 +79,7 @@ DNN=[
    ol = DNNLastLayer(hiddenDim, labelDim, h1_d, fc2WScale, fc2BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]

--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.ndl
@ -84,7 +84,7 @@ DNN=[
    ol = DNNLastLayer(hiddenDim, labelDim, h1, fc2WScale, fc2BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]

--- a/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/03_ResNet.ndl
@ -61,7 +61,7 @@ DNN=[
    ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]

--- a/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/04_ResNet_56.ndl
@ -106,7 +106,7 @@ DNN=[
    ol = DnnLastLayer(cMap3, labelDim, pool, fc1WScale, fc1BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]

--- a/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal.ndl
+++ b/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal.ndl
@ -78,7 +78,7 @@ DNN=[
    ol = DnnImageLastLayer(7, 7, cMap4, labelDim, conv4, fc1WScale, fc1BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]

--- a/Examples/Image/Miscellaneous/CIFAR-10/CIFAR_convert_py3.py
+++ b/Examples/Image/Miscellaneous/CIFAR-10/CIFAR_convert_py3.py
@ -0,0 +1,105 @@
+import sys
+import urllib.request as ul
+import pickle as cp
+import tarfile
+import shutil
+import os
+import struct
+import numpy as np
+import getopt
+
+ImgSize = 32
+NumFeat = ImgSize * ImgSize * 3
+
+def readBatch(src, outFmt):
+    with open(src, 'rb') as f:
+        d = cp.load(f,  encoding="latin1")
+        # Note: most of the frameworks use spatial-major (aka NCHW) input format:
+        # R0..RN,G0..GN,B0..BN
+        # There are 2 possible options in CNTK:
+        # 1. If CNTK is built with cuDNN then 'cudnn' (i.e. NCHW format) should be used.
+        # 2. Otherwise, legacy CNTK 'NHWC' format should be used. As CIFAR-10 dataset comes in 
+        #   NCHW format, it has to be converted to CNTK legacy format first.
+        data = d['data']
+        if outFmt == 'cudnn':
+            feat = data
+        elif outFmt == 'legacy':
+            r = data[:, : ImgSize * ImgSize]
+            g = data[:, ImgSize * ImgSize : 2 * ImgSize * ImgSize]
+            b = data[:, 2 * ImgSize * ImgSize : 3 * ImgSize * ImgSize]
+            feat = np.empty_like(data)
+            feat[:, ::3] = r
+            feat[:, 1::3] = g
+            feat[:, 2::3] = b
+        else:
+            print ('Format not supported: ' + outFmt)
+            usage()
+            sys.exit(1)
+    res = np.hstack((feat, np.reshape(d['labels'], (len(d['labels']), 1))))
+    return res.astype(np.int)
+
+def loadData(src, outFmt):
+    print ('Downloading ' + src)
+    fname, h = ul.urlretrieve(src, './delete.me')
+    print ('Done.')
+    try:
+        print ('Extracting files...')
+        with tarfile.open(fname) as tar:
+            tar.extractall()
+        print ('Done.')
+        print ('Preparing train set...')
+        trn = np.empty((0, NumFeat + 1), dtype=np.int)
+        for i in range(5):
+            batchName = './cifar-10-batches-py/data_batch_{0}'.format(i + 1)
+            trn = np.vstack((trn, readBatch(batchName, outFmt)))
+        print ('Done.')
+        print ('Preparing test set...')
+        tst = readBatch('./cifar-10-batches-py/test_batch', outFmt)
+        print ('Done.')
+    finally:
+        os.remove(fname)
+    return (trn, tst)
+
+def usage():
+    print ('Usage: CIFAR_convert.py [-f <format>] \n  where format can be either cudnn or legacy. Default is cudnn.')
+
+def parseCmdOpt(argv):
+    if len(argv) == 0:
+        print ("Using cudnn output format.")
+        return "cudnn"
+    try:
+        opts, args = getopt.getopt(argv, 'hf:', ['help', 'outFormat='])
+    except getopt.GetoptError:
+        usage()
+        sys.exit(1)
+    for opt, arg in opts:
+        if opt in ('-h', '--help'):
+            usage()
+            sys.exit()
+        elif opt in ('-f', '--outFormat'):
+            fmt = arg
+            if fmt != 'cudnn' and fmt != 'legacy':
+                print ('Invalid output format option.')
+                usage()
+                sys.exit(1)
+            return fmt
+
+def savetxt(filename, ndarray):
+    with open(filename, 'w') as f:
+        labels = list(map(' '.join, np.eye(10, dtype=np.uint).astype(str)))
+        for row in ndarray:
+            row_str = row.astype(str)
+            label_str = labels[row[-1]]
+            feature_str = ' '.join(row_str[:-1])
+            f.write('|labels {} |features {}\n'.format(label_str, feature_str))
+
+
+if __name__ == "__main__":
+    fmt = parseCmdOpt(sys.argv[1:])
+    trn, tst = loadData('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz', fmt)
+    print ('Writing train text file...')
+    savetxt(r'./Train_cntk_text.txt', trn)
+    print ('Done.')
+    print ('Writing test text file...')
+    savetxt(r'./Test_cntk_text.txt', tst)
+    print ('Done.')
--- a/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter_py3.py
+++ b/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter_py3.py
@ -0,0 +1,73 @@
+import os
+import sys
+import struct
+import pickle as cp
+from PIL import Image
+import numpy as np
+import xml.etree.cElementTree as et
+import xml.dom.minidom
+
+imgSize = 32
+
+def saveImage(fname, data, label, mapFile, pad, **key_parms):
+    # data in CIFAR-10 dataset is in CHW format.
+    pixData = data.reshape((3, imgSize, imgSize))
+    if ('mean' in key_parms):
+        key_parms['mean'] += pixData
+
+    if pad > 0:
+        pixData = np.pad(pixData, ((0, 0), (pad, pad), (pad, pad)), mode='constant', constant_values=128) # can also use mode='edge'
+
+    img = Image.new('RGB', (imgSize + 2 * pad, imgSize + 2 * pad))
+    pixels = img.load()
+    for x in range(img.size[0]):
+        for y in range(img.size[1]):
+            pixels[x, y] = (pixData[0][y][x], pixData[1][y][x], pixData[2][y][x])
+    img.save(fname)
+    mapFile.write("%s\t%d\n" % (fname, label))
+
+def saveMean(fname, data):
+    root = et.Element('opencv_storage')
+    et.SubElement(root, 'Channel').text = '3'
+    et.SubElement(root, 'Row').text = str(imgSize)
+    et.SubElement(root, 'Col').text = str(imgSize)
+    meanImg = et.SubElement(root, 'MeanImg', type_id='opencv-matrix')
+    et.SubElement(meanImg, 'rows').text = '1'
+    et.SubElement(meanImg, 'cols').text = str(imgSize * imgSize * 3)
+    et.SubElement(meanImg, 'dt').text = 'f'
+    et.SubElement(meanImg, 'data').text = ' '.join(['%e' % n for n in np.reshape(data, (imgSize * imgSize * 3))])
+
+    tree = et.ElementTree(root)
+    tree.write(fname)
+    x = xml.dom.minidom.parse(fname)
+    with open(fname, 'w') as f:
+        f.write(x.toprettyxml(indent = '  '))
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print ("Usage: CifarConverter.py <path to CIFAR-10 dataset directory>\nCIFAR-10 dataset (Python version) can be downloaded from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")
+        sys.exit(1)
+    rootDir = sys.argv[1]
+    trainDir = os.path.join(rootDir, os.path.join('data', 'train'))
+    if not os.path.exists(trainDir):
+        os.makedirs(trainDir)
+    testDir = os.path.join(rootDir, os.path.join('data', 'test'))
+    if not os.path.exists(testDir):
+      os.makedirs(testDir)
+    data = {}
+    dataMean = np.zeros((3, imgSize, imgSize)) # mean is in CHW format.
+    with open(os.path.join(rootDir, 'train_map.txt'), 'w') as mapFile:
+        for ifile in range(1, 6):
+            with open(os.path.join(rootDir, 'data_batch_' + str(ifile)), 'rb') as f:
+                data = cp.load(f, encoding='latin1')
+                for i in range(10000):
+                    fname = os.path.join(trainDir, ('%05d.png' % (i + (ifile - 1) * 10000)))
+                    saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, 4, mean=dataMean)
+    dataMean = dataMean / (50 * 1000)
+    saveMean(os.path.join(rootDir, 'CIFAR-10_mean.xml'), dataMean)
+    with open(os.path.join(rootDir, 'test_map.txt'), 'w') as mapFile:
+        with open(os.path.join(rootDir, 'test_batch'), 'rb') as f:
+            data = cp.load(f, encoding='latin1')
+            for i in range(10000):
+                fname = os.path.join(testDir, ('%05d.png' % i))
+                saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, 0)
--- a/Examples/Image/Miscellaneous/CIFAR-10/ConvNet.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/ConvNet.cntk
@ -41,8 +41,8 @@ TrainConvNet = [

        # connect to system
        ce       = CrossEntropyWithSoftmax (labels, z)
-        errs     = ErrorPrediction         (labels, z)
-        top5Errs = ErrorPrediction         (labels, z, topN=5)  # only used in Eval action
+        errs     = ClassificationError         (labels, z)
+        top5Errs = ClassificationError         (labels, z, topN=5)  # only used in Eval action

        featureNodes    = (features)
        labelNodes      = (labels)
@ -109,8 +109,8 @@ TrainConvNetWithBN = [

        # connect to system
        ce       = CrossEntropyWithSoftmax (labels, z)
-        errs     = ErrorPrediction         (labels, z)
-        top5Errs = ErrorPrediction         (labels, z, topN=5)
+        errs     = ClassificationError         (labels, z)
+        top5Errs = ClassificationError         (labels, z, topN=5)

        featureNodes    = (features)
        labelNodes      = (labels)
--- a/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/TutorialImage.cntk
@ -1,229 +0,0 @@
-# Simple CIFAR-10 convnet, without and with BatchNormalization.
-
-command = TrainConvNet:Eval
-#command = TrainConvNetWithBN:Eval
-
-makeMode = false ; traceLevel = 0 ; deviceId = "auto"
-
-RootDir = "." ; DataDir  = "$RootDir$" ; ModelDir = "$RootDir$/Output/Models"
-
-modelPath = "$ModelDir$/cifar10.cmf"
-
-# Training without BN
-TrainConvNet = {
-    action = "train"
-
-    BrainScriptNetworkBuilder = {
-        imageShape = 32:32:3
-        labelDim = 10
-
-        # basic model
-        model_basic (features) =
-        {
-            featNorm = features - Constant (128)
-            l1 = ConvolutionalLayer {32, (5:5), pad = true, activation = ReLU,
-                                     init = "gaussian", initValueScale = 0.0043} (featNorm)
-            p1 = MaxPoolingLayer {(3:3), stride = (2:2)} (l1)
-            l2 = ConvolutionalLayer {32, (5:5), pad = true, activation = ReLU,
-                                     init = "gaussian", initValueScale = 1.414} (p1)
-            p2 = MaxPoolingLayer {(3:3), stride = (2:2)} (l2)
-            l3 = ConvolutionalLayer {64, (5:5), pad = true, activation = ReLU,
-                                     init = "gaussian", initValueScale = 1.414} (p2)
-            p3 = MaxPoolingLayer {(3:3), stride = (2:2)} (l3)
-            d1 = DenseLayer {64, activation = ReLU, init = "gaussian", initValueScale = 12} (p3)
-            z  = LinearLayer {10, init = "gaussian", initValueScale = 1.5} (d1)
-        }.z
-
-        # with self-defined layer
-        MyConvReLUPoolLayer {dim, initValueScale} =
-        {
-            C = ConvolutionalLayer {dim, (5:5), pad = true, activation = ReLU, init = "gaussian", initValueScale = initValueScale}
-            P = MaxPoolingLayer {(3:3), stride = (2:2)}
-            apply (x) = P(C(x))
-        }.apply
-        model_layers (features) =
-        {
-            featNorm = features - Constant (128)
-            h1 = MyConvReLUPoolLayer {32, 0.0043} (featNorm)
-            h2 = MyConvReLUPoolLayer {32, 1.414} (h1)
-            h3 = MyConvReLUPoolLayer {64, 1.414} (h2)
-            d1 = DenseLayer {64, activation = ReLU, init = "gaussian", initValueScale = 12} (h3)
-            z = LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5} (d1)
-        }.z
-
-        # model-composition style
-        # ...TODO: test this again; last run was a little worse
-        Subtract128 (x) = x - Constant (128)
-        model_compositionStyle = Sequential (
-            Subtract128 :
-            MyConvReLUPoolLayer {32, 0.0043} :
-            MyConvReLUPoolLayer {32, 1.414}  :
-            MyConvReLUPoolLayer {64, 1.414}  :
-            DenseLayer {64, activation = ReLU, init = "gaussian", initValueScale = 12} :
-            LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5}
-        )
-        #model_compositionStyle =
-        #    Subtract128 >>
-        #    LayerStack {3, i => MyConvReLUPoolLayer {dims[i], initValueScales[i]} } >>
-        #    MyConvReLUPoolLayer {32, 0.0043} >>
-        #    MyConvReLUPoolLayer {32, 1.414}  >>
-        #    MyConvReLUPoolLayer {64, 1.414}  >>
-        #    DenseLayer {64, activation = ReLU, init = "gaussian", initValueScale = 12} >>
-        #    LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5}
-
-        // --- with BatchNorm
-        MyConvBNReLUPoolLayer {dim, initValueScale} =
-        {
-            C = ConvolutionalLayer {dim, (5:5), pad = true, bias = false, init = "gaussian", initValueScale = initValueScale}
-            B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}
-            P = MaxPoolingLayer {(3:3), stride = (2:2)}
-            apply (x) = P(ReLU(B(C(x))))
-        }.apply
-        MyDenseBNReLULayer {dim, initValueScale} =
-        {
-            D = DenseLayer {dim, bias = false, init = "gaussian", initValueScale = initValueScale}
-            B = BatchNormalizationLayer {normalizationTimeConstant = 4096}
-            apply (x) = ReLU(B(D(x)))
-        }.apply
-        model_withBatchNorm (features) =
-        {
-            featNorm = features - Constant (128)
-            h1 = MyConvBNReLUPoolLayer {32, 0.0043} (featNorm)
-            h2 = MyConvBNReLUPoolLayer {32, 1.414} (h1)
-            h3 = MyConvBNReLUPoolLayer {64, 1.414} (h2)
-            d1 = MyDenseBNReLULayer {64, 12} (h3)
-            z = LinearLayer {labelDim, init = "gaussian", initValueScale = 1.5} (d1)
-        }.z
-
-        // --- ResNet
-        MyConvBNLayer {dim, initValueScale, stride} =
-        {
-            # note: (3:3), while the macro above is (5:5)
-            C = ConvolutionalLayer {dim, (3:3), pad = true, stride = (stride:stride), bias = false, init = "gaussian", initValueScale = initValueScale}
-            B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}
-            apply (x) = B(C(x))
-        }.apply
-        ResNetNode {dim, initValueScale} =
-        {
-            C1 = MyConvBNLayer {dim, initValueScale, 1}  # first convolution layer
-            C2 = MyConvBNLayer {dim, initValueScale, 1}  # second convolution layer
-            #B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}
-            # ^^ Note: Adding an exra BN to 'x' trains slightly better.
-            apply (x) = ReLU (x + C2(ReLU(C1(x))))  # ReLU between C1 and C2 and after summation
-        }.apply
-        ResNetIncNode {dim, initValueScale} =
-        {
-            # first branch. This doubles the #channels but halves the image size
-            C1 = MyConvBNLayer {dim, initValueScale, 2}  # first convolution layer, stride = 2
-            C2 = MyConvBNLayer {dim, initValueScale, 1}  # second convolution layer
-
-            # second branch:
-            # sub-sample spatially by a factor of 2
-            DownSamplingLayer {stride} = MaxPoolingLayer {(1:1), stride = stride}
-            # append dim/2 zero output channels
-            pad = ConstantTensor (0, (1:1:dim/2))  # the 1s will broadcast to image size
-            P(x) = Splice ((DownSamplingLayer {(2:2)} (x) : pad), axis = 3)
-            B = BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096}
-
-            # layer sums both branches and rectifies the result
-            apply (x) = ReLU (B(P(x)) + C2(ReLU(C1(x))))  # ReLU between C1 and C2 and after summation
-        }.apply
-        model_resNet (features) =
-        {
-            conv1 = MyConvBNLayer {16, 0.26, 1} (features)
-            rl1   = ReLU (conv1)
-            rn1   = LayerStack {3, _ => ResNetNode {16, 7.07}} (rl1)
-
-            rn2_1 = ResNetIncNode {32, 7.07} (rn1)
-            rn2   = LayerStack {2, _ => ResNetNode {32, 7.07}} (rn2_1)
-
-            rn3_1 = ResNetIncNode {64, 7.07} (rn2)
-            rn3   = LayerStack {2, _ => ResNetNode {64, 7.07}} (rn3_1)
-
-            pool = AveragePoolingLayer {(8:8)} (rn3)
-
-            z = LinearLayer {labelDim, init = "gaussian", initValueScale = 0.4} (pool)
-        }.z
-
-        # inputs
-        features = Input {imageShape}
-        labels   = Input {labelDim}
-
-        # apply model to features
-        z = model_basic (features)
-
-        # connect to system
-        ce       = CrossEntropyWithSoftmax (labels, z)
-        errs     = ErrorPrediction         (labels, z)
-        top5Errs = ErrorPrediction         (labels, z, topN=5)  # only used in Eval action
-
-        featureNodes    = (features)
-        labelNodes      = (labels)
-        criterionNodes  = (ce)
-        evaluationNodes = (errs)  # top5Errs only used in Eval
-        outputNodes     = (z)
-    }
-
-    SGD = {
-        epochSize = 50000   # 49984 --TODO: why 16 less?
-
-        # without BatchNormalization:
-        maxEpochs = 30 ; minibatchSize = 64
-        learningRatesPerSample = 0.00015625*10:0.000046875*10:0.000015625
-        momentumAsTimeConstant = 600*20:6400
-        L2RegWeight = 0.03
-
-        # with BatchNormalization:
-        #maxEpochs = 30 ; minibatchSize = 64
-        #learningRatesPerSample = 0.00046875*7:0.00015625
-        #momentumAsTimeConstant = 0
-        #L2RegWeight = 0
-
-        # ResNet
-        #maxEpochs = 160 ; minibatchSize = 128
-        #learningRatesPerSample = 0.0078125*80:0.00078125*40:0.000078125
-        #momentumAsTimeConstant = 1200
-        #L2RegWeight = 0.0001
-
-        firstMBsToShowResult = 10 ; numMBsToShowResult = 500
-    }
-
-    reader = {
-        verbosity = 0
-        randomize = true
-        deserializers = ({
-            type = "ImageDeserializer" ; module = "ImageReader"
-            file = "$DataDir$/cifar-10-batches-py/train_map.txt"
-            input = {
-                features = { transforms = (
-                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
-                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
-                    { type = "Transpose" }
-                )}
-                labels =   { labelDim = 10 }
-            }
-        })
-    }
-}
-
-# Eval action
-Eval = {
-    action = "eval"
-    minibatchSize = 16
-    evalNodeNames = errs:top5Errs  # also test top-5 error rate
-    reader = {
-        verbosity = 0
-        randomize = true
-        deserializers = ({
-            type = "ImageDeserializer" ; module = "ImageReader"
-            file = "$DataDir$/cifar-10-batches-py/test_map.txt"
-            input = {
-                features = { transforms = (
-                   { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
-                   { type = "Transpose" }
-                )}
-                labels =   { labelDim = 10 }
-            }
-        })
-    }
-}
--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/AddTop5Layer.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/AddTop5Layer.mel
@ -1,6 +1,6 @@
 m1 = LoadModel("$curModel$", format="cntk")
 SetDefaultModel(m1)

-errTop5 = ErrorPrediction(labels, outputNodes.z, Const(5), tag="eval")
+errTop5 = ClassificationError(labels, outputNodes.z, Const(5), tag="eval")

 SaveModel(m1, "$newModel$", format="cntk")
--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.ndl
@ -103,6 +103,6 @@ DNN=[
    ol = DNNLastLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/CreateEvalModel.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/CreateEvalModel.mel
@ -2,6 +2,6 @@ m1=LoadModel($CurModel$, format=cntk)
 SetDefaultModel(m1)

 # Add top-5 error prediction node.
-ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = Eval)
+ErrTop5 = ClassificationError(labels, OutputNodes.z, Const(5), tag = Eval)

 SaveModel(m1, $NewModel$, format=cntk)
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
@ -111,6 +111,6 @@ DNN=[
    ol = DnnLayer(cMap6, labelDim, pool2, fcWScale, fcBValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.ndl
@ -67,6 +67,6 @@ DNN=[
    ol = DnnLayer(cMap4, labelDim, pool5, fcWScale, fcBValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
@ -74,6 +74,6 @@ DNN=[
    ol = DnnLayer(cMap4, labelDim, pool5, fcWScale, fcBValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
@ -77,6 +77,6 @@ DNN=[
    ol = DnnLayer(cMap6, labelDim, pool2, fcWScale, fcBValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/CreateEvalModel.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/CreateEvalModel.mel
@ -2,6 +2,6 @@ m1=LoadModel($CurModel$, format=cntk)
 SetDefaultModel(m1)

 # Add top-5 error prediction node.
-ErrTop5 = ErrorPrediction(labels, OutputNodes.z, Const(5), tag = "eval")
+ErrTop5 = ClassificationError(labels, OutputNodes.z, Const(5), tag = "eval")

 SaveModel(m1, $NewModel$, format=cntk)
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.ndl
@ -71,6 +71,6 @@ DNN=[
    ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
@ -79,6 +79,6 @@ DNN=[
    ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl
@ -80,6 +80,6 @@ DNN=[
    ol = DnnLayer(hiddenDim, labelDim, h2_d, fc3WScale, fc3BValue)
    
    CE = CrossEntropyWithSoftmax(labels, ol, tag = Criteria)
-    Err = ErrorPrediction(labels, ol, tag = Eval)
+    Err = ClassificationError(labels, ol, tag = Eval)
    OutputNodes = ol
 ]
--- a/Examples/Other/NdlExamples/NDLExamples.ndl
+++ b/Examples/Other/NdlExamples/NDLExamples.ndl
@ -29,7 +29,7 @@ ndlTestCosDist=[
    CD = CosDistance(L4, labels);
    CDAll=SumElements(CD)
    NCD=Negate(CDALL, tag="criterion")
-    Err=ErrorPrediction(labels, L4, tag="evaluation")
+    Err=ClassificationError(labels, L4, tag="evaluation")

    # rootNodes defined here
    OutputNodes=(L4)
@ -129,7 +129,7 @@ ndlFull=[
    #SM=Softmax(Plus2)
    #CE=CrossEntropy(labels, SM)
    CE=CrossEntropyWithSoftmax(labels, Plus2)
-    ErrPredict=ErrorPrediction(labels, Plus2)
+    ErrPredict=ClassificationError(labels, Plus2)
    FeatureNodes=(features)
    LabelNodes=(labels)
    CriterionNodes=(CE)
@ -233,7 +233,7 @@ ndlMacroUse2=[
    L2 = RBFF(L1, HDim, HDim)
    L3 = RBFF(L2, HDim, HDim)
    CE = SMBFF(L3, LDim, HDim, labels, tag="criterion")
-    Err=ErrorPrediction(labels, CE.BFF, tag="evaluation")
+    Err=ClassificationError(labels, CE.BFF, tag="evaluation")

    # rootNodes defined here
    OutputNodes=(CE.BFF)
@ -290,7 +290,7 @@ ndlMacroUseCNNSubSample2ZeroPadding=[
    HDim=128
    L1 = SBFF(mp, HDim, mpoutputSizePerSample)
    CE = SMBFF(L1, LDim, HDim, labels, tag="criterion")
-    Err=ErrorPrediction(labels, CE.BFF, tag="evaluation")
+    Err=ClassificationError(labels, CE.BFF, tag="evaluation")

    # rootNodes defined here
    OutputNodes=(CE.BFF)
@ -349,7 +349,7 @@ ndlMacroUseCNNSubSample2=[
    HDim=128
    L1 = SBFF(mp, HDim, mpoutputSizePerSample)
    CE = SMBFF(L1, LDim, HDim, labels, tag="criterion")
-    Err=ErrorPrediction(labels, CE.BFF, tag="evaluation")
+    Err=ClassificationError(labels, CE.BFF, tag="evaluation")

    # rootNodes defined here
    OutputNodes=(CE.BFF)
@ -399,7 +399,7 @@ ndlMacroUseCNN=[
    HDim=128
    L1 = SBFF(mp, HDim, 0)
    CE = SMBFF(L1, LDim, HDim, labels, tag="criterion")
-    Err=ErrorPrediction(labels, CE.BFF, tag="evaluation")
+    Err=ClassificationError(labels, CE.BFF, tag="evaluation")

    # rootNodes defined here
    OutputNodes=(CE.BFF)
@ -430,7 +430,7 @@ ndlMacroUseNoBase=[
    L2 = RFFD(L1, HDim, HDim)
    L3 = RFFD(L2, HDim, HDim)
    CE = SMFF(L3, LDim, SDim, labels, tag="criterion")
-    Err=ErrorPrediction(labels, CE.BFF, tag="evaluation")
+    Err=ClassificationError(labels, CE.BFF, tag="evaluation")
    # rootNodes defined here
    OutputNodes=(CE.BFF)
 ]
@ -463,7 +463,7 @@ ndlMacroUseMask=[
    L4=ElementTimes(L3, ML2)
    
    CE = SMBFF(L4, LDim, HDim, labels, tag="criterion")
-    Err=ErrorPrediction(labels, CE.BFF, tag="evaluation")
+    Err=ClassificationError(labels, CE.BFF, tag="evaluation")
    
    # output nodes
    Prior=Mean(labels)
--- a/Examples/Other/Simple2d/Config/Multigpu.cntk
+++ b/Examples/Other/Simple2d/Config/Multigpu.cntk
@ -39,7 +39,7 @@ Multigpu_Demo_Train=[
        # 2 input, 2 50-element hidden, 2 output
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ErrorPrediction"
+        evalCriterion = "ClassificationError"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
--- a/Examples/Other/Simple2d/Config/Simple.cntk
+++ b/Examples/Other/Simple2d/Config/Simple.cntk
@ -32,7 +32,7 @@ Simple_Demo_Train = [
        # 2 input, 2 50-element hidden, 2 output
        layerSizes = 2:50*2:2
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ErrorPrediction"
+        evalCriterion = "ClassificationError"
        layerTypes = "Sigmoid"
        initValueScale = 1.0
        applyMeanVarNorm = true
@ -137,6 +137,6 @@ Simple_Demo_Output=[
        #    grep labels        SimpleOutput.labels        | awk '{print $1}' > L
        #    diff L P | grep "<" | wc -l
        #    wc -l P
-        # The ratio of the two numbers gives the same error rate as ErrorPrediction/Sample in the log.
+        # The ratio of the two numbers gives the same error rate as ClassificationError/Sample in the log.
    ]
 ]
--- a/Examples/Speech/AN4/Config/FeedForward.cntk
+++ b/Examples/Speech/AN4/Config/FeedForward.cntk
@ -29,7 +29,7 @@ speechTrain = [
    SimpleNetworkBuilder = [
        layerSizes = 363:512:512:132
        trainingCriterion = "CrossEntropyWithSoftmax"
-        evalCriterion = "ErrorPrediction"
+        evalCriterion = "ClassificationError"
        layerTypes = "Sigmoid"
        applyMeanVarNorm = true
        needPrior = true
--- a/Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl
+++ b/Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl
@ -199,7 +199,7 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3 = [
    LSTMoutputW = Plus(Times(W, LSTMoutput3), b);

    ce = CrossEntropyWithSoftmax(labels, LSTMoutputW);
-    err = ErrorPrediction(labels, LSTMoutputW);
+    err = ClassificationError(labels, LSTMoutputW);
    
    logPrior = LogPrior(labels)
    scaledLogLikelihood = Minus(LSTMoutputW, logPrior)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/PACRnn.txt
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/PACRnn.txt
@ -172,7 +172,7 @@ ndlCreateNetwork=[
    criterion = Plus(Scale(cr2,criterion2), Scale(cr1,criterion1), tag=Criteria)
 	
    #CE = SMBFF(Dout,labelDim,hiddenDim,labels,tag=Criteria)
-	Err = ErrorPrediction(labels,DNN_A_CE_BFF,tag=Eval)
+	Err = ClassificationError(labels,DNN_A_CE_BFF,tag=Eval)
    
    logPrior = LogPrior(labels)	 

--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/dnn_3layer.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/dnn_3layer.ndl
@ -33,7 +33,7 @@ ndlCreateNetwork=[
 	 L2 = SBFF(L1,hiddenDim,hiddenDim)
 	 L3 = SBFF(L2,hiddenDim,hiddenDim)
 	 CE = SMBFF(L3,labelDim,hiddenDim,labels,tag=Criteria)
-	 Err = ErrorPrediction(labels,CE.BFF.FF.P,tag=Eval)
+	 Err = ClassificationError(labels,CE.BFF.FF.P,tag=Eval)

 	 # define output (scaled loglikelihood)
     logPrior = LogPrior(labels)	 
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/dnn_6layer.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/dnn_6layer.ndl
@ -122,7 +122,7 @@ ndlCreateNetwork=[
     L5 = SBFF(L4,hiddenDim,hiddenDim)
     L6 = SBFF(L5,hiddenDim,hiddenDim)
     CE = SMBFF(L6,labelDim,hiddenDim,labels,tag=Criteria)
-     Err = ErrorPrediction(labels,CE.BFF.FF.P,tag=Eval)
+     Err = ClassificationError(labels,CE.BFF.FF.P,tag=Eval)

     # define output (scaled loglikelihood)
     logPrior = LogPrior(labels)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/dnn_6layer_smbr.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/dnn_6layer_smbr.ndl
@ -128,7 +128,7 @@ ndlCreateNetwork=[
    # same name as the corresponding node in the non-sequence training model.
    CE.BFF = BFF(L6, labelDim, hiddenDim)
    Cr = DummyCriterion(objectives, derivatives, CE.BFF.FF.P, tag=Criteria)
-    Err = ErrorPrediction(labels, CE.BFF.FF.P, tag=Eval)
+    Err = ClassificationError(labels, CE.BFF.FF.P, tag=Eval)

    # define output (scaled loglikelihood)
    logPrior = LogPrior(labels)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/lstm_3layer_delay.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/lstm_3layer_delay.ndl
@ -106,7 +106,7 @@ ndlCreateNetwork=[
    LSTMoutputW1 = Times(W1, LSTMoutput3)

    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW1,tag=Criteria)
-    Err = ErrorPrediction(labels,LSTMoutputW1,tag=Eval)
+    Err = ClassificationError(labels,LSTMoutputW1,tag=Eval)
    
    logPrior = LogPrior(labels)	 
    ScaledLogLikelihood=Minus(LSTMoutputW1,logPrior,tag=Output)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer-delay.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer-delay.ndl
@ -142,7 +142,7 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3=[
    #LSTMoutputW = Plus(Times(W, LSTMoutput3), b);

    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW,tag=Criteria);
-    Err = ErrorPrediction(labels,LSTMoutputW,tag=Eval);
+    Err = ClassificationError(labels,LSTMoutputW,tag=Eval);

    logPrior = LogPrior(labels)
    ScaledLogLikelihood=Minus(LSTMoutputW,logPrior,tag=Output)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer-highway-dropout.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer-highway-dropout.ndl
@ -184,7 +184,7 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3=[


    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW,tag=Criteria);
-    Err = ErrorPrediction(labels,LSTMoutputW,tag=Eval);
+    Err = ClassificationError(labels,LSTMoutputW,tag=Eval);

    logPrior = LogPrior(labels)
    ScaledLogLikelihood=Minus(LSTMoutputW,logPrior,tag=Output)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer-highway.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer-highway.ndl
@ -182,7 +182,7 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3=[


    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW,tag=Criteria);
-    Err = ErrorPrediction(labels,LSTMoutputW,tag=Eval);
+    Err = ClassificationError(labels,LSTMoutputW,tag=Eval);

    logPrior = LogPrior(labels)
    ScaledLogLikelihood=Minus(LSTMoutputW,logPrior,tag=Output)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer.ndl
@ -111,7 +111,7 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3=[
    LSTMoutputW = Plus(Times(W, LSTMoutput3), b);

    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW,tag=Criteria);
-    Err = ErrorPrediction(labels,LSTMoutputW,tag=Eval);
+    Err = ClassificationError(labels,LSTMoutputW,tag=Eval);

    logPrior = LogPrior(labels)
    ScaledLogLikelihood=Minus(LSTMoutputW,logPrior,tag=Output)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer.txt
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-3layer.txt
@ -112,7 +112,7 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3=[

 	
    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW,tag=Criteria);
-    Err = ErrorPrediction(labels,LSTMoutputW,tag=Eval);
+    Err = ClassificationError(labels,LSTMoutputW,tag=Eval);
    
    logPrior = LogPrior(labels)	 
    ScaledLogLikelihood=Minus(LSTMoutputW,logPrior,tag=Output)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-8layer-highway.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-8layer-highway.ndl
@ -187,7 +187,7 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3=[


    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW,tag=Criteria);
-    Err = ErrorPrediction(labels,LSTMoutputW,tag=Eval);
+    Err = ClassificationError(labels,LSTMoutputW,tag=Eval);

    logPrior = LogPrior(labels)
    ScaledLogLikelihood=Minus(LSTMoutputW,logPrior,tag=Output)
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-8layer.ndl
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/lstmp-8layer.ndl
@ -116,7 +116,7 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3=[
    LSTMoutputW = Plus(Times(W, LSTMoutput8), b);

    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW,tag=Criteria);
-    Err = ErrorPrediction(labels,LSTMoutputW,tag=Eval);
+    Err = ClassificationError(labels,LSTMoutputW,tag=Eval);

    logPrior = LogPrior(labels)
    ScaledLogLikelihood=Minus(LSTMoutputW,logPrior,tag=Output)
--- a/Examples/Speech/Miscellaneous/TIMIT/AdditionalFiles/TrainSimpleTimit.cntk
+++ b/Examples/Speech/Miscellaneous/TIMIT/AdditionalFiles/TrainSimpleTimit.cntk
@ -27,7 +27,7 @@ TimitTrainSimple=[
    SimpleNetworkBuilder=[
        layerSizes=792:512*3:183
        trainingCriterion=CrossEntropyWithSoftmax
-        evalCriterion=ErrorPrediction
+        evalCriterion=ClassificationError
        layerTypes=Sigmoid
        initValueScale=1.0
        applyMeanVarNorm=true
--- a/Examples/Speech/Miscellaneous/TIMIT/config/PAC-RNN.ndl
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/PAC-RNN.ndl
@ -163,7 +163,7 @@ ndlCreateNetwork=[
    criterion2 = CrossEntropyWithSoftmax(statelabels, DNN_B_CE_BFF)
    criterion = Plus(Scale(cr2,criterion2), Scale(cr1,criterion1), tag="criterion")
 	
-    Err = ErrorPrediction(labels,DNN_A_CE_BFF,tag="evaluation")
+    Err = ClassificationError(labels,DNN_A_CE_BFF,tag="evaluation")
    
    logPrior = LogPrior(labels)	 

--- a/Examples/Speech/Miscellaneous/TIMIT/config/TIMIT_AdaptLearnRate.cntk
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/TIMIT_AdaptLearnRate.cntk
@ -18,7 +18,7 @@ TIMIT_TrainAdaptLR=[
    SimpleNetworkBuilder=[
        layerSizes=792:512*3:183
        trainingCriterion=CrossEntropyWithSoftmax
-        evalCriterion=ErrorPrediction
+        evalCriterion=ClassificationError
        layerTypes=Sigmoid
        initValueScale=1.0
        applyMeanVarNorm=true
--- a/Examples/Speech/Miscellaneous/TIMIT/config/TIMIT_TrainSimpleNetwork.cntk
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/TIMIT_TrainSimpleNetwork.cntk
@ -24,7 +24,7 @@ TIMIT_TrainSimple=[
    SimpleNetworkBuilder=[
        layerSizes=792:512*3:183
        trainingCriterion=CrossEntropyWithSoftmax
-        evalCriterion=ErrorPrediction
+        evalCriterion=ClassificationError
        layerTypes=Sigmoid
        initValueScale=1.0
        applyMeanVarNorm=true
--- a/Examples/Speech/Miscellaneous/TIMIT/config/classify.ndl
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/classify.ndl
@ -33,7 +33,7 @@ ndlCreateNetwork=[
 	 L2 = SBFF(L1,hiddenDim,hiddenDim)
 	 L3 = SBFF(L2,hiddenDim,hiddenDim)
 	 CE = SMBFF(L3,labelDim,hiddenDim,myLabels,tag="criterion")
-	 Err = ErrorPrediction(myLabels,CE.BFF.FF.P,tag="evaluation")
+	 Err = ClassificationError(myLabels,CE.BFF.FF.P,tag="evaluation")

 	 # define output (scaled loglikelihood)
     	 logPrior = LogPrior(myLabels)	 
--- a/Examples/Speech/Miscellaneous/TIMIT/config/create_1layer.ndl
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/create_1layer.ndl
@ -31,7 +31,7 @@ ndlCreateNetwork=[
 	 featNorm = MeanVarNorm(features)
 	 L1 = SBFF(featNorm,hiddenDim,featDim)
 	 CE = SMBFF(L1,labelDim,hiddenDim,labels,tag="criterion")
-	 Err = ErrorPrediction(labels,CE.BFF.FF.P,tag="evaluation")
+	 Err = ClassificationError(labels,CE.BFF.FF.P,tag="evaluation")

 	 # define output (scaled loglikelihood)
 	 logPrior = LogPrior(labels)	 
--- a/Examples/Speech/Miscellaneous/TIMIT/config/lstm.ndl
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/lstm.ndl
@ -102,7 +102,7 @@ ndlCreateNetwork=[
    LSTMoutputW1 = Times(W1, LSTMoutput)

    cr = CrossEntropyWithSoftmax(labels, LSTMoutputW1,tag="criterion")
-    Err = ErrorPrediction(labels,LSTMoutputW1,tag="evaluation")
+    Err = ClassificationError(labels,LSTMoutputW1,tag="evaluation")
    
    logPrior = LogPrior(labels)	 
    ScaledLogLikelihood=Minus(LSTMoutputW1,logPrior,tag="output")
--- a/Examples/Speech/Miscellaneous/TIMIT/config/mtl_fbank_mfcc.ndl
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/mtl_fbank_mfcc.ndl
@ -51,7 +51,7 @@ L1 = SBFF2(featInput1, HiddenDim, FeatDim1, featInput2, FeatDim2)
 L2 = SBFF(L1, HiddenDim, HiddenDim)
 L3 = SBFF(L2, HiddenDim, HiddenDim)
 CE = SMBFF(L3, LabelDim1, HiddenDim, labels,tag="criterion") # do I need a tag? 
-FER = ErrorPrediction(labels,CE.BFF.FF.P,tag="evaluation")
+FER = ClassificationError(labels,CE.BFF.FF.P,tag="evaluation")

 # outputNodes
 Prior=Mean(labels)
--- a/Examples/Speech/Miscellaneous/TIMIT/config/mtl_senones_dr.ndl
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/mtl_senones_dr.ndl
@ -41,12 +41,12 @@ L3 = SBFF(L2, HiddenDim, HiddenDim2)
 # objective function 1
 BFF1=BFF(L3,LabelDim1,HiddenDim)
 CE1=CrossEntropyWithSoftmax(labels,BFF1.FF.P,tag="evaluation")
-FER1 = ErrorPrediction(labels,BFF1.FF.P,tag="evaluation")
+FER1 = ClassificationError(labels,BFF1.FF.P,tag="evaluation")

 # objective function 2
 BFF2=BFF(L3,LabelDim2,HiddenDim)
 CE2=CrossEntropyWithSoftmax(regions,BFF2.FF.P,tag="evaluation")
-FER2 = ErrorPrediction(regions,BFF2.FF.P,tag="evaluation")
+FER2 = ClassificationError(regions,BFF2.FF.P,tag="evaluation")

 # weighted final objective function
 Alpha1=0.8
--- a/Examples/Text/ATIS/ATIS.cntk
+++ b/Examples/Text/ATIS/ATIS.cntk
@ -60,7 +60,7 @@ Train = [
        outputs = W * LSTMoutput + b
        
        cr = CrossEntropyWithSoftmax(labels, outputs)
-        errs = ErrorPrediction(labels, outputs)
+        errs = ClassificationError(labels, outputs)

        criterionNodes = (cr)
        evaluationNodes = (errs)
@ -191,7 +191,7 @@ Test = [
        labels = Input($labelCount$, tag = "label")
        modelAsTrained = BS.Network.Load ("$modelPath$")
        final = Hardmax(modelAsTrained.outputs)
-        errorRate = ErrorPrediction(labels, final, tag='evaluation')
+        errorRate = ClassificationError(labels, final, tag='evaluation')
    ]
    
    evalNodeNames  = errorRate
--- a/Examples/Text/ATIS/Data/trash/query.wl
+++ b/Examples/Text/ATIS/Data/trash/query.wl
--- a/Examples/Text/ATIS/Data/trash/slots.wl
+++ b/Examples/Text/ATIS/Data/trash/slots.wl
--- a/Examples/Text/ATIS/TutorialAll.cntk
+++ b/Examples/Text/ATIS/TutorialAll.cntk
@ -1,169 +0,0 @@
-# The configuration file to build language understanding model with ATIS corpus.
-# An LSTM model is built to tag each word in sentences with its semantic label.
-
-makeMode = false ; traceLevel = 1 ; deviceId = -1
-
-WorkDir = Work
-DataDir = Data
-
-modelPath = $WorkDir$/slot.model
-parallelTrain = true
-
-#stderr = $WorkDir$/log
-
-command = TrainATIS:RunATIS:EvalATIS
-
-vocabSize = 943    # number of words
-numLabels = 129    # number of slot labels
-numIntents = 26    # number of intent labels
-
-# The command to train the LSTM model
-TrainATIS = [
-    action = "train"
-    BrainScriptNetworkBuilder = [
-        inputDim = $vocabSize$
-        labelDim = $numLabels$
-        embDim = 150
-        hiddenDim = 300
-        #hiddenDim = 150
-
-        model = Sequential (
-            Parallel ((DelayLayer{T=1} : Identity : DelayLayer{T=-1}), Splice) :  # 3-word window
-            EmbeddingLayer {embDim} :                                             # embedding
-            RecurrentLSTMLayer {hiddenDim} :              # LSTM
-            #Parallel ((RecurrentLSTMLayer {hiddenDim} : RecurrentLSTMLayer {hiddenDim, goBackwards=true}), Splice) :  # bidirectional LSTM
-            #Parallel ((RecurrentLSTMLayer {hiddenDim} : RecurrentLSTMLayer {hiddenDim, goBackwards=true}), Splice) :  # bidirectional LSTM
-            DenseLayer {labelDim, initValueScale=7}        # output layer
-        )
-
-        # features
-        query      = Input {inputDim}
-        slotLabels = Input {labelDim}
-
-        # model application
-        z = model (query)
-
-        # loss and metric
-        ce   = CrossEntropyWithSoftmax (slotLabels, z)
-        errs = ErrorPrediction         (slotLabels, z)
-
-        featureNodes    = (query)
-        labelNodes      = (slotLabels)
-        criterionNodes  = (ce)
-        evaluationNodes = (errs)
-        outputNodes     = (z)
-    ]
-    # rename this to BrainScriptNetworkBuilder to switch to intent-classification task
-    Intent_BrainScriptNetworkBuilder = [
-        inputDim = $vocabSize$
-        labelDim = $numIntents$
-        embDim = 150
-        #hiddenDim = 300
-        hiddenDim = 150
-
-        model = Sequential (
-            Parallel ((DelayLayer{T=1} : Identity : DelayLayer{T=-1}), Splice) :  # 3-word window
-            EmbeddingLayer {embDim} :                                             # embedding
-            RecurrentLSTMLayer {hiddenDim} : BS.Sequences.Last :                  # LSTM state, final state
-            #Parallel ((Sequential (RecurrentLSTMLayer {hiddenDim}                   : BS.Sequences.Last):
-                       Sequential (RecurrentLSTMLayer {hiddenDim, goBackwards=true} : BS.Sequences.First)), Splice) :  # bidirectional LSTM
-            DenseLayer {labelDim, initValueScale=7}        # output layer
-        )
-
-        # features
-        t = DynamicAxis{}
-        query        = Input {inputDim, dynamicAxis=t}
-        intentLabels = Input {labelDim}
-
-        # model application
-        z = model (query)
-
-        # loss and metric
-        ce   = CrossEntropyWithSoftmax (intentLabels, z)
-        errs = ErrorPrediction         (intentLabels, z)
-
-        featureNodes    = (query)
-        labelNodes      = (intentLabels)
-        criterionNodes  = (ce)
-        evaluationNodes = (errs)
-        outputNodes     = (z)
-    ]
-
-    SGD = [
-        maxEpochs = 20 ; epochSize = 36000
-
-        minibatchSize = 70
-
-        learningRatesPerSample = 0.01*2:0.005*12:0.001
-
-        gradUpdateType = "FSAdaGrad"
-
-        gradientClippingWithTruncation = true ; clippingThresholdPerSample = 15.0
-
-        # number of minibatches to report progress
-        firstMBsToShowResult = 10 ; numMBsToShowResult = 100
-
-        parallelTrain = [
-            parallelizationMethod = "DataParallelSGD"
-            parallelizationStartEpoch = 2
-            distributedMBReading = true
-            dataParallelSGD = [
-                gradientBits = 1
-            ]
-        ]
-    ]
-
-    reader = [
-        readerType = "CNTKTextFormatReader"
-        file = "$DataDir$/atis.train.ctf"
-        randomize = true
-        input = [
-            query        = [ alias = "S0" ; dim = $vocabSize$ ;  format = "sparse" ]
-            intentLabels = [ alias = "S1" ; dim = $numIntents$ ; format = "sparse" ]
-            slotLabels   = [ alias = "S2" ; dim = $numLabels$ ;  format = "sparse" ]
-        ]
-    ]
-]
-
-# Run the model to predict slot labels
-RunATIS = [
-    action = "write"
-    BrainScriptNetworkBuilder = [
-        modelAsTrained = BS.Network.Load ("$modelPath$")
-        final = Hardmax (modelAsTrained.z)  # make a decision
-        #labels = Pass (modelAsTrained.slotLabels)
-        # enable this for intent classification:
-        labels = Pass (modelAsTrained.intentLabels)
-        t = DynamicAxis()
-    ]
-
-    outputPath = $WorkDir$/model.writeaction
-    outputNodeNames = intentLabels:slotLabels:final
-
-    reader = [
-        readerType = "CNTKTextFormatReader"
-        file = "$DataDir$/atis.test.ctf"
-        randomize = false
-        input = [
-            query        = [ alias = "S0" ; dim = $vocabSize$ ;  format = "sparse" ]
-            intentLabels = [ alias = "S1" ; dim = $numIntents$ ; format = "sparse" ]
-            slotLabels   = [ alias = "S2" ; dim = $numLabels$ ;  format = "sparse" ]
-        ]
-    ]
-]
-
-# Evaluate the model's slot-tagging accuracy (as an error count)
-EvalATIS = [
-    action = "eval"
-    modelPath = $modelPath$  # from outside
-    reader = [
-        readerType = "CNTKTextFormatReader"
-        file = "$DataDir$/atis.test.ctf"
-        randomize = false
-        input = [
-            query        = [ alias = "S0" ; dim = $vocabSize$ ;  format = "sparse" ]
-            intentLabels = [ alias = "S1" ; dim = $numIntents$ ; format = "sparse" ]
-            slotLabels   = [ alias = "S2" ; dim = $numLabels$ ;  format = "sparse" ]
-        ]
-    ]
-]
--- a/Examples/Text/ATIS/Data/intent.wl
+++ b/Examples/Text/ATIS/Data/intent.wl
--- a/Examples/Tutorial/3Classes_bs.cntk
+++ b/Examples/Tutorial/3Classes_bs.cntk
@ -34,7 +34,7 @@ Train = [
        z = w * features + b

        ce = CrossEntropyWithSoftmax (labels, z)
-        errs = ErrorPrediction (labels, z)
+        errs = ClassificationError (labels, z)

        # root nodes
        featureNodes    = (features)
--- a/5
+++ b/5
@ -713,6 +713,9 @@ endif
 ########################################

 ifdef OPENCV_PATH
+ifdef BOOST_PATH
+
+INCLUDEPATH += $(BOOST_PATH)/include

 IMAGE_READER_LIBS += -lopencv_core -lopencv_imgproc -lopencv_imgcodecs

@ -743,6 +746,7 @@ $(IMAGEREADER): $(IMAGEREADER_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH) $(IMAGE_READER_LIBS)
 endif
+endif

 ########################################
 # 1bit SGD setup
@ -841,7 +845,6 @@ UNITTEST_READER_SRC = \
 	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ReaderLibTests.cpp \
-	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/UCIFastReaderTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/stdafx.cpp \
 	$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
 	$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
--- a/Source/ActionsLib/Actions.h
+++ b/Source/ActionsLib/Actions.h
@ -39,6 +39,8 @@ void DoTrain(const ConfigRecordType& config);
 template <typename ElemType>
 void DoAdapt(const ConfigParameters& config);
 template <typename ElemType>
+void DoDumpNodes(const ConfigParameters& config);
+template <typename ElemType>
 void DoEdit(const ConfigParameters& config);

 // evaluation (EvalActions.cpp)
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -154,6 +154,13 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                m_net->InitLearnableParameters(nodePtr, L"uniform",  initValueScale, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed, initOnCPUOnly);
            else if (EqualCI(initString, L"gaussian"))
                m_net->InitLearnableParameters(nodePtr, L"gaussian", initValueScale, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed, initOnCPUOnly);
+            else if (EqualCI(initString, L"bilinear"))
+            {
+                const size_t kernelWidth = node->GetOptionalParameter("kernelWidth", "0");
+                const size_t kernelHeight = node->GetOptionalParameter("kernelHeight", "0");
+                assert(kernelWidth > 0 && kernelHeight > 0);
+                m_net->InitLearnableParametersWithBilinearFill<ElemType>(nodePtr, kernelWidth, kernelHeight);
+            }
            else if (EqualCI(initString, L"fromFile"))
            {
                std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -158,6 +158,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
 #endif
    else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(ClassificationErrorNode), L"ErrorPrediction")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
@ -177,7 +178,6 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(DropoutNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(DummyCriterionNode), L"DummyCriterion")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ElementTimesNode))) ret = true;
-    else if (EqualInsensitive(nodeType, OperationNameOf(ErrorPredictionNode), L"ClassificationError")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ExpNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(FloorNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(FutureValueNode))) ret = true;
--- a/Source/ActionsLib/SimpleNetworkBuilder.cpp
+++ b/Source/ActionsLib/SimpleNetworkBuilder.cpp
@ -1751,10 +1751,10 @@ shared_ptr<ComputationNode<ElemType>> SimpleNetworkBuilder<ElemType>::AddTrainAn
            // output = builder.SquareError(label, tinput, (evalNodeName == L"")?L"EvalSquareError":evalNodeName);
            output = builder.Logistic(label, tinput, (evalNodeName == L"") ? L"Logistic" : evalNodeName);
            break;
-        case EvalCriterion::ErrorPrediction:
+        case EvalCriterion::ClassificationError:
            if (matrix != nullptr && tinput == input)
                tinput = builder.Times(matrix, input);
-            output = builder.ErrorPrediction(label, tinput, (evalNodeName == L"") ? L"EvalErrorPrediction" : evalNodeName);
+            output = builder.ClassificationError(label, tinput, (evalNodeName == L"") ? L"EvalClassificationError" : evalNodeName);
            break;
 #ifdef COMING_SOON
        case EvalCriterion::CRF:
@ -1785,23 +1785,26 @@ template class SimpleNetworkBuilder<double>;
 TrainingCriterion ParseTrainingCriterionString(wstring s)
 {
    if      (EqualCI(s, L"crossEntropyWithSoftmax"))      return TrainingCriterion::CrossEntropyWithSoftmax;
-    else if (EqualCI(s, L"sequenceWithSoftmax"))          return TrainingCriterion::SequenceWithSoftmax;
    else if (EqualCI(s, L"squareError"))                  return TrainingCriterion::SquareError;
    else if (EqualCI(s, L"logistic"))                     return TrainingCriterion::Logistic;
    else if (EqualCI(s, L"noiseContrastiveEstimation"))   return TrainingCriterion::NCECrossEntropyWithSoftmax;
+    // legacy/deprecated
    else if (EqualCI(s, L"classCrossEntropyWithSoftmax")) return TrainingCriterion::ClassCrossEntropyWithSoftmax;
+    else if (EqualCI(s, L"sequenceWithSoftmax"))          return TrainingCriterion::SequenceWithSoftmax;
    else LogicError("trainingCriterion: Invalid trainingCriterion value. Valid values are (crossEntropyWithSoftmax | squareError | logistic | classCrossEntropyWithSoftmax| sequenceWithSoftmax)");
 }

 EvalCriterion ParseEvalCriterionString(wstring s)
 {
-    if      (EqualCI(s, L"errorPrediction"))              return EvalCriterion::ErrorPrediction;
+    if      (EqualCI(s, L"classificationError"))          return EvalCriterion::ClassificationError;
    else if (EqualCI(s, L"crossEntropyWithSoftmax"))      return EvalCriterion::CrossEntropyWithSoftmax;
-    else if (EqualCI(s, L"sequenceWithSoftmax"))          return EvalCriterion::SequenceWithSoftmax; 
-    else if (EqualCI(s, L"classCrossEntropyWithSoftmax")) return EvalCriterion::ClassCrossEntropyWithSoftmax;
    else if (EqualCI(s, L"logistic"))                     return EvalCriterion::Logistic;
    else if (EqualCI(s, L"noiseContrastiveEstimation"))   return EvalCriterion::NCECrossEntropyWithSoftmax;
    else if (EqualCI(s, L"squareError"))                  return EvalCriterion::SquareError;
+    // legacy/deprecated
+    else if (EqualCI(s, L"classCrossEntropyWithSoftmax")) return EvalCriterion::ClassCrossEntropyWithSoftmax;
+    else if (EqualCI(s, L"sequenceWithSoftmax"))          return EvalCriterion::SequenceWithSoftmax;
+    else if (EqualCI(s, L"errorPrediction"))              return EvalCriterion::ClassificationError;
    else LogicError("evalCriterion: Invalid trainingCriterion value. Valid values are (errorPrediction | crossEntropyWithSoftmax | squareError | logistic | sequenceWithSoftmax)");
 }

--- a/Source/ActionsLib/SimpleNetworkBuilder.h
+++ b/Source/ActionsLib/SimpleNetworkBuilder.h
@ -66,7 +66,7 @@ enum class EvalCriterion : int
    CrossEntropy,
    SquareError,
    Logistic,
-    ErrorPrediction,
+    ClassificationError,
    ClassCrossEntropyWithSoftmax,
    NCECrossEntropyWithSoftmax,
    CRF,
--- a/Source/ActionsLib/TrainActions.cpp
+++ b/Source/ActionsLib/TrainActions.cpp
@ -191,6 +191,30 @@ void DoAdapt(const ConfigParameters& config)
 template void DoAdapt<float>(const ConfigParameters& config);
 template void DoAdapt<double>(const ConfigParameters& config);

+// ===========================================================================
+// DoDumpNodes() - implements CNTK "dumpNode" command
+// ===========================================================================
+
+template <typename ElemType>
+void DoDumpNodes(const ConfigParameters& config)
+{
+    wstring modelPath        = config(L"modelPath");
+    wstring nodeName         = config(L"nodeName", L"__AllNodes__");
+    wstring nodeNameRegexStr = config(L"nodeNameRegex", L"");
+    wstring defOutFilePath   = modelPath + L"." + nodeName + L".txt";
+    wstring outputFile       = config(L"outputFile", defOutFilePath);
+    bool printValues         = config(L"printValues", true);
+    bool printMetadata       = config(L"printMetadata", true);
+    if (!printValues && !printMetadata)
+        InvalidArgument("printValues and printMetadata: Since both are set to false, there will be nothing to dump");
+
+    ComputationNetworkPtr net = ComputationNetwork::CreateFromFile<ElemType>(CPUDEVICE, modelPath);
+    net->DumpNodeInfoToFile(nodeName, printValues, printMetadata, outputFile, nodeNameRegexStr);
+}
+
+template void DoDumpNodes<float>(const ConfigParameters& config);
+template void DoDumpNodes<double>(const ConfigParameters& config);
+
 // ===========================================================================
 // DoEdit() - implements CNTK "edit" command
 // ===========================================================================
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
@ -277,7 +277,7 @@ static ConfigValuePtr NodeOp(const ExpressionPtr &e, ConfigValuePtr leftVal, Con
        if (rightVal.Is<Double>())   // ComputeNode * scalar
            swap(leftVal, rightVal); // -> scalar * ComputeNode
        if (leftVal.Is<Double>())
-            operationName = L"Scale"; // scalar * ComputeNode
+            operationName = L"ElementTimes"; // scalar * ComputeNode
        else
            operationName = L"Times"; // ComputeNode * ComputeNode (matrix produt)
    }
@ -305,6 +305,8 @@ static ConfigValuePtr NodeOp(const ExpressionPtr &e, ConfigValuePtr leftVal, Con
    config->Add(L"operation", MakeFailFn(e->location), ConfigValuePtr(make_shared<String>(operationName), MakeFailFn(e->location), exprPath));
    let leftFailFn = leftVal.GetFailFn(); // report any error for this Constant object as belonging to the scalar factor's expression
    vector<ConfigValuePtr> inputs;
+#if 0  // BUGBUG: rows,cols is no longer right, we need a TensorShape here
+    // TODO: Solve this by directly constructing Constant() off a 'double' input in the ComputationNode constructor.
    if (operationName == L"Scale")
    {
        // if we scale, the first operand is a Double, and we must convert that into a 1x1 Constant
@ -314,7 +316,7 @@ static ConfigValuePtr NodeOp(const ExpressionPtr &e, ConfigValuePtr leftVal, Con
        let one = MakePrimitiveConfigValuePtr(1.0, leftFailFn, exprPath);
        constantConfig->Add(L"rows", leftFailFn, one);
        constantConfig->Add(L"cols", leftFailFn, one);
-        //constantConfig->Add(L"shape", leftFailFn, one);  // BUGBUG: rows,cols is no longer right, we need a TensorShape here
+        //constantConfig->Add(L"shape", leftFailFn, one);
        constantConfig->Add(L"value", leftFailFn, leftVal);
        constantConfig->Add(L"learningRateMultiplier", leftFailFn, MakePrimitiveConfigValuePtr(0.0f, leftFailFn, exprPath));
        let value = ConfigValuePtr(rtInfo->construct(constantConfig), leftFailFn, exprPath);
@ -323,6 +325,7 @@ static ConfigValuePtr NodeOp(const ExpressionPtr &e, ConfigValuePtr leftVal, Con
            valueWithName->SetName(value.GetExpressionName());
        leftVal = value; // and that's our actual left value
    }
+#endif
    inputs.push_back(leftVal);
    if (operationName != L"Negate") // Negate only has one input (rightVal is a nullptr)
        inputs.push_back(rightVal);
@ -332,6 +335,8 @@ static ConfigValuePtr NodeOp(const ExpressionPtr &e, ConfigValuePtr leftVal, Con
    {
        let one = MakePrimitiveConfigValuePtr(1.0, leftFailFn, exprPath);
        config->Add(L"outputRank", leftFailFn, one);
+        let minusOne = MakePrimitiveConfigValuePtr(-1.0, leftFailFn, exprPath);
+        config->Add(L"inferInputRankToMap", leftFailFn, minusOne);
    }
    // instantiate the ComputationNode
    let value = ConfigValuePtr(rtInfo->construct(config), MakeFailFn(e->location), exprPath);
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -30,27 +30,38 @@

 # LinearLayer -- create a fully-connected linear projection layer
 # Note: outDim may describe a tensor as well.
-LinearLayer {outDim, bias = true, init='uniform', initValueScale=1} =
+LinearLayer {outDim, bias = true, init='heNormal', initValueScale=1, inputRank=None, mapRank=None} =
 {
-    W = ParameterTensor {_ConcatArrays (outDim, 0), init=init, initValueScale=initValueScale}
+    # inputRank given: number of zeroes to add to W (mapRank must not be given)
+    # mapRank   given: expand W to leave exactly mapRank axes (inputRank must not be given)
+    # none      given: expand W to all (same as mapRank=0)
+    inputShape =
+        if       BS.Constants.IsNone (inputRank) then Inferred  # not given: one Inferred, which will get expanded
+        else if !BS.Constants.IsNone (mapRank)   then Fail ("'inputRank' and 'mapRank' cannot be specified at the same time.")
+        else Repeat (inputRank, Inferred)
+    W = ParameterTensor {_ConcatArrays (outDim, inputShape), init=init, initValueScale=initValueScale}
    b = ParameterTensor {outDim, initValue=0}
-    outRank = Length (_AsArray (outDim)) # support outputs with tensor layouts
+    outputRank = Length (_AsArray (outDim)) # support outputs with tensor layouts
+    inferInputRankToMap =
+        if      !BS.Constants.IsNone (inputRank) then -1  # means not specified
+        else if  BS.Constants.IsNone (mapRank)   then 0   # default to 'use all input dims'
+        else mapRank
    apply (x) =
        if bias
-        then Times (W, x, outputRank = outRank) + b
-        else Times (W, x, outputRank = outRank)
+        then Times (W, x, outputRank=outputRank, inferInputRankToMap=inferInputRankToMap) + b
+        else Times (W, x, outputRank=outputRank, inferInputRankToMap=inferInputRankToMap)
 }.apply

 # DenseLayer -- create a fully-connected layer with optional non-linearity
-DenseLayer{outDim, bias = true, activation=(x=>x), init='uniform', initValueScale=1} = Sequential ( LinearLayer{outDim, bias = bias, init = init, initValueScale = initValueScale} : activation )
+DenseLayer{outDim, bias = true, activation=(x=>x), init='heNormal', initValueScale=1, inputRank=None, mapRank=None} = Sequential ( LinearLayer{outDim, bias=bias, init=init, initValueScale=initValueScale, inputRank=inputRank, mapRank=mapRank} : activation )

 # EmbeddingLayer -- create a linear embedding layer
 EmbeddingLayer {outDim,                                   # dimension of embedding
                embeddingPath = '', transpose = false} =  # load a fixed embedding from a path instead
 {
-    shape = if transpose then (0 : outDim) else (outDim : 0)
+    shape = if transpose then (Inferred : outDim) else (outDim : Inferred)
    E = if embeddingPath == ''
-        then ParameterTensor {shape, init='uniform'}  # learnable
+        then ParameterTensor {shape, init='heNormal'}  # learnable
        else ParameterTensor {shape, initFromFilePath = embeddingPath, learningRateMultiplier = 0}  # fixed from file
    TimesOp = if transpose then TransposeTimes else Times
    apply (x) = TimesOp (E, x)    # x is expected to be sparse one-hot
@ -65,7 +76,7 @@ ConvolutionalLayer {numOutputChannels,   # e.g. (1) or BS.Constants.None
                    filterShape,         # e.g. (3:3)
                    bias = true,
                    activation = (x=>x),
-                    init = "uniform",
+                    init = "heNormal",
                    initValueScale = 1,          # TODO: rename to initScale
                    #reductionRank = 1,          # TODO: support this
                    stride = 1, pad = false,
@ -77,9 +88,9 @@ ConvolutionalLayer {numOutputChannels,   # e.g. (1) or BS.Constants.None
    outputChannelsShape = _AsArray (numOutputChannels)
    outputRank = Length (outputChannelsShape)
    filterRank = Length (filterShape)
-    kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, 0)) # kernel := filter plus reductionDims
-    #W = ParameterTensor{_ConcatArrays (                kernelShape,      outputChannelsShape), init = init, initValueScale = initValueScale}  # [ W x H x C x K ]
-    W = ParameterTensor{(outputChannelsShape:0), init = init, initValueScale = initValueScale}  # old-style for backwards-compatible random initialization
+    kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, Inferred)) # kernel := filter plus reductionDims
+    #W = ParameterTensor{_ConcatArrays (                kernelShape,      outputChannelsShape), init = init, initValueScale = initValueScale, initOutputRank = -1}  # [ W x H x C x K ]
+    W = ParameterTensor{(outputChannelsShape : Inferred), init = init, initValueScale = initValueScale}  # old-style for backwards-compatible random initialization
    b = ParameterTensor(_ConcatArrays (Repeat (Length (filterShape), 1), outputChannelsShape), initValue = 0)                                 # [ 1 x 1 x     K ]
    sharing = true    # TODO: support this
    transpose = false # TODO: support this
@ -106,10 +117,11 @@ AveragePoolingLayer {filterShape, stride = 1, pad = false, lowerPad = 0, upperPa
 RecurrentLSTMLayer {outputDim,
                    cellShape = BS.Constants.None, # if set then use a projection
                    goBackwards = false,
+                    init='heNormal', initValueScale=1,
                    enableSelfStabilization = false} =
 {
    previousHook = if goBackwards then BS.RNNs.NextHC else BS.RNNs.PreviousHC
-    lstm = BS.RNNs.LSTMBlock {outputDim, cellShape = cellShape, enableSelfStabilization = enableSelfStabilization}
+    lstm = BS.RNNs.LSTMBlock {outputDim, cellShape = cellShape, enableSelfStabilization = enableSelfStabilization, init=init, initValueScale=initValueScale}
    apply (x) = {
        prevState = previousHook (lstmState) # recurrent memory. E.g. Previous or Next, with or without initial state, beam reordering etc.

@ -138,7 +150,7 @@ DelayLayer {T=1, defaultHiddenActivation=0} =
 # BatchNormalizationLayer -- create a batch-normalization layer
 BatchNormalizationLayer {spatialRank = 0,  # reduce over these dims. E.g. 2 to reduce over (w,h) in a [W x H x C]-shaped input
                         initialScale = 1,
-                         normalizationTimeConstant = 0, blendTimeConstant = 0,
+                         normalizationTimeConstant = 0, blendTimeConstant = 0,  # TODO: normTimeConst should be INF, not 0
                         epsilon = 0.00001, useCntkEngine = true} =
 {
    #normShape   = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
@ -151,18 +163,16 @@ BatchNormalizationLayer {spatialRank = 0,  # reduce over these dims. E.g. 2 to r
 }.apply

 # LayerNormalizationLayer -- create a layer-normalization layer
-LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} = if BS.Constants.IsNone (dim) then Fail ("LayerNormalizationLayer: 'dim' parameter is currently required.") else
+LayerNormalizationLayer {initScale = 1, initBias = 0} =
 {
    gain = ParameterTensor{(1), initValue = initScale}
    bias = ParameterTensor{(1), initValue = initBias}

    apply (x) = {
-        div = Constant (1.0 / dim)
-
        # normalize w.r.t. actual sample statistics
-        mean = div .* ReduceSum (x)
+        mean = ReduceMean (x)
        x0 = x - mean;
-        std = Sqrt (div .* ReduceSum (x0 .* x0))
+        std = Sqrt (ReduceMean (x0 .* x0))
        xHat = ElementDivide (x0, std)

        # denormalize with learned parameters
@ -171,16 +181,22 @@ LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} =
 }.apply

 # StabilizerLayer -- create a scalar stabilizer [J. Droppo, 2014 -- TODO: get the reference]
-StabilizerLayer {} =
+StabilizerLayer{} =
 {
    # BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor Stabilize() for this.
-    apply (x) = Stabilize (x)
+    apply (x) = BS.Parameters.Stabilize (x)
 }.apply

 # FeatureMVNLayer -- create a corpus-level feature-normalization layer
 # This can only be applied to features. Statistics are not shared across invocations,
 # which is semantically OK because the values are the same. However, it is not efficient.
-FeatureMVNLayer {} = MeanVarNorm
+FeatureMVNLayer{} = MeanVarNorm
+
+# LogPriorLayer -- create a corpus-level label-prior layer
+# This can only be applied to labels. Statistics are not shared across invocations,
+# which is semantically OK because the values are the same. However, it is not efficient.
+# TODO: document on Wiki
+LogPriorLayer{} = LogPrior

 # Layers that exist in other tools that we will not have:
 # FlattenLayer{}: Not needed since DenseLayer() can handle tensors just fine.
@ -188,6 +204,10 @@ FeatureMVNLayer {} = MeanVarNorm

 Identity(x) = x # sometimes helpful

+None = BS.Constants.None   # for use with some optional parameters; test with IsNone()
+
+Inferred = 0  # denotes a dimension that is to be inferred
+
 ##############################################################################
 # Composing layers or models into more more complex models
 ##############################################################################
@ -234,7 +254,7 @@ CrossEntropyWithSoftmax = CNTK2.CrossEntropyWithSoftmax
 Dropout                 = CNTK2.Dropout
 ElementTimes            = CNTK2.ElementTimes
 ElementDivide           = CNTK2.ElementDivide
-ErrorPrediction         = CNTK2.ErrorPrediction
+ClassificationError     = CNTK2.ClassificationError
 Exp                     = CNTK2.Exp
 Floor                   = CNTK2.Floor
 Log                     = CNTK2.Log
@ -245,6 +265,7 @@ RectifiedLinear         = CNTK2.ReLU # deprecated
 ReLU                    = CNTK2.ReLU
 ReduceSum               = CNTK2.ReduceSum
 ReduceLogSum            = CNTK2.ReduceLogSum
+ReduceMean              = CNTK2.ReduceMean
 ReduceMin               = CNTK2.ReduceMin
 ReduceMax               = CNTK2.ReduceMax

@ -283,7 +304,7 @@ CNTK2 = [
    // TODO: The API for Parameter is different in current 2.0 design, getting a constant as input for the initial values. 
    // This needs to be fixed to follow the way the Constant() is exposed in Python
    // Making this an internal node with "_" until we agree on the final interface:
-    _Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
+    _Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initOutputRank = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]

    // 3. Shape operations
    // Changes: NewReshape -> Reshape, input -> _, dims -> shape
@ -316,13 +337,14 @@ CNTK2 = [

    // 4. Tensor operations
    // Changes: Matrix -> Tensor. A -> x, B -> y. Data must come on y ("default parameter") hence not using _
-    Times(x, y, outputRank=1, tag='') = new ComputationNode [ operation = 'Times' ; inputs = ( x : y ) /*plus the function args*/ ]
+    Times(x, y, outputRank=1, inferInputRankToMap=-1, tag='') = new ComputationNode [ operation = 'Times' ; inputs = ( x : y ) /*plus the function args*/ ]

    // 5. Elementwise operations.
    // Changes: "Matrix" -> "Tensor"; left input -> _; Clip: move input to front. ElementDivide/Times: anotherTensor -> y
    Abs(_, tag='') = new ComputationNode [ operation = 'Abs' ; inputs = _ /*plus the function args*/ ]
    Ceil(_, tag='') = Negate(Floor(Negate(_)), tag=tag)
    Clip(_, minValue, maxValue, tag='') = new ComputationNode [ operation = 'Clip' ; inputs = (minValue : maxValue : _) /* plus the function args*/ ]
+    # TODO: Make ElementDivide a proper operation
    ElementDivide(_, y, tag='') = ElementTimes(_, Reciprocal(y), tag=tag)
    ElementTimes(_, y, tag='') = new ComputationNode [ operation = 'ElementTimes' ; inputs = (_ : y) /*plus the function args*/ ]
    Exp(_, tag='') = new ComputationNode [ operation = 'Exp' ; inputs = _ /*plus the function args*/ ]
@ -336,12 +358,12 @@ CNTK2 = [
    Tanh(_, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = _ /*plus the function args*/ ]

    // 6. Reductions    
-    ReduceSum   (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum"    /*plus the function args*/ ]
-    ReduceLogSum(_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "LogSum" /*plus the function args*/ ]
-    ReduceMin   (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Min"    /*plus the function args*/ ]
-    ReduceMax   (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Max"    /*plus the function args*/ ]
-    #ReduceMean (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Mean"   /*plus the function args*/ ]
-    
+    ReduceSum   (_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "Sum"    /*plus the function args*/ ]}.r
+    ReduceLogSum(_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "LogSum" /*plus the function args*/ ]}.r
+    ReduceMean  (_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "Mean"   /*plus the function args*/ ]}.r
+    ReduceMin   (_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "Min"    /*plus the function args*/ ]}.r
+    ReduceMax   (_, axis=None, tag='') = { axis1 = if BS.Constants.IsNone (axis) then 0 else axis ; r = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; axis = axis1 ; reductionOp = "Max"    /*plus the function args*/ ]}.r
+
    // 7. Control flow (if, composite etc.)
    // None so far

@ -365,11 +387,12 @@ CNTK2 = [
    // No changes here - we said the default input would be the label sequence here, against which the 
    // empirical sequence is compared to. Keeping this for now.
    CrossEntropyWithSoftmax(_, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (_ : outProbVectorSequence) /*plus the function args*/ ]
-    ErrorPrediction(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = if topN == 1 then (_ : outVectorSequence) else  (_ : outVectorSequence : Constant (topN)) /*plus the function args*/ ]
+    ClassificationError(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ClassificationError' ; inputs = if topN == 1 then (_ : outVectorSequence) else  (_ : outVectorSequence : Constant (topN)) /*plus the function args*/ ]
+    ErrorPrediction = ClassificationError  # legacy
    # TODO: replace with this (need to deal with topN thing):
    # (_new will be removed once the change is made)
    CrossEntropyWithSoftmax_new (L, z, tag='') = Minus (ReduceLogSum (z), TransposeTimes (L,          z),  tag=tag)
-    ErrorPrediction_new (L, z, tag='')         = Minus (BS.Constants.One, TransposeTimes (L, Hardmax (z)), tag=tag)
+    ClassificationError_new (L, z, tag='')     = Minus (BS.Constants.One, TransposeTimes (L, Hardmax (z)), tag=tag)

    // 12. Comparison nodes
    Less(_, y, tag='')         = new ComputationNode [ operation = 'Less'         ; inputs = (_ : y) /*plus the function args*/ ]
@ -393,12 +416,12 @@ CNTK2 = [
 #  - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
 #  - init="fixedValue", value from 'value'
 # Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
-Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
+Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; initOutputRank = 1 ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]

 LearnableParameter = Parameter  // deprecated

 # TODO: make Parameter take tensor dims?
-ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
+ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initOutputRank = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
 ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
 # TODO: Deprecate ConstantFromString() in favor of Constant(array expression)
 DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/  ]
@ -452,7 +475,7 @@ MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, l
 MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]
 AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]
 ColumnwiseCrossProduct = KhatriRaoProduct // deprecated 
-ClassificationError = ErrorPrediction 
+ErrorPrediction = ClassificationError   # legacy name
 Delay = PastValue 

 BatchNormalization(input, scale, bias, runMean, runVariance, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runVariance) /*plus the function args*/ ]
@ -551,16 +574,22 @@ IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ;  args = (x:y) ]
 # macros from NDL book
 ##############################################################################

+# deprecated--use LinearLayer{} and DenseLayer{} instead
 BFF(in, rows, cols) = [ B = Parameter(rows, 1, initValue = 0) ; W = Parameter(rows, cols) ; z = W*in+B ] 
 SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ] 
+
+# deprecated--use FeatureMVNLayer{} instead
 MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat)) 
+
+# deprecated--use LogPriorLayer{} instead
 LogPrior(labels) = Log(Mean(labels)) 

 # specify one of these two for initialization:
 #  - init = "uniform"|"gaussian"
 #  - embeddingFile = PATHNAME
+# deprecated--use EmbeddingLayer{} instead
 Embedding (embeddingDim, input, inputDim=input.dim, initFrom=''/*|fromFile|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
-    embedding = Transpose (LearnableParameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
+    embedding = Transpose (Parameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
    lookup = if sparseInput then embedding * input
             else GatherPacked (input, embedding)
 ].lookup
@ -802,7 +831,8 @@ Loop = [

 Parameters =
 [
-    WeightParam (outputDim, inputDim) = Parameter (outputDim, inputDim, init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1)
+    # TODO: These all have randomSeed set to 1!
+    WeightParam (outputDim, inputDim) = ParameterTensor ((outputDim : inputDim), init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1)
    DiagWeightParam (outputDim)       = ParameterTensor ((outputDim), init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1) # meant to be applied elementwise
    BiasParam (dim)                   = ParameterTensor ((dim), initValue=0.0)
    ScalarParam()                     = BiasParam (1)
@ -845,23 +875,25 @@ RNNs =
    # This is the stateless version that takes the previous state as an input.
    # It returns a dictionary with three members: h and c, and dim=h.dim for convenience. prevState must have h and c.
    # This function also takes an optional auxiliary input, e.g. for suporting attention models.
-    LSTMBlock (outputDim, cellShape=Constants.None, enableSelfStabilization=false) =
-    [
+    LSTMBlock (outputDim, cellShape=None, init='heNormal', initValueScale=1, enableSelfStabilization=false) =
+    {
        cellDim = if Constants.IsNone (cellShape) then outputDim else cellShape
-        // parameter macros
-        # note: each invocation comes with its own set of weights
-        B{} = Parameters.BiasParam {cellDim}
-        W{} = Parameters.WeightParam {cellDim, 0}               // input
-        A{} = Parameters.WeightParam {cellDim, 0}               // aux input
-        H{} = Parameters.WeightParam {cellDim, outputDim}       // hidden-to-hidden
-        C{} = Parameters.DiagWeightParam {cellDim}              // cell-to-hiddden {note: applied elementwise}
-        Wmr = Parameters.WeightParam {outputDim, cellDim};
+
+        # parameter helpers
+        # note: invoked multiple times, each invocation comes with its own set of weights
+        B{} = ParameterTensor {(cellDim),             initValue=0}       # a bias
+        W{} = ParameterTensor {(cellDim : Inferred),  init=init, initValueScale=initValueScale}   # a input
+        A{} = ParameterTensor {(cellDim : Inferred),  init=init, initValueScale=initValueScale}   # an aux input
+        H{} = ParameterTensor {(cellDim : outputDim), init=init, initValueScale=initValueScale}   # a hidden-to-hidden
+        C{} = ParameterTensor {(cellDim),             init=init, initValueScale=initValueScale}   # a cell-to-hiddden {note: applied elementwise}
+
+        Wmr = ParameterTensor {(outputDim : cellDim), init=init, initValueScale=initValueScale};  # final projection

        S(x) = Parameters.Stabilize (x, enabled=enableSelfStabilization)
        # BUGBUG: S() must not be a macro either, but also an object instance

-        apply (x, prevState, aux=Constants.None) = [
-            _ = [     // encapsulate the inner workings
+        apply (x, prevState, aux=None) = {
+            _ = {     // encapsulate the inner workings

                dh = prevState.h // previous values
                dc = prevState.c
@ -872,20 +904,20 @@ RNNs =

                # projected contribution from input(s) and bias
                pin() = if Constants.IsNone (aux)
-                        then B() + W() * x
-                        else B() + W() * x + A() * aux
+                        then B{} + W{} * x
+                        else B{} + W{} * x + A{} * aux

-                it = Sigmoid (pin() + H() * dhs + C() .* dcs)           // input gate(t)
-                bit = it .* Tanh (pin() + H() * dhs)                    // applied to tanh of input network
+                it = Sigmoid (pin() + H{} * dhs + C{} .* dcs)           // input gate(t)
+                bit = it .* Tanh (pin() + H{} * dhs)                    // applied to tanh of input network

-                ft = Sigmoid (pin() + H() * dhs + C() .* dcs)           // forget-me-not gate(t)
+                ft = Sigmoid (pin() + H{} * dhs + C{} .* dcs)           // forget-me-not gate(t)
                bft = ft .* dc                                          // applied to cell(t-1)

                ct = bft + bit                                          // c(t) is sum of both

-                ot = Sigmoid (pin() + H() * dhs + C() .* S(ct))         // output gate(t)
+                ot = Sigmoid (pin() + H{} * dhs + C{} .* S(ct))         // output gate(t)
                ht = ot .* Tanh (ct)                                    // applied to tanh(cell(t))
-            ]
+            }

            # our return values
            c = _.ct          // cell value
@ -893,8 +925,8 @@ RNNs =
                then Wmr * S(_.ht)        // project
                else _.ht                 // no projection
            dim = outputDim
-        ] // end of apply(x)
-    ].apply
+        } // end of apply (x, prevState)
+    }.apply

    # LSTMP -- LSTM function with projection and self-stabilization
    # Projection is enabled by passing different values for outputDim and cellDim.
--- a/Source/CNTK/BrainScript/Doc/Notes.txt
+++ b/Source/CNTK/BrainScript/Doc/Notes.txt
@ -216,13 +216,13 @@ TIMIT_TrainSimple = new TrainAction [               // new: added TrainAction; t
            needPrior = true
            // the following two belong into SGD, so they were removed here
            //trainingCriterion = CrossEntropyWithSoftmax
-            //evalCriterion = ErrorPrediction
+            //evalCriterion = ClassificationError
            // new: connect to input stream from source; and expose the output layer
            input = source.features.data                        // these are also ComputeNodeRefs, exposed by the source
            output = ComputeNodeRef [ dim = source.labels.dim ] // SimpleNetworkBuilder will put top layer affine transform output (input to softmax) here
            // criteria are configurable here; these are ComputeNodes created here
            trainingCriterion = CrossEntropyWithSoftmax (source.labels.data, output)
-            evalCriterion = ErrorPrediction (source.labels.data, output)
+            evalCriterion = ClassificationError (source.labels.data, output)
            // new: (and half-baked) define Input nodes
            myFeatures=Input(featDim)               // reader stream will reference this
            myLabels=Input(labelDim)
@ -245,7 +245,7 @@ TIMIT_TrainSimple = new TrainAction [               // new: added TrainAction; t
            //L2 = SBFF(L1,hiddenDim,hiddenDim)
            //L3 = SBFF(L2,hiddenDim,hiddenDim)
            //CE = SMBFF(L3,labelDim,hiddenDim,myLabels,tag=Criteria)
-            //Err = ErrorPrediction(myLabels,CE.BFF.FF.P,tag=Eval)
+            //Err = ClassificationError(myLabels,CE.BFF.FF.P,tag=Eval)
            //logPrior = LogPrior(myLabels)	 
            //ScaledLogLikelihood=Minus(CE.BFF.FF.P,logPrior,tag=Output)

@ -279,7 +279,7 @@ TIMIT_TrainSimple = new TrainAction [               // new: added TrainAction; t

            // define criterion nodes
            CE = CrossEntropyWithSoftmax(myLabels, outZ)
-            Err = ErrorPrediction(myLabels, outZ)
+            Err = ClassificationError(myLabels, outZ)

            // define output node for decoding
            logPrior = LogPrior(myLabels)	 
@ -392,7 +392,7 @@ network = new NDL [

    // define criterion nodes
    CE = CrossEntropyWithSoftmax(myLabels, outZ)
-    Err = ErrorPrediction(myLabels, outZ)
+    Err = ClassificationError(myLabels, outZ)

    // define output node for decoding
    logPrior = LogPrior(myLabels)	 
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -93,26 +93,6 @@ std::string WCharToString(const wchar_t* wst)
    return s;
 }

-// TODO: This is an action, it should be moved into ActionsLib.
-template <typename ElemType>
-void DumpNodeInfo(const ConfigParameters& config)
-{
-    wstring modelPath = config(L"modelPath");
-    wstring nodeName = config(L"nodeName", L"__AllNodes__");
-    wstring nodeNameRegexStr = config(L"nodeNameRegex", L"");
-    wstring defOutFilePath = modelPath + L"." + nodeName + L".txt";
-    wstring outputFile = config(L"outputFile", defOutFilePath);
-    bool printValues = config(L"printValues", true);
-    bool printMetadata = config(L"printMetadata", true);
-    if (!printValues && !printMetadata)
-    {
-        InvalidArgument("printValues and printMetadata: Since both are set to false, there will be nothing to dump");
-    }
-
-    ComputationNetworkPtr net = ComputationNetwork::CreateFromFile<ElemType>(CPUDEVICE, modelPath);
-    net->DumpNodeInfoToFile(nodeName, printValues, printMetadata, outputFile, nodeNameRegexStr);
-}
-
 size_t GetMaxEpochs(const ConfigParameters& configParams)
 {
    ConfigParameters configSGD(configParams("SGD"));
@ -286,9 +266,9 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
                {
                    TestCn<ElemType>(config); // for "devtest" action pass the root config instead
                }
-                else if (thisAction == "dumpNode" /*deprecated:*/|| thisAction == "dumpnode")
+                else if (thisAction == "dumpNodes" /*deprecated:*/ || thisAction == "dumpNode" || thisAction == "dumpnode")
                {
-                    DumpNodeInfo<ElemType>(commandParams);
+                    DoDumpNodes<ElemType>(commandParams);
                }
                else if (thisAction == "convertdbn")
                {
@ -682,28 +662,22 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
        fprintf(stderr, "%*s%ls", i > 0 ? 2 : 0, "", argv[i]); // use 2 spaces for better visual separability
    fprintf(stderr, "\n\n");

-#if 1 //def _DEBUG
+#ifdef _DEBUG
    // This simply merges all the different config parameters specified (eg, via config files or via command line directly),
    // and prints it.
-    fprintf(stderr, "\n\n");
-    LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG (VARIABLES NOT RESOLVED) >>>>>>>>>>>>>>>>>>>>\n");
+    fprintf(stderr, "\nConfiguration, Raw:\n\n");
    LOGPRINTF(stderr, "%s\n", rawConfigString.c_str());
-    LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG (VARIABLES NOT RESOLVED)  <<<<<<<<<<<<<<<<<<<<\n");

    // Same as above, but all variables are resolved.  If a parameter is set multiple times (eg, set in config, overridden at command line),
    // All of these assignments will appear, even though only the last assignment matters.
-    fprintf(stderr, "\n");
-    LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> RAW CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
+    fprintf(stderr, "\nConfiguration After Variable Resolution:\n\n");
    LOGPRINTF(stderr, "%s\n", config.ResolveVariables(rawConfigString).c_str());
-    LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< RAW CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");

+#endif
    // This outputs the final value each variable/parameter is assigned to in config (so if a parameter is set multiple times, only the last
    // value it is set to will appear).
-    fprintf(stderr, "\n");
-    LOGPRINTF(stderr, ">>>>>>>>>>>>>>>>>>>> PROCESSED CONFIG WITH ALL VARIABLES RESOLVED >>>>>>>>>>>>>>>>>>>>\n");
+    fprintf(stderr, "\nConfiguration After Processing and Variable Resolution:\n\n");
    config.dumpWithResolvedVariables();
-    LOGPRINTF(stderr, "<<<<<<<<<<<<<<<<<<<< PROCESSED CONFIG WITH ALL VARIABLES RESOLVED <<<<<<<<<<<<<<<<<<<<\n");
-#endif

    LOGPRINTF(stderr, "Commands:");
    for (int i = 0; i < command.size(); i++)
--- a/Source/CNTK/modelEditorFromScratch.txt
+++ b/Source/CNTK/modelEditorFromScratch.txt
@ -23,7 +23,7 @@ m1=[
    L2 = RBFF(L1, HDim, HDim)
    L3 = RBFF(L2, HDim, HDim)
    CE = SMBFF(L3, LDim, HDim, labels, tag="criterion")
-    Err=ErrorPrediction(labels, CE.BFF.FF.P, tag="evaluation")
+    Err=ClassificationError(labels, CE.BFF.FF.P, tag="evaluation")

    # rootNodes defined here
    OutputNodes=(CE.BFF.FF.P)
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -218,7 +218,7 @@ namespace CNTK
                std::swap(inputVars[0], inputVars[1]);
                opType = PrimitiveOpType::CrossEntropyWithSoftmax;
            }
-            else if (node->OperationName() == OperationNameOf(ErrorPredictionNode))
+            else if (node->OperationName() == OperationNameOf(ClassificationErrorNode))
            {
                std::swap(inputVars[0], inputVars[1]);
                opType = PrimitiveOpType::ClassificationError;
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -355,7 +355,7 @@ namespace CNTK
                computationNodePtr = builder.CrossEntropyWithSoftmax(input1Node, input0Node, function->Name());
                break;
            case PrimitiveOpType::ClassificationError:
-                computationNodePtr = builder.ErrorPrediction(input1Node, input0Node, function->Name());
+                computationNodePtr = builder.ClassificationError(input1Node, input0Node, function->Name());
                break;
            case PrimitiveOpType::PastValue:
            case PrimitiveOpType::FutureValue:
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -426,6 +426,13 @@ void ComputationNetwork::RandomInitLearnableParameters(const ComputationNodeBase
    InitLearnableParameters(node, uniformInit ? L"uniform" : L"gaussian", initValueScale, randomSeed, initOnCPUOnly);
 }

+template <class ElemType>
+void ComputationNetwork::InitLearnableParametersWithBilinearFill(const ComputationNodeBasePtr& node, size_t kernelWidth, size_t kernelHeight)
+{
+    auto learnableParameterNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(node);
+    learnableParameterNode->InitBilinear(kernelWidth, kernelHeight);
+}
+
 bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
 {
    // TODO: just use return!
@ -435,7 +442,7 @@ bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
        nodePtr->OperationName() == OperationNameOf(SequenceWithSoftmaxNode) ||
        nodePtr->OperationName() == OperationNameOf(CrossEntropyNode) ||
        nodePtr->OperationName() == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode) ||
-        nodePtr->OperationName() == OperationNameOf(ErrorPredictionNode) ||
+        nodePtr->OperationName() == OperationNameOf(ClassificationErrorNode) ||
 #ifdef COMING_SOON
        nodePtr->OperationName() == OperationNameOf(CRFNode) ||
 #endif
@ -1228,7 +1235,7 @@ void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wst
    };

    // Get output node
-    std::list<ComputationNodeBasePtr> outputNodes = net->GetNodesWithType(OperationNameOf(ErrorPredictionNode));
+    std::list<ComputationNodeBasePtr> outputNodes = net->GetNodesWithType(OperationNameOf(ClassificationErrorNode));
    ComputationNodeBasePtr outputNode = GetFirstNodeWithDifferentType(outputNodes.front()->GetInputs(), OperationNameOf(InputValue));

    if (outputNode == nullptr)
@ -1478,6 +1485,7 @@ void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wst
    PutTag("EDBN");
 }

+template void ComputationNetwork::InitLearnableParametersWithBilinearFill<float>(const ComputationNodeBasePtr& node, size_t kernelWidth, size_t kernelHeight);
 template void ComputationNetwork::Read<float>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
@ -1487,6 +1495,7 @@ template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net,
                                                     const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 template void ComputationNetwork::SaveToDbnFile<float>(ComputationNetworkPtr net, const std::wstring& fileName) const;

+template void ComputationNetwork::InitLearnableParametersWithBilinearFill<double>(const ComputationNodeBasePtr& node, size_t kernelWidth, size_t kernelHeight);
 template void ComputationNetwork::Read<double>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -349,6 +349,9 @@ public:
    // Legacy version that is for random only.
    void RandomInitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly = false) const;

+    template <class ElemType>
+    void InitLearnableParametersWithBilinearFill(const ComputationNodeBasePtr& node, size_t kernelWidth, size_t kernelHeight);
+
    template <typename N>
    static shared_ptr<N> AsNodePtr(const ComputationNodeBasePtr& inode)
    {
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -40,13 +40,8 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
 #endif
         if (nodeType == OperationNameOf(AbsNode))                              return New<AbsNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode))return New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(ClassificationErrorNode))              return New<ClassificationErrorNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ClipNode))                             return New<ClipNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(EqualNode))                            return New<EqualNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(GreaterEqualNode))                     return New<GreaterEqualNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(GreaterNode))                          return New<GreaterNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(LessEqualNode))                        return New<LessEqualNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(LessNode))                             return New<LessNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(NotEqualNode))                         return New<NotEqualNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(CosDistanceNode))                      return New<CosDistanceNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(CosDistanceWithNegativeSamplesNode))   return New<CosDistanceWithNegativeSamplesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(CosineNode))                           return New<CosineNode<ElemType>>(forward<_Types>(_Args)...);
@ -59,7 +54,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(DynamicAxisNode))                      return New<DynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ElementTimesNode))                     return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(EnvironmentInputNode))                 return New<EnvironmentInputNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(ErrorPredictionNode))                  return New<ErrorPredictionNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(EqualNode))                            return New<EqualNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ExpNode))                              return New<ExpNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(FloorNode))                            return New<FloorNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(FutureValueNode))                      return New<FutureValueNode<ElemType>>(forward<_Types>(_Args)...);
@ -67,10 +62,14 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
 #ifdef COMING_SOON
    else if (nodeType == OperationNameOf(GMMLogLikelihoodNode))                 return New<GMMLogLikelihoodNode<ElemType>>(forward<_Types>(_Args)...);
 #endif
+    else if (nodeType == OperationNameOf(GreaterEqualNode))                     return New<GreaterEqualNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(GreaterNode))                          return New<GreaterNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(HardmaxNode))                          return New<HardmaxNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(IfNode))                               return New<IfNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(InvStdDevNode))                        return New<InvStdDevNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(KhatriRaoProductNode))                 return New<KhatriRaoProductNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(LessEqualNode))                        return New<LessEqualNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(LessNode))                             return New<LessNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LogNode))                              return New<LogNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LogPlusNode))                          return New<LogPlusNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LogSoftmaxNode))                       return New<LogSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
@ -80,6 +79,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(MeanNode))                             return New<MeanNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(MinusNode))                            return New<MinusNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(NegateNode))                           return New<NegateNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(NotEqualNode))                         return New<NotEqualNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(NoiseContrastiveEstimationNode))       return New<NoiseContrastiveEstimationNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PackedIndexNode))                      return New<PackedIndexNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PastValueNode))                        return New<PastValueNode<ElemType>>(forward<_Types>(_Args)...);
@ -119,6 +119,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(WhereNode))                            return New<WhereNode<ElemType>>(forward<_Types>(_Args)...);
    // legacy names we also support for back compat of model-files
    else if (nodeType == L"ColumnElementTimes")                                 return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == L"ErrorPrediction")                                    return New<ClassificationErrorNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"Delay")                                              return New<PastValueNode<ElemType>>(forward<_Types>(_Args)...);
    // TODO: DiagTimes is also an alias of ElementTimes; current separate implementation is unnecessary.
    else if (nodeType == L"PerDimMeanVarNormalizationNode")                     return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
@ -368,9 +369,9 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Avera
 }

 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ClassificationError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
 {
-    return net.AddNodeToNetAndAttachInputs(New<ErrorPredictionNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
+    return net.AddNodeToNetAndAttachInputs(New<ClassificationErrorNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
 }

 template <class ElemType>
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -122,7 +122,7 @@ public:
    ComputationNodePtr DummyCriterion(const ComputationNodePtr objectives, const ComputationNodePtr derivatives, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
    ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName = L"");
-    ComputationNodePtr ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
+    ComputationNodePtr ClassificationError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Exp(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr Floor(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
@ -447,7 +447,7 @@ ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetworkWithEd
 //    refWeight = 0.9
 //    kldLabels = labels * (1-refWeight) + Softmax (zRef) * refWeight  # interpolate with ref output
 //    ce = CrossEntropyWithSoftmax (z, kldLabels)
-//    errs = ErrorPrediction (z, labels)
+//    errs = ClassificationError (z, labels)
 //    criterionNodes = (ce)
 //    evaluationNodes = (errs)
 // ===================================================================
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -32,16 +32,17 @@
 #define CNTK_MODEL_VERSION_1 1
 #define CNTK_MODEL_VERSION_2 2
 #define CNTK_MODEL_VERSION_3 3
-#define CNTK_MODEL_VERSION_4 4 // PastValue
-#define CNTK_MODEL_VERSION_5 5 // ND convolution and pooling
-#define CNTK_MODEL_VERSION_6 6 // Batch norm blending
-#define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
-#define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
-#define CNTK_MODEL_VERSION_9 9 // Transpose flag in ConvolutionNode to support deconvolution. 
-#define CNTK_MODEL_VERSION_10 10 // Learning rate multiplier for input nodes. 
-#define CNTK_MODEL_VERSION_11 11 // Dynamic axis name for where nodes. 
-#define CNTK_MODEL_VERSION_12 12 // Batch norm: switch running inverse std deviation -> variance, MB count -> samplesSeen; CuDNN v5
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_12
+#define CNTK_MODEL_VERSION_4 4   // PastValue
+#define CNTK_MODEL_VERSION_5 5   // ND convolution and pooling
+#define CNTK_MODEL_VERSION_6 6   // batch-norm blending
+#define CNTK_MODEL_VERSION_7 7   // ElemType tag in model file
+#define CNTK_MODEL_VERSION_8 8   // DynamicAxis for inputs
+#define CNTK_MODEL_VERSION_9 9   // transpose flag in ConvolutionNode to support deconvolution
+#define CNTK_MODEL_VERSION_10 10 // learning-rate multiplier for input nodes
+#define CNTK_MODEL_VERSION_11 11 // dynamic axis name for where nodes
+#define CNTK_MODEL_VERSION_12 12 // Times() m_inputRank to support parameter-rank inference
+#define CNTK_MODEL_VERSION_13 13 // Batch norm: switch running inverse std deviation -> variance, MB count -> samplesSeen; CuDNN v5
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_13

 extern bool g_shareNodeValueMatrices;

--- a/Source/ComputationNetworkLib/DeprecatedNodes.h
+++ b/Source/ComputationNetworkLib/DeprecatedNodes.h
@ -166,4 +166,128 @@ public:
 template class PerDimMeanVarNormalizationNode<float>;
 template class PerDimMeanVarNormalizationNode<double>;

+// -----------------------------------------------------------------------
+// DiagTimesNode (vector representing the diagonal of a square matrix, data)
+// Deprecated because can be implemented with ElementTimes.
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class DiagTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
+{
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"DiagTimes"; }
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(DiagTimesNode);
+    DiagTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+    {
+        if (inputIndex == 0) // left derivative
+        {
+            Matrix<ElemType> sliceOutputGrad = MaskedGradientFor(fr); // use Masked- version since this is reducing over frames
+            Matrix<ElemType> sliceInput1Value = Input(1)->MaskedValueFor(fr);
+            m_innerproduct->AssignInnerProductOf(sliceOutputGrad, sliceInput1Value, false);
+            Input(0)->GradientAsMatrix() += *m_innerproduct;
+        }
+        else // right derivative
+        {
+            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
+            Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
+            m_rightGradient->SetValue(sliceOutputGrad);
+            m_rightGradient->ColumnElementMultiplyWith(Input(0)->ValueAsMatrix());
+            sliceInput1Grad += *m_rightGradient;
+        }
+    }
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The DiagTimesNode does not require its output value for computing
+        // the gradients of its input nodes
+        return false;
+    }
+
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
+    {
+        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+
+        sliceOutputValue.AssignValuesOf(sliceInput1Value);
+        sliceOutputValue.ColumnElementMultiplyWith(Input(0)->ValueAsMatrix());
+    }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
+
+        size_t rows0 = Input(0)->GetAsMatrixNumRows();
+        size_t rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
+
+        // if dimension not specified we assume two operands' dimensions should match
+        Input(0)->ValidateInferInputDimsFrom(TensorShape(rows1));
+
+        if (Input(1)->HasMBLayout())
+        {
+            // infer rows1 as rows0
+            Input(1)->ValidateInferInputDimsFrom(TensorShape(rows0));
+            SetDims(TensorShape(rows0), true);
+        }
+        else // multiplying two straight matrices
+        {
+            size_t cols1 = Input(1)->GetAsMatrixNumCols();
+            // infer rows1 as rows0
+            Input(1)->ValidateInferInputDimsFrom(TensorShape(rows0, cols1));
+            SetDims(TensorShape(rows0, cols1), false);
+        }
+
+        // update after inference
+        rows0 = Input(0)->GetAsMatrixNumRows();
+        rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
+        if (isFinalValidationPass && rows0 != rows1)
+            InvalidArgument("The inner matrix dimension in the %ls %ls operation does not match (%d vs. %d).", NodeName().c_str(), OperationName().c_str(), (int) rows1, (int) rows0);
+        size_t cols0 = Input(0)->GetAsMatrixNumCols();
+        if (isFinalValidationPass && cols0 != 1)
+            InvalidArgument("The first matrix should be a column vector representing the diagonal of a square matrix in the DiagTimes operation.");
+
+        SetDims(Input(1));
+    }
+
+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
+        {
+            auto node = dynamic_pointer_cast<DiagTimesNode<ElemType>>(nodeP);
+            node->m_innerproduct->SetValue(*m_innerproduct);
+            node->m_rightGradient->SetValue(*m_rightGradient);
+        }
+    }
+    // request matrices that are needed for gradient computation
+    virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
+    {
+        Base::RequestMatricesBeforeBackprop(matrixPool);
+        RequestMatrixFromPool(m_innerproduct, matrixPool);
+        RequestMatrixFromPool(m_rightGradient, matrixPool);
+    }
+
+    // release gradient and temp matrices that no longer needed after all the children's gradients are computed.
+    virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+    {
+        Base::ReleaseMatricesAfterBackprop(matrixPool);
+        ReleaseMatrixToPool(m_innerproduct, matrixPool);
+        ReleaseMatrixToPool(m_rightGradient, matrixPool);
+    }
+
+private:
+    shared_ptr<Matrix<ElemType>> m_innerproduct;
+    shared_ptr<Matrix<ElemType>> m_rightGradient;
+};
+
+template class DiagTimesNode<float>;
+template class DiagTimesNode<double>;
+
 }}}
--- a/Source/ComputationNetworkLib/EvaluationNodes.h
+++ b/Source/ComputationNetworkLib/EvaluationNodes.h
@ -18,24 +18,20 @@
 namespace Microsoft { namespace MSR { namespace CNTK {

 // -----------------------------------------------------------------------
-// ErrorPredictionNode (label, prediction)   or ErrorPredictionNode (prediction, label)
+// ClassificationErrorNode (label, prediction)   or ClassificationErrorNode (prediction, label)
 // Performs classification and error counting.
 // Result is an error rate, lower = better.
 // -----------------------------------------------------------------------

 template <class ElemType>
-class ErrorPredictionNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>
+class ClassificationErrorNode : public ComputationNodeNonLooping /*ComputationNode*/<ElemType>
 {
-    typedef ComputationNodeNonLooping<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"ErrorPrediction";
-    }
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"ClassificationError"; }

 public:
-    DeclareConstructorFromConfig(ErrorPredictionNode);
-    ErrorPredictionNode(DEVICEID_TYPE deviceId, const wstring& name)
+    DeclareConstructorFromConfig(ClassificationErrorNode);
+    ClassificationErrorNode(DEVICEID_TYPE deviceId, const wstring& name)
        : Base(deviceId, name)
    {
    }
@ -63,10 +59,10 @@ public:
        MaskMissingColumnsToZero(*m_maxIndexes1, Input(1)->GetMBLayout(), fr);
        Value().AssignNumOfDiff(*m_maxIndexes0, *m_maxIndexes1, m_topK > 1);
 #if NANCHECK
-        Value().HasNan("ErrorPrediction");
+        Value().HasNan("ClassificationError");
 #endif
 #if DUMPOUTPUT
-        Value().Print("ErrorPredictionNode");
+        Value().Print("ClassificationErrorNode");
 #endif
    }

@ -100,7 +96,7 @@ public:
        Base::CopyTo(nodeP, newName, flags);
        if (flags & CopyNodeFlags::copyNodeValue)
        {
-            auto node = dynamic_pointer_cast<ErrorPredictionNode<ElemType>>(nodeP);
+            auto node = dynamic_pointer_cast<ClassificationErrorNode<ElemType>>(nodeP);
            node->m_maxIndexes0->SetValue(*m_maxIndexes0);
            node->m_maxIndexes1->SetValue(*m_maxIndexes1);
            node->m_maxValues->SetValue(*m_maxValues);
@ -131,8 +127,8 @@ private:
    int m_topK;
 };

-template class ErrorPredictionNode<float>;
-template class ErrorPredictionNode<double>;
+template class ClassificationErrorNode<float>;
+template class ClassificationErrorNode<double>;

 #ifdef COMING_SOON

--- a/Source/ComputationNetworkLib/InputAndParamNodes.cpp
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
@ -26,7 +26,7 @@ void LearnableParameter<ElemType>::InitShape(const TensorShape& shape)
    Value().Invalidate();
 }

-static pair<bool/*uniform*/, double/*stddev or range*/> ParseRandomizationType(const std::wstring& type, size_t fanOut = 1, size_t fanIn = 1);
+static pair<bool/*uniform*/, double/*stddev or range*/> ParseRandomizationType(const wstring& type, size_t fanOut = 1, size_t fanIn = 1);

 // constructor from config
 // Parameterization is a little wicked. An older version required to specify the type of initialization
@ -41,6 +41,11 @@ static pair<bool/*uniform*/, double/*stddev or range*/> ParseRandomizationType(c
 //  - init="fixedValue",  value from 'value'            --deprecated in favor of just specifying initValue
 //  - init="fromFile",    value from 'initFromFilePath' --deprecated in favor of just specifying 'initFromFilePath'
 //  - init="fromLiteral", value from 'initFromLiteral'  --deprecated in favor of initValue=array expression
+// Random initialization takes an additional optional parameter initOutputRank, default 1.
+// All dimensions that are not amongst the first 'initOutputRank' are considered inputs.
+// This is necessary e.g. for convolution.
+// 'initOutputRank' can also be negative to denote output dims on the right, to cater to the needs
+// of convolution kernels where the output rank is the right-most axis (initOutputRank=-1).
 // The forms that infer the dimensions have different BrainScript names. TODO: need one for fromFile
 // TODO: All forms that require specified dimensions but contain zeroes (to be updated by graph)
 //       will need to do deferred initialization, or have a way to repeat it.
@ -91,7 +96,8 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
        int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
        m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
        m_initValueScale = configp->Get(L"initValueScale");
-        m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
+        m_initOutputRank = configp->Get(L"initOutputRank");
+        m_initOnCPUOnly  = configp->Get(L"initOnCPUOnly");
    }
    else if (initString == L"zero")
    {
@ -114,6 +120,13 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
        InitFromFile(initFromFilePath);
        m_initString.clear();
    }
+    else if (initString == L"bilinear")
+    {
+        const size_t kernelWidth = configp->Get(L"kernelWidth");
+        const size_t kernelHeight = configp->Get(L"kernelHeight");
+        InitBilinear(kernelWidth, kernelHeight);
+        m_initString.clear();
+    }
    // legacy
    else if (initString == L"fixedValue") // deprecated. Use initValue=... instead
    {
@ -155,6 +168,7 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
        m_initString = initString;
        m_randomSeed = randomSeed;
        m_initValueScale = initValue;
+        m_initOutputRank = 1; // default. NDL (deprecated) cannot specify a different value.
        m_initOnCPUOnly = initOnCPUOnly;
    }
    else if (initString == L"fixedValue") // from constant value
@ -182,7 +196,7 @@ void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString,
 //  heNormal:      sqrt(2 / fanin)
 //  heUniform:     sqrt(6 / fanin)
 // returns (*,0) for unrecognized string
-static pair<bool/*uniform*/,double/*stddev or range*/> ParseRandomizationType(const std::wstring& type, size_t fanOut /* = 1*/, size_t fanIn /*= 1*/)
+static pair<bool/*uniform*/,double/*stddev or range*/> ParseRandomizationType(const wstring& type, size_t fanOut /* = 1*/, size_t fanIn /*= 1*/)
 {
    if      (type == L"uniform")       return make_pair( true, 0.05f);
    else if (type == L"gaussian")      return make_pair(false, 0.2 / sqrt(fanIn));
@ -197,26 +211,33 @@ static pair<bool/*uniform*/,double/*stddev or range*/> ParseRandomizationType(co
 // initialize with random numbers
 // if 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing)
 template <class ElemType>
-void LearnableParameter<ElemType>::InitRandom(const std::wstring& type,
+void LearnableParameter<ElemType>::InitRandom(const wstring& type,
                                              const unsigned long randomSeed,
                                              const ElemType initValueScale,
-                                              bool initOnCPUOnly)
+                                              const int initOutputRank,
+                                              const bool initOnCPUOnly)
 {
    // fprintf(stderr, "%d x %d: %d  %ls\n", (int)GetNumRows(), (int)GetNumCols(), (int)randomSeed, NodeName().c_str());

    let& sampleLayout = GetSampleLayout();
-#if 1   // this more complex version is needed to repro test cases generated with an older version
-    auto& value = sampleLayout.GetRank() > 2 ? Value() : ValueAsMatrix();
-#else
-    auto& value = Value();
-#endif
-
    let numElements = sampleLayout.GetNumElements();
    if (numElements == 0)
        return;
-    // We assume that the matrix row dimension is the output dimension. This is wrong in case of ND biases, convolution filters, and BatchNorm.
-    size_t fanIn = value.GetNumCols();   // fan-in
-    size_t fanOut = numElements / fanIn; // remaining dimensions
+    // determine fan-in and fan-out
+    // This is controlled by initOutputRank.
+    // For a normal matrix [I x J], fanOut = I, fanIn = J=inDim --> initOutputRank = +1
+    // For a convolution kernel [w x h x C x K], fanOut = K, fanIn = w*h*C. --> initOutputRank = -1, meaning count from back
+    if (abs(initOutputRank) > sampleLayout.GetRank())
+        InvalidArgument("InitRandom: initOutputRank=%d exceeds sampleLayout rank %d", initOutputRank, (int)sampleLayout.GetRank());
+    // fanIn is determined by multiplying a range of dimensions:
+    //  - initOutputRank >= 0: [ initOutputRank, rank )
+    //  - initOutputRank <  0: [ 0, rank-abs(initOutputRank) )
+    let inDimsBegin = (initOutputRank >= 0) ? (size_t)initOutputRank : 0;
+    let inDimsEnd   = (initOutputRank >= 0) ? sampleLayout.GetRank() : (size_t)((int)sampleLayout.GetRank() + initOutputRank);
+    size_t fanIn = 1;
+    for (size_t k = inDimsBegin; k < inDimsEnd; k++)
+        fanIn *= sampleLayout[k];
+    let fanOut = numElements / fanIn; // remaining dimensions
    let opts = ParseRandomizationType(type, fanOut, fanIn);
    let isUniform = opts.first;
    ElemType range = (ElemType)opts.second;
@ -224,18 +245,74 @@ void LearnableParameter<ElemType>::InitRandom(const std::wstring& type,
        LogicError("InitRandom: Invalid initialization type '%ls'", type.c_str());

    // the random seed offset is set via the "randomSeedOffset" parameter in config
-    fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, range=%f*%f, onCPU=%s).\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(), (int)m_randomSeed, range, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
+    fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f*%f, onCPU=%s).\n",
+            NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
+            (int)m_randomSeed, (int)fanOut, (int)fanIn, range, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
    range *= initValueScale;
    if (initOnCPUOnly)
        Value().TransferToDeviceIfNotThere(CPUDEVICE, true);
    if (isUniform)
-        value.SetUniformRandomValue(-range, range, randomSeed);
+        Value().SetUniformRandomValue(-range, range, randomSeed);
    else
-        value.SetGaussianRandomValue(0, range, randomSeed);
+        Value().SetGaussianRandomValue(0, range, randomSeed);
    if (initOnCPUOnly)
        Value().TransferToDeviceIfNotThere(m_deviceId, true);
 }

+// Initialize with bilinear interpolation coefficients (useful for deconvolution layer).
+template <class ElemType>
+void LearnableParameter<ElemType>::InitBilinear(size_t kernelWidth, size_t kernelHeight)
+{
+    if (kernelHeight != kernelWidth)
+        LogicError("Filter for bilinear interpolation must be square.");
+
+    // Transfer to CPU as GPU initialization is still not supported.
+    Value().TransferToDeviceIfNotThere(CPUDEVICE, true);
+
+    const SmallVector<size_t>& dims = GetSampleLayout().GetDims();
+    assert(dims.size() == 2);
+    const size_t kernelCount = dims[0];
+    const size_t kernelWeightCount = dims[1];
+    assert(kernelWeightCount % (kernelWidth * kernelHeight) == 0);
+    const size_t channels = kernelWeightCount / (kernelWidth * kernelHeight);
+    if (kernelCount != channels)
+        LogicError("Number of input and output channels of filter for bilinear interpolation must be equal.");
+
+    ElemType* data = Value().Data();
+    const size_t factor = (kernelWidth + 1) / 2;
+    const float center = (kernelWidth - 1) / 2.0f;
+    int count = 0;
+    // Filter dimensions are [W x H x C x K] or ARRAY[1..K] OF ARRAY[1..C] OF ARRAY[1..H] OF ARRAY[1..W], where:
+    // W = width, H = height, C = input channels, K = output channels.
+    // In deconvolution, output channel should be upsampled version of corresponding input channel.
+    // 2D filter for bilinear interpolation where height=width=3 contains the following values:
+    // |0.25, 0.50, 0.25|
+    // |0.50, 1.00, 0.50|
+    // |0.25, 0.50, 0.25|
+    // So, output kernel with dimensions [3 x 3 x C] will contain all zeros except for the channel which we want to
+    // upsample. For that channel it will contain values above.
+    for (size_t kernel = 0; kernel < kernelCount; ++kernel)
+    {
+        for (size_t channel = 0; channel < channels; ++channel)
+        {
+            for (size_t h = 0; h < kernelHeight; ++h)
+            {
+                for (size_t w = 0; w < kernelWidth; ++w)
+                {
+                    float val = 0;
+                    if (kernel == channel)
+                    {
+                        val = (1 - fabs(w - center) / factor) * (1 - fabs(h - center) / factor);
+                    }
+                    data[count++] = val;
+                }
+            }
+        }
+    }
+
+    Value().TransferToDeviceIfNotThere(m_deviceId, true);
+}
+
 // initialize by reading a matrix from a text file
 template <class ElemType>
 void LearnableParameter<ElemType>::InitFromFile(const wstring& initFromFilePath)
@ -247,7 +324,7 @@ void LearnableParameter<ElemType>::InitFromFile(const wstring& initFromFilePath)

 // initialize by reading a matrix from a text file
 template <class ElemType>
-void LearnableParameter<ElemType>::InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols)
+void LearnableParameter<ElemType>::InitFromArray(const vector<ElemType>& array, size_t numRows, size_t numCols)
 {
    // infer tensor dimensions from input file if not set
    // Note: The mapping of dimensions of the input matrix to tensor dimensions are somewhat confusing.
@ -295,13 +372,13 @@ void LearnableParameter<ElemType>::InitFromArray(const std::vector<ElemType>& ar

 // TODO: Move this error check there, since this is called only from one place.
 template <class ElemType>
-void LearnableParameter<ElemType>::ReviseFromFile(const std::wstring& reviseFromFilePath)
+void LearnableParameter<ElemType>::ReviseFromFile(const wstring& reviseFromFilePath)
 {
    try
    {
        InitFromFile(reviseFromFilePath);
    }
-    catch (const std::exception & e)
+    catch (const exception & e)
    {
        RuntimeError("ReviseFromFile: Failed to reload %ls %ls operation from file %ls: %s", NodeName().c_str(), OperationName().c_str(), reviseFromFilePath.c_str(), e.what());
    }
@ -356,7 +433,7 @@ void LearnableParameter<ElemType>::Load(File& fstream, size_t modelVersion) /*ov
 }

 template <class ElemType>
-/*virtual*/ void LearnableParameter<ElemType>::CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const /*override*/
+/*virtual*/ void LearnableParameter<ElemType>::CopyTo(ComputationNodeBasePtr nodeP, const wstring& newName, const CopyNodeFlags flags) const /*override*/
 {
    Base::CopyTo(nodeP, newName, flags);
    if (flags & CopyNodeFlags::copyNodeValue)
@ -365,6 +442,7 @@ template <class ElemType>
        node->m_initString     = m_initString;
        node->m_randomSeed     = m_randomSeed;
        node->m_initValueScale = m_initValueScale;
+        node->m_initOutputRank = m_initOutputRank;
        node->m_initOnCPUOnly  = m_initOnCPUOnly;
        node->m_initValue      = m_initValue;
    }
@ -439,7 +517,7 @@ void LearnableParameter<ElemType>::LazyInitParameters()
    }
    else if (ParseRandomizationType(m_initString).second != 0)
    {
-        InitRandom(m_initString, m_randomSeed, m_initValueScale, m_initOnCPUOnly);
+        InitRandom(m_initString, m_randomSeed, m_initValueScale, m_initOutputRank, m_initOnCPUOnly);
    }
    else
        LogicError("LearnableParameter: Invalid value of m_initString '%ls' for deferred initialization for %ls.", m_initString.c_str(), NodeDescription().c_str());
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -56,13 +56,16 @@ public:
                            unsigned long randomSeed = 0,
                            bool initOnCPUOnly = false);

+    // Initialize with bilinear interpolation coefficients (useful for deconvolution layer).
+    void InitBilinear(size_t kernelWidth, size_t kernelHeight);
+
    // initialize by reading a matrix from a text file
    void InitFromFile(const std::wstring& initFromFilePath);

 private:
    // initialize with random numbers
    // If 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing).
-    void InitRandom(const std::wstring& type, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
+    void InitRandom(const std::wstring& type, const unsigned long randomSeed, const ElemType initValueScale, const int initOutputRank, const bool initOnCPUOnly);

    // helper to initialize from a matrix read from a text file or a string literal
    void InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols);
@ -103,6 +106,7 @@ private:
    std::wstring m_initString; // if non-empty then deferred initialization is needed. Gets cleared upon completion of deferred init.
    unsigned long m_randomSeed;
    ElemType m_initValueScale;
+    int m_initOutputRank;
    bool m_initOnCPUOnly;
    ElemType m_initValue;
 };
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -238,8 +238,8 @@ class TimesNodeBase : public ComputationNode<ElemType>, public NumInputs<2>
    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembers; using Base::OperationName;                                                                                                                           \

 public:
-    TimesNodeBase(DEVICEID_TYPE deviceId, const wstring& name, size_t outputRank = 1)
-        : Base(deviceId, name), m_outputRank(outputRank)
+    TimesNodeBase(DEVICEID_TYPE deviceId, const wstring& name, size_t outputRank = 1, int inferInputRankToMap = -1)
+        : Base(deviceId, name), m_outputRank(outputRank), m_inferInputRankToMap(inferInputRankToMap)
    {
    }

@ -249,7 +249,8 @@ public:
        if (flags & CopyNodeFlags::copyNodeValue)
        {
            auto node = dynamic_pointer_cast<TimesNodeBase<ElemType, m_transpose>>(nodeP);
-            node->m_outputRank = m_outputRank;
+            node->m_outputRank          = m_outputRank;
+            node->m_inferInputRankToMap = m_inferInputRankToMap;
        }
    }

@ -257,6 +258,7 @@ public:
    {
        Base::Save(fstream);
        fstream << m_outputRank;
+        fstream << m_inferInputRankToMap;
    }

    virtual void Load(File& fstream, size_t modelVersion) override
@ -266,6 +268,10 @@ public:
            fstream >> m_outputRank;
        else
            m_outputRank = 1;
+        if (modelVersion >= CNTK_MODEL_VERSION_11)
+            fstream >> m_inferInputRankToMap;
+        else
+            m_inferInputRankToMap = -1;
    }

 private:
@ -420,19 +426,33 @@ public:
                if (dimsA[k] == 0)
                    InvalidArgument("%ls %ls operation: The outputRank (%d) dimensions in left argument's shape [%s] must not be 0.", NodeName().c_str(), OperationName().c_str(), (int)m_outputRank, dimsAstring.c_str());

-            // if the last dimension of A is 0, then extend it to fully match B
-            // E.g. [I x 0] * [X x Y x Z x K] => infer as [I x X x Y x Z], not as [I x X].
-            // I.e. we cannot use inference to infer a matrix product on a part of an input tensor.
-            // We default to inferring the whole, as part of a tensor is a special use case.
-            assert (dimsA.size() == m_outputRank + numReductionDims);
-            while (numReductionDims < dimsB.size() && dimsA.back() == 0)
+            // infer rank of dimsA
+            // For purpose of dimension inference, Times() accepts an optional parameter inferInputRankToMap (default -1=unspecified).
+            // The last 'inferInputRankToMap' axes are considered those that the matrix product should keep (Times()
+            // is applied one by one, like a "map" operation) rather than reducing over.
+            // Specifically, inferInputRankToMap=0 means to reduce over all input axes, e.g. for an image input that
+            // should be flattened.
+            // Examples:
+            //  [I x Inferred] * [J x K],                    inferInputRankToMap=n/a --> Inferred  := J, result is [I x K]
+            //  [I x Inferred] * [W x H x C],                inferInputRankToMap=n/a --> Inferred  := W, result is [I x H x C] (not desired)
+            //  [I x Inferred x Inferred] * [W x H x C],     inferInputRankToMap=n/a --> Inf x Inf := [W x H], result is [I x C]
+            //  [I x Inferred] * [W x H x C],                inferInputRankToMap=0   --> Inferred  := W x H x C, result is [I] (desired)
+            //  [I x Inferred] * [W x H x C x R],            inferInputRankToMap=1   --> Inferred  := W x H x C, result is [I x R] (desired)
+            // If W's shape is too short, it will be padded with 0 (i.e. inferred in a subsequent step).
+            if (m_inferInputRankToMap >= 0) // if given, we pad if needed
            {
-                dimsA.push_back(0);
-                numReductionDims++;
+                if ((size_t)m_inferInputRankToMap >= dimsB.size() && isFinalValidationPass) // at least one axis must be left to reduce over
+                    InvalidArgument("%ls %ls operation: 'inferInputRankToMap' argument %d must be less than rank of second operand [%s].", NodeName().c_str(), OperationName().c_str(), m_inferInputRankToMap, dimsBstring.c_str());
+                assert(dimsA.size() == m_outputRank + numReductionDims);
+                while (numReductionDims + (size_t)m_inferInputRankToMap < dimsB.size())
+                {
+                    dimsA.push_back(0);
+                    numReductionDims++;
+                }
            }

            // fill in the missing ones
-            // We fill in dimensions given as 0. The tensor rank is not inferred.
+            // We fill in dimensions given as 0. The tensor rank is not inferred here (that is done above).
            for (size_t k = m_outputRank; k < dimsA.size(); k++)
            {
                auto& dimA = dimsA[k];
@ -478,6 +498,7 @@ public:

 private:
    size_t m_outputRank;
+    int m_inferInputRankToMap;  // -1 (not specified) or says how to expand shape of W, to keep this many mapping dims
 };

 // -----------------------------------------------------------------------
@ -504,12 +525,12 @@ class TimesNode : public TimesNodeBase<ElemType, false>
    static const std::wstring TypeName() { return L"Times"; }

 public:
-    TimesNode(DEVICEID_TYPE deviceId, const wstring& name, size_t outputRank = 1)
-        : Base(deviceId, name, outputRank)
+    TimesNode(DEVICEID_TYPE deviceId, const wstring& name, size_t outputRank = 1, int inferInputRankToMap = -1)
+        : Base(deviceId, name, outputRank, inferInputRankToMap)
    {
    }
    TimesNode(const ScriptableObjects::IConfigRecordPtr configp)
-        : TimesNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"outputRank"))
+        : TimesNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"outputRank"), configp->Get(L"inferInputRankToMap"))
    {
        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
    }
@ -537,7 +558,7 @@ class TransposeTimesNode : public TimesNodeBase<ElemType, true>
 public:
    DeclareConstructorFromConfigWithNumInputs(TransposeTimesNode);
    TransposeTimesNode(DEVICEID_TYPE deviceId, const wstring& name, size_t outputRank = 1)
-        : Base(deviceId, name, outputRank)
+        : Base(deviceId, name, outputRank, /*inferInputRankToMap=*/-1)
    {
    }
 };
@ -545,134 +566,6 @@ public:
 template class TransposeTimesNode<float>;
 template class TransposeTimesNode<double>;

-// -----------------------------------------------------------------------
-// DiagTimesNode (vector representing the diagonal of a square matrix, data)
-// TODO: This is redundant with ElementTimes and should be removed (with a compat stub).
-// -----------------------------------------------------------------------
-
-template <class ElemType>
-class DiagTimesNode : public ComputationNode<ElemType>, public NumInputs<2>
-{
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"DiagTimes";
-    }
-
-public:
-    DeclareConstructorFromConfigWithNumInputs(DiagTimesNode);
-    DiagTimesNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name)
-    {
-    }
-
-    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-    {
-        if (inputIndex == 0) // left derivative
-        {
-            Matrix<ElemType> sliceOutputGrad = MaskedGradientFor(fr); // use Masked- version since this is reducing over frames
-            Matrix<ElemType> sliceInput1Value = Input(1)->MaskedValueFor(fr);
-            m_innerproduct->AssignInnerProductOf(sliceOutputGrad, sliceInput1Value, false);
-            Input(0)->GradientAsMatrix() += *m_innerproduct;
-        }
-        else // right derivative
-        {
-            Matrix<ElemType> sliceOutputGrad = GradientFor(fr);
-            Matrix<ElemType> sliceInput1Grad = Input(1)->GradientFor(fr);
-            m_rightGradient->SetValue(sliceOutputGrad);
-            m_rightGradient->ColumnElementMultiplyWith(Input(0)->ValueAsMatrix());
-            sliceInput1Grad += *m_rightGradient;
-        }
-    }
-
-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The DiagTimesNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
-
-    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-    {
-        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
-
-        sliceOutputValue.AssignValuesOf(sliceInput1Value);
-        sliceOutputValue.ColumnElementMultiplyWith(Input(0)->ValueAsMatrix());
-    }
-
-    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-    {
-        Base::Validate(isFinalValidationPass);
-        InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
-
-        size_t rows0 = Input(0)->GetAsMatrixNumRows();
-        size_t rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
-
-        // if dimension not specified we assume two operands' dimensions should match
-        Input(0)->ValidateInferInputDimsFrom(TensorShape(rows1));
-
-        if (Input(1)->HasMBLayout())
-        {
-            // infer rows1 as rows0
-            Input(1)->ValidateInferInputDimsFrom(TensorShape(rows0));
-            SetDims(TensorShape(rows0), true);
-        }
-        else // multiplying two straight matrices
-        {
-            size_t cols1 = Input(1)->GetAsMatrixNumCols();
-            // infer rows1 as rows0
-            Input(1)->ValidateInferInputDimsFrom(TensorShape(rows0, cols1));
-            SetDims(TensorShape(rows0, cols1), false);
-        }
-
-        // update after inference
-        rows0 = Input(0)->GetAsMatrixNumRows();
-        rows1 = Input(1)->HasMBLayout() ? Input(1)->GetSampleMatrixNumRows() : Input(1)->GetAsMatrixNumRows();
-        if (isFinalValidationPass && rows0 != rows1)
-            InvalidArgument("The inner matrix dimension in the %ls %ls operation does not match (%d vs. %d).", NodeName().c_str(), OperationName().c_str(), (int) rows1, (int) rows0);
-        size_t cols0 = Input(0)->GetAsMatrixNumCols();
-        if (isFinalValidationPass && cols0 != 1)
-            InvalidArgument("The first matrix should be a column vector representing the diagonal of a square matrix in the DiagTimes operation.");
-
-        SetDims(Input(1));
-    }
-
-    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
-    {
-        Base::CopyTo(nodeP, newName, flags);
-        if (flags & CopyNodeFlags::copyNodeValue)
-        {
-            auto node = dynamic_pointer_cast<DiagTimesNode<ElemType>>(nodeP);
-            node->m_innerproduct->SetValue(*m_innerproduct);
-            node->m_rightGradient->SetValue(*m_rightGradient);
-        }
-    }
-    // request matrices that are needed for gradient computation
-    virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool)
-    {
-        Base::RequestMatricesBeforeBackprop(matrixPool);
-        RequestMatrixFromPool(m_innerproduct, matrixPool);
-        RequestMatrixFromPool(m_rightGradient, matrixPool);
-    }
-
-    // release gradient and temp matrices that no longer needed after all the children's gradients are computed.
-    virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
-    {
-        Base::ReleaseMatricesAfterBackprop(matrixPool);
-        ReleaseMatrixToPool(m_innerproduct, matrixPool);
-        ReleaseMatrixToPool(m_rightGradient, matrixPool);
-    }
-
-private:
-    shared_ptr<Matrix<ElemType>> m_innerproduct;
-    shared_ptr<Matrix<ElemType>> m_rightGradient;
-};
-
-template class DiagTimesNode<float>;
-template class DiagTimesNode<double>;
-
 // -----------------------------------------------------------------------
 // SumElementsNode (input)
 // Sums up all elements in the input across all samples into a single scalar.
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -37,6 +37,7 @@ template <class ElemType>
        node->m_axis        = m_axis;
        node->m_operation   = m_operation;
        node->m_reductionOp = m_reductionOp;
+        node->m_scale       = m_scale;
    }
 }

@ -64,8 +65,8 @@ template <class ElemType>
    auto input  = Input(0)->ValueTensorFor(rank, fr);

    // the actual operation is a Copy with reduction, where the magic is in the reduction op
-    result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_reductionOp);
-    // note: we can implement "Mean" by passing 1/dim for alpha
+    // For "Mean", m_scale is 1/#elements, and 1 otherwise.
+    result.DoUnaryOpOf(0, input, m_scale, ElementWiseOperator::opCopy, m_reductionOp);
 }

 template <class ElemType>
@ -82,8 +83,9 @@ template <class ElemType>
    switch (m_reductionOp)
    {
    case ElementWiseOperator::opSum:
-        // "Sum": broadcast the gradient
-        sliceInputGrad.AddCopyOf(sliceOutputGrad);
+        // "Sum":  broadcast the gradient
+        // "Mean": same as "Sum" with scaling by 1/#dims
+        sliceInputGrad.AddCopyOf(sliceOutputGrad, m_scale);
        break;

    case ElementWiseOperator::opLogSum:
@ -95,7 +97,7 @@ template <class ElemType>
            // df / dx = exp(x)/exp(f)
            //         = exp(x – f)
            sliceInputGrad.AddElementwiseProductWithExpOfDiffOf(sliceOutputGrad, input, output);
-        }
+    }
        break;

    case ElementWiseOperator::opMin:
@ -120,12 +122,6 @@ template <class ElemType>
        break;

        // more coming
-
-        // "LogPlus": softmax
-        //   f(x) = log(sum_i exp x_i), hence gradient is:
-        //   df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i  - ReduceLogPlus(x))
-        // targetGradient = gradientFromTop .* Exp (inputValue - outputValue)   --TODO: verify
-        // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.
    }
 }

@ -164,6 +160,7 @@ void ReduceElementsNode<ElemType>::ValidateOp()
    else
 #endif
    if      (m_operation == L"Sum")    m_reductionOp = ElementWiseOperator::opSum;
+    else if (m_operation == L"Mean")   m_reductionOp = ElementWiseOperator::opSum;
    else if (m_operation == L"LogSum") m_reductionOp = ElementWiseOperator::opLogSum;
    else if (m_operation == L"Min")    m_reductionOp = ElementWiseOperator::opMin;
    else if (m_operation == L"Max")    m_reductionOp = ElementWiseOperator::opMax;
@ -183,13 +180,26 @@ template <class ElemType>

    let shape = Input(0)->GetSampleLayout();
    auto dims = shape.GetDims();
+    size_t reducedDim = 0; // (init to keep compiler happy)
    if (m_axis == 0)
+    {
+        reducedDim = shape.GetNumElements();
        dims = { 1 };                       // entire sample is reduced to a scalar
+    }
    else if (m_axis - 1 >= 0 && m_axis - 1 < dims.size())
+    {
+        reducedDim = dims[m_axis - 1];
        dims[m_axis - 1] = 1;               // one axis is reduced to a scalar
+    }
    else if (isFinalValidationPass)
        InvalidArgument("The shape of %ls [%s] has no axis %d", NodeDescription().c_str(), string(shape).c_str(), m_axis);

+    // for "Mean", we must divide by #elements
+    if (isFinalValidationPass && m_operation == L"Mean")
+        m_scale = (ElemType)(1.0 / reducedDim);
+    else
+        m_scale = (ElemType)1;
+
    SetDims(TensorShape(dims), Input(0)->HasMBLayout());
 }

--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -176,10 +176,10 @@ template class ReshapeNode<double>;
 // The optional axis can be 0 (meaning all elements) or a specific axis.
 // Allowed operations:
 //  - "Sum"
-//  - "LogSum"    --not implemented yet
-//  - "Mean"      --not implemented yet
-//  - "Max"       --not implemented yet
-//  - "Min"       --not implemented yet
+//  - "LogSum"
+//  - "Mean"
+//  - "Max"
+//  - "Min"
 //  - "All"       --not implemented yet
 //  - "Any"       --not implemented yet
 // TODO:
@ -196,7 +196,7 @@ class ReduceElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
    void ValidateOp();
 public:
    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = 0) :
-        Base(deviceId, name), m_operation(operation), m_axis(axis), m_reductionOp((ElementWiseOperator)-1/*invalid*/)
+        Base(deviceId, name), m_operation(operation), m_axis(axis), m_reductionOp((ElementWiseOperator)-1/*invalid*/), m_scale(0/*invalid*/)
    {
        if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate())
            ValidateOp();
@ -221,9 +221,13 @@ public:
    int ReductionAxis() const { return m_axis; }

 private:
+    // operation attributes
    int m_axis;
-    std::wstring m_operation; // the operation as a string, e.g. "Sum", see ValidateOp()
+    std::wstring m_operation;          // the operation as a string, e.g. "Sum", see ValidateOp()
+
+    // things cached during validation
    ElementWiseOperator m_reductionOp; // the reduction operation mapped to our internal opCode
+    ElemType m_scale;                  // 1 or, for Mean, 1/number of elements we are reducing over
 };

 // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -1629,7 +1629,7 @@ public:
            fstream >> m_normTimeConst;
            fstream >> m_blendTimeConst;
            fstream >> m_imageLayoutKind;
-            if (modelVersion >= CNTK_MODEL_VERSION_12)
+            if (modelVersion >= CNTK_MODEL_VERSION_13)
                fstream >> m_samplesSeen;
            else
                fstream >> mbCount; // converted below
@ -1677,7 +1677,7 @@ public:
            }
        }

-        if (modelVersion < CNTK_MODEL_VERSION_12)
+        if (modelVersion < CNTK_MODEL_VERSION_13)
        {
            // Prior to version 12, minibatch count was stored instead of samples seen.
            // Approximate by assuming minibatch size 16, inform about that.
@ -1779,7 +1779,7 @@ public:
            LogicError("%ls: Failed to convert running variance until forward prop", NodeName().c_str());
        FrameRange fr(Input(0)->GetMBLayout());

-        Matrix<ElemType> sliceInputValue  = Input(0)->ValueFor(fr);
+        Matrix<ElemType> sliceInputValue  = Input(0)->MaskedValueFor(fr);
        const Matrix<ElemType>& scale     = Input(1)->Value();
        const Matrix<ElemType>& bias      = Input(2)->Value();
        Matrix<ElemType>& runMean         = Input(3)->Value();
@ -1828,10 +1828,10 @@ public:

        if (inputIndex == 0) // derivative with respect to the input.
        {
-            auto sliceOutputGrad                = GradientFor(fr);
-            auto sliceInputValue                = Input(0)->ValueFor(fr);
-            const Matrix<ElemType>& scale       = Input(1)->Value();
-            const Matrix<ElemType>& bias        = Input(2)->Value();
+            auto sliceOutputGrad          = MaskedGradientFor(fr);
+            auto sliceInputValue          = Input(0)->ValueFor(fr);
+            const Matrix<ElemType>& scale = Input(1)->Value();
+            const Matrix<ElemType>& bias  = Input(2)->Value();

            auto sliceInputGrad = Input(0)->GradientFor(fr);
            m_dScale->Resize(scale); // gradients for scale and bias get stored here
--- a/Source/Math/BatchNormalizationEngine.cpp
+++ b/Source/Math/BatchNormalizationEngine.cpp
@ -127,13 +127,13 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
    // Use CNTK as default batch norm engine.
    if (HasFlag(enabledEngines, BatchNormEngineKind::Cntk))
    {
-        fprintf(stderr, "\nUsing CNTK batch normalization engine.\n");
+        fprintf(stderr, "Using CNTK batch normalization engine.\n");
        return std::make_unique<CntkBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
    }

    if (HasFlag(enabledEngines, BatchNormEngineKind::CuDnn))
    {
-        fprintf(stderr, "\nUsing cuDNN batch normalization engine.\n");
+        fprintf(stderr, "Using cuDNN batch normalization engine.\n");
        return CuDnnBatchNormEngineFactory<ElemType>::Create(deviceId, inOutT, spatial, imageLayout);
    }

--- a/Source/Math/ColumnQuantizer.h
+++ b/Source/Math/ColumnQuantizer.h
@ -81,12 +81,7 @@ public:
    static cudacode void ComputeRangeStatColj(const ElemType* inMat, const ElemType* inResidual, long M, size_t j, size_t bits, ElemType& lower, ElemType& upper)
    {
        /*dummy reducers do nothing in linear CPU version*/
-        ComputeRangeStatColjSubset<ZeroThresholdFor1Bit>(inMat, inResidual, M, j, bits, lower, upper, 0, 1, [](ElemType&)
-                                                         {
-                                                         },
-                                                         [](unsigned int&)
-                                                         {
-                                                         });
+        ComputeRangeStatColjSubset<ZeroThresholdFor1Bit>(inMat, inResidual, M, j, bits, lower, upper, 0, 1, [](ElemType&){}, [](unsigned int&){});
    }

 public:
@ -231,9 +226,9 @@ public:
        // i.e.
        //  - do not symmetrize/pool the quantization values for 0 and 1
        //  - but hard-code the quantization threshold to be 0 instead of the mean of the two bounds
-        // This should give us the best of all--fast operation yet ability to be asymmetric within a column
+        // This should give us the best of all--fast operation yet ability to be asymmetric within a column.
        ElemType mean = 0.0f;
-        if (!ZeroThresholdFor1Bit || (bits != 1))
+        if (!ZeroThresholdFor1Bit && (bits == 1))
        {
            ElemType meanacc = 0.0f;
            // (subset: compute subset sum)
@ -320,7 +315,7 @@ public:
        }
        else
        {
-            ElemType stddevs = 5.0f;
+            ElemType stddevs = 4.0f; // TODO: make this a parameter
            // >1 bit:
            // We linearly quantize between 'stddevs' standard deviations.
            ElemType varacc = 0.0f;
@ -349,7 +344,6 @@ private:
    template <typename T>
    friend class QuantizedMatrix;
 };
-}
-}
-}
+
+}}}
 #endif
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -866,7 +866,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
        if (!isEnabled(ConvolutionEngineKind::Legacy))
            RuntimeError("Trying to use Legacy convolution engine when it's disabled.");
        // REVIEW alexeyk: should honor m_traceLevel here.
-        fprintf(stderr, "\n%lsusing legacy convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+        fprintf(stderr, "%lsusing legacy convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return std::make_unique<LegacyConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

@ -874,19 +874,19 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
    if (isEnabled(ConvolutionEngineKind::CuDnn) &&
        CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId, geometry, poolKind))
    {
-        fprintf(stderr, "\n%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+        fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

    if (isEnabled(ConvolutionEngineKind::Gemm) && GemmConvolutionEngine<ElemType>::IsSupported(deviceId, geometry))
    {
-        fprintf(stderr, "\n%lsusing GEMM convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+        fprintf(stderr, "%lsusing GEMM convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return std::make_unique<GemmConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

    if (!isEnabled(ConvolutionEngineKind::Reference))
        RuntimeError("Reference convolution is disabled and no other engine supports such configuratin (or disabled).");
-    fprintf(stderr, "\n%lsusing reference convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+    fprintf(stderr, "%lsusing reference convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
    return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
 }

--- a/Source/Math/GPURNGHandle.cu
+++ b/Source/Math/GPURNGHandle.cu
@ -14,7 +14,9 @@ GPURNGHandle::GPURNGHandle(int deviceId, unsigned long seed)
    : RNGHandle(deviceId)
 {
    unsigned long long cudaSeed = seed;
+#ifdef _DEBUG
    fprintf(stderr, "(GPU): creating curand object with seed %llu\n", cudaSeed);
+#endif

    CURAND_CALL(curandCreateGenerator(&m_generator, CURAND_RNG_PSEUDO_XORWOW));
    CURAND_CALL(curandSetPseudoRandomGeneratorSeed(m_generator, cudaSeed));
--- a/Source/Math/MatrixQuantizerCPU.cpp
+++ b/Source/Math/MatrixQuantizerCPU.cpp
@ -32,33 +32,33 @@ void MatrixQuantizerCPU<ElemType>::QuantizeAsync(const Matrix<ElemType>& inMatri
 #else
    for (size_t j = 0; j < nCol; j++)
 #endif
-                              {
-                                  auto& qcol = *(outQMatrix.GetQuantizedColumn(j));
-                                  if (zeroThresholdFor1Bit)
-                                  {
-                                      // Explicit use of 'template' keyword is needed to compile with GCC
-                                      ColumnQuantizer<ElemType>::template ComputeRangeStatColj<true>(inMatrix.Data(), inResidual.Data(), (long) nRow, j, nBits, qcol.lower, qcol.upper);
-                                  }
-                                  else
-                                  {
-                                      // Explicit use of 'template' keyword is needed to compile with GCC
-                                      ColumnQuantizer<ElemType>::template ComputeRangeStatColj<false>(inMatrix.Data(), inResidual.Data(), (long) nRow, j, nBits, qcol.lower, qcol.upper);
-                                  }
+    {
+        auto& qcol = *(outQMatrix.GetQuantizedColumn(j));
+        if (zeroThresholdFor1Bit)
+        {
+            // Explicit use of 'template' keyword is needed to compile with GCC
+            ColumnQuantizer<ElemType>::template ComputeRangeStatColj<true>(inMatrix.Data(), inResidual.Data(), (long) nRow, j, nBits, qcol.lower, qcol.upper);
+        }
+        else
+        {
+            // Explicit use of 'template' keyword is needed to compile with GCC
+            ColumnQuantizer<ElemType>::template ComputeRangeStatColj<false>(inMatrix.Data(), inResidual.Data(), (long) nRow, j, nBits, qcol.lower, qcol.upper);
+        }

-                                  ColumnQuantizer<ElemType> q(ldNbits, qcol.lower, qcol.upper);
-                                  if (zeroThresholdFor1Bit)
-                                  {
-                                      // Explicit use of 'template' keyword is needed to compile with GCC
-                                      q.template Quantize<true>(inMatrix.Data(), inResidual.Data(), (long) nRow, j, qcol.bits, outResidual.Data());
-                                  }
-                                  else
-                                  {
-                                      // Explicit use of 'template' keyword is needed to compile with GCC
-                                      q.template Quantize<false>(inMatrix.Data(), inResidual.Data(), (long) nRow, j, qcol.bits, outResidual.Data());
-                                  }
-                              }
+        ColumnQuantizer<ElemType> q(ldNbits, qcol.lower, qcol.upper);
+        if (zeroThresholdFor1Bit)
+        {
+            // Explicit use of 'template' keyword is needed to compile with GCC
+            q.template Quantize<true>(inMatrix.Data(), inResidual.Data(), (long) nRow, j, qcol.bits, outResidual.Data());
+        }
+        else
+        {
+            // Explicit use of 'template' keyword is needed to compile with GCC
+            q.template Quantize<false>(inMatrix.Data(), inResidual.Data(), (long) nRow, j, qcol.bits, outResidual.Data());
+        }
+    }
 #ifdef QUANTUSEPPL
-                              );
+    );
 #endif
 }

@ -89,13 +89,13 @@ void MatrixQuantizerCPU<ElemType>::UnquantizeAsync(QuantizedMatrix<ElemType>& in
 #else
    for (size_t j = 0; j < nCol; j++)
 #endif
-                              {
-                                  const auto& qcol = *(inQMatrix.GetQuantizedColumn(j));
-                                  ColumnQuantizer<ElemType> q(ldNbits, qcol.lower, qcol.upper);
-                                  q.Unquantize(outMatrix.Data(), (long) nRow, j, qcol.bits, add);
-                              }
+    {
+        const auto& qcol = *(inQMatrix.GetQuantizedColumn(j));
+        ColumnQuantizer<ElemType> q(ldNbits, qcol.lower, qcol.upper);
+        q.Unquantize(outMatrix.Data(), (long) nRow, j, qcol.bits, add);
+    }
 #ifdef QUANTUSEPPL
-                              );
+    );
 #endif
 }

@ -108,4 +108,5 @@ void MatrixQuantizerCPU<ElemType>::WaitUnquantizeAsyncDone()
 //The explicit instantiation part will make the linker happy
 template class MatrixQuantizerCPU<float>;
 template class MatrixQuantizerCPU<double>;
-} } }
+
+}}}
--- a/Source/Math/QuantizedMatrix.cpp
+++ b/Source/Math/QuantizedMatrix.cpp
@ -182,4 +182,5 @@ void QuantizedMatrix<ElemType>::Print(const char* matrixName, size_t rowStart, s
 // Explicit instantiation
 template class QuantizedMatrix<float>;
 template class QuantizedMatrix<double>;
-} } }
+
+}}}
--- a/Source/Math/QuantizedMatrix.h
+++ b/Source/Math/QuantizedMatrix.h
@ -119,4 +119,5 @@ private:
    template <typename T>
    friend class MatrixQuantizer;
 };
-} } }
+
+}}}
--- a/Source/Math/ValueQuantizer.h
+++ b/Source/Math/ValueQuantizer.h
@ -83,10 +83,12 @@ public:
        }
        else
        {
+            // make the range asymmetrical, so we get a 0 slot
+            size_t usedrangeend = rangeend - (Nbits > 1); // TODO: make this a parameter
            // precompute this for quantize() (see comment there)
-            qfactor = rangeend / (quantimax - quantimin);
+            qfactor = usedrangeend / (quantimax - quantimin);
            // and for unquantize()
-            ufactor = (quantimax - quantimin) / rangeend;
+            ufactor = (quantimax - quantimin) / usedrangeend;
        }

        // set the quantization threshold for the special case of 1-bit
@ -127,6 +129,7 @@ public:
    // unquantize one value
    cudasharedcode ElemType Unquantize(QWordVal u) const
    {
+        // special branch that does not quantize at all, for testing
        if (Nbits == QWordNumBits)
        {
            return *(ElemType*) &u;
--- a/Source/Readers/HTKMLFReader/rollingwindowsource.h
+++ b/Source/Readers/HTKMLFReader/rollingwindowsource.h
@ -311,10 +311,10 @@ public:
            wstring key;
            if (!labels.empty()) // empty means unsupervised mode (don't load any)
            {
-#ifdef _MSC_VER
+#ifdef _WIN32
                key = regex_replace((wstring) ppath, wregex(L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none)
 #else
-                key = removeExtension(basename(ppath));
+                key = removeExtension(ppath);
 #endif
                if (labels.find(key) == labels.end())
                {
@ -630,9 +630,8 @@ public:
                    {
 #ifdef _WIN32
                        key = regex_replace((wstring) ppath, wregex(L"\\.[^\\.\\\\/:]*$"), wstring()); // delete extension (or not if none)
-#endif
-#ifdef __unix__
-                        key = removeExtension(basename(ppath));
+#else
+                        key = removeExtension(ppath);
 #endif
                        if (labels[0].find(key) == labels[0].end())
                        {
--- a/Source/Readers/ImageReader/ImageReader.vcxproj
+++ b/Source/Readers/ImageReader/ImageReader.vcxproj
@ -74,7 +74,7 @@
  </ItemDefinitionGroup>
  <ItemDefinitionGroup>
    <ClCompile>
-      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\Math;$(OpenCvInclude);$(ZipInclude);$(SolutionDir)Source\Readers\ReaderLib</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\Math;$(OpenCvInclude);$(ZipInclude);$(SolutionDir)Source\Readers\ReaderLib;$(BOOST_INCLUDE_PATH)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir);$(OpenCvLibPath);$(ZipLibPath)</AdditionalLibraryDirectories>
@ -127,10 +127,11 @@
    <ClCompile Include="ZipByteReader.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
-  <Target Name="Build" Condition="$(HasOpenCv)" Outputs="$(TargetPath)" DependsOnTargets="$(BuildDependsOn)" />
+  <Target Name="Build" Condition="$(HasOpenCv) And $(HasBoost)" Outputs="$(TargetPath)" DependsOnTargets="$(BuildDependsOn)" />
  <ImportGroup Label="ExtensionTargets" />
  <Target Name="CheckDependencies">
+    <Warning Condition="!$(HasBoost)" Text="ImageReader requires the Boost library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#boost for installation instructions." />
    <Warning Condition="!$(HasOpenCv)" Text="ImageReader requires the OpenCV library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#opencv for installation instructions." />
    <Warning Condition="!$(UseZip)" Text="zlib and libzip libraries were not found, ImageReader will be built without zip container support. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#libzip for installation instructions." />
  </Target>
-</Project>
+</Project>
--- a/Показать больше
+++ b/Показать больше