Merge branch 'master' of https://github.com/Microsoft/cntk

2016-08-08 15:23:32 -07:00 · 2016-08-08 15:23:32 -07:00 · 9db12ddf3d
--- a/.gitattributes
+++ b/.gitattributes
@ -6,6 +6,7 @@ Dockerfile-GPU text
 *.counts text
 *.labels text
 *.feats text
+*.ctf text
 *.post text
 *.cpu text
 *.gpu text
@ -19,6 +20,7 @@ Dockerfile-GPU text

 *.md text
 *.txt text
+*.TXT text
 *.html text
 *.lyx text
 *.bib text
@ -44,6 +46,9 @@ make_binary_drop_linux text eol=lf
 Tests/EndToEndTests/Examples/Speech/TIMIT/WriteBottleneck/expected_output_md5sum.*.txt eol=lf
 Tests/EndToEndTests/Examples/Speech/TIMIT/WriteScaledLogLike/expected_output_md5sum.*.txt eol=lf

+# Used by reader unit test, needs to keep LF line endings.
+Tests/UnitTests/ReaderTests/Data/CNTKTextFormatReader/invalid_inputs.txt eol=lf
+
 Makefile text
 *.sln text
 *.vcxproj text
@ -106,6 +111,10 @@ TIMIT*.statelist text
 TIMIT*.tfsa text
 TIMIT*.transitions text

+Examples/Text/ATIS/data/ATIS.* text
+
+Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b* text
+
 # Binary extensions:
 *.vsdm binary
 *.pdf binary
--- a/.gitignore
+++ b/.gitignore
@ -65,6 +65,7 @@ ipch/
 *.opensdf
 *.sdf
 *.cachefile
+*.userosscache

 # Visual Studio profiler
 *.psess
--- a/CNTK.sln
+++ b/CNTK.sln
@ -934,7 +934,7 @@ EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{D11F76CC-DB6D-4CB4-B3B7-AB139DE2F5FA}"
 	ProjectSection(SolutionItems) = preProject
 		Tests\EndToEndTests\Text\SequenceClassification\Data\embeddingmatrix.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\embeddingmatrix.txt
-		Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt
+		Tests\EndToEndTests\Text\SequenceClassification\Data\Train.ctf = Tests\EndToEndTests\Text\SequenceClassification\Data\Train.ctf
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{181664AC-4C95-4798-A923-09B879215B33}"
@ -1120,6 +1120,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKv2LibraryDll", "Source\
 		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
+		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
 		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
 		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}
 	EndProjectSection
@ -1147,6 +1148,11 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Examples\E
 		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
 	EndProjectSection
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BrainScriptTests", "Tests\UnitTests\BrainScriptTests\BrainScriptTests.vcxproj", "{9F999212-AFC5-4EAC-AA78-F7247D46C456}"
+	ProjectSection(ProjectDependencies) = postProject
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug_CpuOnly|x64 = Debug_CpuOnly|x64
@ -1425,6 +1431,14 @@ Global
 		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
 		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.ActiveCfg = Release|x64
 		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.Build.0 = Release|x64
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug|x64.ActiveCfg = Debug|x64
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Debug|x64.Build.0 = Debug|x64
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release|x64.ActiveCfg = Release|x64
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -1583,5 +1597,6 @@ Global
 		{3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA} = {47755F2E-D674-4175-9E38-8EA053455072}
 		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
 		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
+		{9F999212-AFC5-4EAC-AA78-F7247D46C456} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
 	EndGlobalSection
 EndGlobal
--- a/Documentation/Documents/Network
+++ b/Documentation/Documents/Network
@ -863,38 +863,27 @@ The dimension reduced matrix consisting of the maximum value within each pooling

 This function is often associated with Convolution() operations.

-### Delay
+### PastValue, FutureValue

-Delay node used in recurrent networks, allows creation of a loop in the convolutional network that will repeat a specified number of times.
+PastValue and FutureValue nodes are used in recurrent networks, allow creation of a loop in the convolutional network that will repeat a specified number of times. PastValue retrieves the value of a node several steps away in the past, while FutureValue retrieves the value of a node from future.

-`Delay(rows, [cols], delayNode, delayTime=1, needGradient=true, defaultHiddenActivity=0.1)`
+`PastValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
+`FutureValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`

 #### Parameters

-`cvweight` – convolution weight matrix, it has the dimensions of \[outputChannels, kernelWidth \* kernelHeight \* inputChannels\]
+`rows` – number of rows in the node

-`kernelWidth` – width of the kernel
+`cols` – number of cols in the node. This value is often ommit since the length of a sequence varies

-`kernelHeight` – height of the kernel
+`timeStep` – \[default = 1\] number of time steps toward the past and future

-`outputChannels` – number of output channels
-
-`horizontalSubsample` – subsamples in the horizontal direction
-
-`verticalSubsample` – subsamples in the vertical direction
-
-#### Optional Parameters
-
-`delayTime` – \[default = 1\] the amount of delay that will be introduced (number of times the loop will happen)
-
-`needGradient` – \[default = true\] does the gradient need to be computed for this node
-
-`defaultHiddenActivity` – \[default = 0.1\] the numerical amount for the defaultHiddenActivity
+`defaultHiddenActivity` – \[default = 0.1\] default value to use when passing the sequence bounday or when the value is missing.

 #### Returns

-The results of the completed Delay loop
+Either the past or future value of a node

 #### Notes

-This node is used in recurrent networks, where a delay is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network that will repeat delayTime number of iterations.
+This node is used in recurrent networks, where a past value is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network.
--- a/Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp
+++ b/Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp
@ -37,19 +37,22 @@ int main(int argc, char* argv[])
    std::string app = argv[0];
    std::string path; 
    IEvaluateModel<float> *model;
+    size_t pos;

 #ifdef _WIN32
-    path = app.substr(0, app.rfind("\\"));
+    pos = app.rfind("\\");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);

    // This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
    const std::string modelWorkingDirectory = path + "/../../Examples/Image/MNIST/Data/";
 #else // on Linux
-    path = app.substr(0, app.rfind("/"));
+    pos = app.rfind("/");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);

    // This relative path assumes launching from CNTK's binary folder, e.g. build/release/bin/
    const std::string modelWorkingDirectory = path + "/../../../Examples/Image/MNIST/Data/";
 #endif
-
+    
    GetEvalF(&model);

    const std::string modelFilePath = modelWorkingDirectory + "../Output/Models/01_OneHidden";
--- a/Examples/Evaluation/CSEvalClient/ModelEvaluator.cs
+++ b/Examples/Evaluation/CSEvalClient/ModelEvaluator.cs
@ -110,9 +110,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
        public static bool Evaluate(string record)
        {
            var model = Models.Take();
-            var outcome = model.EvaluateRecord(record);
-            Models.Add(model);
-            return outcome;
+            try
+            {
+                var outcome = model.EvaluateRecord(record);
+                return outcome;
+            }
+            finally
+            { 
+                Models.Add(model); 
+            }
        }

        /// <summary>
@ -123,9 +129,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
        public static List<float> Evaluate(List<float> inputs)
        {
            var model = Models.Take();
-            var outcome = model.EvaluateInput(inputs);
-            Models.Add(model);
-            return outcome;
+            try
+            {
+                var outcome = model.EvaluateInput(inputs);
+                return outcome;
+            }
+            finally
+            {
+                Models.Add(model);
+            }
        }

        /// <summary>
--- a/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping
+++ b/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping
@ -1,69 +1,69 @@
-'
-</s>
-<s/>
-<s>
-A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z
-~AA
-~AE
-~AH
-~AO
-~AW
-~AY
-~B
-~CH
-~D
-~DH
-~EH
-~ER
-~EY
-~F
-~G
-~HH
-~IH
-~IY
-~JH
-~K
-~L
-~M
-~N
-~NG
-~OW
-~OY
-~P
-~R
-~S
-~SH
-~T
-~TH
-~UH
-~UW
-~V
-~W
-~Y
-~Z
-~ZH
+'
+</s>
+<s/>
+<s>
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+~AA
+~AE
+~AH
+~AO
+~AW
+~AY
+~B
+~CH
+~D
+~DH
+~EH
+~ER
+~EY
+~F
+~G
+~HH
+~IH
+~IY
+~JH
+~K
+~L
+~M
+~N
+~NG
+~OW
+~OY
+~P
+~R
+~S
+~SH
+~T
+~TH
+~UH
+~UW
+~V
+~W
+~Y
+~Z
+~ZH
--- a/Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl
+++ b/Examples/Speech/AN4/Config/lstmp-3layer-opt.ndl
@ -18,9 +18,9 @@ ndlMacroDefine = [
    ]

    LSTMPComponent(inputDim, outputDim, cellDim, inputx, cellDimX2, cellDimX3, cellDimX4) = [
-        wx = Parameter(cellDimX4, inputDim,  init="uniform", initValueScale=1);
-        b = Parameter(cellDimX4,  1,         init="fixedValue", value=0.0);
-        Wh = Parameter(cellDimX4, outputDim, init="uniform", initValueScale=1);
+        wx = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
+        b  = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
+        Wh = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);

        Wci = Parameter(cellDim, init="uniform", initValueScale=1);
        Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
@ -63,9 +63,9 @@ ndlMacroDefine = [
    ]
    
    LSTMPComponentBetter(inputDim, outputDim, cellDim, inputx, cellDimX2, cellDimX3, cellDimX4) = [
-        wx = Parameter(cellDimX4, inputDim,  init="uniform", initValueScale=1);
-        b = Parameter(cellDimX4,  1,         init="fixedValue", value=0.0);
-        Wh = Parameter(cellDimX4, outputDim, init="uniform", initValueScale=1);
+        wx = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);
+        b  = Parameter(cellDimX4, 1, init="fixedValue", value=0.0);
+        Wh = Parameter(cellDimX4, 0, init="uniform", initValueScale=1);

        Wci = Parameter(cellDim, init="uniform", initValueScale=1);
        Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
@ -112,26 +112,26 @@ ndlMacroDefine = [
    ]

    LSTMPComponentNaive(inputDim, outputDim, cellDim, inputx) = [
-        Wxo = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
-        Wxi = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
-        Wxf = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
-        Wxc = Parameter(cellDim, inputDim, init="uniform", initValueScale=1);
+        Wxo = Parameter(cellDim, 0, init="uniform", initValueScale=1);
+        Wxi = Parameter(cellDim, 0, init="uniform", initValueScale=1);
+        Wxf = Parameter(cellDim, 0, init="uniform", initValueScale=1);
+        Wxc = Parameter(cellDim, 0, init="uniform", initValueScale=1);

        bo = Parameter(cellDim, init="fixedValue", value=0.0);
        bc = Parameter(cellDim, init="fixedValue", value=0.0);
        bi = Parameter(cellDim, init="fixedValue", value=0.0);
        bf = Parameter(cellDim, init="fixedValue", value=0.0);

-        Whi = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
+        Whi = Parameter(cellDim, 0, init="uniform", initValueScale=1);

        Wci = Parameter(cellDim, init="uniform", initValueScale=1);


-        Whf = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
+        Whf = Parameter(cellDim, 0, init="uniform", initValueScale=1);
        Wcf = Parameter(cellDim, init="uniform", initValueScale=1);
-        Who = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
+        Who = Parameter(cellDim, 0, init="uniform", initValueScale=1);
        Wco = Parameter(cellDim, init="uniform", initValueScale=1);
-        Whc = Parameter(cellDim, outputDim, init="uniform", initValueScale=1);
+        Whc = Parameter(cellDim, 0, init="uniform", initValueScale=1);

        dh = PastValue(outputDim, output, timeStep=1);
        dc = PastValue(cellDim, ct, timeStep=1);
@ -194,8 +194,8 @@ ndlCreateNetwork_LSTMP_c1024_p256_x3 = [
    # layer 3 
    LSTMoutput3 = LSTMPComponent(hiddenDim, hiddenDim, cellDim, LSTMoutput2, cellDimX2, cellDimX3, cellDimX4);

-    W = Parameter(labelDim, hiddenDim, init="uniform", initValueScale=1);
-    b = Parameter(labelDim, 1,         init="fixedValue", value=0);
+    W = Parameter(labelDim, 0, init="uniform", initValueScale=1);
+    b = Parameter(labelDim, 1, init="fixedValue", value=0);
    LSTMoutputW = Plus(Times(W, LSTMoutput3), b);

    ce = CrossEntropyWithSoftmax(labels, LSTMoutputW);
--- a/203
+++ b/203
@ -17,8 +17,10 @@
 #     version for the CNTK custom MKL installation
 #   MKL_THREADING=parallel|sequential
 #     only needed if MATHLIB=mkl
-#   GDK_PATH= path to cuda gdk installation, so $(GDK_PATH)/include/nvidia/gdk/nvml.h exists
-#     defaults to /usr
+#   GDK_INCLUDE_PATH= path to CUDA GDK include path, so $(GDK_INCLUDE_PATH)/nvml.h exists
+#     defaults to /usr/include/nvidia/gdk
+#   GDK_NVML_LIB_PATH= path to CUDA GDK (stub) library path, so $(GDK_NVML_LIB_PATH)/libnvidia-ml.so exists
+#     defaults to /usr/src/gdk/nvml/lib
 #   MATHLIB= One of acml or mkl
 #     defaults to acml
 #   CUDA_PATH= Path to CUDA
@ -29,10 +31,12 @@
 #     If not specified, CNTK will be be built without cuDNN.
 #   KALDI_PATH= Path to Kaldi
 #     If not specified, Kaldi plugins will not be built
-#   OPENCV_PATH= path to OpenCV 3.0.0 installation, so $(OPENCV_PATH) exists
-#     defaults to /usr/local/opencv-3.0.0
+#   OPENCV_PATH= path to OpenCV 3.1.0 installation, so $(OPENCV_PATH) exists
+#     defaults to /usr/local/opencv-3.1.0
 #   LIBZIP_PATH= path to libzip installation, so $(LIBZIP_PATH) exists
 #     defaults to /usr/local/
+#   BOOST_PATH= path to Boost installation, so $(BOOST_PATH)/include/boost/test/unit_test.hpp
+#     defaults to /usr/local/boost-1.60.0
 # These can be overridden on the command line, e.g. make BUILDTYPE=debug

 # TODO: Build static libraries for common dependencies that are shared by multiple 
@ -71,7 +75,7 @@ INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2L
 # COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
 COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
 CPPFLAGS:= 
-CXXFLAGS:= -msse3 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
+CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
 LIBPATH:=
 LIBS:=
 LDFLAGS:=
@ -93,9 +97,14 @@ all : buildall
 CUFLAGS = -m 64 

 ifdef CUDA_PATH
-  ifndef GDK_PATH
-    $(info defaulting GDK_PATH to /usr)
-    GDK_PATH=/usr
+  ifndef GDK_INCLUDE_PATH
+    GDK_INCLUDE_PATH=/usr/include/nvidia/gdk
+    $(info defaulting GDK_INCLUDE_PATH to $(GDK_INCLUDE_PATH))
+  endif
+
+  ifndef GDK_NVML_LIB_PATH
+    GDK_NVML_LIB_PATH=/usr/src/gdk/nvml/lib
+    $(info defaulting GDK_NVML_LIB_PATH to $(GDK_NVML_LIB_PATH))
  endif

  ifndef CUB_PATH
@ -107,10 +116,8 @@ ifdef CUDA_PATH

  NVCC = $(CUDA_PATH)/bin/nvcc

-  # This is a suggested/default location for NVML
-  INCLUDEPATH+=$(GDK_PATH)/include/nvidia/gdk
+  INCLUDEPATH+=$(GDK_INCLUDE_PATH)
  INCLUDEPATH+=$(CUB_PATH)
-  NVMLLIBPATH=$(GDK_PATH)/src/gdk/nvml/lib

 # Set up CUDA includes and libraries
  INCLUDEPATH += $(CUDA_PATH)/include
@ -328,7 +335,7 @@ $(CNTKMATH_LIB): $(MATH_OBJ)
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
 	@mkdir -p $(dir $@)
-	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp

 ########################################
 # CNTKLibrary
@ -368,13 +375,17 @@ SEQUENCE_TRAINING_LIB_SRC +=\
 endif

 CNTKLIBRARY_SRC =\
+	$(SOURCEDIR)/CNTKv2LibraryDll/BackCompat.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Common.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Function.cpp \
+	$(SOURCEDIR)/CNTKv2LibraryDll/MinibatchSource.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/NDArrayView.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/NDMask.cpp \
+	$(SOURCEDIR)/CNTKv2LibraryDll/Trainer.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Utils.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Value.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Variable.cpp \
+    $(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \

 CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
 CNTKLIBRARY_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
@ -393,7 +404,7 @@ $(CNTKLIBRARY_LIB): $(CNTKLIBRARY_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)

 ########################################
 # CNTKLibrary tests
@ -405,6 +416,8 @@ CNTKLIBRARY_TESTS_SRC =\
 	Tests/UnitTests/V2LibraryTests/NDArrayViewTests.cpp \
 	Tests/UnitTests/V2LibraryTests/RecurrentFunctionTests.cpp \
 	Tests/UnitTests/V2LibraryTests/TensorTests.cpp \
+	Tests/UnitTests/V2LibraryTests/TrainerTests.cpp \
+	Tests/UnitTests/V2LibraryTests/CifarResNet.cpp \

 CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
 CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_TESTS_SRC)))
@ -416,7 +429,7 @@ $(CNTKLIBRARY_TESTS): $(CNTKLIBRARY_TESTS_OBJ) | $(CNTKLIBRARY_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)

 ########################################
 # LibEval
@ -437,7 +450,7 @@ EVAL_SRC=\
 	$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
 	$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
 	$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
-	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp 
+	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \

 EVAL_SRC+=$(SGDLIB_SRC)
 EVAL_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
@ -450,11 +463,11 @@ EVAL_LIB:=$(LIBDIR)/lib$(EVAL).so
 ALL+=$(EVAL_LIB)
 SRC+=$(EVAL_SRC)

-$(EVAL_LIB): $(EVAL_OBJ) 
+$(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) 

 ########################################
 # Eval Sample client
@ -469,11 +482,11 @@ EVAL_SAMPLE_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SAMPLE_CLIENT_SR
 ALL+=$(EVAL_SAMPLE_CLIENT)
 SRC+=$(EVAL_SAMPLE_CLIENT_SRC)

-$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
+$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB) 
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $(EVAL_SAMPLE_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ -l$(EVAL) -l$(CNTKMATH)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)

 ########################################
 # BinaryReader plugin
@ -770,7 +783,6 @@ CNTK_SRC =\
 	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
-	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \

 CNTK_SRC+=$(SGDLIB_SRC)
 CNTK_SRC+=$(CNTK_COMMON_SRC)
@ -787,7 +799,7 @@ $(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp

 # deployable resources: standard library of BS
 CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
@ -797,6 +809,151 @@ $(CNTK_CORE_BS): $(SOURCEDIR)/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
 	@echo bin-placing deployable resource files
 	cp -f $^ $@

+########################################
+# Unit Tests
+########################################
+
+# only build unit tests when Boost is available
+ifdef BOOST_PATH
+
+INCLUDEPATH += $(BOOST_PATH)/include
+
+BOOSTLIB_PATH = $(BOOST_PATH)/lib
+BOOSTLIBS := -lboost_unit_test_framework -lboost_filesystem -lboost_system
+
+UNITTEST_EVAL_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/EvalTests/EvalExtendedTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/EvalTests/stdafx.cpp
+
+UNITTEST_EVAL_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_EVAL_SRC))
+
+UNITTEST_EVAL := $(BINDIR)/evaltests
+
+ALL += $(UNITTEST_EVAL)
+SRC += $(UNITTEST_EVAL_SRC)
+
+$(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH) 
+
+#TODO: create project specific makefile or rules to avoid adding project specific path to the global path
+INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
+
+UNITTEST_READER_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ReaderLibTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/UCIFastReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/stdafx.cpp \
+	$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
+	$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
+
+UNITTEST_READER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_READER_SRC))
+
+UNITTEST_READER := $(BINDIR)/readertests
+
+ALL += $(UNITTEST_READER)
+SRC += $(UNITTEST_READER_SRC)
+
+$(UNITTEST_READER): $(UNITTEST_READER_OBJ) | $(HTKMLFREADER) $(HTKDESERIALIZERS) $(UCIFASTREADER) $(COMPOSITEDATAREADER) $(IMAGEREADER) $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) -l$(CNTKMATH) -ldl 
+
+UNITTEST_NETWORK_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/OperatorEvaluation.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/stdafx.cpp \
+	$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
+	$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
+	$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
+	$(SOURCEDIR)/ActionsLib/OtherActions.cpp \
+	$(SOURCEDIR)/ActionsLib/SpecialPurposeActions.cpp \
+	$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
+	$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
+	$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
+	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
+
+UNITTEST_NETWORK_SRC += $(COMPUTATION_NETWORK_LIB_SRC)
+UNITTEST_NETWORK_SRC += $(CNTK_COMMON_SRC)
+UNITTEST_NETWORK_SRC += $(SEQUENCE_TRAINING_LIB_SRC)
+UNITTEST_NETWORK_SRC += $(SGDLIB_SRC)
+
+UNITTEST_NETWORK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_NETWORK_SRC)))
+
+UNITTEST_NETWORK := $(BINDIR)/networktests
+
+ALL += $(UNITTEST_NETWORK)
+SRC += $(UNITTEST_NETWORK_SRC)
+
+$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -fopenmp
+
+UNITTEST_MATH_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/BlockMultiplierTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/constants.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUSparseMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/fixtures.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixCudaBlasTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixBlasTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixDataSynchronizationTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixFileWriteReadTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
+
+UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
+
+UNITTEST_MATH := $(BINDIR)/mathtests
+
+ALL += $(UNITTEST_MATH)
+SRC += $(UNITTEST_MATH_SRC)
+
+$(UNITTEST_MATH): $(UNITTEST_MATH_OBJ) | $(CNTKMATH_LIB) 
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(CNTKMATH) -ldl -fopenmp
+
+UNITTEST_BRAINSCRIPT_SRC = \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/ParserTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/BrainScriptTests/stdafx.cpp
+
+UNITTEST_BRAINSCRIPT_SRC+=$(COMMON_SRC)
+
+UNITTEST_BRAINSCRIPT_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_BRAINSCRIPT_SRC))
+
+UNITTEST_BRAINSCRIPT := $(BINDIR)/brainscripttests
+
+ALL += $(UNITTEST_BRAINSCRIPT)
+SRC += $(UNITTEST_BRAINSCRIPT_SRC)
+
+$(UNITTEST_BRAINSCRIPT): $(UNITTEST_BRAINSCRIPT_OBJ)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -ldl
+
+unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH) $(UNITTEST_BRAINSCRIPT)
+
+endif
+
 ########################################
 # General compile and dependency rules
 ########################################
@ -821,13 +978,13 @@ $(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
 	@mkdir -p $(dir $@)
 	$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"

-$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
+$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION) 
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
 	@mkdir -p $(dir $@)
 	$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}

-.PHONY: clean buildall all
+.PHONY: clean buildall all unittests

 clean:
 	@echo $(SEPARATOR)
--- a/README.md
+++ b/README.md
@ -1,6 +1,11 @@
 # CNTK

 ## Latest news
+*2016-07-15.* V 1.6 Binary release  
+CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
+
+*2016-07-12.* We have further expanded Licensing options for CNTK 1bit-SGD and related components. See the details at the [Wiki page](https://github.com/microsoft/cntk/wiki/CNTK-1bit-SGD-License). These new options are an extension of the new CNTK 1bit-SGD License that we have announced on Jun 23, 2016.
+
 *2016-07-05.* CNTK now supports *Deconvolution* and *Unpooling*. See the usage example in the Network number 4 in [MNIST Sample](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md).

 *2016-06-23.* New License Terms for CNTK 1bit-SGD and related components.  
@ -8,12 +13,6 @@ Effective immediately the License Terms for CNTK 1bit-SGD and related components

 *2016-06-20.* A [post](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/) on Intel MKL and CNTK is published in the [Intel IT Peer Network](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/)

-*2016-06-16.* V 1.5 Binary release. NuGet Package with CNTK Model Evaluation Libraries.  
-NuGet Package is added to CNTK v.1.5 binaries. See [CNTK Releases page](https://github.com/Microsoft/CNTK/releases) and [NuGet Package description](https://github.com/Microsoft/CNTK/wiki/Nuget-Package-for-Evaluation).
-
-*2016-06-15.*  CNTK now supports building against a custom Intel® Math Kernel Library (MKL).
-See [setup instructions](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine) on how to set this up for your platform.
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 ## What is CNTK
--- a/Scripts/README.md
+++ b/Scripts/README.md
@ -0,0 +1,24 @@
+This directory contains different script helping using different components of CNTK.
+
+### CNTK Text format Converters
+Two Python Scripts for converting Data to CNTK Text format for using as an input for CNTK Text Format Reader (see https://github.com/microsoft/cnTK/wiki/CNTKTextFormat-Reader).
+```
+txt2ctf.py 
+```
+Converts a set of dictionary files and a plain text file to CNTK Text format. Run ```python txt2ctf.py -h``` to see usage instructions. See the comments in the beginning of the script file for the specific usage example. 
+
+```
+uci2ctf.py
+```
+Converts data stored in a text file in UCI format to CNTK Text format. Run ```python uci2ctf.py -h``` to see usage instructions and example. Also see a usage example below:
+```
+python Scripts/uci2ctf.py --input_file Examples/Image/MNIST/Data/Train-28x28.txt --features_start 1 --features_dim 784 --labels_start 0 --labels_dim 1 --num_labels 10  --output_file Examples/Image/MNIST/Data/Train-28x28_cntk_text.txt
+```
+```input_file``` – original dataset in the (columnar) UCI format
+```features_start``` – index of the first feature column (start parameter in the UCIFastReader config, see https://github.com/Microsoft/CNTK/wiki/UCI-Fast-Reader)
+```features_dim``` – number of feature columns (dim parameter in the UCIFastReader config)
+```labels_start``` - index of the first label column
+```labels_dim``` – number of label columns
+```num_labels``` – number of possible label values (labelDim parameter in the UCIFastReader config)
+```output_file``` – path and filename of the resulting dataset.
+
--- a/Source/1BitSGD
+++ b/Source/1BitSGD
@ -1 +1 @@
-Subproject commit c9821dd5565d4654841eaba819b655c9db2fe85b
+Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -149,11 +149,11 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            int forcedRandomSeed = node->GetOptionalParameter("randomSeed", "-1" /*disabled*/);

            if (EqualCI(initString, L"fixedValue"))
-                nodePtr->Value().SetValue(value);
+                m_net->InitLearnableParameters(nodePtr, L"fixedValue", value);
            else if (EqualCI(initString, L"uniform"))
-                m_net->InitLearnableParameters(nodePtr, true, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, initValueScale, initOnCPUOnly);
+                m_net->InitLearnableParameters(nodePtr, L"uniform",  initValueScale, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed, initOnCPUOnly);
            else if (EqualCI(initString, L"gaussian"))
-                m_net->InitLearnableParameters(nodePtr, false, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, initValueScale, initOnCPUOnly);
+                m_net->InitLearnableParameters(nodePtr, L"gaussian", initValueScale, forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed, initOnCPUOnly);
            else if (EqualCI(initString, L"fromFile"))
            {
                std::string initFromFilePath = node->GetOptionalParameter("initFromFilePath", "");
@ -167,7 +167,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                dynamic_pointer_cast<LearnableParameter<ElemType>>(nodePtr)->InitFromFile(msra::strfun::utf16(initFromFilePath));
            }
            else
-                RuntimeError("'init' must be one of the values of [ uniform | gaussian | fixedValue ]");
+                RuntimeError("'init' must be one of the values of [ uniform | gaussian | fixedValue | fromFile ]");
        }
    }
    else if (cnNodeType == L"Constant")
@ -186,7 +186,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
        else if (pass == ndlPassFinal || nodePtr->Value().GetNumElements() != 0)
        {
            ElemType val = parameter[0]->GetScalar();
-            nodePtr->Value().SetValue(val);
+            m_net->InitLearnableParameters(nodePtr, L"fixedValue", val);
        }
    }
    else if (cnNodeType == L"RowSlice") // Note: This now maps onto SliceNode which specifies the end differently.
@ -304,7 +304,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                             "1. 2D convolution which takes 7 fixed parameters [weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample] \n"
                             "and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"]. \n"
                             "2. ND convolution which takes 3 fixed parameters [weightNodeName, inputValueNodeName, kernelShape] and \n"
-                             "10 optional parameters [mapCount = [1|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], bool transpose = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
+                             "10 optional parameters [mapCount = [0|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], bool transpose = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
                             "For ND convolution, parameters kernelShape, mapCount, stride, sharing, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
                             cnNodeType.c_str(), cnNodeType.c_str());
            }
@ -380,7 +380,7 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                };

                auto kernelShape = paramGetter(reqParams.size() - 1);
-                auto mapCount = paramResolver("mapCount", 1);
+                auto mapCount = paramResolver("mapCount", 0);
                auto stride = paramResolver("stride", 1);
                auto sharing = boolParamResolver("sharing", true);
                auto autoPad = boolParamResolver("autoPadding", true);
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -158,12 +158,12 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
 #endif
    else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
-	else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
-	else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
-	else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
-	else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
-	else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
-	else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ClipNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
@ -263,4 +263,5 @@ template class NDLNode<double>;

 template class NDLScript<float>;
 template class NDLScript<double>;
-} } }
+
+}}}
--- a/Source/ActionsLib/SimpleNetworkBuilder.cpp
+++ b/Source/ActionsLib/SimpleNetworkBuilder.cpp
@ -95,8 +95,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
        if (numHiddenLayers > 0)
        {
            w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[0]);
-            m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
            b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
+            m_net->InitLearnableParameters(b, L"fixedValue", 0);
            output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, L"W0*features"), b, L"W0*features+B0"), 0, L"H1");

            if (m_addDropoutNodes)
@ -114,8 +115,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
                wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

                w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[i + 1], m_layerSizes[i]);
-                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[i + 1], 1);
+                m_net->InitLearnableParameters(b, L"fixedValue", 0);
                output = ApplyNonlinearFunction(builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus), i, nameOfH);

                if (m_addDropoutNodes)
@ -132,8 +134,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildFFDNNFromDescription(
        wstring nameOfPlus = nameOfTimes + L"+" + nameOfB;

        w = builder.CreateLearnableParameter(nameOfW, m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
        b = builder.CreateLearnableParameter(nameOfB, m_layerSizes[numHiddenLayers + 1], 1);
+        m_net->InitLearnableParameters(b, L"fixedValue", 0);
        output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
        m_net->RenameNode(output, L"HLast");

@ -198,12 +201,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
        {
            // TODO: to figure out sparse matrix size
            u = builder.CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
-            m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);

            if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
            {
                w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
-                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
                // unless there is a good algorithm to detect loops, use this explicit setup
@ -230,12 +233,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
            {
                // TODO: to figure out sparse matrix size
                u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
-                m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);

                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
                {
                    w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
-                    m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+                    m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                    pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t) m_layerSizes[i + 1], 1);
                    // unless there is a good algorithm to detect loops, use this explicit setup
@ -259,7 +262,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildRNNFromDescription()
        }

        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
        /*m_net->MatrixL2Reg(w , L"L1w");*/

        label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
@ -311,12 +314,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
        if (numHiddenLayers > 0)
        {
            u = builder.CreateLearnableParameter(L"U0", m_layerSizes[1], m_layerSizes[0]);
-            m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);

            if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == 1)
            {
                w = builder.CreateLearnableParameter(L"W0", m_layerSizes[1], m_layerSizes[1]);
-                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[1], 1);
                // unless there is a good algorithm to detect loops, use this explicit setup
@ -330,7 +333,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
            else
            {
                b = builder.CreateLearnableParameter(L"B0", m_layerSizes[1], 1);
-                m_net->InitLearnableParameters(b, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(b, m_uniformInit, randomSeed++, m_initValueScale);
                output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), 0);
            }

@ -342,11 +345,11 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
            for (int i = 1; i < numHiddenLayers; i++)
            {
                u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
-                m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
                {
                    w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
-                    m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+                    m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

                    pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, (size_t) m_layerSizes[i + 1], 1);
                    // unless there is a good algorithm to detect loops, use this explicit setup
@ -373,13 +376,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassEntropyRNNFromDe
        // e.g., [200 x 10000], where 10000 is the vocabulary size
        // this is for speed-up issue as per word matrix can be simply obtained using column slice
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        // the label is a dense matrix. each element is the word index
        label = builder.CreateInputNode(L"labels", 4);

        clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
        clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");

        output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
@ -428,7 +431,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
-            m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");

            if (m_addDropoutNodes)
@ -464,9 +467,8 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
        // serve as a global bias term
        gt = builder.CreateInputNode(L"binaryFeature", m_auxFeatDim);
        m_net->AddToNodeGroup(L"feature", gt);
-        e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0),
-                                             m_layerSizes[numHiddenLayers], m_auxFeatDim);
-        m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
+        e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"AuxTrans%d", 0), m_layerSizes[numHiddenLayers], m_auxFeatDim);
+        m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
        u = ApplyNonlinearFunction(builder.Times(e, gt), numHiddenLayers, L"TimesToGetGlobalBias");
        output = builder.Plus(input, u, L"PlusGlobalBias");
        input = output;
@ -475,13 +477,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildConditionalLSTMNetwor
        // e.g., [200 x 10000], where 10000 is the vocabulary size
        // this is for speed-up issue as per word matrix can be simply obtained using column slice
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        // the label is a dense matrix. each element is the word index
        label = builder.CreateInputNode(L"labels", 4);

        clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
        clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");

        output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
@ -535,7 +537,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
-            m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"Lookuptatble");

            if (m_addDropoutNodes)
@ -556,7 +558,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
            pastValueXI->AttachInputs({ input });
            // TODO: to figure out sparse matrix size
            Wxi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"DD%d", ik), m_layerSizes[0], m_layerSizes[0]);
-            m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);

            it = builder.Plus(output, builder.Times(Wxi, pastValueXI));
            output = it;
@ -572,13 +574,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
        for (int i = m_lookupTableOrder > 0 ? 1 : 0; i < numHiddenLayers; i++)
        {
            u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i] * (m_lookupTableOrder > 0 ? m_lookupTableOrder : 1));
-            m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.Times(u, input);
            input = output;
            if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
            {
                w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"R%d", i + 1), m_layerSizes[i + 1], m_layerSizes[i + 1]);
-                m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i + 1], 1);
                output = builder.Plus(builder.Times(w, pastValue), input);

@ -588,6 +590,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
            }

            bi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bi%d", i), m_layerSizes[i + 1], 1);
+            m_net->InitLearnableParameters(bi, L"fixedValue", 0);
            output = builder.Plus(input, bi);

            if (m_addDropoutNodes)
@ -597,7 +600,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLogBilinearNetworkFro
        }

        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
        AddTrainAndEvalCriterionNodes(input, label, w);
@ -650,6 +653,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
        if (numHiddenLayers > 0)
        {
            bi = builder.CreateLearnableParameter(L"bi0", m_layerSizes[1], 1);
+            m_net->InitLearnableParameters(bi, L"fixedValue", 0);

            pastValueXI = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 1);
            pastValueXII = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[0], 2);
@ -664,19 +668,19 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
            {
                // TODO: to figure out sparse matrix size
                Wxi2 = builder.CreateLearnableParameter(L"WXI2", m_layerSizes[1], m_layerSizes[0]);
-                m_net->InitLearnableParameters(Wxi2, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(Wxi2, m_uniformInit, randomSeed++, m_initValueScale);
                // TODO: to figure out sparse matrix size
                Wxi3 = builder.CreateLearnableParameter(L"WXI3", m_layerSizes[1], m_layerSizes[0]);
-                m_net->InitLearnableParameters(Wxi3, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(Wxi3, m_uniformInit, randomSeed++, m_initValueScale);
                // TODO: to figure out sparse matrix size
                Wxi4 = builder.CreateLearnableParameter(L"WXI4", m_layerSizes[1], m_layerSizes[0]);
-                m_net->InitLearnableParameters(Wxi4, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(Wxi4, m_uniformInit, randomSeed++, m_initValueScale);
                // TODO: to figure out sparse matrix size
                Wxi1 = builder.CreateLearnableParameter(L"WXI1", m_layerSizes[1], m_layerSizes[0]);
-                m_net->InitLearnableParameters(Wxi1, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(Wxi1, m_uniformInit, randomSeed++, m_initValueScale);
                // TODO: to figure out sparse matrix size
                Wxi = builder.CreateLearnableParameter(L"WXI", m_layerSizes[1], m_layerSizes[0]);
-                m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);

                // unless there is a good algorithm to detect loops, use this explicit setup
                it = builder.Plus(
@ -711,11 +715,11 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr
            for (int i = 1; i < numHiddenLayers; i++)
            {
                u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
-                m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+                m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                if (m_recurrentLayers.size() > 0 && m_recurrentLayers[recur_idx] == i + 1)
                {
                    w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", i), m_layerSizes[i + 1], m_layerSizes[i + 1]);
-                    m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+                    m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
                    std::list<ComputationNodeBasePtr> recurrent_loop;
                    pastValue = builder.PastValue(NULL, m_defaultHiddenActivity, m_layerSizes[i + 1], 1);
                    output = SimpleNetworkBuilder<ElemType>::ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), builder.Times(w, pastValue)), i);
@ -736,8 +740,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildDNNLMNetworkFromDescr

        // TODO: to figure out sparse matrix size
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
-        //                b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        //b = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"B%d", numHiddenLayers), m_layerSizes[numHiddenLayers+1], 1);
+        //m_net->InitLearnableParameters(b, L"fixedValue", 0);
        label = builder.CreateSparseInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
        AddTrainAndEvalCriterionNodes(input, label, w);

@ -766,11 +771,11 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
        if (m_directConnect[i] == iLayer)
        {
            ComputationNodePtr directWIO = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"D%d", i), outputDim, inputDim);
-            m_net->InitLearnableParameters(directWIO, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(directWIO, m_uniformInit, randomSeed++, m_initValueScale);
            directOutput = ApplyNonlinearFunction(builder.Times(directWIO, input), i);

            ComputationNodePtr scalar = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"SV%d", i), 1, 1);
-            scalar->Value().SetValue((ElemType) 0.01);
+            m_net->InitLearnableParameters(scalar, L"fixedValue", (ElemType) 0.01);
            ComputationNodePtr scaled = builder.ElementTimes(scalar, directOutput, msra::strfun::wstrprintf(L"S%d", i));

            mergedNode = builder.Plus(toNode, scaled);
@ -801,39 +806,38 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
    Wxf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXF%d", iLayer), outputDim, inputDim);
    Wxc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WXC%d", iLayer), outputDim, inputDim);

-    m_net->InitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
-    m_net->InitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
-    m_net->InitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
-    m_net->InitLearnableParameters(Wxc, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Wxo, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Wxi, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Wxf, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Wxc, m_uniformInit, randomSeed++, m_initValueScale);

    bo = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bo%d", iLayer), outputDim, 1);
    bc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bc%d", iLayer), outputDim, 1);
    bi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bi%d", iLayer), outputDim, 1);
    bf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"bf%d", iLayer), outputDim, 1);
-    // if (m_forgetGateInitVal > 0)
-    bf->Value().SetValue(m_forgetGateInitVal);
-    // if (m_inputGateInitVal > 0)
-    bi->Value().SetValue(m_inputGateInitVal);
-    // if (m_outputGateInitVal > 0)
-    bo->Value().SetValue(m_outputGateInitVal);
+
+    m_net->InitLearnableParameters(bi, L"fixedValue", m_inputGateInitVal);
+    m_net->InitLearnableParameters(bc, L"fixedValue", 0);
+    m_net->InitLearnableParameters(bo, L"fixedValue", m_outputGateInitVal);
+    m_net->InitLearnableParameters(bf, L"fixedValue", m_forgetGateInitVal);

    Whi = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHI%d", iLayer), outputDim, outputDim);
-    m_net->InitLearnableParameters(Whi, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Whi, m_uniformInit, randomSeed++, m_initValueScale);
    Wci = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCI%d", iLayer), outputDim, 1);
-    m_net->InitLearnableParameters(Wci, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Wci, m_uniformInit, randomSeed++, m_initValueScale);

    Whf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHF%d", iLayer), outputDim, outputDim);
-    m_net->InitLearnableParameters(Whf, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Whf, m_uniformInit, randomSeed++, m_initValueScale);
    Wcf = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCF%d", iLayer), outputDim, 1);
-    m_net->InitLearnableParameters(Wcf, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Wcf, m_uniformInit, randomSeed++, m_initValueScale);

    Who = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHO%d", iLayer), outputDim, outputDim);
-    m_net->InitLearnableParameters(Who, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Who, m_uniformInit, randomSeed++, m_initValueScale);
    Wco = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WCO%d", iLayer), outputDim, 1);
-    m_net->InitLearnableParameters(Wco, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Wco, m_uniformInit, randomSeed++, m_initValueScale);

    Whc = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WHC%d", iLayer), outputDim, outputDim);
-    m_net->InitLearnableParameters(Whc, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(Whc, m_uniformInit, randomSeed++, m_initValueScale);

    size_t layer1 = outputDim;

@ -848,8 +852,8 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
    if (m_constInputGateValue)
    {
        // it = builder.CreateLearnableParameter(msra::strfun::wstrprintf (L"CONSTIT%d", iLayer), outputDim);
+        // m_net->InitLearnableParameters(it, L"fixedValue", m_constInputGateValue);
        // it->SetLearningRateMultiplier(0);
-        // it->Value().SetValue(m_constInputGateValue);
        it = nullptr;
    }
    else
@ -988,7 +992,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
-            m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");

            if (m_addDropoutNodes)
@ -1017,8 +1021,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
                else
                {
                    u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i] * (offset ? m_lookupTableOrder : 1));
-                    m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+                    m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                    b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
+                    m_net->InitLearnableParameters(b, L"fixedValue", 0);
                    output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

@ -1030,13 +1035,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildCRFLSTMNetworkFromDes
        }

        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"TimesBeforeSoftMax%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        output = builder.Times(w, input, L"outputsBeforeSoftmax");

        trans = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"TransProb%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers + 1]);
-        trans->Value().SetValue((ElemType) 1.0 / m_layerSizes[numHiddenLayers + 1]);
-        //          m_net->InitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->InitLearnableParameters(trans, L"fixedValue", (ElemType) 1.0 / m_layerSizes[numHiddenLayers + 1]);
+        //          m_net->RandomInitLearnableParameters(trans, m_uniformInit, randomSeed++, m_initValueScale);
        trans->SetLearningRateMultiplier(1.0f);
        label = builder.CreateInputNode(L"labels", m_layerSizes[numHiddenLayers + 1]);
        AddTrainAndEvalCriterionNodes(output, label, nullptr, L"CRFTrainCriterion", L"CRFEvalCriterion", nullptr, trans);
@ -1085,7 +1090,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassLSTMNetworkFromD
        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
-            m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");

            if (m_addDropoutNodes)
@ -1122,13 +1127,13 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildClassLSTMNetworkFromD
        // e.g., [200 x 10000], where 10000 is the vocabulary size
        // this is for speed-up issue as per word matrix can be simply obtained using column slice
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        // the label is a dense matrix. each element is the word index
        label = builder.CreateInputNode(L"labels", 4);

        clsweight = builder.CreateLearnableParameter(L"WeightForClassPostProb", m_nbrCls, m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(clsweight, m_uniformInit, randomSeed++, m_initValueScale);
        clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");

        output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeClassBasedCrossEntropy", L"EvalNodeClassBasedCrossEntrpy",
@ -1164,16 +1169,16 @@ shared_ptr<ComputationNode<ElemType>> /*ComputationNodePtr*/ SimpleNetworkBuilde
    input = inputObs;
    size_t nDim = inputDim + outputDim + 2;
    wInputGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WINPUTGATE%d", iLayer), outputDim, nDim);
-    m_net->InitLearnableParameters(wInputGate, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(wInputGate, m_uniformInit, randomSeed++, m_initValueScale);
    wInputGate->Value().ColumnSlice(0, 1).SetValue(m_inputGateInitVal); // init to input gate bias
    wForgetGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WFORGETGATE%d", iLayer), outputDim, nDim);
-    m_net->InitLearnableParameters(wForgetGate, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(wForgetGate, m_uniformInit, randomSeed++, m_initValueScale);
    wForgetGate->Value().ColumnSlice(0, 1).SetValue(m_forgetGateInitVal); // init to forget gate bias
    wOutputGate = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WOUTPUTGATE%d", iLayer), outputDim, nDim);
-    m_net->InitLearnableParameters(wOutputGate, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(wOutputGate, m_uniformInit, randomSeed++, m_initValueScale);
    wOutputGate->Value().ColumnSlice(0, 1).SetValue(m_outputGateInitVal); // init to output gate bias
    wMemoryCellMatrix = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"WMEMORYCELLWEIGHT%d", iLayer), outputDim, inputDim + outputDim + 1);
-    m_net->InitLearnableParameters(wMemoryCellMatrix, m_uniformInit, randomSeed++, m_initValueScale);
+    m_net->RandomInitLearnableParameters(wMemoryCellMatrix, m_uniformInit, randomSeed++, m_initValueScale);
    wMemoryCellMatrix->Value().ColumnSlice(0, 1).SetValue(0); // init to memory cell bias

    output = builder.LSTM(inputObs, wInputGate, wForgetGate, wOutputGate, wMemoryCellMatrix, msra::strfun::wstrprintf(L"LSTM%d", iLayer));
@ -1234,7 +1239,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
-            m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");
 #ifdef DEBUG_DECODER
            e->Value().SetValue((ElemType) 0.01);
@ -1275,8 +1280,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
                else
                {
                    u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
-                    m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+                    m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                    b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
+                    m_net->InitLearnableParameters(b, L"fixedValue", 0);
                    output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

@ -1290,7 +1296,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildLSTMNetworkFromDescri
        }

        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers + 1], m_layerSizes[numHiddenLayers]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
 #ifdef DEBUG_DECODER
        w->Value().SetValue((ElemType) 0.01);
 #endif
@ -1349,7 +1355,7 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
        if (m_lookupTableOrder > 0)
        {
            e = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"E%d", 0), m_layerSizes[1], m_layerSizes[0] / m_lookupTableOrder);
-            m_net->InitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
+            m_net->RandomInitLearnableParameters(e, m_uniformInit, randomSeed++, m_initValueScale);
            output = builder.LookupTable(e, input, L"LookupTable");

            if (m_addDropoutNodes)
@ -1381,8 +1387,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
                else
                {
                    u = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"U%d", i), m_layerSizes[i + 1], m_layerSizes[i]);
-                    m_net->InitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
+                    m_net->RandomInitLearnableParameters(u, m_uniformInit, randomSeed++, m_initValueScale);
                    b = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"B%d", i), m_layerSizes[i + 1], 1);
+                    m_net->InitLearnableParameters(b, L"fixedValue", 0);
                    output = ApplyNonlinearFunction(builder.Plus(builder.Times(u, input), b), i);
                }

@ -1407,14 +1414,14 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNCELSTMNetworkFromDes
        // e.g., [200 x 10000], where 10000 is the vocabulary size
        // this is for speed-up issue as per word matrix can be simply obtained using column slice
        w = builder.CreateLearnableParameter(msra::strfun::wstrprintf(L"W%d", numHiddenLayers), m_layerSizes[numHiddenLayers], m_layerSizes[numHiddenLayers + 1]);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);

        // the label is a dense matrix. each element is the word index
        label = builder.CreateInputNode(L"labels", 2 * (this->nce_noises + 1));

        bias = builder.CreateLearnableParameter(L"BiasVector", 1, m_layerSizes[m_layerSizes.size() - 1]);
-        bias->Value().SetValue((ElemType) -std::log(m_layerSizes[m_layerSizes.size() - 1]));
-        // m_net->InitLearnableParameters(bias, m_uniformInit, randomSeed++, std::log(m_layerSizes[m_layerSizes.size() - 1])* m_initValueScale);
+        m_net->InitLearnableParameters(bias, L"fixedValue", (ElemType) -std::log(m_layerSizes[m_layerSizes.size() - 1]));
+        // m_net->RandomInitLearnableParameters(bias, m_uniformInit, randomSeed++, std::log(m_layerSizes[m_layerSizes.size() - 1])* m_initValueScale);
        // clslogpostprob = builder.Times(clsweight, input, 1, L"ClassPostProb");

        output = AddTrainAndEvalCriterionNodes(input, label, w, L"TrainNodeNCEBasedCrossEntropy", L"EvalNodeNCEBasedCrossEntrpy", bias);
@ -1525,10 +1532,12 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
        wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

        w = builder.CreateLearnableParameter(nameOfW, wts.GetNumRows(), wts.GetNumCols());
-        w->Value().SetValue(wts);
+        m_net->InitLearnableParameters(w, L"fixedValue", 0); // follow protocol
+        w->Value().SetValue(wts); // and overwrite

        b = builder.CreateLearnableParameter(nameOfB, bias.GetNumRows(), 1);
-        b->Value().SetValue(bias);
+        m_net->InitLearnableParameters(b, L"fixedValue", 0); // follow protocol
+        b->Value().SetValue(bias); // and overwrite

        if (layerType == "perceptron")
        {
@ -1588,8 +1597,9 @@ ComputationNetworkPtr SimpleNetworkBuilder<ElemType>::BuildNetworkFromDbnFile(co
        wstring nameOfH = msra::strfun::wstrprintf(L"H%d", i + 1);

        w = builder.CreateLearnableParameter(nameOfW, outputLayerSize, penultimateSize);
-        m_net->InitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
+        m_net->RandomInitLearnableParameters(w, m_uniformInit, randomSeed++, m_initValueScale);
        b = builder.CreateLearnableParameter(nameOfB, outputLayerSize, 1);
+        m_net->InitLearnableParameters(b, L"fixedValue", 0);
        output = builder.Plus(builder.Times(w, input, 1, nameOfTimes), b, nameOfPlus);
        m_net->RenameNode(output, L"HLast");

--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
@ -53,7 +53,6 @@ public:

 __declspec_noreturn static inline void EvaluationError(const wstring &msg, TextLocation where)
 {
-    //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
    throw EvaluationException(msg, where);
 }

@ -536,8 +535,13 @@ static ConfigValuePtr Evaluate(const ExpressionPtr &e, const IConfigRecordPtr &s
            }
            return ConfigValuePtr(make_shared<ConfigLambda>(move(paramNames), move(namedParams), f), MakeFailFn(e->location), exprPath);
        }
-        else if (e->op == L"(") // === apply a function to its arguments
+        else if (e->op == L"(" || e->op == L"{") // === apply a function to its arguments
        {
+            // Note: "{" is experimental and currently ignored as a distinction. To do it more completely, we need
+            //  - remember how a function was declared (currently not possible for lambdas)
+            //  - make sure the invocation matches declaration
+            //  - disallow calling Parameter() or any other creating functions as "()"
+            //  - disallow calling "{}"-declared functions from inside a "()"
            let &lambdaExpr = e->args[0]; // [0] = function
            let &argsExpr = e->args[1];   // [1] = arguments passed to the function ("()" expression of expressions)
            let lambda = AsPtr<ConfigLambda>(Evaluate(lambdaExpr, scope, exprPath, L"" /*macros are not visible in expression names*/), lambdaExpr, L"function");
@ -848,8 +852,8 @@ static wstring FormatConfigValue(ConfigValuePtr arg, const wstring &how)
    {
        let arr = arg.AsPtr<ConfigArray>();
        wstring result;
-        let range = arr->GetIndexRange();
-        for (int i = range.first; i <= range.second; i++)
+        let range = arr->GetIndexBeginEnd();
+        for (int i = range.first; i < range.second; i++)
        {
            if (i > range.first)
                result.append(L"\n");
@ -890,20 +894,20 @@ public:
                else // otherwise expect an array
                {
                    let & arr = arg.AsRef<ConfigArray>();
-                    let range = arr.GetIndexRange();
-                    us = (double)(range.second + 1 - range.first);
+                    let range = arr.GetSize(arg.GetFailFn());
+                    us = (double)range;
                }
            }
        }
-        else if (what == L"Mod" || what == L"IntDiv")  //two-arg int functions
+        else if (what == L"Mod" || what == L"IntDiv")  // two-arg int functions
        {
            let argsArg = config[L"args"];
            let& args = argsArg.AsRef<ConfigArray>();
-            auto range = args.GetIndexRange();
-            if (range.second != range.first + 1)
+            auto range = args.GetIndexBeginEnd();
+            if (range.second != range.first + 2)
                argsArg.Fail(L"Mod/IntDiv expects two arguments");
            let arg1 = (int)args.At(range.first);
-            let arg2 = (int)args.At(range.second);
+            let arg2 = (int)args.At(range.first + 1);

            if (what == L"Mod")
                us = (int)(arg1 % arg2);
@ -918,6 +922,7 @@ public:

 // CompareFunctions
 //  - IsSameObject()
+//  - IsArray()
 class CompareFunction : public BoxOf<Bool>
 {
 public:
@ -932,13 +937,17 @@ public:
        if (what == L"IsSameObject")
        {
            let& args = argsArg.AsRef<ConfigArray>();
-            auto range = args.GetIndexRange();
-            if (range.second != range.first+1)
+            auto range = args.GetIndexBeginEnd();
+            if (range.second != range.first + 2)
                argsArg.Fail(L"IsSameObject expects two arguments");
-            let arg1 = args.At(range.first ).AsPtr<Object>();
-            let arg2 = args.At(range.second).AsPtr<Object>();
+            let arg1 = args.At(range.first    ).AsPtr<Object>();
+            let arg2 = args.At(range.first + 1).AsPtr<Object>();
            us = arg1.get() == arg2.get();
        }
+        else if (what == L"IsArray")
+        {
+            us = argsArg.Is<ConfigArray>();
+        }
        else
            whatArg.Fail(L"Unknown 'what' value to CompareFunction: " + what);
    }
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.h
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.h
@ -22,6 +22,4 @@ ConfigValuePtr Evaluate(ExpressionPtr);                               // evaluat
 void Do(ExpressionPtr e);                                             // evaluate e.do
 shared_ptr<Object> EvaluateField(ExpressionPtr e, const wstring& id); // for experimental CNTK integration

-// some simple tests
-void SomeTests();
-} } } // end namespaces
+}}} // end namespaces
--- a/Source/CNTK/BrainScript/BrainScriptParser.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp
@ -13,6 +13,7 @@
 #include <set>
 #include <stdexcept>
 #include <algorithm>
+#include <iomanip>

 #ifndef let
 #define let const auto
@ -89,9 +90,18 @@ struct Issue
 // Because it is often hard to recognize an issue only from the point where it occurred, we also report the history in compact visual form.
 // Since often multiple contexts are on the same source line, we only print each source line once in a consecutive row of contexts.
 /*static*/ void TextLocation::PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
+{
+    wstring error = CreateIssueMessage(locations, errorKind, kind, what);
+    fprintf(stderr, "%ls", error.c_str());
+    fflush(stderr);
+}
+
+/*static*/ wstring TextLocation::CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
 {
    vector<Issue> issues; // tracing the error backwards
    size_t symbolIndex = 0;
+    wstring message;
+
    for (size_t n = 0; n < locations.size(); n++)
    {
        let& location = locations[n];
@ -125,20 +135,23 @@ struct Issue
    if (!locations.empty()) // (be resilient to some throwers not having a TextLocation; to be avoided)
    {
        let& firstLoc = issues.front().location;
-        fprintf(stderr, "[CALL STACK]\n");
+        message += wstrprintf(L"[CALL STACK]\n");
        for (auto i = issues.rbegin(); i != issues.rend(); i++)
        {
            let& issue = *i;
            auto& where = issue.location;
            const auto& lines = where.GetSourceFile().lines;
            const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str();
-            fprintf(stderr, "  %ls\n  %ls\n", line, issue.markup.c_str());
+            message += wstrprintf(L"  %ls\n  %ls\n", line, issue.markup.c_str());
        }
-        fprintf(stderr, "%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
+        message += wstrprintf(L"%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
    }
    else
-        fprintf(stderr, "%ls while %ls", errorKind, kind);
-    fprintf(stderr, ": %ls\n", what), fflush(stderr);
+    {
+        message += wstrprintf(L"%ls while %ls", errorKind, kind);
+    }
+    message += wstrprintf(L": %ls\n", what);
+    return message;
 }
 /*static*/ vector<SourceFile> TextLocation::sourceFileMap;

@ -286,7 +299,7 @@ public:
        };
        punctuations = set<wstring>{
            L"=", L";", L",", L"\n",
-            L"[", L"]", L"(", L")",
+            L"[", L"]", L"(", L")", L"{", L"}", L"[|", L"|]",
            L"+", L"-", L"*", L"/", L"**", L".*", L"%", L"||", L"&&", L"^",
            L"!",
            L"==", L"!=", L"<", L"<=", L">", L">=",
@ -557,37 +570,43 @@ public:
 // ---------------------------------------------------------------------------

 // diagnostics helper: print the content
-void Expression::Dump(int indent) const
+void Expression::DumpToStream(wstringstream & treeStream, int indent)
 {
-    fprintf(stderr, "%*s", indent, "");
+    treeStream << std::setfill(L' ') << std::setw(indent) << L" ";
+    treeStream << std::setw(0);
+
    if (op == L"s")
-        fprintf(stderr, "'%ls' ", s.c_str());
+        treeStream << "'" << s.c_str() << "'";
    else if (op == L"d")
-        fprintf(stderr, "%.f ", d);
+        treeStream << std::fixed << std::setprecision(0) << d;
    else if (op == L"b")
-        fprintf(stderr, "%s ", b ? "true" : "false");
+        treeStream << b ? "true" : "false";
    else if (op == L"id")
-        fprintf(stderr, "%ls ", id.c_str());
+        treeStream << id.c_str();
    else if (op == L"new" || op == L"array" || op == L".")
-        fprintf(stderr, "%ls %ls ", op.c_str(), id.c_str());
+        treeStream << op.c_str() << " " << id.c_str();
    else
-        fprintf(stderr, "%ls ", op.c_str());
+        treeStream << op.c_str();
+
    if (!args.empty())
    {
-        fprintf(stderr, "\n");
+        treeStream << std::endl;
        for (const auto& arg : args)
-            arg->Dump(indent + 2);
+        {
+            arg->DumpToStream(treeStream, indent + 1);
+        }
    }
    if (!namedArgs.empty())
    {
-        fprintf(stderr, "\n");
+        treeStream << std::endl;
        for (const auto& arg : namedArgs)
        {
-            fprintf(stderr, "%*s%ls =\n", indent + 2, "", arg.first.c_str());
-            arg.second.second->Dump(indent + 4);
+            treeStream << std::setfill(L' ') << std::setw(indent + 1) << L"";
+            treeStream << arg.first.c_str() << L" =" << std::endl;
+            arg.second.second->DumpToStream(treeStream, indent + 2);
        }
    }
-    fprintf(stderr, "\n");
+    treeStream << std::endl;
 }

 class Parser : public Lexer
@ -647,13 +666,15 @@ class Parser : public Lexer
        return id;
    }

-    map<wstring, int> infixPrecedence; // precedence level of infix operators
+    map<wstring, int> infixPrecedence;    // precedence level of infix operators
+    static const int unaryPrecedence = 90;  // for unary "-" and "!". 90 is below x., x[, x(, and x{, but above all others
+    // TODO: Would be more direct to fold this into the table below as well.
 public:
    Parser(SourceFile&& sourceFile, vector<wstring>&& includePaths)
        : Lexer(move(includePaths))
    {
        infixPrecedence = map<wstring, int>{
-            {L".", 99}, {L"[", 99}, {L"(",   99}, // also sort-of infix operands...
+            {L".", 99}, {L"[", 99}, {L"(",   99}, {L"{",   99}, // (with LHS) these are also sort-of infix operands...
            {L"*", 10}, {L"/", 10}, {L".*",  10}, {L"**", 10}, {L"%", 10},
            {L"+",  9}, {L"-",  9}, {L"with", 9}, {L"==",  8},
            {L"!=", 8}, {L"<",  8}, {L"<=",   8}, {L">",   8}, {L">=", 8},
@ -700,7 +721,7 @@ public:
        {
            operand = make_shared<Expression>(tok.beginLocation, tok.symbol + L"("); // encoded as +( -( !(
            ConsumeToken();
-            operand->args.push_back(ParseExpression(100, stopAtNewline));
+            operand->args.push_back(ParseExpression(unaryPrecedence, stopAtNewline));
        }
        else if (tok.symbol == L"new") // === new class instance
        {
@ -723,13 +744,34 @@ public:
            operand = ParseExpression(0, false /*go across newlines*/); // just return the content of the parens (they do not become part of the expression tree)
            ConsumePunctuation(L")");
        }
-        else if (tok.symbol == L"[") // === dictionary constructor
+        else if (tok.symbol == L"{" || tok.symbol == L"["/*soon to be deprecated*/) // === record constructor
        {
+            let* closeSymbol = tok.symbol == L"{" ? L"}" : L"]";
            operand = make_shared<Expression>(tok.beginLocation, L"[]");
            ConsumeToken();
            operand->namedArgs = ParseRecordMembers();
-            ConsumePunctuation(L"]");
+            ConsumePunctuation(closeSymbol);
        }
+#if 1   // the F# syntax is a stop-gap and meant for experimentation, and we will not recommend to use it
+        // Rather, we must find a way to parse both Python-like array literals and BS dictionaries jointly,
+        // and eventually deprecate [] for records.
+        else if (tok.symbol == L"[|") // === array literal using F# syntax [| a; b; c |] (same as a:b:c, but also allows for 0- and 1-element arrays)
+        {
+            operand = make_shared<Expression>(tok.beginLocation, L":");
+            ConsumeToken();
+            if (GotToken().symbol != L"|]") // {} defines an empty array
+            {
+                for (;;)
+                {
+                    operand->args.push_back(ParseExpression(0, false)); // item. Precedence 0 means go until comma or closing parenthesis.
+                    if (GotToken().symbol != L";")
+                        break;
+                    ConsumeToken();
+                }
+            }
+            ConsumePunctuation(L"|]");
+        }
+#endif
        else if (tok.symbol == L"array") // === array constructor
        {
            operand = OperandFromTokenSymbol(tok);
@ -780,18 +822,18 @@ public:
                if (left->op != L"id") // currently only allow for a single argument
                    Expected(L"identifier");
                ConsumeToken();
-                let macroArgs = make_shared<Expression>(left->location, L"()", left); // wrap identifier in a '()' macro-args expression
+                let macroArgs = make_shared<Expression>(left->location, L"()", left); // wrap identifier in a "()" macro-args expression
                // TODO: test parsing of i => j => i*j
                let body = ParseExpression(opPrecedence, stopAtNewline); // pass same precedence; this makes '=>' right-associative  e.g.i=>j=>i*j
                operation->args[0] = macroArgs;                          // [0]: parameter list
                operation->args.push_back(body);                         // [1]: right operand
            }
-            else if (op == L"(") // === macro application
+            else if (op == L"(" || op == L"{") // === macro application
            {
-                // op = "("   means 'apply'
+                // op = "(" and "{"   mean 'apply', where {} refers to experimental constructor syntax
                // args[0] = lambda expression (lambda: op="=>", args[0] = param list, args[1] = expression with unbound vars)
-                // args[1] = arguments    (arguments: op="(), args=vector of expressions, one per arg; and namedArgs)
-                operation->args.push_back(ParseMacroArgs(false)); // [1]: all arguments
+                // args[1] = arguments    (arguments: op="()", args=vector of expressions, one per arg; and namedArgs)
+                operation->args.push_back(ParseMacroArgs(false, op)); // [1]: all arguments
            }
            else if (op == L"[") // === array index
            {
@ -829,11 +871,12 @@ public:
    //         In case of macro definition, all arguments must be of type "id". Pass 'defining' to check for that.
    //  namedArgs = dictionary of optional args
    //         In case of macro definition, dictionary values are default values that are used if the argument is not given
-    ExpressionPtr ParseMacroArgs(bool defining)
+    ExpressionPtr ParseMacroArgs(bool defining, wstring openSymbol)
    {
-        ConsumePunctuation(L"(");
+        ConsumePunctuation(openSymbol.c_str());
        auto macroArgs = make_shared<Expression>(GotToken().beginLocation, L"()");
-        if (GotToken().symbol != L")") // x() defines an empty argument list
+        let* closeSymbol = openSymbol == L"(" ? L")" : L"}";
+        if (GotToken().symbol != closeSymbol) // x() defines an empty argument list
        {
            for (;;)
            {
@ -856,7 +899,7 @@ public:
                ConsumeToken();
            }
        }
-        ConsumePunctuation(L")");
+        ConsumePunctuation(closeSymbol);
        return macroArgs;
    }
    map<wstring, pair<TextLocation, ExpressionPtr>> ParseRecordMembers()
@ -865,7 +908,7 @@ public:
        //  member identifier -> expression
        // Macro declarations are translated into lambdas, e.g.
        //  F(A,B) = expr(A,B)
-        // gets represented in the dictionary as
+        // (and likewise F{A,B}) gets represented in the dictionary as
        //  F = (A,B) => expr(A,B)
        // where a lambda expression has this structure:
        //  op="=>"
@ -897,7 +940,8 @@ public:
                ConsumePunctuation(L"]");
            }
            // optional macro args
-            let parameters = (GotToken().symbol == L"(") ? ParseMacroArgs(true /*defining*/) : ExpressionPtr(); // optionally, macro arguments
+            let& openParen = GotToken().symbol;
+            let parameters = (openParen == L"(" || openParen == L"{") ? ParseMacroArgs(true /*defining*/, openParen) : ExpressionPtr(); // optionally, macro arguments
            ConsumePunctuation(L"=");
            auto rhs = ParseExpression(0, true /*can end at newline*/); // and the right-hand side
            // if macro then rewrite it as an assignment of a lambda expression
@ -907,7 +951,8 @@ public:
            if (arrayIndexExpr)
            {
                // create a lambda expression over the index variable
-                let macroArgs = make_shared<Expression>(arrayIndexExpr->location, L"()", arrayIndexExpr);      // wrap identifier in a '()' macro-args expression
+                // BUGBUG: For {} constructor functions--we cannot declare constructor lambdas for now.
+                let macroArgs = make_shared<Expression>(arrayIndexExpr->location, L"()", arrayIndexExpr);      // wrap identifier in a "()" macro-args expression
                let initLambdaExpr = make_shared<Expression>(arrayIndexExpr->location, L"=>", macroArgs, rhs); // [0] is id, [1] is body
                rhs = make_shared<Expression>(location, L"array");
                rhs->args.push_back(fromExpr);       // [0] first index
@ -939,12 +984,6 @@ public:
        topDict->namedArgs = topMembers;
        return topDict;
    }
-    // simple test function for use during development
-    static void Test()
-    {
-        let parserTest = L"a=1\na1_=13;b=2 // cmt\ndo = (print\n:train:eval) ; x = array[1..13] (i=>1+i*print.message==13*42) ; print = new PrintAction [ message = 'Hello World' ]";
-        ParseConfigDictFromString(parserTest, L"Test", vector<wstring>())->Dump();
-    }
 };

 // globally exported functions to execute the parser
--- a/Source/CNTK/BrainScript/BrainScriptParser.h
+++ b/Source/CNTK/BrainScript/BrainScriptParser.h
@ -37,6 +37,7 @@ struct TextLocation // position in the text. Lightweight value struct that we ca

    // helpers for pretty-printing errors: Show source-code line with ...^ under it to mark up the point of error
    static void PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
+    static std::wstring CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
    static void Trace(TextLocation, const wchar_t* traceKind, const wchar_t* op, const wchar_t* exprPath);

    // construction
@ -77,8 +78,12 @@ public:
    }                                        // where the error happened
    virtual const wchar_t* kind() const = 0; // e.g. "warning" or "error"

+    wstring GetError(const std::wstring& linePrefix) const override
+    {
+        return TextLocation::CreateIssueMessage(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
+    }
    // pretty-print this as an error message
-    void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const
+    void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const override
    {
        TextLocation::PrintIssue(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
    }
@ -129,7 +134,7 @@ struct Expression
        args.push_back(arg2);
    }
    // diagnostics helper: print the content
-    void Dump(int indent = 0) const;
+    void DumpToStream(wstringstream & treeStream, int indent = 0);
 };
 typedef Expression::ExpressionPtr ExpressionPtr; // circumvent some circular definition problem

--- a/Source/CNTK/BrainScript/BrainScriptTest.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptTest.cpp
@ -1,194 +0,0 @@
-// BrainScriptTest.cpp -- some tests
-
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include "Basics.h"
-#include "BrainScriptEvaluator.h"
-#include "BrainScriptParser.h"
-
-#ifndef let
-#define let const auto
-#endif
-
-namespace Microsoft { namespace MSR { namespace BS {
-
-using namespace std;
-using namespace msra::strfun;
-
-// Note: currently this seems to be the master copy; got to check whether the other one was also changed
-
-//extern wstring standardFunctions, computationNodes, commonMacros;
-
-#if 1 // TODO: these may be newer, merge into Experimentalthingy
-
-static wstring standardFunctions =
-    L"Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ] \n"
-    L"Fail(msg) = new FailAction [ what = msg ] \n"
-    L"RequiredParameter(message) = Fail('RequiredParameter: ' + message) \n"
-    L"Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ] \n"
-    L"Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ] \n"
-    L"Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ] \n"
-    L"Chr(c) = new StringFunction [ what = 'Chr' ;  arg = c ] \n"
-    L"Floor(x)  = new NumericFunction [ what = 'Floor' ;  arg = x ] \n"
-    L"Length(x) = new NumericFunction [ what = 'Length' ; arg = x ] \n"
-    L"Ceil(x) = -Floor(-x) \n"
-    L"Round(x) = Floor(x+0.5) \n"
-    L"Abs(x) = if x >= 0 then x else -x \n"
-    L"Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0 \n"
-    L"Min(a,b) = if a < b then a else b \n"
-    L"Max(a,b) = if a > b then a else b \n"
-    L"Fac(n) = if n > 1 then Fac(n-1) * n else 1 \n";
-
-static wstring computationNodes = // BUGBUG: optional args not working yet, some scope problem causing a circular reference
-    L"Mean(z, tag='') = new ComputationNode [ class = 'MeanNode' ; inputs = z /* ; tag = tag */ ]\n"
-    L"InvStdDev(z, tag='') = new ComputationNode [ class = 'InvStdDevNode' ; inputs = z /* ; tag = tag */ ]\n"
-    L"PerDimMeanVarNormalization(feat,mean,invStdDev, tag='') = new ComputationNode [ class = 'PerDimMeanVarNormalizationNode' ; inputs = feat:mean:invStdDev /* ; tag = tag */ ]\n"
-    L"Parameter(outD, inD, tag='parameter') = new ComputationNode [ class = 'LearnableParameterNode' ; outDim = outD ; inDim = inD /*; tag = tag*/ ]\n"
-    L"Input(dim,tag='features') = Parameter(dim,1,tag=tag)   // TODO: for now \n"
-    L"RowSlice(firstRow, rows, features, tag='') = new ComputationNode [ class = 'RowSliceNode' ; inputs = features ; first = firstRow ; num = rows /* ; tag = tag */ ]\n"
-    L"Delay(in, delay, tag='') = new RecurrentComputationNode [ class = 'DelayNode' ; inputs = in ; deltaT = -delay /* ; tag = tag */ ]\n"
-    L"Sigmoid(z, tag='') = new ComputationNode [ class = 'SigmoidNode' ; inputs = z /* ; tag = tag */ ]\n"
-    L"Log(z, tag='') = new ComputationNode [ class = 'LogNode' ; inputs = z /* ; tag = tag */ ]\n"
-    L"CrossEntropyWithSoftmax(labels, outZ, tag='') = new ComputationNode [ class = 'CrossEntropyWithSoftmaxNode' ; inputs = labels:outZ /* ; tag = tag */ ]\n"
-    L"ErrorPrediction(labels, outZ, tag='') = new ComputationNode [ class = 'ErrorPredictionNode' ; inputs = labels:outZ /* ; tag = tag */ ]\n";
-
-static wstring commonMacros = // TODO: rename rows and cols to inDim and outDim or vice versa, whichever it is
-    L"BFF(in, rows, cols) = [ B = Parameter(rows, 1/*init = fixedvalue, value = 0*/) ; W = Parameter(rows, cols) ; z = W*in+B ] \n"
-    L"SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ] \n "
-    L"MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat)) \n"
-    L"LogPrior(labels) = Log(Mean(labels)) \n";
-
-#endif
-
-void SomeTests()
-{
-    try
-    {
-        // collecting all sorts of test cases here
-        const wchar_t* parserTests[] =
-            {
-                L"do = Parameter(13,42) * Input(42) + Parameter(13,1)",
-                L"do = Print(array [1..10] (i=>i*i))",
-                L"do = new PrintAction [ what = 'abc' ]",
-                L"do = Print(new StringFunction [ x = 13 ; y = 42 ; what = 'Format' ; how = '.2' ; arg = x*y ])",
-                L"do = Print(\"new StringFunction [ what = 'Format' ; how = '.2' ; arg = '13 > 42' ]\")",
-                L"do = new PrintAction [ what = if 13 > 42 || 12 > 1 then 'Hello World' + \"!\" else 'Oops?']",
-                L"i2s(i) = new StringFunction [ what = 'Format' ; arg = i ; how = '.2' ] ; do = Print('result=' + i2s((( [ v = (i => i + delta) ].v(5)))+13)) ; delta = 42 ",
-                L"do = Print(1+2*3) : Print('hello'+' world')",
-                L"do = Print(Format( (13:(fortytwo:1):100), '')) ; fortytwo=42 ",
-                L"do = Print(val) ; val=if !false then 42 else -+-++-13:[a='a';b=42]:+14; arr = array [1..10] (i => 2*i)",
-                L"do = Print(arg) ; N = 5 ; arr = array [1..N] (i => if i < N then arr[i+1]*i else N) ; arg = arr ",
-                L"do = Print(val) ; val = [ v = (i => i + offset) ].v(42) ; offset = 13 ",
-                // #12: DNN with recursion
-                L"do = Print(val) \n"
-                L"val = new NDLComputationNetwork [\n"
-                L"  featDim=40*31 ; labelDim=9000 ; hiddenDim=2048 ; numHiddenLayers = 3 \n"
-                L"  myFeatures = Input(featDim) ; myLabels = Input(labelDim) \n"
-                L"  featNorm = MeanVarNorm(myFeatures) \n"
-                L"  HiddenStack(layer) = if layer > 1 then SBFF(HiddenStack(layer - 1).Eh, hiddenDim, hiddenDim) else SBFF(featNorm, hiddenDim, featDim) \n"
-                L"  outLayer = BFF(HiddenStack(numHiddenLayers).Eh, labelDim, hiddenDim) \n"
-                L"  outZ = outLayer.z \n"
-                L"  CE = CrossEntropyWithSoftmax(myLabels, outZ) \n"
-                L"  Err = ErrorPrediction(myLabels, outZ) \n"
-                L"  logPrior = LogPrior(myLabels) \n"
-                L"  ScaledLogLikelihood = outZ - logPrior \n"
-                L"]\n",
-                // #13: factorial
-                L"do = Print(fac(5)) ; fac(i) = if i > 1 then fac(i-1)*i else 1 ",
-                // #14: Fibonacci sequence with memoization
-                L"do = Print(fibs(10)) ; fibs(n) = [ vals = array[1..n] (i => if i < 3 then i-1 else vals[i-1]+vals[i-2]) ].vals[n] ",
-                // #15: DNN with array
-                L"do = Print(val) \n"
-                L"val = new NDLComputationNetwork [\n"
-                L"  featDim=40*31 ; labelDim=9000 ; hiddenDim=2048 ; numHiddenLayers = 3 \n"
-                L"  myFeatures = Input(featDim, tag='features') ; myLabels = Input(labelDim, tag='labels') \n"
-                L"  featNorm = MeanVarNorm(myFeatures) \n"
-                L"  layers[layer:1..numHiddenLayers] = if layer > 1 then SBFF(layers[layer-1].Eh, hiddenDim, hiddenDim) else SBFF(featNorm, hiddenDim, featDim) \n"
-                L"  outLayer = BFF(layers[numHiddenLayers].Eh, labelDim, hiddenDim) \n"
-                L"  outZ = outLayer.z + Delay(outZ, 1) \n"
-                L"  CE = CrossEntropyWithSoftmax(myLabels, outZ) \n"
-                L"  Err = ErrorPrediction(myLabels, outZ) \n"
-                L"  logPrior = LogPrior(myLabels) \n"
-                L"  ScaledLogLikelihood = outZ - logPrior \n"
-                L"]\n",
-                // #16: windowed RNN
-                L"do = Print(val)                                                                                                           \n"
-                L"val = new NDLComputationNetwork [                                                                                         \n"
-                L"   hiddenDim = 512                                                                                                        \n"
-                L"   numHiddenLayers = 2                                                                                                    \n"
-                L"   T = 3                                  // total context window                                                         \n"
-                L"                                                                                                                          \n"
-                L"   // data sources                                                                                                        \n"
-                L"   featDim = 40 ; labelDim = 9000                                                                                         \n"
-                L"   myFeatures = Input(featDim) ; myLabels = Input(labelDim)                                                               \n"
-                L"                                                                                                                          \n"
-                L"   // split the augmented input vector into individual frame vectors                                                      \n"
-                L"   subframes[t:0..T - 1] = RowSlice(t * featDim, featDim, myFeatures)                                                     \n"
-                L"                                                                                                                          \n"
-                L"   // hidden layers                                                                                                       \n"
-                L"   layers[layer:1..numHiddenLayers] = [     // each layer stores a dict that stores its hidden fwd and bwd state vectors  \n"
-                L"       // model parameters                                                                                                \n"
-                L"       W_fwd = Parameter(hiddenDim, featDim)                                              // Parameter(outdim, indim)     \n"
-                L"       W_bwd = if layer > 1 then Parameter(hiddenDim, hiddenDim) else Fail('no W_bwd')    // input-to-hidden              \n"
-                L"       H_fwd = Parameter(hiddenDim, hiddenDim)                                            // hidden-to-hidden             \n"
-                L"       H_bwd = Parameter(hiddenDim, hiddenDim)                                                                            \n"
-                L"       b = Parameter(hiddenDim, 1)                                                        // bias                         \n"
-                L"       // shared part of activations (input connections and bias)                                                         \n"
-                L"       z_shared[t:0..T-1] = (if layer > 1                                                                                 \n"
-                L"                             then W_fwd * layers[layer - 1].h_fwd[t] + W_bwd * layers[layer - 1].h_bwd[t]                 \n"
-                L"                             else W_fwd * subframes[t]                                                                    \n"
-                L"                            ) + b                                                                                         \n"
-                L"       // recurrent part and non-linearity                                                                                \n"
-                L"       step(H, h, dt, t) = Sigmoid(if (t + dt >= 0 && t + dt < T)                                                         \n"
-                L"                                   then z_shared[t] + H * h[t + dt]                                                       \n"
-                L"                                   else z_shared[t])                                                                      \n"
-                L"       h_fwd[t:0..T-1] = step(H_fwd, h_fwd, -1, t)                                                                        \n"
-                L"       h_bwd[t:0..T-1] = step(H_bwd, h_bwd,  1, t)                                                                        \n"
-                L"   ]                                                                                                                      \n"
-                L"   // output layer --linear only at this point; Softmax is applied later                                                  \n"
-                L"   outLayer = [                                                                                                           \n"
-                L"       // model parameters                                                                                                \n"
-                L"       W_fwd = Parameter(labelDim, hiddenDim)                                                                             \n"
-                L"       W_bwd = Parameter(labelDim, hiddenDim)                                                                             \n"
-                L"       b = Parameter(labelDim, 1)                                                                                         \n"
-                L"       //  output                                                                                                         \n"
-                L"       topHiddenLayer = layers[numHiddenLayers]                                                                           \n"
-                L"       centerT = Floor(T/2)                                                                                               \n"
-                L"       z = W_fwd * topHiddenLayer.h_fwd[centerT] + W_bwd * topHiddenLayer.h_bwd[centerT] + b                              \n"
-                L"   ]                                                                                                                      \n"
-                L"   outZ = outLayer.z     // we only want this one & don't care about the rest of this dictionary                          \n"
-                L"                                                                                                                          \n"
-                L"   // define criterion nodes                                                                                              \n"
-                L"   CE = CrossEntropyWithSoftmax(myLabels, outZ)                                                                           \n"
-                L"   Err = ErrorPrediction(myLabels, outZ)                                                                                  \n"
-                L"                                                                                                                          \n"
-                L"   // define output node for decoding                                                                                     \n"
-                L"   logPrior = LogPrior(myLabels)                                                                                          \n"
-                L"   ScaledLogLikelihood = outZ - logPrior   // before: Minus(CE.BFF.FF.P,logPrior,tag=Output)                              \n"
-                L"]\n",
-                L" \n" // this fails because dict is outside val; expression name is not local to it
-                L"do = Print(val) \n"
-                L"dict = [ outY = Input(13) ] ; val = new NDLComputationNetwork [ outZ = dict.outY \n"
-                L"]\n",
-                L"f(x,option='default') = Print(option); do = f(42,option='value')",
-                NULL};
-        let first = 0; // 0 for all
-        bool oneOnly = first > 0;
-        for (size_t i = first; parserTests[i]; i++)
-        {
-            fprintf(stderr, "\n### Test %d ###\n\n", (int) i), fflush(stderr);
-            let parserTest = parserTests[i];
-            let expr = ParseConfigDictFromString(standardFunctions + computationNodes + commonMacros + parserTest, L"Test", vector<wstring>());
-            //expr->Dump();
-            Do(expr);
-            if (oneOnly)
-                break;
-        }
-    }
-    catch (const ConfigException& err)
-    {
-        err.PrintError(L"error");
-    }
-}
-
-}}} // namespaces
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -6,24 +6,175 @@
 //

 ##############################################################################
-# standard functions
+# Layer constructors
+#
+# A layer constructor is a stateful function that creates and returns an instance
+# of a 'learnable function'. A learnable function is a function object that has
+# learnable parameters baked into it, which get trained by SGD.
+# Calling a layer constructor twice creates two instances with independent parameters.
+#
+# Learnable function instances can be applied to data or composed directly into
+# more complex models. For example:
+#   // immediate usage:
+#   z = LinearLayer{9000}(h)  # LinearLayer{9000} returns a new function object
+#   // composing multiple layers into a model
+#   model = Sequential ( DenseLayer{2048, activation=Sigmoid} : LinearLayer {9000} )
+#   z = model (features)
+#   // applying the same model to two inputs, with shared, jointly updated parameters
+#   f = DenseLayer{2048, activation=ReLU}
+#   z1 = f (feat1) ; z2 = f (feat2)
+# The names are intentionally kept similar to other toolkits.
+#
+# Note that functions without parameters can be used as layers directly, e.g. Sigmoid.
 ##############################################################################

-Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ] 
-Fail(what) = new FailAction [ /*what*/ ]
-Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ] 
-Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ] 
-Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ] 
-Chr(c) = new StringFunction [ what = 'Chr' ;  arg = c ] 
-Length(x) = new NumericFunction [ what = 'Length' ; arg = x ] 
-Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0 
-Min(a,b) = if a < b then a else b 
-Max(a,b) = if a > b then a else b 
-Fac(n) = if n > 1 then Fac(n-1) * n else 1 
-IsSameObject(a,b) = new CompareFunction [ what = 'IsSameObject' ; args = (a : b) ]
-Mod(x, y)  = new NumericFunction [ what = 'Mod' ;  args = (x:y) ] 
-IntDiv(x, y)  = new NumericFunction [ what = 'IntDiv' ;  args = (x:y) ] 
+# LinearLayer -- create a fully-connected linear projection layer
+# Note: outDim may describe a tensor as well.
+LinearLayer {outDim} =
+{
+    W = ParameterTensor {_ConcatArrays (outDim, 0), init='uniform'}
+    b = ParameterTensor {outDim, initValue=0}
+    outRank = Length (_AsArray (outDim)) # support outputs with tensor layouts
+    f(x) = Times (W, x, outputRank = outRank) + b
+}.f

+# DenseLayer -- create a fully-connected layer with optional non-linearity
+DenseLayer{outDim, activation=(x=>x)} = Sequential ( LinearLayer{outDim} : activation )
+
+# EmbeddingLayer -- create a linear embedding layer
+EmbeddingLayer {outDim,                                   # dimension of embedding
+                embeddingPath = '', transpose = false} =  # load a fixed embedding from a path instead
+{
+    shape = if transpose then (0 : outDim) else (outDim : 0)
+    E = if embeddingPath == ''
+        then ParameterTensor {shape, init='uniform'}  # learnable
+        else ParameterTensor {shape, initFromFilePath = embeddingPath, learningRateMultiplier = 0}  # fixed from file
+    TimesOp = if transpose then TransposeTimes else Times
+    f(x) = TimesOp (E, x)    # x is expected to be sparse one-hot
+}.f
+
+# ConvolutionalLayer -- create a convolution layer with optional non-linearity
+#             [ (shifting dims)  |  (reduction dim)  |  (output dim)  |  (sample dims) ]
+#    in     : [ (shifting dims)  |  (reduction dim)  |                |  (sample dims) ]
+#    kernel : [ (filter dims)    |  (reduction dim)  |  (output dim)  |                ]
+#    out    : [ (shifting dims)] |                   |  (output dim)  |  (sample dims) ]
+ConvolutionalLayer {numOutputChannels,   # e.g. (1) or BS.Constants.None
+                    filterShape,         # e.g. (3:3)
+                    init = "uniform",
+                    #reductionRank = 1,          # TODO: support this
+                    stride = 1, autoPadding = true,
+                    #lowerPad = 0, upperPad = 0, # TODO: support this
+                    #transpose = false,          # TODO: support this
+                    maxTempMemSizeInSamples = 0} =
+{
+    reductionRank = 1 # TODO: shall become an optional parameter
+    outputChannelsShape = Repeat (1, numOutputChannels) # Repeat(1) turns a scalar into a 1-element array
+    outputRank = Length (outputChannelsShape)
+    kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, 0)) # append reduction dims to filter dims
+    W = ParameterTensor{_ConcatArrays (kernelDims, outputChannelsShape), init=init}
+    autoPaddingPadded = _ConcatArrays (_ForceResizeArray (Length (kernelDims), autoPadding), Repeat (reductionRank, false)) # set padding flags for reduction dims to false
+    sharing = false # TODO: support this
+    f(x) = Convolution (W, x, kernelShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = autoPaddingPadded, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
+}.f
+
+# MaxPoolingLayer, AveragePoolingLayer -- create a max- or average-pooling layer
+_PoolingLayer {poolKind,            # "max" or "average"
+               filterShape,         # e.g. (3:3)
+               lowerPad = 0, upperPad = 0} = # TODO: support this
+{
+    f(x) = Pooling (x, poolKind, kernelShape, stride = stride, sharing = sharing, autoPadding = autoPaddingPadded, lowerPad = lowerPad, upperPad = upperPad)
+}.f
+MaxPoolingLayer {filterShape, stride = 1, autoPadding = true /*, lowerPad = 0, upperPad = 0*/} =
+    _PoolingLayer {"max", filterShape, stride = stride, autoPadding = autoPadding}
+AveragePoolingLayer {filterShape, stride = 1, autoPadding = true /*, lowerPad = 0, upperPad = 0*/} =
+    _PoolingLayer {"average", filterShape, stride = stride, autoPadding = autoPadding}
+
+# RecurrentLSTMLayer -- create an LSTM layer
+RecurrentLSTMLayer {outDim,
+                    cellDim = BS.Constants.None, # if set then use a projection
+                    goBackwards = false,
+                    enableSelfStabilization = false} =
+{
+    cellShape = if BS.Constants.IsNone (cellDim) then outDim else cellDim
+    # BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor LSTM for this.
+    f(x) = BS.RNNs.RecurrentLSTMP (outDim, cellDim = cellShape,
+                                   x, inputDim = 0,
+                                   previousHook = if goBackwards then BS.RNNs.NextHC else BS.RNNs.PreviousHC,
+                                   enableSelfStabilization = enableSelfStabilization).h
+}.f
+
+# DropoutLayer -- create a drop-out layer
+DropoutLayer {prob = BS.Constants.None} = if !BS.Constants.IsNone (prob) then Fail ("DropoutLayer: Dropout probability can currently not be specified per-layer.") else
+{
+    f(x) = Dropout (x)
+}.f
+
+# BatchNormalizationLayer -- create a batch-normalization layer
+BatchNormalizationLayer {spatialRank = 0,  # reduce over these dims. E.g. 2 to reduce over (w,h) in a [W x H x C]-shaped input
+                         initialScale = 1,
+                         normalizationTimeConstant = 0, blendTimeConstant = 0,
+                         epsilon = 0.00001, useCntkEngine = true} =
+{
+    normShape = _ConcatArrays (Repeat (spatialRank, 1), 0) # spatial dims get a dimension of 1 (broadcasting, while all others are inferred from input)
+    scale        = ParameterTensor{normShape, initValue = initialScale}
+    bias         = ParameterTensor{normShape, initValue = 0}
+    runMean      = ParameterTensor{normShape, initValue = 0, learningRateMultiplier = 0} # note: disable learning since these are updated differently
+    runInvStdDev = ParameterTensor{normShape, initValue = 0, learningRateMultiplier = 0}
+    f(x) = BatchNormalization (x, scale, bias, runMean, runInvStdDev, spatialDims > 0, normalizationTimeConstant = normalizationTimeConstant, blendTimeConstant = blendTimeConstant, epsilon = epsilon, useCntkEngine = useCntkEngine)
+}.f
+
+# LayerNormalizationLayer -- create a layer-normalization layer
+LayerNormalizationLayer {dim = BS.Constants.None, initScale = 1, initBias = 0} = if BS.Constants.IsNone (dim) then Fail ("LayerNormalizationLayer: 'dim' parameter is currently required.") else
+{
+    gain = ParameterTensor{(1), initValue = initScale}
+    bias = ParameterTensor{(1), initValue = initBias}
+
+    f(x) = {
+        div = Constant (1.0 / dim)
+
+        # normalize w.r.t. actual sample statistics
+        mean = div .* ReduceSum (x)
+        x0 = x - mean;
+        std = Sqrt (div .* ReduceSum (x0 .* x0))
+        xHat = ElementDivide (x0, std)
+
+        # denormalize with learned parameters
+        val = xHat .* gain + bias
+    }.val
+}.f
+
+# StabilizerLayer -- create a scalar stabilizer [J. Droppo, 2014 -- TODO: get the reference]
+StabilizerLayer {} =
+{
+    # BUGBUG: Calling f(x) twice will create a second set of parameters. Needs to refactor Stabilize() for this.
+    f(x) = Stabilize (x)
+}.f
+
+# FeatureMVNLayer -- create a corpus-level feature-normalization layer
+# This can only be applied to features. Statistics are not shared across invocations,
+# which is semantically OK because the values are the same. However, it is not efficient.
+FeatureMVNLayer {} = MeanVarNorm
+
+# Layers that exist in other tools that we will not have:
+# FlattenLayer{}: Not needed since DenseLayer() can handle tensors just fine.
+# Activation{}: Not needed since functions can be used directly.
+
+##############################################################################
+# Composing layers or models into more more complex models
+##############################################################################
+
+# Sequential -- composite that applies a sequence of functions onto an input
+Sequential (arrayOfFunctions) =
+{
+    fs = _AsArray (arrayOfFunctions)  # make sure it works with a single function that is not an array
+    Apply (x, N) = if N == 0 then x else fs[N-1](Apply (x, N-1))  # we do that recursively
+    f(x) = Apply (x, Length (fs))
+}.f
+Merge (arrayOfFunctions, combineFunction) =
+    if Length (arrayOfFunctions) != 2 then Fail ("Merge() is currently limited to binary functions.") else
+    {
+        f(x,y) = combineFunction (arrayOfFunctions[0](x), arrayOfFunctions[1](y))
+    }.f

 ##############################################################################
 # aliases
@ -51,9 +202,13 @@ Log                     = CNTK2.Log
 Minus                   = CNTK2.Minus
 Pass                    = CNTK2.Identity
 Plus                    = CNTK2.Plus
-RectifiedLinear         = CNTK2.Relu
+RectifiedLinear         = CNTK2.ReLU # deprecated
+ReLU                    = CNTK2.ReLU
 ReduceSum               = CNTK2.ReduceSum
 ReduceLogSum            = CNTK2.ReduceLogSum
+ReduceMin               = CNTK2.ReduceMin
+ReduceMax               = CNTK2.ReduceMax
+
 Round                   = CNTK2.Round
 Sigmoid                 = CNTK2.Sigmoid

@ -89,7 +244,7 @@ CNTK2 = [
    // TODO: The API for Parameter is different in current 2.0 design, getting a constant as input for the initial values. 
    // This needs to be fixed to follow the way the Constant() is exposed in Python
    // Making this an internal node with "_" until we agree on the final interface:
-    _Parameter(shape, value = 0, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]
+    _Parameter(shape, value = 0, initValue = '', learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*shape */ ] /*plus the function args*/ ]

    // 3. Shape operations
    // Changes: NewReshape -> Reshape, input -> _, dims -> shape
@ -142,10 +297,12 @@ CNTK2 = [
    Tanh(_, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = _ /*plus the function args*/ ]

    // 6. Reductions    
-    # the following is a temporary workaround until we have the C++ version
-    ReduceLogSum (_, axis=0, tag='')  = if axis != 0 then Fail("ReduceLogSum for now only supports axis=0.")
-    else [ tag1=tag ; axis1=axis ; out = RowSlice (0, 1, _ - LogSoftmax (_), tag=tag1) ].out
-    ReduceSum (_, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum"    /*plus the function args*/ ]
+    ReduceSum   (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum"    /*plus the function args*/ ]
+    ReduceLogSum(_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "LogSum" /*plus the function args*/ ]
+    ReduceMin   (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Min"    /*plus the function args*/ ]
+    ReduceMax   (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Max"    /*plus the function args*/ ]
+    #ReduceMean (_, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Mean"   /*plus the function args*/ ]
+    
    // 7. Control flow (if, composite etc.)
    // None so far

@ -158,8 +315,9 @@ CNTK2 = [
    PastValue(_, shape, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = _ ; shape = new TensorShape [ /*shape*/ ] /*plus the function args*/ ]

    // 10. NN-specific operations
-    // Changes: input -> _, RectifiedLinear -> Relu. [Use Relu to arrive at relu() in snake_case]
-    Relu(_, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = _ /*plus the function args*/ ]
+    // Changes: input -> _, RectifiedLinear -> ReLU
+    ReLU(_, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = _ /*plus the function args*/ ]
+    Relu = ReLU // [Use Relu to arrive at relu() in snake_case]
    Sigmoid(_, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = _ /*plus the function args*/ ]
    Softmax(_, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = _ /*plus the function args*/ ]
    Dropout(_, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = _ /*plus the function args*/ ]
@ -169,6 +327,10 @@ CNTK2 = [
    // empirical sequence is compared to. Keeping this for now.
    CrossEntropyWithSoftmax(_, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (_ : outProbVectorSequence) /*plus the function args*/ ]
    ErrorPrediction(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = if topN == 1 then (_ : outVectorSequence) else  (_ : outVectorSequence : Constant (topN)) /*plus the function args*/ ]
+    # TODO: replace with this (need to deal with topN thing):
+    # (_new will be removed once the change is made)
+    CrossEntropyWithSoftmax_new (L, z, tag='') = Minus (ReduceLogSum (z), TransposeTimes (L,          z),  tag=tag)
+    ErrorPrediction_new (L, z, tag='')         = Minus (BS.Constants.One, TransposeTimes (L, Hardmax (z)), tag=tag)

    // 12. Comparison nodes
    Less(_, y, tag='')         = new ComputationNode [ operation = 'Less'         ; inputs = (_ : y) /*plus the function args*/ ]
@ -182,11 +344,21 @@ CNTK2 = [
    Identity(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]    
 ]

-LearnableParameter (outputDim, inputDim, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
-Parameter = LearnableParameter // deprecated 
+# Parameter{} can do several forms of initialization. It is no longer required to say 'init="kind"', so we can clean these up a bit.
+#  - initValue=scalar, value=array --> initialize from this value  --array form not implemented yet
+#  - initFromFilePath="..." --> read from a data file
+#  - init="uniform|gaussian" (random init scaled by initValueScale). Warning: This has magic scaling factors. TODO: document them here
+#  - init="zero"
+# deprecated:
+#  - initFromLiteral="..." (deprecated) --> parse a string literal (obsolete with value=array form)
+#  - init="fixedValue", value from 'value'
+# Warning: Current config will behave unexpected if user mistypes 'initValue' as 'value' (which will be ignored, defaulting to "uniform" init)
+Parameter {outputDim, inputDim, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0/*deprecated*/, initValue = '', initFromFilePath = '', initFromLiteral = ''/*deprecated*/, initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
+LearnableParameter = Parameter  // deprecated 
 # TODO: make Parameter take tensor dims?
-ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
-ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, init = 'fromLiteral', initFromLiteral = literal, learningRateMultiplier = 0.0)
+ParameterTensor {dims, learningRateMultiplier = 1.0, init = ''/*|uniform|fixedValue|gaussian|fromFile|fromLiteral*/, initValueScale = 1, value = 0, initValue = '', initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag=''} = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
+ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, initFromLiteral = literal, learningRateMultiplier = 0.0)
+# TODO: Deprecate ConstantFromString() in favor of Constant(array expression)
 DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/  ]
 Input(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
 # TODO: change from dynamicAxis by name to dynamicAxis being an actual object
@ -195,8 +367,8 @@ ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxi
 SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
 EnvironmentInput(propertyName, tag='') = new ComputationNode [ operation = 'EnvironmentInput' /*plus the function args*/ ]
 # TODO: make 'dims' the first parameter, think ConstantTensor<dims> (val)
-ConstantTensor(val, dims, tag='') = ParameterTensor(dims, learningRateMultiplier = 0, init = 'fixedValue', value = val) 
-Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, init = 'fixedValue', value = val) 
+ConstantTensor(val, dims, tag='') = ParameterTensor(dims, learningRateMultiplier = 0, initValue = val)
+Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, initValue = val)
 PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
 FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
 Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]
@ -227,7 +399,7 @@ WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNo
 ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
 ReconcileMBLayout = ReconcileDynamicAxis # back compat
 CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
-Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
+Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 # ND pooling/unpooling
 Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxUnpooling' ; inputs = (unpoolInput : poolInput); kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
@ -264,13 +436,10 @@ Mean(dataVectorSequence, tag='') = new ComputationNode [ operation = 'Mean' ; in
 Negate(input, tag='') = new ComputationNode [ operation = 'Negate' ; inputs = input /*plus the function args*/ ]
 PackedIndex(targetObject, indexSequence, tag='') = new ComputationNode [ operation = 'PackedIndex' ; inputs = (targetObject : indexSequence) /*plus the function args*/ ]
 PerDimMeanVarDeNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarDeNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
-PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
+#PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
+PerDimMeanVarNormalization (x, mean, invStdDev) = (x - mean) .* invStdDev
 Reciprocal(z, tag='') = new ComputationNode [ operation = 'Reciprocal' ; inputs = z /*plus the function args*/ ]
 //# the following is a temporary workaround until we have the C++ version
-#ReduceLogSum (z, axis=0, tag='')  = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogSum" /*plus the function args*/ ]
-#ReduceMean (z, axis=0, tag='')    = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean"    /*plus the function args*/ ]
-#ReduceMax (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max"     /*plus the function args*/ ]
-#ReduceMin (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min"     /*plus the function args*/ ]
 Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
 # TODO: Scale = ElementTimes
 ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
@ -300,15 +469,52 @@ TransposeTimes(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operatio
 Where(cond, tag='') = new ComputationNode [ operation = 'Where' ; inputs = cond /*plus the function args*/ ]

 ##############################################################################
-# common macros
+# non-neural-network functions
 ##############################################################################

-BFF(in, rows, cols) = [ B = Parameter(rows, 1, init = 'fixedValue', value = 0) ; W = Parameter(rows, cols) ; z = W*in+B ] 
+Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ] 
+Fail(what) = new FailAction [ /*what*/ ]
+Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ] 
+Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ] 
+Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ] 
+Chr(c) = new StringFunction [ what = 'Chr' ;  arg = c ] 
+Length(x) = new NumericFunction [ what = 'Length' ; arg = x ] 
+Repeat (N, what) = if N <= 0 then BS.Constants.None else (Repeat (N-1, what) : what) # can also be used to turn a scalar into a 1-element array
+_ForceResizeArray (N, arrayOrScalar) = { # bring an array to a given length, either by chopping or by duplicating its last value
+    arr = _AsArray (arrayOrScalar)
+    L = Length (arr)
+    res = if N < L then array[0..N-1] (i => arr[i]) # chop to length
+          else if L == 0 then Fail ("BottomlessExpansion(): needs at least one element to expand.")
+          else _ConcatArrays (arr, Repeat (N-L, arr[L-1])) # append copies of the last value
+}.res
+_AsArray (x) = if IsArray (x) then x else [| x |] # helper to allow dimensions to describe scalars (42) or tensors (13:42)
+_ConcatArrays (aOrScalar, bOrScalar) = {
+    a = _AsArray (aOrScalar) ; b = _AsArray (bOrScalar)
+    newLen = Length (a)+Length(b)
+    res = if newLen == 0 then BS.Constants.None else array[0..newLen-1] (i => if i < Length (a) then a[i] else b[i-Length (a)])
+}.res
+Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0
+Min(a,b) = if a < b then a else b
+Max(a,b) = if a > b then a else b
+Fac(n) = if n > 1 then Fac(n-1) * n else 1
+IsSameObject(a,b) = new CompareFunction [ what = 'IsSameObject' ; args = (a : b) ]
+IsArray(a) = new CompareFunction [ what = 'IsArray' ; args = a ]
+Mod(x, y)  = new NumericFunction [ what = 'Mod' ;  args = (x:y) ]
+IntDiv(x, y) = new NumericFunction [ what = 'IntDiv' ;  args = (x:y) ]
+
+##############################################################################
+# macros from NDL book
+##############################################################################
+
+BFF(in, rows, cols) = [ B = Parameter(rows, 1, initValue = 0) ; W = Parameter(rows, cols) ; z = W*in+B ] 
 SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ] 
 MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat)) 
 LogPrior(labels) = Log(Mean(labels)) 

-Embedding (embeddingDim, input, inputDim=input.dim, initFrom='fromFile'/*|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
+# specify one of these two for initialization:
+#  - init = "uniform"|"gaussian"
+#  - embeddingFile = PATHNAME
+Embedding (embeddingDim, input, inputDim=input.dim, initFrom=''/*|fromFile|gaussian|uniform*/, embeddingPath = '', sparseInput = false, learningRateWeight = 0.0) = [
    embedding = Transpose (LearnableParameter (inputDim, embeddingDim, learningRateMultiplier = learningRateWeight, init = initFrom, initFromFilePath = embeddingPath))
    lookup = if sparseInput then embedding * input
             else GatherPacked (input, embedding)
@ -341,7 +547,7 @@ Constants = [
    # is this like Sequences.Repeat?
    True  = 1
    False = 0
-    None = ConstantTensor (42, (1))
+    None = [| |]  # doubles up as an empty array. Note: only use [| |] syntax inside here, as it may change in the future
    IsNone (x) = IsSameObject (x, None)
 ]

@ -553,7 +759,7 @@ Parameters =
 [
    WeightParam (outputDim, inputDim) = Parameter (outputDim, inputDim, init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1)
    DiagWeightParam (outputDim)       = ParameterTensor ((outputDim), init='uniform', initValueScale=1, initOnCPUOnly=true, randomSeed=1) # meant to be applied elementwise
-    BiasParam (dim)                   = ParameterTensor ((dim), init='fixedValue', value=0.0)
+    BiasParam (dim)                   = ParameterTensor ((dim), initValue=0.0)
    ScalarParam()                     = BiasParam (1)

    # route input through an extra weight, for stabilization
@ -561,16 +767,16 @@ Parameters =
        if enabled
        then [
            #beta = Exp (BiasParam ((inputDim))) # init value is 0
-            #beta = ParameterTensor ((inputDim), init='fixedValue', value=1.0) # init value is 1
+            #beta = ParameterTensor ((inputDim), initValue=1.0) # init value is 1
            # or SoftPlus: ln(1+e^beta)
-            #beta = Log (Constants.One + Exp (ParameterTensor ((inputDim), init='fixedValue', value=0.54132485/*ln (e-1)*/))) # init value is 1
+            #beta = Log (Constants.One + Exp (ParameterTensor ((inputDim), initValue=0.54132485/*ln (e-1)*/))) # init value is 1

            # sharpened Softplus: 1/f ln(1+e^{f*beta})
            # this behaves linear for weights around 1, yet guarantees positiveness

            f = ConstantTensor (4, (1))
            fInv = Reciprocal (f)
-            beta = fInv .* Log (Constants.One + Exp (f .* ParameterTensor ((inputDim), init='fixedValue', value=0.99537863/* 1/f*ln (e^f-1) */))) # init value is 1
+            beta = fInv .* Log (Constants.One + Exp (f .* ParameterTensor ((inputDim), initValue=0.99537863/* 1/f*ln (e^f-1) */))) # init value is 1

            TraceDense (h, what) = h  # delete h and uncomment Trace to trace the beta values. They are a valuable indicator.
                //Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=25, format=[ type = "real" ; transpose = false ; precisionFormat = ".6" ])
@ -1033,6 +1239,7 @@ Seq2Seq =

 Network = [
    Load(pathName) = new ComputationNetworkFromFile [ /*pathName; also needs 'precision' somewhere*/ ]
+    CloneFunction (inputNodes, outputNodes, parameters="learnable" /*|"constant"|"shared"*/) = new CloneFunctionConfigLambda [ /*args*/ ]
    Edit(inputModel, editFunctions, additionalRoots) = new ComputationNetworkWithEdits [ /*inputModel, editFunctions, additionalRoots*/ ]

    Editing = [
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -8,6 +8,10 @@
 #define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _

 #include "stdafx.h"
+#ifdef _WIN32
+#include <crtdbg.h>
+#endif 
+
 #include "Basics.h"
 #include "Actions.h"
 #include "ComputationNetwork.h"
@ -18,6 +22,7 @@
 #include "NDLNetworkBuilder.h"
 #include "ModelEditLanguage.h"
 #include "CPUMatrix.h" // used for SetNumThreads()
+#include "GPUMatrix.h" // used for SyncGuard::EnableSync()
 #include "CommonMatrix.h"
 #include "SGD.h"
 #include "MPIWrapper.h"
@ -440,11 +445,6 @@ static wstring PathToBSStringLiteral(const wstring& path) // quote a pathname fo
        return L'"' + path + L'"';
 }

-// TODO: decide where these should go. Also, do we need three variables?
-//extern wstring standardFunctions;
-//extern wstring commonMacros;
-//extern wstring computationNodes;
-
 int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapper that catches & reports Win32 exceptions
 {
    vector<wstring> args(argv, argv + argc);
@ -488,7 +488,6 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    bs += L"include \'cntk.core.bs'"; // start with including the standard macros

    // Note: Using lowercase ^^ here to match the Linux name of the CNTK exe.
-    //bs += standardFunctions + computationNodes + commonMacros + L"\n";
    for (const auto& sourceFile : sourceFiles)
        bs += L"include " + PathToBSStringLiteral(sourceFile) + L"\n";
    bs += L"\n]\n";
@ -538,6 +537,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

+    bool synchronizeCUDAKernelExecutions = config(L"synchronizeCUDAKernelExecutions", false);
+    if (synchronizeCUDAKernelExecutions)
+        SyncGuard::EnableSync();
+
    // logging
    wstring logpath = config(L"stderr", L"");
    if (logpath != L"")
@ -581,13 +584,11 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    if (actionsVal.Is<ScriptableObjects::ConfigArray>())
    {
        const ScriptableObjects::ConfigArray& actions = actionsVal;
-        for (int i = actions.GetIndexRange().first; i <= actions.GetIndexRange().second; i++)
+        for (int i = actions.GetIndexBeginEnd().first; i < actions.GetIndexBeginEnd().second; i++)
        {
            // TODO: When running in parallel with MPI, only commands in 'commandstoRunOnAllRanks' should
            // be run in parallel across multiple ranks. Others should only run on rank 0
-            actions.At(i, [](const wstring&)
-                       {
-                       }); // this will evaluate and thus execute the action
+            actions.At(i, [](const wstring&){}); // this will evaluate and thus execute the action
        }
    }
    // else action has already been executed, see comment above
@ -823,15 +824,38 @@ static void LogDelayLoadError(PEXCEPTION_POINTERS pExcPointers)
    }
 }

+#if _DEBUG
+// in case of asserts in debug mode, print the message into stderr and throw exception
+int HandleDebugAssert(int,               // reportType  - ignoring reportType, printing message and aborting for all reportTypes
+                      char *message,     // message     - fully assembled debug user message
+                      int * returnValue) // returnValue - retVal value of zero continues execution
+{
+    fprintf(stderr, "C-Runtime: %s\n", message);
+
+    if (returnValue) {
+        *returnValue = 0;   // return value of 0 will continue operation and NOT start the debugger
+    }
+
+    return TRUE;            // returning TRUE will make sure no message box is displayed
+}
+#endif
+
 int wmain(int argc, wchar_t* argv[]) // wmain wrapper that reports Win32 exceptions
 {
    set_terminate(TerminateThis);    // insert a termination handler to ensure stderr gets flushed before actually terminating
-    _set_error_mode(_OUT_TO_STDERR); // make sure there are no CRT prompts when CNTK is executing

-    // Note: this does not seem to work--processes with this seem to just hang instead of terminating
    __try
    {
-        return wmain1(argc, argv);
+        // in case of asserts in debug mode, print the message into stderr and throw exception
+        if (_CrtSetReportHook2(_CRT_RPTHOOK_INSTALL, HandleDebugAssert) == -1) {
+            LOGPRINTF(stderr, "CNTK: _CrtSetReportHook2 failed.\n");
+            return -1;
+        }
+
+        int mainReturn = wmain1(argc, argv);
+        _CrtSetReportHook2(_CRT_RPTHOOK_REMOVE, HandleDebugAssert);
+
+        return mainReturn;
    }
    __except (LogDelayLoadError(GetExceptionInformation()), EXCEPTION_EXECUTE_HANDLER)
    {
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -81,7 +81,7 @@
      <StackReserveSize>100000000</StackReserveSize>
    </Link>
    <PreBuildEvent>
-      <Command>prebuild.bat "$(Configuration)" "$(CudaPath)"</Command>
+      <Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -109,7 +109,7 @@
      <StackReserveSize>100000000</StackReserveSize>
    </Link>
    <PreBuildEvent>
-      <Command>prebuild.bat "$(Configuration)" "$(CudaPath)"</Command>
+      <Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)"</Command>
    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
@ -144,6 +144,7 @@
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\CrossProcessMutex.h" />
+    <ClInclude Include="..\Common\Include\basetypes.h" />
    <ClInclude Include="..\Common\Include\Basics.h" />
    <ClInclude Include="..\Common\Include\BestGpu.h" />
    <ClInclude Include="..\Common\Include\DataReader.h" />
@ -199,7 +200,6 @@
  <ItemGroup>
    <ClCompile Include="BrainScript\BrainScriptEvaluator.cpp" />
    <ClCompile Include="BrainScript\BrainScriptParser.cpp" />
-    <ClCompile Include="BrainScript\BrainScriptTest.cpp" />
    <ClCompile Include="CNTK.cpp" />
    <ClCompile Include="ModelEditLanguage.cpp" />
    <ClCompile Include="stdafx.cpp" />
@ -222,4 +222,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/Source/CNTK/CNTK.vcxproj.filters
+++ b/Source/CNTK/CNTK.vcxproj.filters
@ -1,18 +1,6 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
-    <ClCompile Include="..\Common\DataReader.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\DataWriter.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\File.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="ModelEditLanguage.cpp">
      <Filter>Model Editing</Filter>
    </ClCompile>
@ -22,34 +10,13 @@
    <ClCompile Include="tests.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
-    <ClCompile Include="..\Common\TimerUtility.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="CNTK.cpp" />
-    <ClCompile Include="..\Common\MPIWrapper.cpp">
-      <Filter>MPI Interfacing</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\Include\ConcStack.h">
-      <Filter>Common\Include</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\Config.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="BrainScript\BrainScriptEvaluator.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
    <ClCompile Include="BrainScript\BrainScriptParser.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
-    <ClCompile Include="BrainScript\BrainScriptTest.cpp">
-      <Filter>BrainScript</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\CompositeDataReader.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\Include\fileutil.h">
@ -205,9 +172,8 @@
    <ClInclude Include="..\Readers\ReaderLib\Transformer.h">
      <Filter>from ReaderLib</Filter>
    </ClInclude>
-    <ClInclude Include="..\Common\Include\CompositeDataReader.h">
-      <Filter>Common\Include</Filter>
-    </ClInclude>
+    <ClInclude Include="..\Common\Include\basetypes.h" />
+    <ClInclude Include="..\Readers\CompositeDataReader\CompositeDataReader.h" />
  </ItemGroup>
  <ItemGroup>
    <Text Include="modelEditor.txt">
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@ -591,7 +591,7 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
        std::string paramPath = params[1];

        NetNdl<ElemType>* netNdl;
-        vector<ComputationNodeBasePtr> nodes = FindSymbols(params[0], netNdl);
+        vector<ComputationNodeBasePtr> nodes = FindSymbols(nodeName, netNdl);

        for (auto& pNodes : nodes)
        {
--- a/Source/CNTK/ModelEditLanguage.h
+++ b/Source/CNTK/ModelEditLanguage.h
@ -180,7 +180,7 @@ public:
                    auto nodePtr = builder.CreateLearnableParameter(name, 1, 1);
                    ndlNode->SetEvalValue(nodePtr.get());
                    ElemType val = ndlNode->GetScalar();
-                    nodePtr->Value().SetValue(val);
+                    cn->InitLearnableParameters(nodePtr, L"fixedValue", val);
                }
            }
        }
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@ -7,10 +7,23 @@ setlocal enableDelayedexpansion
 ::: for full license information.
 ::: ==============================================================================
 :::
-::: This is called as a pre-build step for the CNTK executable.
-::: It receives the build's configuration, $(Configuration), as first paramter.
+::: This is called as a pre-build step for the CNTK executable, taking parameters below.
 ::: It creates buildinfo.h, which makes version information available to the executable itself.

+:: Grab the parameters
+::
+:: Note: don't rely on environment variables, since properties may have been
+:: overridden at msbuild invocation. By convention, we let parameters start with p_, locals with l_.
+:: A Vim search for [%!]\([lp]_\)\@!\w\+[%!:] should only match
+:: well-known (non-CNTK-specific) environment variables.
+set p_Configuration=%~1
+set p_CNTK_MKL=%~2
+set p_CNTK_MKL_SEQUENTIAL=%~3
+set p_CNTK_ENABLE_1BitSGD=%~4
+set p_CudaPath=%~5
+set p_CUDNN_PATH=%~6
+set p_CUB_PATH=%~7
+
 echo #ifndef _BUILDINFO_H > buildinfo.h$$
 echo #define _BUILDINFO_H >> buildinfo.h$$

@ -23,19 +36,19 @@ if not errorlevel 1 (
    call git --version > NUL 2>&1
    if not errorlevel 1 (
        echo #define _GIT_EXIST >> buildinfo.h$$
-        FOR /F %%i IN ('call git rev-parse --abbrev-ref HEAD') DO SET BRANCH=%%i
-        FOR /F %%i IN ('call git rev-parse HEAD') DO SET COMMIT=%%i
-        set STATUS=
+        FOR /F %%i IN ('call git rev-parse --abbrev-ref HEAD') DO SET l_BRANCH=%%i
+        FOR /F %%i IN ('call git rev-parse HEAD') DO SET l_COMMIT=%%i
+        set l_STATUS=
        call git diff --quiet --cached
        if not errorlevel 1 call git diff --quiet
-        if errorlevel 1 set STATUS= ^(modified^)
-        echo #define _BUILDBRANCH_  "!BRANCH!"      >> buildinfo.h$$
-        echo #define _BUILDSHA1_    "!COMMIT!!STATUS!">> buildinfo.h$$
+        if errorlevel 1 set l_STATUS= ^(modified^)
+        echo #define _BUILDBRANCH_  "!l_BRANCH!"      >> buildinfo.h$$
+        echo #define _BUILDSHA1_    "!l_COMMIT!!l_STATUS!">> buildinfo.h$$
    )
 )

-if "%CNTK_MKL%" == "1" (
-  if "%CNTK_MKL_SEQUENTIAL%" == "1" (
+if "%p_CNTK_MKL%" == "1" (
+  if "%p_CNTK_MKL_SEQUENTIAL%" == "1" (
    echo #define _MATHLIB_ "mkl-sequential">> buildinfo.h$$
  ) else (
    echo #define _MATHLIB_ "mkl">> buildinfo.h$$
@ -49,42 +62,40 @@ echo #define _BUILDER_ "%USERNAME%"     >> buildinfo.h$$
 echo #define _BUILDER_ "%USERNAME%"     >> buildinfo.h$$
 echo #define _BUILDMACHINE_ "%HOST%"    >> buildinfo.h$$

-set scriptpath=%~dp0
-set buildpath="%scriptpath:\=\\%"
-echo #define _BUILDPATH_    %buildpath%     >> buildinfo.h$$
+set l_scriptpath=%~dp0
+set l_buildpath="%l_scriptpath:\=\\%"
+echo #define _BUILDPATH_    %l_buildpath%     >> buildinfo.h$$

-set build_type=Unknown
-set build_target=Unknown
+set l_build_type=Unknown
+set l_build_target=Unknown
 :: Configuration property provided by CNTK.vcxproj
-if /i "%~1" == "Debug" set build_type=Debug&set build_target=GPU
-if /i "%~1" == "Debug_CpuOnly" set build_type=Debug&set build_target=CPU-only
-if /i "%~1" == "Release" set build_type=Release&set build_target=GPU
-if /i "%~1" == "Release_CpuOnly" set build_type=Release&set build_target=CPU-only
+if /i "%p_Configuration%" == "Debug" set l_build_type=Debug&set l_build_target=GPU
+if /i "%p_Configuration%" == "Debug_CpuOnly" set l_build_type=Debug&set l_build_target=CPU-only
+if /i "%p_Configuration%" == "Release" set l_build_type=Release&set l_build_target=GPU
+if /i "%p_Configuration%" == "Release_CpuOnly" set l_build_type=Release&set l_build_target=CPU-only

-echo #define _BUILDTYPE_ "%build_type%">> buildinfo.h$$
-echo #define _BUILDTARGET_ "%build_target%">> buildinfo.h$$
+echo #define _BUILDTYPE_ "%l_build_type%">> buildinfo.h$$
+echo #define _BUILDTARGET_ "%l_build_target%">> buildinfo.h$$

-if "%CNTK_ENABLE_1BitSGD%" == "true" (
+if "%p_CNTK_ENABLE_1BitSGD%" == "true" (
    echo #define _WITH_1BITSGD_ "yes">>buildinfo.h$$
 ) else (
    echo #define _WITH_1BITSGD_ "no">>buildinfo.h$$
 )

-if not %build_target% == CPU-only (
-    :: CudaPath property provided by CNTK.vcxproj
-    if "%~2%" == "" (
+if not %l_build_target% == CPU-only (
+    if "%p_CudaPath%" == "" (
        echo #define _CUDA_PATH_    "NOT_DEFINED"     >> buildinfo.h$$
    ) else (
-        set cudaPathTemp=%~2
-        echo #define _CUDA_PATH_    "!cudaPathTemp:\=\\!" >> buildinfo.h$$
+        echo #define _CUDA_PATH_    "!p_CudaPath:\=\\!" >> buildinfo.h$$
    )

-    if not "%cudnn_path%" == "" (
-        echo #define _CUDNN_PATH_  "%cudnn_path:\=\\%" >> buildinfo.h$$
+    if not "%p_CUDNN_PATH%" == "" (
+        echo #define _CUDNN_PATH_  "%p_CUDNN_PATH:\=\\%" >> buildinfo.h$$
    )

-    if not "%cub_path%" == "" (
-        echo #define _CUB_PATH_  "%cub_path:\=\\%" >> buildinfo.h$$
+    if not "%p_CUB_PATH%" == "" (
+        echo #define _CUB_PATH_  "%p_CUB_PATH:\=\\%" >> buildinfo.h$$
    )
 )

--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -153,6 +153,11 @@ namespace CNTK
        static const size_t InferredDimension = (size_t)-1;

    public:
+        ///
+        /// Construct a NDShape with 0 axes, which denotes a scalar.
+        ///
+        NDShape() {}
+
        ///
        /// Contruct a NDShape instance with the specified number of axes and dimensionality in each axis.
        ///
@ -285,6 +290,7 @@ namespace CNTK
    class NDArrayView final : public std::enable_shared_from_this<NDArrayView>
    {
        friend class CompositeFunction;
+        friend class LearnerBase;

        template <typename T, typename ...CtorArgTypes>
        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
@ -429,6 +435,16 @@ namespace CNTK
        ///
        bool IsReadOnly() const { return m_isReadOnly; }

+        // TODO: The set methods should be offered in template from
+        ///
+        /// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Float.
+        ///
+        CNTK_API void SetValue(float value);
+
+        ///
+        /// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Double.
+        ///
+        CNTK_API void SetValue(double value);
        ///
        /// Creates a new NDArrayView with newly allocated storage on the same device as 'this' view and copies 'this' view's contents into the newly allocated view.
        ///
@ -467,8 +483,6 @@ namespace CNTK
    private:
        CNTK_API NDArrayView(CNTK::DataType dataType, const DeviceDescriptor& device, CNTK::StorageFormat storageType, const NDShape& viewShape, bool readOnly, void* tensorView);

-        CNTK_API void SetValue(float value);
-        CNTK_API void SetValue(double value);

        template <typename ElementType>
        static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrixImpl(const Microsoft::MSR::CNTK::TensorView<ElementType>* tensorView, size_t rowColSplitPoint);
@ -526,6 +540,11 @@ namespace CNTK
        ///
        CNTK_API void Clear();

+        ///
+        /// Returns the number of masked/invalid values
+        ///
+        CNTK_API size_t MaskedCount() const;
+
        ///
        /// Returns the descriptor of the device that 'this' mask resides on
        ///
@ -536,6 +555,11 @@ namespace CNTK
        ///
        const NDShape& Shape() const { return m_maskShape; }

+        ///
+        /// Returns a read-only pointer to the data buffer underlying 'this' Mask object
+        /// 
+        CNTK_API const char* DataBuffer() const;
+
        ///
        /// Creates a new NDMask with newly allocated storage on the same device as 'this' mask and copies 'this' mask's contents into the newly allocated mask.
        ///
@ -760,7 +784,21 @@ namespace CNTK
        ///
        /// Create an 'Input' Variable.
        ///
-        Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name = L"")
+        Variable(const NDShape& shape, CNTK::DataType dataType)
+            : Variable(shape, dataType, L"")
+        {}
+
+        ///
+        /// Create an 'Input' Variable.
+        ///
+        Variable(const NDShape& shape, CNTK::DataType dataType, const wchar_t* name)
+            : Variable(shape, dataType, std::wstring(name))
+        {}
+
+        ///
+        /// Create an 'Input' Variable.
+        ///
+        Variable(const NDShape& shape, CNTK::DataType dataType, const std::wstring& name)
            : Variable(shape, VariableKind::Input, dataType, nullptr, nullptr, false, { Axis::DefaultDynamicAxis() }, false, name)
        {}

@ -919,6 +957,10 @@ namespace CNTK
        return first.m_dataFields == second.m_dataFields;
    }

+    inline bool operator!=(const Variable& first, const Variable& second)
+    {
+        return !(first == second);
+    }
    ///
    /// Denotes Parameter inputs of a Function.
    ///
@ -1146,7 +1188,7 @@ namespace CNTK
        /// and the user is responsible for ensuring that the contents of the inputs and outputs are unchanged until after any uses of the BackPropState instance
        /// for backpropagating gradients through this function.
        ///
-        CNTK_API virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
+        CNTK_API virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
                                                  std::unordered_map<Variable, ValuePtr>& outputs,
                                                  const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice(),
                                                  const std::unordered_set<Variable>& outputsToRetainBackwardStateFor = {}) = 0;
@ -1161,7 +1203,7 @@ namespace CNTK
        /// computation that this gradient backpropagation corresponds to.
        ///
        CNTK_API virtual void Backward(const BackPropStatePtr& state,
-                                       const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
+            const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
                                       std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) = 0;

    public:
@ -1330,10 +1372,74 @@ namespace CNTK
    };

    ///
-    /// Create an instance of the CNTK built-in matrix multiplication operation with the specified input operands.
-    /// TODO: Specify the constraints on the shapes of the operands.
+    /// Create an instance of the CNTK built-in elementwise negate operation with the specified input operand.
    ///
-    CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+    CNTK_API FunctionPtr Negate(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise sigmoid operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise tanh operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Tanh(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise linear rectifier operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr ReLU(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise exp operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Exp(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise log operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Log(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise square operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Square(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise square-root operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Sqrt(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise round operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Round(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise floor operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Floor(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise ceil operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Ceil(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise abs operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Abs(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise reciprocal operation with the specified input operand.
+    ///
+    CNTK_API FunctionPtr Reciprocal(const Variable& operand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in softmax operation on specified tensor input operand
+    ///
+    CNTK_API FunctionPtr Softmax(const Variable& operand, const std::wstring& name = L"");

    ///
    /// Create an instance of the CNTK built-in elementwise tensor addition operation with the specified input operands.
@ -1341,30 +1447,71 @@ namespace CNTK
    CNTK_API FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");

    ///
-    /// Create an instance of the CNTK built-in elementwise sigmoid operation with the specified input operand.
+    /// Create an instance of the CNTK built-in elementwise tensor subtraction operation with the specified input operands.
    ///
-    CNTK_API FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name = L"");
+    CNTK_API FunctionPtr Minus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
    
    ///
-    /// Create an instance of the CNTK built-in elementwise tanh operation with the specified input operand.
+    /// Create an instance of the CNTK built-in elementwise multiplication operation on specified tensor input operands.
    ///
-    CNTK_API FunctionPtr Tanh(const Variable& operand, const std::wstring& name = L"");
+    CNTK_API FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise division operation on specified tensor input operands.
+    ///
+    CNTK_API FunctionPtr ElementDivide(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise equality comparison operation on specified tensor input operands.
+    ///
+    CNTK_API FunctionPtr Equal(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise not-equal comparison operation on specified tensor input operands.
+    ///
+    CNTK_API FunctionPtr NotEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise less than comparison operation on specified tensor input operands.
+    ///
+    CNTK_API FunctionPtr Less(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise less than or equal to comparison operation on specified tensor input operands.
+    ///
+    CNTK_API FunctionPtr LessEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise greater than comparison operation on specified tensor input operands.
+    ///
+    CNTK_API FunctionPtr Greater(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in elementwise greater than or equal to comparison operation on specified tensor input operands.
+    ///
+    CNTK_API FunctionPtr GreaterEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in matrix multiplication operation with the specified input operands.
+    /// TODO: Specify the constraints on the shapes of the operands.
+    ///
+    CNTK_API FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes = 1, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in operation to compute squared-error for specified input operands.
+    ///
+    CNTK_API FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name = L"");

    ///
    /// Create an instance of the CNTK built-in operation to compute cross-entropy with softmax for specified input operands.
    ///
-    CNTK_API FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name = L"");
+    CNTK_API FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name = L"");

    ///
    /// Create an instance of the CNTK built-in operation for computing the classification prediction error for specified operands.
    ///
    CNTK_API FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name = L"");

-    ///
-    /// Create an instance of the CNTK built-in elementwise exp operation with the specified input operand.
-    ///
-    CNTK_API FunctionPtr Exp(const Variable& operand, const std::wstring& name = L"");
-
    ///
    /// Create an instance of the CNTK built-in operation for getting the past value along the lone dynamic axis of the specified operand.
    /// Throws an exception of the operand has more than one dynamic axis.
@ -1379,21 +1526,582 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name = L"");

-    ///
-    /// Create an instance of the CNTK built-in elementwise multiplication operation on specified tensor input operands.
-    ///
-    CNTK_API FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name = L"");

    ///
    /// Create an instance of the CNTK built-in sum reduction operation on specified tensor input operand along all the axes
    ///
    CNTK_API FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name = L"");

+    ///
+    /// Per dimension mean-variance normalization of the specified input operand.
+    ///
+    CNTK_API FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name = L"");
+
+    ///
+    /// TODO:
+    ///
+    CNTK_API FunctionPtr Convolution(const Variable& convolutionMap,
+                                     const Variable& operand,
+                                     const NDShape& strides = {1},
+                                     const std::vector<bool>& sharing = {true},
+                                     const std::vector<bool>& autoPadding = {true},
+                                     const NDShape& lowerPad = {0},
+                                     const NDShape& upperPad = {0},
+                                     bool transpose = false,
+                                     size_t maxTempMemSizeInSamples = 0,
+                                     const std::wstring& name = L"");
+
+    ///
+    /// TODO:
+    ///
+    enum class PoolingType
+    {
+        Max,
+        Average,
+    };
+
+    ///
+    /// TODO:
+    ///
+    CNTK_API FunctionPtr Pooling(const Variable& operand,
+                                 PoolingType poolingType,
+                                 const NDShape& poolingWindowShape,
+                                 const NDShape& strides = {1},
+                                 const std::vector<bool>& autoPadding = {false},
+                                 const NDShape& lowerPad = {0},
+                                 const NDShape& upperPad = {0},
+                                 const std::wstring& name = L"");
+
+    ///
+    /// TODO:
+    ///
+    CNTK_API FunctionPtr BatchNormalization(const Variable& operand,
+                                            const Variable& scale,
+                                            const Variable& bias,
+                                            const Variable& runningMean,
+                                            const Variable& runningInvStd,
+                                            bool spacial,
+                                            double normalizationTimeConstant = 0,
+                                            double blendTimeConstant = 0,
+                                            double epsilon = 0.00001,
+                                            bool useCuDNNEngine = false,
+                                            const std::wstring& name = L"");
+
    ///
    /// Create a new Function instance which just combines the outputs of the specified list of 'operands' Functions such that the 'Outputs' of the 
    /// new 'Function' are union of the 'Outputs' of each of the specified 'operands' Functions.
    /// E.g. When creating a classification model, typically the CrossEntropy loss Function and the ClassificationError Function comprise the two roots
    /// of the computation graph which can be "Combine"d to create a single Function with 2 outputs; viz. CrossEntropy loss and ClassificationError output.
    ///
-    CNTK_API FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name = L"");
+    CNTK_API FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name = L"");
+
+    ///
+    /// Load a legacy CNTK v1 format model
+    ///
+    template <typename ElementType>
+    CNTK_API FunctionPtr LoadLegacyModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice());
+
+    /// 
+    /// Save a Composite Function instance to a file in CNTK legacy model format
+    ///
+    template <typename ElementType>
+    CNTK_API void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile);
+
+    ///
+    /// A serializable value represents one of:
+    /// a) Boolean
+    /// b) Signed long integer
+    /// c) Single and double precision floating point values
+    /// d) NDShape
+    /// e) vector<DictionaryValue>
+    ///
+    /// TODO: We need to have native support for DictionaryValue<vector> and DictionaryValue<NDArrayView>.
+    class DictionaryValue final
+    {
+    public:
+        enum class Type : unsigned int
+        {
+            None,
+            Bool,
+            SizeT,
+            Float,
+            Double,
+            String,
+            NDShape,
+            Vector,
+            Dictionary,
+        };
+
+        static const char* TypeName(Type type)
+        {
+            switch (type)
+            {
+            case Type::None:
+                return "None";
+            case Type::Bool:
+                return "Bool";
+            case Type::SizeT:
+                return "SizeT";
+            case Type::Float:
+                return "Float";
+            case Type::Double:
+                return "Double";
+            case Type::String:
+                return "String";
+            case Type::NDShape:
+                return "NDShape";
+            case Type::Vector:
+                return "Vector";
+            case Type::Dictionary:
+                return "Dictionary";
+            default:
+                LogicError("Unknown DictionaryValue::Type");
+            }
+        }
+
+    public:
+        DictionaryValue() : m_valueType(Type::None)
+        {
+        }
+
+        DictionaryValue(bool value) : m_valueType(GetValueType<bool>())
+        {
+            m_data.m_boolean = value;
+        }
+
+        DictionaryValue(size_t value) : m_valueType(GetValueType<size_t>())
+        {
+            m_data.m_sizeT = value;
+        }
+
+        DictionaryValue(float value) : m_valueType(GetValueType<float>())
+        {
+            m_data.m_float = value;
+        }
+
+        DictionaryValue(double value) : m_valueType(GetValueType<double>())
+        {
+            m_data.m_double = value;
+        }
+
+        DictionaryValue(const wchar_t* value) 
+            : DictionaryValue(std::wstring(value))
+        {}
+        template <typename T>
+        DictionaryValue(const T& value) : m_valueType(GetValueType<T>())
+        {
+            static_assert(std::is_same<T, NDShape>::value ||
+                          std::is_same<T, std::wstring>::value ||
+                          std::is_same<T, std::vector<DictionaryValue>>::value ||
+                          std::is_same<T, Dictionary>::value,
+                          "Unsupported ValueType");
+
+            AllocateDataPtr(value);
+        }
+
+        DictionaryValue(const DictionaryValue& other) : m_valueType(Type::Bool)
+        {
+            // The m_valueType must have been set to a non-ptr type to prevent an attempt to interpret
+            // the underlying underlying uninitialized value as a ptr and free it.
+            *this = other;
+        }
+
+        DictionaryValue& operator=(const DictionaryValue& other)
+        {
+            if (this != &other)
+            {
+                FreeDataPtr();
+
+                m_valueType = other.m_valueType;
+                m_data = other.m_data;
+
+                if (other.m_valueType == Type::String)
+                    AllocateDataPtr(other.GetValue<std::wstring>());
+                else if (other.m_valueType == Type::NDShape)
+                    AllocateDataPtr(other.GetValue<NDShape>());
+                else if (other.m_valueType == Type::Vector)
+                    AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
+                else if (other.m_valueType == Type::Dictionary)
+                    AllocateDataPtr(other.GetValue<Dictionary>());
+            }
+
+            return *this;
+        }
+
+        ~DictionaryValue()
+        {
+            FreeDataPtr();
+        }
+
+        template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
+        const T& GetValue() const
+        {
+            VerifyType<T>();
+            return m_data.m_boolean;
+        }
+
+        template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
+        const T& GetValue() const
+        {
+            VerifyType<T>();
+            return m_data.m_sizeT;
+        }
+
+        template <typename T, typename std::enable_if<std::is_same<T, float>::value>::type* = nullptr>
+        const T& GetValue() const
+        {
+            VerifyType<T>();
+            return m_data.m_float;
+        }
+
+        template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
+        const T& GetValue() const
+        {
+            VerifyType<T>();
+            return m_data.m_double;
+        }
+
+        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value ||
+            std::is_same<T, std::wstring>::value ||
+            std::is_same<T, std::vector<DictionaryValue>>::value ||
+            std::is_same<T, Dictionary>::value>::type* = nullptr>
+        const T& GetValue() const
+        {
+            VerifyType<T>();
+            return *(reinterpret_cast<T*>(m_data.m_ptr));
+        }
+
+        bool HasValue() const
+        {
+            return m_valueType != Type::None;
+        }
+
+        Type ValueType() const
+        {
+            return m_valueType;
+        }
+
+        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us);
+        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us);
+
+    private:
+        template <typename T>
+        static Type GetValueType()
+        {
+            static_assert(std::is_same<T, bool>::value ||
+                          std::is_same<T, size_t>::value ||
+                          std::is_same<T, float>::value ||
+                          std::is_same<T, double>::value ||
+                          std::is_same<T, std::wstring>::value ||
+                          std::is_same<T, NDShape>::value ||
+                          std::is_same<T, std::vector<DictionaryValue>>::value ||
+                          std::is_same<T, Dictionary>::value,
+                          "Unsupported ValueType");
+
+            if (std::is_same<T, bool>::value)                                      return Type::Bool;
+            if (std::is_same<T, size_t>::value)                                    return Type::SizeT;
+            if (std::is_same<T, float>::value)                                     return Type::Float;
+            if (std::is_same<T, double>::value)                                    return Type::Double;
+            if (std::is_same<T, std::wstring>::value)                              return Type::String;
+            if (std::is_same<T, NDShape>::value)                                   return Type::NDShape;
+            if (std::is_same<T, std::vector<DictionaryValue>>::value)              return Type::Vector;
+            if (std::is_same<T, Dictionary>::value)                                return Type::Dictionary;
+        }
+
+        template <typename T>
+        void VerifyType() const
+        {
+            if (GetValueType<T>() != m_valueType)
+                RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
+        }
+
+        template <typename T>
+        CNTK_API void AllocateDataPtr(const T& value);
+
+        template <typename T>
+        CNTK_API void FreePtrAsType();
+
+        CNTK_API void FreeDataPtr()
+        {
+            if (m_valueType == Type::String)
+                FreePtrAsType<std::wstring>();
+            else if (m_valueType == Type::NDShape)
+                FreePtrAsType<NDShape>();
+            else if (m_valueType == Type::Vector)
+                FreePtrAsType<std::vector<DictionaryValue>>();
+            else if (m_valueType == Type::Dictionary)
+                FreePtrAsType<Dictionary>();
+        }
+
+        Type m_valueType;
+
+        union ValueData
+        {
+            bool m_boolean;
+            size_t m_sizeT;
+            float m_float;
+            double m_double;
+            void* m_ptr;
+        } m_data;
+
+        const size_t version = 1;
+    };
+
+    ///
+    /// A type denoting a dictionary (keyed by Unicode strings) of serializable values (dynamically typed).
+    ///
+    class Dictionary final
+    {
+        friend inline void AddConfigString(std::wstringstream& s, const DictionaryValue& value, size_t numIndentationSpaces);
+        friend class CompositeMinibatchSource;
+    public:
+        CNTK_API Dictionary();
+        CNTK_API ~Dictionary();
+
+        CNTK_API Dictionary(const Dictionary&);
+        CNTK_API Dictionary& operator=(const Dictionary&);
+
+        CNTK_API Dictionary(Dictionary&& other);
+        CNTK_API Dictionary& operator=(Dictionary&& other);
+
+        CNTK_API DictionaryValue& operator[](const wchar_t* key);
+        DictionaryValue& operator[](const std::wstring& key)
+        {
+            return operator[](key.c_str());
+        }
+
+        CNTK_API DictionaryValue operator[](const wchar_t* key) const;
+
+        DictionaryValue operator[](const std::wstring& key) const
+        {
+            return operator[](key.c_str());
+        }
+
+        CNTK_API bool Contains(const wchar_t* key) const;
+
+        bool Contains(const std::wstring& key) const
+        {
+            return Contains(key.c_str());
+        }
+
+
+        friend CNTK_API Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us);
+        friend CNTK_API Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us);
+
+    private:
+        std::shared_ptr<std::unordered_map<std::wstring, DictionaryValue>> m_dictionaryData;
+        const size_t version = 1;
+    };
+
+    ///
+    /// Abstraction for learning a subset of parameters of a learnable function using first order gradient values
+    /// For e.g momentum, AdaGrad, RMSProp etc. are different types of learners with their own algorithms for
+    /// learning parameter values using first order gradients.
+    ///
+    class Learner : public std::enable_shared_from_this<Learner>
+    {
+    public:
+        //
+        // Method to update the parameters associated with this learner. By returning false, this method indicates that
+        // learning has stopped for all of the parameters associated with this learner
+        //
+        CNTK_API virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) = 0;
+
+        ///
+        /// Returns the set of parameters associated with this learner.
+        ///
+        const std::unordered_set<Parameter>& Parameters() const { return m_parameters; }
+
+        ///
+        /// Optionally overridable method to checkpoint the learner's state.
+        ///
+        // TODO: move the following two methods into ISerializable interface, make 
+        // Learner (and all other entities that need checkpointing capability) implement it.
+        CNTK_API virtual Dictionary GetCheckpointState() const { return Dictionary(); }
+
+        ///
+        /// Optionally overridable method to restore the learner's state from a previous checkpoint.
+        ///
+        CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& /*checkpoint*/) {}
+
+        virtual ~Learner() {}
+
+    protected:
+        Learner(const std::unordered_set<Parameter>& parameters)
+            : m_parameters(parameters)
+        {}
+
+        std::unordered_set<Parameter> m_parameters;
+
+    };
+
+    ///
+    /// Create an instance of the CNTK built-in SGD learner.
+    ///
+    /// TODO: add additional SGD parameters here (a collection of learning rate values)
+    CNTK_API LearnerPtr SGDLearner(const std::unordered_set<Parameter>& parameters, double learningRatePerSample);
+
+    ///
+    /// Create an instance of the CNTK built-in Momentum SGD learner.
+    ///
+    /// TODO: add additional Momentum parameters here (a collection of momentum rate values)
+    CNTK_API LearnerPtr MomentumSGDLearner(const std::unordered_set<Parameter>& parameters);
+
+    ///
+    /// Create an instance of the CNTK built-in Nesterov's accelerated SGD learner.
+    ///
+    CNTK_API LearnerPtr NesterovLearner(const std::unordered_set<Parameter>& parameters);
+
+    ///
+    /// Create an instance of the CNTK built-in AdaGrad learner.
+    ///
+    CNTK_API LearnerPtr AdaGradLearner(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier = true);
+
+    ///
+    /// Create an instance of the CNTK built-in FSAdaGrad (improved AdaGrad) learner.
+    ///
+    CNTK_API LearnerPtr FSAdaGradLearner(const std::unordered_set<Parameter>& parameters);
+
+    ///
+    /// Create an instance of the CNTK built-in RMSProp learner.
+    ///
+    CNTK_API LearnerPtr RMSPropLearner(const std::unordered_set<Parameter>& parameters,
+                                       double gamma,
+                                       double inc,
+                                       double dec,
+                                       double max,
+                                       double min,
+                                       bool needAveMultiplier = true);
+
+    ///
+    /// Trainer is the top-level abstraction responsible for the orchestration of the training of a model
+    /// using the specified learners and training data either explicilty supplied as Value objects or from
+    /// a MinibatchSource object.
+    ///
+    class Trainer
+    {
+    public:
+        ///
+        /// Construct a Trainer to train the specified 'model' with the specified 'trainingLoss' Variable as the training criterion
+        /// and using the specified set of 'parameterLearners' for updating the model's parameters using computed gradients.
+        ///
+        CNTK_API Trainer(const FunctionPtr& model, const Variable& trainingLoss, const std::unordered_set<LearnerPtr>& parameterLearners);
+
+        ///
+        /// Optimize model parameters using the specified 'arguments' minibatch of training samples.
+        /// Returns false if all parameter learners indicate end of learning (through their Update method's return value).
+        ///
+        CNTK_API bool TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice = DeviceDescriptor::DefaultDevice());
+
+        ///
+        /// Model being trained by 'this' Trainer.
+        ///
+        FunctionPtr Model() const { return m_model; }
+
+        ///
+        /// Variable of the Trainer's model representing the training loss that is used as the optimization 
+        /// criterion for learning the model's parameters.
+        ///
+        Variable TrainingLossVariable() const { return m_trainingLossVar; }
+
+        ///
+        /// Returns the Value of the training loss variable of the model corresponding to the last minibatch trained with
+        ///
+        ValuePtr PreviousMinibatchTrainingLossValue() const { return m_prevMinibatchTrainingLossValue; }
+
+        ///
+        /// Learners associated with this Trainer for updating the model's parameters using computed gradients.
+        ///
+        const std::unordered_set<LearnerPtr>& ParameterLearners() const { return m_parameterLearners; }
+
+    private:
+        FunctionPtr m_model;
+        Variable m_trainingLossVar;
+        ValuePtr m_prevMinibatchTrainingLossValue;
+        std::unordered_set<LearnerPtr> m_parameterLearners;
+    };
+
+    ///
+    /// Describes an input stream: its name, element type, storage, etc.
+    ///
+    struct StreamInfo
+    {
+        std::wstring m_name;           // Unique name of the stream
+        size_t m_id;                   // Unique identifier of the stream
+        StorageFormat m_storageFormat; // Storage format of the stream
+        DataType m_elementType;        // Element type of the stream
+        NDShape m_sampleLayout;        // Layout of the sample for the stream
+    };
+
+    inline bool operator==(const StreamInfo& left, const StreamInfo& right)
+    {
+        return ((left.m_id == right.m_id) &&
+                (left.m_name == right.m_name) &&
+                (left.m_storageFormat == right.m_storageFormat) &&
+                (left.m_elementType == right.m_elementType) &&
+                (left.m_sampleLayout == right.m_sampleLayout));
+    }
+}
+
+namespace std {
+    template <> struct hash<CNTK::StreamInfo>
+    {
+        size_t operator()(const CNTK::StreamInfo& x) const
+        {
+            return std::hash<size_t>()(x.m_id);
+        }
+    };
+}
+
+namespace CNTK
+{
+    struct MinibatchData
+    {
+        size_t m_numSequences;
+        size_t m_numSamples;
+        ValuePtr m_data;
+    };
+
+    ///
+    /// Abstraction for generating minbatches of samples for training/evaluation.
+    ///
+    class MinibatchSource : public std::enable_shared_from_this<MinibatchSource>
+    {
+    public:
+        ///
+        /// Describes the streams 'this' MinibatchSource produces.
+        ///
+        virtual const std::unordered_set<StreamInfo>& StreamInfos() = 0;
+
+        ///
+        /// Reads a minibatch that contains data across all input streams.
+        /// The minibatchData argument specifies the desired minibatch size for each stream of the reader either in terms of #sequences or 
+        /// #samples or both. In case the size is specified in terms of both #sequences and #samples, the smaller of the 2 is taken. The actual
+        /// returned size of the minibatch is the min across all streams. Also the requested MB size fields in the maps are updated by the 
+        /// MinibatchSource to contain the actual #sequences and #samples in the returned minibatch for the corresponding stream.
+        /// The return value indciates if the MinibatchSource will return any further data in subsequent calls of this function.
+        ///
+        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) = 0;
+
+        // TODO: Methods to save and restore from checkpoints
+
+        // Disallow copy and move construction and assignment
+        MinibatchSource(const MinibatchSource&) = delete; MinibatchSource(MinibatchSource&&) = delete; MinibatchSource& operator=(const MinibatchSource&) = delete; MinibatchSource& operator=(MinibatchSource&&) = delete;
+
+    protected:
+        MinibatchSource() {}
+    };
+
+    ///
+    /// Instantiate the CNTK built-in composite minibatch source.
+    ///
+    CNTK_API MinibatchSourcePtr CreateCompositeMinibatchSource(const Dictionary& configuration);
+
+    ///
+    /// Compute the per dimension means and variances for each of the specified streams using data from the specified minibatchSource.
+    ///
+    CNTK_API void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
+                                                       std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndVariances,
+                                                       const DeviceDescriptor& device = DeviceDescriptor::CPUDevice());
 }
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -47,6 +47,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template <typename ElementType>
    class ComputationNode;
+
+    class File;
 }}}

 // TODO: The following should be reconciled with the equivalent code in the CNTK implementation
@ -100,9 +102,15 @@ namespace CNTK

    // RuntimeError - throw a std::runtime_error with a formatted error string
 #ifndef _MSC_VER // gcc __attribute__((format(printf())) does not percolate through variadic templates; so must go the macro route
+#ifndef RuntimeError
 #define RuntimeError ThrowFormatted<std::runtime_error>
+#endif
+#ifndef LogicError
 #define LogicError ThrowFormatted<std::logic_error>
+#endif
+#ifndef InvalidArgument
 #define InvalidArgument ThrowFormatted<std::invalid_argument>
+#endif
 #else
    template <class... _Types>
    __declspec_noreturn inline void RuntimeError(const char* format, _Types&&... _Args)
@ -158,4 +166,12 @@ namespace CNTK

    class Function;
    typedef std::shared_ptr<Function> FunctionPtr;
+
+    class Learner;
+    typedef std::shared_ptr<Learner> LearnerPtr;
+
+    class Dictionary;
+
+    class MinibatchSource;
+    typedef std::shared_ptr<MinibatchSource> MinibatchSourcePtr;
 }
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -0,0 +1,274 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "CNTKLibrary.h"
+#include "Function.h"
+#include "ComputationNetworkBuilder.h"
+#include "Utils.h"
+#include "ComputationNode.h"
+#include "InputAndParamNodes.h"
+#include "NonlinearityNodes.h"
+#include "LinearAlgebraNodes.h"
+#include "RecurrentNodes.h"
+#include "EvaluationNodes.h"
+#include "TrainingNodes.h"
+
+using namespace Microsoft::MSR::CNTK;
+
+namespace CNTK
+{
+    template <typename ElementType>
+    Variable GetVariable(const ComputationNodeBasePtr& node,
+                         std::unordered_map<ComputationNodeBasePtr, Variable>& nodeToVariableMap,
+                         std::unordered_map<Placeholder, Variable>& placeholderReplacements,
+                         std::unordered_set<FunctionPtr>& allPrimitiveFunctions)
+    {
+        auto iter = nodeToVariableMap.find(node);
+        if (iter != nodeToVariableMap.end())
+            return iter->second;
+
+        Variable var;
+        NDShape varShape = AsNDShape(node->GetSampleLayout());
+        // The CNTK sample layouts may have trailing axes with dimension size of 1 which are automatically
+        // added when converting from NDShape to CNTK internal TensorShapes and are not present in the original
+        // shapes specified by the user. These should be truncated.
+        if (varShape.NumAxes() <= 2)
+        {
+            size_t numTrailingDimsToRemove = 0;
+            for (int i = varShape.NumAxes() - 1; i >= 0; --i)
+            {
+                if (varShape[i] == 1)
+                    numTrailingDimsToRemove++;
+                else
+                    break;
+            }
+            varShape = varShape.SubShape(0, varShape.NumAxes() - numTrailingDimsToRemove);
+        }
+
+        if (node->IsLeaf())
+        {
+            if (node->Is<InputValueBase<ElementType>>())
+            {
+                auto inputNode = node->As<InputValueBase<ElementType>>();
+                bool isSparse = node->Is<SparseInputValue<ElementType>>();
+                if (node->HasMBLayout())
+                {
+                    // TODO: Currently only default dynamic axis is supported
+                    const std::wstring defaultCNTKDynamicAxisName = L"";
+                    if (inputNode->GetRequestedDynamicAxis() != defaultCNTKDynamicAxisName)
+                        LogicError("Found dynamic axis named '%S' while currently only default dynamic axis named '%S' is supported!", node->GetMBLayout()->GetAxisName(), defaultCNTKDynamicAxisName.c_str());
+
+                    var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, node->GetName());
+                }
+                else
+                {
+                    // TODO: Allow creating inputs without a dynamic axis
+                    LogicError("Found InputNode with no dynamic axis which is currently unsupported");
+                }
+            }
+            else if (node->Is<LearnableParameter<ElementType>>())
+            {
+                bool isConstant = (node->GetLearningRateMultiplier() == 0);
+                auto& matrix = node->As<ComputationNode<ElementType>>()->Value();
+                auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), node->GetSampleLayout());
+                NDArrayViewPtr parameterValue = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), varShape, false, tensorView);
+                if (isConstant)
+                    var = Constant(parameterValue, node->GetName());
+                else
+                    var = Parameter(parameterValue, node->GetName());
+            }
+            else
+                LogicError("CNTK::LoadLegacyModel: Unsupported legacy CNTK node named '%S'", node->NodeName().c_str());
+        }
+        else
+        {
+            // This is a non-leaf node and maps to a primitive Function
+            auto placeholderVar = Placeholder(varShape);
+            nodeToVariableMap[node] = placeholderVar;
+
+            std::vector<Variable> inputVars(node->GetNumInputs());
+            for (size_t i = 0; i < inputVars.size(); ++i)
+            {
+                inputVars[i] = GetVariable<ElementType>(node->Input(i), nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions);
+                if (inputVars[i].IsPlaceholder())
+                    placeholderReplacements[Placeholder(inputVars[i])] = Variable();
+            }
+
+            PrimitiveOpType opType;
+            Dictionary primitiveFunctionConfigParameters;
+            if (node->OperationName() == OperationNameOf(NegateNode))
+                opType = PrimitiveOpType::Negate;
+            else if (node->OperationName() == OperationNameOf(SigmoidNode))
+                opType = PrimitiveOpType::Sigmoid;
+            else if (node->OperationName() == OperationNameOf(TanhNode))
+                opType = PrimitiveOpType::Tanh;
+            else if (node->OperationName() == OperationNameOf(RectifiedLinearNode))
+                opType = PrimitiveOpType::ReLU;
+            else if (node->OperationName() == OperationNameOf(ExpNode))
+                opType = PrimitiveOpType::Exp;
+            else if (node->OperationName() == OperationNameOf(LogNode))
+                opType = PrimitiveOpType::Log;
+            else if (node->OperationName() == OperationNameOf(SqrtNode))
+                opType = PrimitiveOpType::Sqrt;
+            else if (node->OperationName() == OperationNameOf(FloorNode))
+                opType = PrimitiveOpType::Floor;
+            else if (node->OperationName() == OperationNameOf(AbsNode))
+                opType = PrimitiveOpType::Abs;
+            else if (node->OperationName() == OperationNameOf(ReciprocalNode))
+                opType = PrimitiveOpType::Reciprocal;
+            else if (node->OperationName() == OperationNameOf(SoftmaxNode))
+                opType = PrimitiveOpType::Softmax;
+            else if (node->OperationName() == OperationNameOf(PlusNode))
+                opType = PrimitiveOpType::Plus;
+            else if (node->OperationName() == OperationNameOf(MinusNode))
+                opType = PrimitiveOpType::Minus;
+            else if (node->OperationName() == OperationNameOf(ElementTimesNode))
+                opType = PrimitiveOpType::ElementTimes;
+            else if (node->OperationName() == OperationNameOf(EqualNode))
+                opType = PrimitiveOpType::Equal;
+            else if (node->OperationName() == OperationNameOf(NotEqualNode))
+                opType = PrimitiveOpType::NotEqual;
+            else if (node->OperationName() == OperationNameOf(LessNode))
+                opType = PrimitiveOpType::Less;
+            else if (node->OperationName() == OperationNameOf(LessEqualNode))
+                opType = PrimitiveOpType::LessEqual;
+            else if (node->OperationName() == OperationNameOf(GreaterNode))
+                opType = PrimitiveOpType::Greater;
+            else if (node->OperationName() == OperationNameOf(GreaterEqualNode))
+                opType = PrimitiveOpType::GreaterEqual;
+            else if (node->OperationName() == OperationNameOf(TimesNode))
+            {
+                primitiveFunctionConfigParameters[L"numOutputAxes"] = DictionaryValue((size_t)node->As<TimesNode<ElementType>>()->OutputRank());
+                opType = PrimitiveOpType::Times;
+            }
+            else if (node->OperationName() == OperationNameOf(PastValueNode))
+            {
+                if (inputVars.size() == 1)
+                {
+                    auto initialStateVar = Constant({}, node->As<PastValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
+                    inputVars.insert(inputVars.begin(), initialStateVar);
+                }
+                primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<PastValueNode<ElementType>>()->TimeStep());
+                opType = PrimitiveOpType::PastValue;
+            }
+            else if (node->OperationName() == OperationNameOf(FutureValueNode))
+            {
+                if (inputVars.size() == 1)
+                {
+                    auto initialStateVar = Constant({}, node->As<FutureValueNode<ElementType>>()->InitialActivationValue(), AsDeviceDescriptor(node->GetDeviceId()));
+                    inputVars.insert(inputVars.begin(), initialStateVar);
+                }
+                primitiveFunctionConfigParameters[L"stepSize"] = DictionaryValue((size_t)node->As<FutureValueNode<ElementType>>()->TimeStep());
+                opType = PrimitiveOpType::FutureValue;
+            }
+            else if (node->OperationName() == OperationNameOf(SquareErrorNode))
+                opType = PrimitiveOpType::SquaredError;
+            else if (node->OperationName() == OperationNameOf(CrossEntropyWithSoftmaxNode))
+            {
+                std::swap(inputVars[0], inputVars[1]);
+                opType = PrimitiveOpType::CrossEntropyWithSoftmax;
+            }
+            else if (node->OperationName() == OperationNameOf(ErrorPredictionNode))
+            {
+                std::swap(inputVars[0], inputVars[1]);
+                opType = PrimitiveOpType::ClassificationError;
+            }
+            else if (node->OperationName() == OperationNameOf(SumElementsNode))
+                opType = PrimitiveOpType::ReduceSum;
+            else if (node->OperationName() == OperationNameOf(ConvolutionNode))
+            {
+                auto convolutionNode = node->As<ConvolutionNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"strides"] = AsNDShape(convolutionNode->Strides());
+                primitiveFunctionConfigParameters[L"sharing"] = AsDictionaryValueVector(convolutionNode->Sharing());
+                primitiveFunctionConfigParameters[L"autoPadding"] = AsDictionaryValueVector(convolutionNode->AutoPad());
+                primitiveFunctionConfigParameters[L"lowerPad"] = AsNDShape(convolutionNode->LowerPad());
+                primitiveFunctionConfigParameters[L"upperPad"] = AsNDShape(convolutionNode->UpperPad());
+                primitiveFunctionConfigParameters[L"transpose"] = convolutionNode->Transpose();
+                primitiveFunctionConfigParameters[L"maxTempMemSizeInSamples"] = convolutionNode->MaxTempMemSizeInSamples();
+
+                opType = PrimitiveOpType::Convolution;
+            }
+            else if (node->OperationName() == OperationNameOf(PoolingNode))
+            {
+                auto poolingNode = node->As<PoolingNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"poolingType"] = (size_t)(AsPoolingType(poolingNode->PoolingKind()));
+                primitiveFunctionConfigParameters[L"poolingWindowShape"] = AsNDShape(poolingNode->KernelShape());
+                primitiveFunctionConfigParameters[L"strides"] = AsNDShape(poolingNode->Strides());
+                primitiveFunctionConfigParameters[L"autoPadding"] = AsDictionaryValueVector(poolingNode->AutoPad());
+                primitiveFunctionConfigParameters[L"lowerPad"] = AsNDShape(poolingNode->LowerPad());
+                primitiveFunctionConfigParameters[L"upperPad"] = AsNDShape(poolingNode->UpperPad());
+
+                opType = PrimitiveOpType::Pooling;
+            }
+            else if (node->OperationName() == OperationNameOf(BatchNormalizationNode))
+            {
+                auto batchNormalizationNode = node->As<BatchNormalizationNode<ElementType>>();
+                primitiveFunctionConfigParameters[L"spacial"] = batchNormalizationNode->Spatial();
+                primitiveFunctionConfigParameters[L"normalizationTimeConstant"] = batchNormalizationNode->NormalizationTimeConstant();
+                primitiveFunctionConfigParameters[L"blendTimeConstant"] = batchNormalizationNode->BlendTimeConstant();
+                primitiveFunctionConfigParameters[L"epsilon"] = batchNormalizationNode->Epsilon();
+                primitiveFunctionConfigParameters[L"useCuDNNEngine"] = !batchNormalizationNode->UseCNTKEngine();
+
+                opType = PrimitiveOpType::BatchNormalization;
+            }
+            else
+                LogicError("Unsupported ComputationNode with OperationName='%S' found when loading legacy CNTK model", node->OperationName().c_str());
+
+            FunctionPtr primitiveFunction = MakeSharedObject<PrimitiveFunction>(opType, inputVars, std::move(primitiveFunctionConfigParameters), node->GetName());
+            allPrimitiveFunctions.insert(primitiveFunction);
+            var = primitiveFunction->Output();
+            if (placeholderReplacements.find(placeholderVar) != placeholderReplacements.end())
+                placeholderReplacements[placeholderVar] = var;
+        }
+
+        nodeToVariableMap[node] = var;
+        return var;
+    }
+
+    template <typename ElementType>
+    FunctionPtr LoadLegacyModel(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/)
+    {
+        ComputationNetworkPtr net = make_shared<ComputationNetwork>(AsCNTKImplDeviceId(computeDevice));
+        net->Load<ElementType>(modelFile);
+
+        // Now traverse the model and construct the Function graph
+        std::unordered_map<ComputationNodeBasePtr, Variable> nodeToVariableMap;
+        std::unordered_map<Placeholder, Variable> placeholderReplacements;
+        std::unordered_set<FunctionPtr> allPrimitiveFunctions;
+        std::vector<FunctionPtr> rootFunctions;
+        auto& networkRoots = net->RootNodes();
+        for (auto& rootNode : networkRoots)
+        {
+            if (rootNode->IsLeaf())
+                continue;
+
+            rootFunctions.push_back(GetVariable<ElementType>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
+        }
+
+        auto rootComposite = Combine(rootFunctions);
+        rootComposite->ReplacePlaceholders(placeholderReplacements);
+
+        return rootComposite;
+    }
+
+    template <typename ElementType>
+    void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile)
+    {
+        CompositeFunction* compositeFunction = dynamic_cast<CompositeFunction*>(rootFunction.get());
+        if (compositeFunction == nullptr)
+            InvalidArgument("Primitive (aka non-composite) Function instances cannot be saved");
+
+        auto computationNetwork = compositeFunction->GetComputationNetwork<ElementType>(DeviceDescriptor::CPUDevice(), {});
+        computationNetwork->Save(modelFile);
+    }
+
+    // Template instantiations
+    template CNTK_API FunctionPtr LoadLegacyModel<float>(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/);
+    template CNTK_API FunctionPtr LoadLegacyModel<double>(const std::wstring& modelFile, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/);
+
+    template CNTK_API void SaveAsLegacyModel<float>(const FunctionPtr& rootFunction, const std::wstring& modelFile);
+    template CNTK_API void SaveAsLegacyModel<double>(const FunctionPtr& rootFunction, const std::wstring& modelFile);
+}
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -56,7 +56,7 @@
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
-      <AdditionalIncludeDirectories>.\API;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>.\API;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(MSMPI_LIB64);$(SolutionDir)$(Platform)\$(Configuration);$(NvmlLibPath)</AdditionalLibraryDirectories>
@ -75,7 +75,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; SequenceTrainingLib.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; SequenceTrainingLib.lib; ReaderLib.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
@ -99,7 +99,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ReaderLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
@ -128,11 +128,14 @@
    <ClInclude Include="API\CNTKLibrary.h" />
    <ClInclude Include="API\CNTKLibraryInternals.h" />
    <ClInclude Include="Function.h" />
+    <ClInclude Include="Learner.h" />
+    <ClInclude Include="MinibatchSource.h" />
    <ClInclude Include="Utils.h" />
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
+    <ClCompile Include="BackCompat.cpp" />
    <ClCompile Include="Common.cpp" />
    <ClCompile Include="dllmain.cpp">
      <CompileAsManaged>false</CompileAsManaged>
@ -140,11 +143,14 @@
      </PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="Function.cpp" />
+    <ClCompile Include="Learner.cpp" />
+    <ClCompile Include="MinibatchSource.cpp" />
    <ClCompile Include="NDArrayView.cpp" />
    <ClCompile Include="NDMask.cpp" />
    <ClCompile Include="stdafx.cpp">
      <PrecompiledHeader>Create</PrecompiledHeader>
    </ClCompile>
+    <ClCompile Include="Trainer.cpp" />
    <ClCompile Include="Utils.cpp" />
    <ClCompile Include="Value.cpp" />
    <ClCompile Include="Variable.cpp" />
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -10,6 +10,10 @@
    <ClCompile Include="Variable.cpp" />
    <ClCompile Include="Utils.cpp" />
    <ClCompile Include="NDMask.cpp" />
+    <ClCompile Include="Learner.cpp" />
+    <ClCompile Include="BackCompat.cpp" />
+    <ClCompile Include="Trainer.cpp" />
+    <ClCompile Include="MinibatchSource.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="stdafx.h" />
@ -22,6 +26,8 @@
      <Filter>API</Filter>
    </ClInclude>
    <ClInclude Include="Function.h" />
+    <ClInclude Include="Learner.h" />
+    <ClInclude Include="MinibatchSource.h" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="API">
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -117,6 +117,7 @@ namespace CNTK
        if (variable.IsParameter() || variable.IsConstant())
        {
            computationNodePtr = builder.CreateLearnableParameter(variable.Name(), AsTensorShape(variable.Shape()));
+            network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
            if (!variable.NeedsGradient())
                computationNodePtr->SetLearningRateMultiplier(0.0);

@ -126,7 +127,13 @@ namespace CNTK
        }
        else if (variable.IsInput())
        {
-            // TODO: Specify dynamic axis
+            // TODO: Support inputs with > 1 dynamic axes
+            if (variable.DynamicAxes().size() != 1)
+                LogicError("Currently only Input variables with one dynamic axis are supported");
+
+            auto dynamicAxis = variable.DynamicAxes()[0];
+            if (dynamicAxis != Axis::DefaultDynamicAxis())
+                LogicError("Currently only Input variables with DefaultDynamicAxis are supported");
            if (IsSparseInput(variable))
                computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()));
            else
@ -164,6 +171,7 @@ namespace CNTK
        if (dynamic_cast<PrimitiveFunction*>(function))
        {
            PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);
+            auto functionConfig = primitiveFunction->FunctionConfig();

            // Create the nodes corresponding to the inputs
            auto functionInputs = primitiveFunction->Inputs();
@ -180,12 +188,8 @@ namespace CNTK
            PrimitiveOpType op = primitiveFunction->OpType();
            switch (op)
            {
-            case PrimitiveOpType::Plus:
-                computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
-                break;
-            case PrimitiveOpType::Times:
-                // TODO: The output rank of the times operation is currently hardcoded to 1
-                computationNodePtr = builder.Times(input0Node, input1Node, 1, function->Name());
+            case PrimitiveOpType::Negate:
+                computationNodePtr = builder.Negate(input0Node, function->Name());
                break;
            case PrimitiveOpType::Sigmoid:
                computationNodePtr = builder.Sigmoid(input0Node, function->Name());
@ -193,15 +197,100 @@ namespace CNTK
            case PrimitiveOpType::Tanh:
                computationNodePtr = builder.Tanh(input0Node, function->Name());
                break;
+            case PrimitiveOpType::ReLU:
+                computationNodePtr = builder.RectifiedLinear(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::Exp:
+                computationNodePtr = builder.Exp(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::Log:
+                computationNodePtr = builder.Log(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::Sqrt:
+                computationNodePtr = builder.Sqrt(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::Floor:
+                computationNodePtr = builder.Floor(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::Abs:
+                computationNodePtr = builder.Abs(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::Reciprocal:
+                computationNodePtr = builder.Reciprocal(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::Softmax:
+                if (functionInputs[0].Shape().NumAxes() > 1)
+                    InvalidArgument("Softmax operation can only be applied to a 1D input");
+
+                computationNodePtr = builder.Softmax(input0Node, function->Name());
+                break;
+            case PrimitiveOpType::Pooling:
+            {
+                PoolingType poolingType = (PoolingType)(functionConfig[L"poolingType"].GetValue<size_t>());
+                auto poolingWindowsShape = functionConfig[L"poolingWindowShape"].GetValue<NDShape>();
+                auto strides = functionConfig[L"strides"].GetValue<NDShape>();
+                auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
+                auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
+                auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
+                computationNodePtr = builder.Pooling(input0Node, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape, true), AsTensorShape(strides, true), autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), ImageLayoutKind::CHW, function->Name());
+                break;
+            }
+            case PrimitiveOpType::Plus:
+                computationNodePtr = builder.Plus(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::Minus:
+                computationNodePtr = builder.Minus(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::ElementTimes:
+                computationNodePtr = builder.ElementTimes(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::Equal:
+                computationNodePtr = builder.Equal(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::NotEqual:
+                computationNodePtr = builder.NotEqual(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::Less:
+                computationNodePtr = builder.Less(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::LessEqual:
+                computationNodePtr = builder.LessEqual(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::Greater:
+                computationNodePtr = builder.Greater(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::GreaterEqual:
+                computationNodePtr = builder.GreaterEqual(input0Node, input1Node, function->Name());
+                break;
+            case PrimitiveOpType::Times:
+            {
+                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
+                computationNodePtr = builder.Times(input0Node, input1Node, numOutputAxes, function->Name());
+                break;
+            }
+            case PrimitiveOpType::Convolution:
+            {
+                NDShape outputMapCount, kernelShape;
+                std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(functionInputs[0].Shape(), functionInputs[1].Shape());
+                auto strides = functionConfig[L"strides"].GetValue<NDShape>();
+                auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
+                auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
+                auto sharing = AsBasicElementTypeVector<bool>(functionConfig[L"sharing"].GetValue<std::vector<DictionaryValue>>());
+                auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
+                auto transpose = functionConfig[L"transpose"].GetValue<bool>();
+                auto maxTempMemSizeInSamples = functionConfig[L"maxTempMemSizeInSamples"].GetValue<size_t>();
+                computationNodePtr = builder.Convolution(input0Node, input1Node, AsTensorShape(kernelShape, true), AsTensorShape(outputMapCount, true), AsTensorShape(strides, true), sharing, autoPadding, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true), transpose, ImageLayoutKind::CHW, maxTempMemSizeInSamples, function->Name());
+                break;
+            }
+            case PrimitiveOpType::SquaredError:
+                computationNodePtr = builder.SquareError(input0Node, input1Node, function->Name());
+                break;
            case PrimitiveOpType::CrossEntropyWithSoftmax:
                computationNodePtr = builder.CrossEntropyWithSoftmax(input1Node, input0Node, function->Name());
                break;
            case PrimitiveOpType::ClassificationError:
                computationNodePtr = builder.ErrorPrediction(input1Node, input0Node, function->Name());
                break;
-            case PrimitiveOpType::Exp:
-                computationNodePtr = builder.Exp(input0Node, function->Name());
-                break;
            case PrimitiveOpType::PastValue:
            case PrimitiveOpType::FutureValue:
            {
@ -231,9 +320,6 @@ namespace CNTK

                break;
            }
-            case PrimitiveOpType::ElementTimes:
-                computationNodePtr = builder.ElementTimes(input0Node, input1Node, function->Name());
-                break;
            case PrimitiveOpType::ReduceSum:
            {
                // TODO: Use the new ReduceElements node instead of the legacy SumElements node for reduction. Currently ReduceElements has incorrect MBLayout inference.
@ -241,6 +327,23 @@ namespace CNTK
                computationNodePtr = builder.Sum(input0Node, function->Name());
                break;
            }
+            case PrimitiveOpType::BatchNormalization:
+            {
+                auto spacial = functionConfig[L"spacial"].GetValue<bool>();
+                auto normalizationTimeConstant = functionConfig[L"normalizationTimeConstant"].GetValue<double>();
+                auto blendTimeConstant = functionConfig[L"blendTimeConstant"].GetValue<double>();
+                auto epsilon = functionConfig[L"epsilon"].GetValue<double>();
+                auto useCuDNNEngine = functionConfig[L"useCuDNNEngine"].GetValue<bool>();
+                std::vector<std::shared_ptr<ComputationNode<ElementType>>> inputNodes;
+                for (auto inputVar : functionInputs)
+                {
+                    auto baseNodePtr = GetNode(inputVar, network, builder, variableToNodeMap, isVariableRootMap);
+                    inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
+                }
+
+                computationNodePtr = builder.BatchNormalization(inputNodes[0], inputNodes[1], inputNodes[2], inputNodes[3], inputNodes[4], spacial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, ImageLayoutKind::CHW, function->Name());
+                break;
+            }
            case PrimitiveOpType::Combine:
                // This operation is just a no-op and is a means to combine multiple functions to create a single Function
                // whose outputs are a union of tyhe outputs of the Functions being combined.
@ -351,7 +454,7 @@ namespace CNTK
                    auto outputShape = outputVar.Shape();
                    auto computationNodeSampleLayout = computationNodePtr->GetSampleLayout();
                    if (((outputShape.NumAxes() == 0) && (computationNodeSampleLayout[0] != 1)) ||
-                        ((outputShape.NumAxes() != 0) && (computationNodeSampleLayout != AsTensorShape(outputShape))))
+                        ((outputShape.NumAxes() != 0) && (computationNodeSampleLayout != AsTensorShape(outputShape)) && (computationNodeSampleLayout != AsTensorShape(outputShape, true))))
                    {
                        LogicError("The output Variable shape %s does not match the SampleLayout shape %s of the corresponding ComputationNode in the network", AsString(outputShape).c_str(), ((std::string)computationNodeSampleLayout).c_str());
                    }
@ -486,18 +589,9 @@ namespace CNTK
    }

    template <typename ElementType>
-    /*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout)
+    /*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(const NDShape& sampleShape, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
    {
-        if (var.DynamicAxes().size() > 1)
-            LogicError("More than one dynamic axis for a variable is currently unsupported");
-
-        if (AsDataType<ElementType>() != var.GetDataType())
-            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(var.GetDataType()));
-
-        if ((layout != nullptr) && (matrix.GetNumRows() != var.Shape().TotalSize()))
-            LogicError("Unexpected matrix layout: The number of rows in the matrix does not match the sample size of the Variable");
-
-        NDShape valueDataShape = var.Shape();
+        NDShape valueDataShape = sampleShape;
        if (layout != nullptr)
            valueDataShape = valueDataShape.AppendShape({ layout->GetNumTimeSteps(), layout->GetNumSequences() });

@ -506,7 +600,7 @@ namespace CNTK
        {
            // Just create a view over the existing matrix itself
            auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorShape(valueDataShape));
-            auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView);
+            auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, readOnly, tensorView);
            return MakeSharedObject<Value>(data);
        }

@ -565,10 +659,25 @@ namespace CNTK
        }

        auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorShape(valueDataShape));
-        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView);
+        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, readOnly, tensorView);
        return MakeSharedObject<Value>(data, mask);
    }

+    template <typename ElementType>
+    /*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
+    {
+        if (var.DynamicAxes().size() > 1)
+            LogicError("More than one dynamic axis for a variable is currently unsupported");
+
+        if (AsDataType<ElementType>() != var.GetDataType())
+            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(var.GetDataType()));
+
+        if ((layout != nullptr) && (matrix.GetNumRows() != var.Shape().TotalSize()))
+            LogicError("Unexpected matrix layout: The number of rows in the matrix does not match the sample size of the Variable");
+
+        return GetValueObjectFromCNTKImplMatrixAndMBLayout(var.Shape(), matrix, layout, readOnly);
+    }
+
    template <typename ElementType>
    /*static*/ void CompositeFunction::PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, ComputationNodeBasePtr& computationNode)
    {
@ -583,7 +692,7 @@ namespace CNTK
        computationNode->GetMBLayout()->CopyFrom(layout);
    }

-    void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments)
+    void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, ValuePtr>& arguments)
    {
        auto functionArguments = this->Arguments();
        std::vector<ComputationNodeBasePtr> inputNodes;
@ -628,7 +737,7 @@ namespace CNTK
    }

    // Assign the supplied gradients corresponding to the root(s) of the network to be backpropagated through the graph
-    void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients)
+    void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, ValuePtr>& gradients)
    {
        auto functionOutputs = this->Outputs();
        for (auto gradientVarValuePair : gradients)
@ -676,45 +785,48 @@ namespace CNTK
        return NDShape(outputShapeDims);
    }

+    /*static*/ void CompositeFunction::GetNodeOutputOrGradient(Variable var, ValuePtr& varValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode, bool getGradient)
+    {
+        auto valueShape = GetValueShape(var, computationNode);
+        if (varValue != nullptr)
+        {
+            // TODO: The shape of the specified output Value object must match the actual output shape
+            if (varValue->Data()->Shape() != valueShape)
+                InvalidArgument("The shape %s of the specified Value object for %s does not match the actual shape %s", AsString(varValue->Data()->Shape()).c_str(), getGradient ? "gradient" : "output", AsString(valueShape).c_str());
+        }
+
+        ValuePtr nodeValue;
+        switch (var.GetDataType())
+        {
+        case DataType::Float:
+            nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(var,
+                                                                           getGradient ? computationNode->As<ComputationNode<float>>()->Gradient() : computationNode->As<ComputationNode<float>>()->Value(),
+                                                                           computationNode->GetMBLayout());
+            break;
+        case DataType::Double:
+            nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var,
+                                                                            getGradient ? computationNode->As<ComputationNode<double>>()->Gradient() : computationNode->As<ComputationNode<double>>()->Value(),
+                                                                            computationNode->GetMBLayout());
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(var.GetDataType()));
+            break;
+        }
+
+        if (varValue == nullptr)
+        {
+            auto data = MakeSharedObject<NDArrayView>(var.GetDataType(), valueShape, AsDeviceDescriptor(computationNode->ValuePtr()->GetDeviceId()));
+            auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
+            varValue = MakeSharedObject<Value>(data, mask);
+        }
+        varValue->CopyFrom(*nodeValue);
+    }
+
    void CompositeFunction::GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs)
    {
        // Now copy the Forward values of output nodes from the network to outputs' Value objects
        for (auto outputVarValuePair : outputs)
-        {
-            auto computationNodePtr = m_variableToNodeMap[outputVarValuePair.first];
-            auto outputValuePtr = outputVarValuePair.second;
-
-            auto outputShape = GetValueShape(outputVarValuePair.first, computationNodePtr);
-            if (outputValuePtr != nullptr)
-            {
-                // TODO: The shape of the specified output Value object must match the actual output shape
-                if (outputValuePtr->Data()->Shape() != outputShape)
-                    InvalidArgument("The shape %s of the specified Value object for output does not match the actual output shape %s", AsString(outputValuePtr->Data()->Shape()).c_str(), AsString(outputShape).c_str());
-            }
-
-            ValuePtr nodeValue;
-            switch (outputVarValuePair.first.GetDataType())
-            {
-            case DataType::Float:
-                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
-                break;
-            case DataType::Double:
-                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
-                break;
-            default:
-                LogicError("Unsupported DataType %s", DataTypeName(outputVarValuePair.first.GetDataType()));
-                break;
-            }
-
-            if (outputValuePtr == nullptr)
-            {
-                auto data = MakeSharedObject<NDArrayView>(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
-                auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
-                outputValuePtr = MakeSharedObject<Value>(data, mask);
-            }
-            outputValuePtr->CopyFrom(*nodeValue);
-            outputs[outputVarValuePair.first] = outputValuePtr;
-        }
+            GetNodeOutputOrGradient(outputVarValuePair.first, outputs[outputVarValuePair.first], m_variableToNodeMap[outputVarValuePair.first], false /*getGradient*/);
    }

    void CompositeFunction::GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients)
@ -732,46 +844,15 @@ namespace CNTK
                InvalidArgument("Gradient value incorrectly requested for an Output or Constant Variable, or an Input Variable with NeedsGradient setting of false");

            auto computationNodePtr = m_variableToNodeMap[gradientVarValuePair.first];
-            auto gradientValuePtr = gradientVarValuePair.second;
-
-            auto gradientShape = GetValueShape(gradientVarValuePair.first, computationNodePtr);
-            if (gradientValuePtr != nullptr)
-            {
-                // TODO: The shape of the specified output Value object must match the actual output shape
-                if (gradientValuePtr->Data()->Shape() != gradientShape)
-                    InvalidArgument("The shape %s of the specified Value object for gradient does not match the actual gradient shape %s", AsString(gradientValuePtr->Data()->Shape()).c_str(), AsString(gradientShape).c_str());
-            }

            if (!computationNodePtr->NeedsGradient())
                LogicError("Backpropagated gradient value cannot be read from a ComputationNode that has NeedsGradient set to false");

-            ValuePtr nodeValue;
-            switch (gradientVarValuePair.first.GetDataType())
-            {
-            case DataType::Float:
-                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
-                break;
-            case DataType::Double:
-                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
-                break;
-            default:
-                LogicError("Unsupported DataType %s", DataTypeName(gradientVarValuePair.first.GetDataType()));
-                break;
-            }
-
-            if (gradientValuePtr == nullptr)
-            {
-                auto data = MakeSharedObject<NDArrayView>(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
-                auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
-                gradientValuePtr = MakeSharedObject<Value>(data, mask);
-            }
-
-            gradientValuePtr->CopyFrom(*nodeValue);
-            gradients[gradientVarValuePair.first] = gradientValuePtr;
+            GetNodeOutputOrGradient(gradientVarValuePair.first, gradients[gradientVarValuePair.first], computationNodePtr, true /*getGradient*/);
        }
    }

-    /*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
+    /*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
                                                            std::unordered_map<Variable, ValuePtr>& outputs,
                                                            const DeviceDescriptor& computeDevice,
                                                            const std::unordered_set<Variable>& outputsToRetainBackwardStateFor)
@ -809,6 +890,8 @@ namespace CNTK
                outputsToEvaluate.push_back(m_variableToNodeMap[rootVarForBackprop]);
        }

+        ScopedNetworkOperationMode modeGuard(m_computationNetwork, outputsToRetainBackwardStateFor.empty() ? NetworkOperationMode::inferring : NetworkOperationMode::training);
+
        m_computationNetwork->ForwardProp(outputsToEvaluate);

        GetNetworkOutputs(outputs);
@ -819,7 +902,7 @@ namespace CNTK
    }

    /*virtual*/ void CompositeFunction::Backward(const BackPropStatePtr& state,
-                                                 const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
+                                                 const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
                                                 std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
    {
        auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.get());
@ -844,6 +927,8 @@ namespace CNTK
        PopulateNetworkGradients(rootGradientValues);

        // Backpropagate through the network
+        ScopedNetworkOperationMode modeGuard(m_computationNetwork, NetworkOperationMode::training);
+
        auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.begin()->first];
        m_computationNetwork->GetNestedNetwork(rootComputationNodePtr)->Backprop(FrameRange(nullptr), true, true);

@ -852,27 +937,261 @@ namespace CNTK
        // TODO: How to deal with the specified 'computeDevice'
    }

-    FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    FunctionPtr UnaryOp(PrimitiveOpType op, const Variable& operand, Dictionary&& opConfig, const std::wstring& name)
    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Times, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(op, std::vector<Variable>({ operand }), std::move(opConfig), name), name);
    }

-    FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    FunctionPtr Negate(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Plus, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
+        return UnaryOp(PrimitiveOpType::Negate, operand, Dictionary(), name);
    }

    FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Sigmoid, std::vector<Variable>({ operand }), Dictionary(), name), name);
+        return UnaryOp(PrimitiveOpType::Sigmoid, operand, Dictionary(), name);
    }

    FunctionPtr Tanh(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Tanh, std::vector<Variable>({ operand }), Dictionary(), name), name);
+        return UnaryOp(PrimitiveOpType::Tanh, operand, Dictionary(), name);
    }

-    FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
+    FunctionPtr ReLU(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::ReLU, operand, Dictionary(), name);
+    }
+
+    FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
+        {
+        return UnaryOp(PrimitiveOpType::Exp, operand, Dictionary(), name);
+    }
+
+    FunctionPtr Log(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::Log, operand, Dictionary(), name);
+        }
+
+    FunctionPtr Square(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return ElementTimes(operand, operand, name);
+    }
+
+    FunctionPtr Sqrt(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::Sqrt, operand, Dictionary(), name);
+    }
+
+    FunctionPtr Round(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return Floor(Plus(operand, Constant(NDShape({}), 0.5f)), name);
+    }
+
+    FunctionPtr Floor(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::Floor, operand, Dictionary(), name);
+    }
+
+    FunctionPtr Ceil(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return Negate(Floor(Negate(operand)), name);
+    }
+
+    FunctionPtr Abs(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::Abs, operand, Dictionary(), name);
+    }
+
+    FunctionPtr Reciprocal(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::Reciprocal, operand, Dictionary(), name);
+    }
+
+    FunctionPtr Softmax(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::Softmax, operand, Dictionary(), name);
+    }
+
+    FunctionPtr BinaryOp(PrimitiveOpType op, const Variable& leftOperand, const Variable& rightOperand, Dictionary&& opConfig, const std::wstring& name)
+    {
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(op, std::vector<Variable>({ leftOperand, rightOperand }), std::move(opConfig), name), name);
+    }
+
+    FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::Plus, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr Minus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::Minus, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::ElementTimes, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr ElementDivide(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return ElementTimes(leftOperand, Reciprocal(rightOperand), name);
+    }
+
+    FunctionPtr Equal(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::Equal, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr NotEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::NotEqual, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr Less(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::Less, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr LessEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::LessEqual, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr Greater(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::Greater, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr GreaterEqual(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::GreaterEqual, leftOperand, rightOperand, Dictionary(), name);
+    }
+
+    FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, size_t numOutputAxes /*= 1*/, const std::wstring& name/* = L""*/)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"numOutputAxes"] = numOutputAxes;
+        return BinaryOp(PrimitiveOpType::Times, leftOperand, rightOperand, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::SquaredError, prediction, targets, Dictionary(), name);
+    }
+
+    FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::CrossEntropyWithSoftmax, prediction, labels, Dictionary(), name);
+    }
+
+    FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
+    {
+        return BinaryOp(PrimitiveOpType::ClassificationError, prediction, labels, Dictionary(), name);
+    }
+
+    FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
+    {
+        if (operand.DynamicAxes().size() != 1)
+            InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
+
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
+        return BinaryOp(PrimitiveOpType::PastValue, initialState, operand, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
+    {
+        if (operand.DynamicAxes().size() != 1)
+            InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
+
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
+        return BinaryOp(PrimitiveOpType::FutureValue, initialState, operand, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
+    {
+        return UnaryOp(PrimitiveOpType::ReduceSum, operand, Dictionary(), name);
+    }
+
+    FunctionPtr PerDimMeanVarianceNormalize(const Variable& operand, const NDArrayViewPtr& mean, const NDArrayViewPtr& invStdDev, const std::wstring& name /*= L""*/)
+    {
+        Constant meanVar(mean);
+        Constant invStdDevVar(invStdDev);
+
+        return ElementTimes(Minus(operand, meanVar), invStdDevVar);
+    }
+
+    FunctionPtr Convolution(const Variable& convolutionMap,
+                            const Variable& operand,
+                            const NDShape& strides,
+                            const std::vector<bool>& sharing,
+                            const std::vector<bool>& autoPadding,
+                            const NDShape& lowerPad,
+                            const NDShape& upperPad,
+                            bool transpose,
+                            size_t maxTempMemSizeInSamples,
+                            const std::wstring& name)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"strides"] = strides;
+        additionalProperties[L"sharing"] = AsDictionaryValueVector(sharing);
+        additionalProperties[L"autoPadding"] = AsDictionaryValueVector(autoPadding);
+        additionalProperties[L"lowerPad"] = lowerPad;
+        additionalProperties[L"upperPad"] = upperPad;
+        additionalProperties[L"transpose"] = transpose;
+        additionalProperties[L"maxTempMemSizeInSamples"] = maxTempMemSizeInSamples;
+
+        return BinaryOp(PrimitiveOpType::Convolution, convolutionMap, operand, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr Pooling(const Variable& operand,
+                        PoolingType poolingType,
+                        const NDShape& poolingWindowShape,
+                        const NDShape& strides,
+                        const std::vector<bool>& autoPadding,
+                        const NDShape& lowerPad,
+                        const NDShape& upperPad,
+                        const std::wstring& name)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"poolingType"] = (size_t)poolingType;
+        additionalProperties[L"poolingWindowShape"] = poolingWindowShape;
+        additionalProperties[L"strides"] = strides;
+        additionalProperties[L"autoPadding"] = AsDictionaryValueVector(autoPadding);
+        additionalProperties[L"lowerPad"] = lowerPad;
+        additionalProperties[L"upperPad"] = upperPad;
+
+        return UnaryOp(PrimitiveOpType::Pooling, operand, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr BatchNormalization(const Variable& operand,
+                                   const Variable& scale,
+                                   const Variable& bias,
+                                   const Variable& runningMean,
+                                   const Variable& runningInvStd,
+                                   bool spacial,
+                                   double normalizationTimeConstant,
+                                   double blendTimeConstant,
+                                   double epsilon,
+                                   bool useCuDNNEngine,
+                                   const std::wstring& name)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[L"spacial"] = spacial;
+        additionalProperties[L"normalizationTimeConstant"] = normalizationTimeConstant;
+        additionalProperties[L"blendTimeConstant"] = blendTimeConstant;
+        additionalProperties[L"epsilon"] = epsilon;
+        additionalProperties[L"useCuDNNEngine"] = useCuDNNEngine;
+
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::BatchNormalization,
+                                                                             std::vector<Variable>({ operand, scale, bias, runningMean, runningInvStd }),
+                                                                             std::move(additionalProperties),
+                                                                             name),
+                                         name);
+    }
+
+    FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
    {
        std::unordered_set<FunctionPtr> uniqueOperands;
        std::vector<Variable> inputs;
@ -888,49 +1207,4 @@ namespace CNTK

        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
    }
-
-    FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name/* = L""*/)
-    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::CrossEntropyWithSoftmax, std::vector<Variable>({ output, labels }), Dictionary(), name), name);
-    }
-
-    FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
-    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ClassificationError, std::vector<Variable>({ prediction, labels }), Dictionary(), name), name);
-    }
-
-    FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
-    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Exp, std::vector<Variable>({ operand }), Dictionary(), name), name);
-    }
-
-    FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
-    {
-        if (operand.DynamicAxes().size() != 1)
-            InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
-
-        auto additionalProperties = Dictionary();
-        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::PastValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
-    }
-
-    FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
-    {
-        if (operand.DynamicAxes().size() != 1)
-            InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic axis");
-
-        auto additionalProperties = Dictionary();
-        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::FutureValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
-    }
-
-    FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
-    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ElementTimes, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
-    }
-
-    FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
-    {
-        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ReduceSum, std::vector<Variable>({ operand }), Dictionary(), name), name);
-    }
 }
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -10,65 +10,110 @@
 #include <iterator>
 #include "ComputationNetwork.h"
 #include "Utils.h"
+#include "ConvolveGeometry.h"

 namespace CNTK
 {
-    enum class PrimitiveOpType
+    enum class PrimitiveOpType : unsigned int
    {
-        Plus,
-        Times,
+        Negate,
        Sigmoid,
        Tanh,
-        Combine,
+        ReLU,
+        Exp,
+        Log,
+        Sqrt,
+        Floor,
+        Abs,
+        Reciprocal,
+        Softmax,
+        Pooling,
+        Plus,
+        Minus,
+        ElementTimes,
+        Equal,
+        NotEqual,
+        Less,
+        LessEqual,
+        Greater,
+        GreaterEqual,
+        Times,
+        Convolution,
+        SquaredError,
        CrossEntropyWithSoftmax,
        ClassificationError,
-        Exp,
        PastValue,
        FutureValue,
-        ElementTimes,
-        ReduceSum
+        ReduceSum,
+        BatchNormalization,
+        Combine,
    };
+}

+namespace std
+{
+    template <> struct hash<CNTK::PrimitiveOpType>
+    {
+        size_t operator()(const CNTK::PrimitiveOpType& x) const
+        {
+            return std::hash<unsigned int>()((unsigned int)x);
+        }
+    };
+}
+
+namespace CNTK
+{
    inline const char* PrimitiveOpTypeName(PrimitiveOpType opType)
    {
-        // TODO: Put these in table form
-        if (opType == PrimitiveOpType::Plus)
-            return "Plus";
-        else if (opType == PrimitiveOpType::Times)
-            return "Times";
-        else if (opType == PrimitiveOpType::Sigmoid)
-            return "Sigmoid";
-        else if (opType == PrimitiveOpType::Tanh)
-            return "Tanh";
-        else if (opType == PrimitiveOpType::Combine)
-            return "Combine";
-        else if (opType == PrimitiveOpType::CrossEntropyWithSoftmax)
-            return "CrossEntropyWithSoftmax";
-        else if (opType == PrimitiveOpType::ClassificationError)
-            return "ClassificationError";
-        else if (opType == PrimitiveOpType::Exp)
-            return "Exp";
-        else if (opType == PrimitiveOpType::PastValue)
-            return "PastValue";
-        else if (opType == PrimitiveOpType::FutureValue)
-            return "FutureValue";
-        else if (opType == PrimitiveOpType::ElementTimes)
-            return "ElementTimes";
-        else if (opType == PrimitiveOpType::ReduceSum)
-            return "ReduceSum";
-        else
+        static std::unordered_map<PrimitiveOpType, const char*> primitiveOpNames = {
+            { PrimitiveOpType::Negate, "Negate" },
+            { PrimitiveOpType::Sigmoid, "Sigmoid" },
+            { PrimitiveOpType::Tanh, "Tanh" },
+            { PrimitiveOpType::ReLU, "ReLU" },
+            { PrimitiveOpType::Exp, "Exp" },
+            { PrimitiveOpType::Log, "Log" },
+            { PrimitiveOpType::Sqrt, "Sqrt" },
+            { PrimitiveOpType::Floor, "Floor" },
+            { PrimitiveOpType::Abs, "Abs" },
+            { PrimitiveOpType::Reciprocal, "Reciprocal" },
+            { PrimitiveOpType::Softmax, "Softmax" },
+            { PrimitiveOpType::Pooling, "Pooling" },
+            { PrimitiveOpType::Plus, "Plus" },
+            { PrimitiveOpType::Minus, "Minus" },
+            { PrimitiveOpType::ElementTimes, "ElementTimes" },
+            { PrimitiveOpType::Equal, "Equal" },
+            { PrimitiveOpType::NotEqual, "NotEqual" },
+            { PrimitiveOpType::Less, "Less" },
+            { PrimitiveOpType::LessEqual, "LessEqual" },
+            { PrimitiveOpType::Greater, "Greater" },
+            { PrimitiveOpType::GreaterEqual, "GreaterEqual" },
+            { PrimitiveOpType::Times, "Times" },
+            { PrimitiveOpType::Convolution, "Convolution" },
+            { PrimitiveOpType::SquaredError, "SquaredError" },
+            { PrimitiveOpType::CrossEntropyWithSoftmax, "CrossEntropyWithSoftmax" },
+            { PrimitiveOpType::ClassificationError, "ClassificationError" },
+            { PrimitiveOpType::PastValue, "PastValue" },
+            { PrimitiveOpType::FutureValue, "FutureValue" },
+            { PrimitiveOpType::ReduceSum, "ReduceSum" },
+            { PrimitiveOpType::BatchNormalization, "BatchNormalization" },
+            { PrimitiveOpType::Combine, "Combine" }
+        };
+
+        if (primitiveOpNames.find(opType) == primitiveOpNames.end())
            LogicError("Unknown PrimitiveOpType");
+
+        return primitiveOpNames.find(opType)->second;
    }

    class PrimitiveFunction final : public Function
    {
    public:
        PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName = L"")
-            : Function(inputs, GetOutputVariables(op, inputs, this), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
+            : Function(inputs, GetOutputVariables(op, inputs, this, functionConfig), nullptr, functionName), m_op(op), m_functionConfig(std::move(functionConfig))
        {
        }

-        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& /*arguments*/,
+        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& /*arguments*/,
                                         std::unordered_map<Variable, ValuePtr>& /*outputs*/,
                                         const DeviceDescriptor& /*computeDevice*/,
                                         const std::unordered_set<Variable>& /*outputsToRetainBackwardStateFor*/) override
@ -77,7 +122,7 @@ namespace CNTK
        }

        virtual void Backward(const BackPropStatePtr& /*state*/,
-                              const std::unordered_map<Variable, const ValuePtr>& /*rootGradientValues*/,
+                              const std::unordered_map<Variable, ValuePtr>& /*rootGradientValues*/,
                              std::unordered_map<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
        {
            NOT_IMPLEMENTED;
@ -131,25 +176,28 @@ namespace CNTK
            return NDShape(std::move(outputDims));
        }

-        static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape)
+        static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape, size_t numOutputAxes)
        {
-            if (rightOperandShape.NumAxes() > 2)
-                RuntimeError("The right operand of a times operation can have at most 2 axes");
+            if (numOutputAxes == 0)
+                InvalidArgument("Output #axes of times operation should be at least one");

-            size_t numOutputAxes = rightOperandShape.NumAxes();
+            if (numOutputAxes > leftOperandShape.NumAxes())
+                InvalidArgument("Output #axes of times operation can at most be the #axes of the left operand");

-            if (leftOperandShape.NumAxes() != 2)
-                RuntimeError("The left operand of a times operation must have 2 axes");
+            size_t numReductionAxes = leftOperandShape.NumAxes() - numOutputAxes;

-            std::vector<size_t> outputDims(numOutputAxes);
-            outputDims[0] = leftOperandShape[0];
-            if (numOutputAxes > 1)
-                outputDims[1] = rightOperandShape[1];
+            // The 'numReductionAxes' trailing dimensions of the left operand's shape must match the corresponding leading
+            // dimensions of the right operand

-            if (leftOperandShape[1] != rightOperandShape[0])
-                RuntimeError("Left operand's shape %s is not compatible with right operand's shape %s for the times operation", AsString(leftOperandShape).c_str(), AsString(rightOperandShape).c_str());
+            if (rightOperandShape.NumAxes() != numReductionAxes)
+                RuntimeError("The right operand's #axes in a times operation should equal #axes being reduced over!");

-            return NDShape(std::move(outputDims));
+            if (leftOperandShape.SubShape(numOutputAxes) != rightOperandShape)
+                InvalidArgument("The trailing dimensions of the left operand (%s) do not match the right operand's dimensions (%s)",
+                                AsString(leftOperandShape.SubShape(numOutputAxes)).c_str(),
+                                AsString(rightOperandShape).c_str());
+
+            return leftOperandShape.SubShape(0, numOutputAxes);
        }

        static NDShape ReductionOpOutputShape(PrimitiveOpType op, const NDShape& operandShape, const std::vector<size_t>& reductionAxes)
@ -171,8 +219,22 @@ namespace CNTK
            return NDShape(std::move(outputDims));
        }

+        static NDShape ConvolutionOpOutputShape(const NDShape& operandShape, const NDShape& kernelShape, const NDShape& outputMapCount, const NDShape& strides,
+                                                const std::vector<bool>& sharing,
+                                                std::vector<bool>& autoPad, const NDShape& lowerPad, const NDShape& upperPad,
+                                                bool transpose)
+        {
+            decltype(&Microsoft::MSR::CNTK::ConvolveGeometry::ComputeOutputShape) computeOutputShapeFunc;
+            if (!transpose)
+                computeOutputShapeFunc = &Microsoft::MSR::CNTK::ConvolveGeometry::ComputeOutputShape;
+            else
+                computeOutputShapeFunc = &Microsoft::MSR::CNTK::ConvolveGeometry::ComputeInputShape;
+
+            return AsNDShape(computeOutputShapeFunc(AsTensorShape(operandShape, true), AsTensorShape(kernelShape, true), AsTensorShape(outputMapCount, true), AsTensorShape(strides, true), sharing, autoPad, AsTensorShape(lowerPad, true), AsTensorShape(upperPad, true)));
+        }
+
        // TODO: Reconcile this with the ComputationNode::Validate functionality in core CNTK to avoid duplication of inference logic
-        static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner)
+        static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner, const Dictionary& functionConfig)
        {
            std::vector<Variable> outputs;

@ -195,32 +257,79 @@ namespace CNTK

            switch (op)
            {
+            case PrimitiveOpType::Negate:
            case PrimitiveOpType::Sigmoid:
            case PrimitiveOpType::Tanh:
+            case PrimitiveOpType::ReLU:
            case PrimitiveOpType::Exp:
+            case PrimitiveOpType::Log:
+            case PrimitiveOpType::Sqrt:
+            case PrimitiveOpType::Floor:
+            case PrimitiveOpType::Abs:
+            case PrimitiveOpType::Reciprocal:
+            case PrimitiveOpType::Softmax:
                assert(inputs.size() == 1);
                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
-            case PrimitiveOpType::PastValue:
-            case PrimitiveOpType::FutureValue:
-                assert(inputs.size() == 2);
-                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
+            case PrimitiveOpType::Pooling:
+            {
+                assert(inputs.size() == 1);
+                auto poolingWindowsShape = functionConfig[L"poolingWindowShape"].GetValue<NDShape>();
+                auto strides = functionConfig[L"strides"].GetValue<NDShape>();
+                auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
+                auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
+                auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
+                outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[0].Shape(), poolingWindowsShape, { 1 }, strides, { true }, autoPadding, lowerPad, upperPad, false), outputDataType, owner, outputDynamicAxes));
                break;
+            }
            case PrimitiveOpType::Plus:
+            case PrimitiveOpType::Minus:
            case PrimitiveOpType::ElementTimes:
+            case PrimitiveOpType::Equal:
+            case PrimitiveOpType::NotEqual:
+            case PrimitiveOpType::Less:
+            case PrimitiveOpType::LessEqual:
+            case PrimitiveOpType::Greater:
+            case PrimitiveOpType::GreaterEqual:
                assert(inputs.size() == 2);
                outputs.push_back(Variable(BinaryElementwiseOpOutputShape(op, inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
            case PrimitiveOpType::Times:
+            {
                assert(inputs.size() == 2);
-                outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
+
+                // TODO: Support dynamic axes on the left operand
+                if (!inputs[0].DynamicAxes().empty())
+                    LogicError("Dynamic axes are currently unsupported for left operand of a Times operation");
+
+                size_t numOutputAxes = functionConfig[L"numOutputAxes"].GetValue<size_t>();
+                outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape(), numOutputAxes), outputDataType, owner, outputDynamicAxes));
                break;
+            }
+            case PrimitiveOpType::Convolution:
+            {
+                assert(inputs.size() == 2);
+                auto strides = functionConfig[L"strides"].GetValue<NDShape>();
+                auto lowerPad = functionConfig[L"lowerPad"].GetValue<NDShape>();
+                auto upperPad = functionConfig[L"upperPad"].GetValue<NDShape>();
+                auto sharing = AsBasicElementTypeVector<bool>(functionConfig[L"sharing"].GetValue<std::vector<DictionaryValue>>());
+                auto autoPadding = AsBasicElementTypeVector<bool>(functionConfig[L"autoPadding"].GetValue<std::vector<DictionaryValue>>());
+                bool transpose = functionConfig[L"transpose"].GetValue<bool>();
+                if (inputs[0].Shape().NumAxes() < inputs[1].Shape().NumAxes())
+                    InvalidArgument("The convolution map should have at least as many axes as the shape of the input it operates on!");
+
+                NDShape outputMapCount, kernelShape;
+                std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(inputs[0].Shape(), inputs[1].Shape());
+                outputs.push_back(Variable(ConvolutionOpOutputShape(inputs[1].Shape(), kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, transpose), outputDataType, owner, outputDynamicAxes));
+                break;
+            }
+            case PrimitiveOpType::SquaredError:
            case PrimitiveOpType::CrossEntropyWithSoftmax:
            case PrimitiveOpType::ClassificationError:
            {
                assert(inputs.size() == 2);

-                if (inputs[0].Shape().NumAxes() > 1)
+                if ((inputs[0].Shape().NumAxes() > 2) || ((inputs[0].Shape().NumAxes() > 1) && (inputs[0].Shape()[1] != 1)))
                    InvalidArgument("The shape of input operands for the %s operation should have at most one axis", PrimitiveOpTypeName(op));

                auto predictionShape = inputs[0].Shape();
@ -235,6 +344,11 @@ namespace CNTK
                outputs.push_back(Variable(ReductionOpOutputShape(op, predictionShape, reductionAxes), outputDataType, owner, {}));
                break;
            }
+            case PrimitiveOpType::PastValue:
+            case PrimitiveOpType::FutureValue:
+                assert(inputs.size() == 2);
+                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
+                break;
            case PrimitiveOpType::ReduceSum:
            {
                assert(inputs.size() == 1);
@ -249,6 +363,9 @@ namespace CNTK
                outputs.push_back(Variable(ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes), outputDataType, owner, reductionOutputDynamicAxes));
                break;
            }
+            case PrimitiveOpType::BatchNormalization:
+                outputs.push_back(Variable(UnaryElementwiseOpOutputShape(inputs[0].Shape()), outputDataType, owner, outputDynamicAxes));
+                break;
            case PrimitiveOpType::Combine:
                outputs = inputs;
                break;
@ -288,10 +405,18 @@ namespace CNTK
    class CompositeFunction final : public Function
    {
        friend class Function;
+        friend class CompositeMinibatchSource;

        template <typename T, typename ...CtorArgTypes>
        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);

+        template <typename ElementType>
+        friend void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile);
+
+        friend void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
+                                                         std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
+                                                         const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/);
+
    public:
        static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
        {
@ -303,13 +428,13 @@ namespace CNTK
            return MakeSharedObject<CompositeFunction>(rootFunction, std::move(visitedFunctions), name);
        }

-        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
+        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
                                         std::unordered_map<Variable, ValuePtr>& outputs,
                                         const DeviceDescriptor& computeDevice,
                                         const std::unordered_set<Variable>& outputsToRetainBackwardStateFor) override;

        virtual void Backward(const BackPropStatePtr& state,
-                              const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
+                              const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
                              std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;

    private:
@ -361,12 +486,13 @@ namespace CNTK

        template <typename ElementType>
        static void PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
-        void PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments);
+        void PopulateNetworkInputs(const std::unordered_map<Variable, ValuePtr>& arguments);

        template <typename ElementType>
        static void PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
-        void PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients);
+        void PopulateNetworkGradients(const std::unordered_map<Variable, ValuePtr>& gradients);

+        static void GetNodeOutputOrGradient(Variable var, ValuePtr& varValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode, bool getGradient);
        void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
        void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);

@ -374,7 +500,9 @@ namespace CNTK
        static std::pair<std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>>, Microsoft::MSR::CNTK::MBLayoutPtr> GetCNTKImplMatrixAndMBLayoutFromValueObject(Variable var, const ValuePtr& value);

        template <typename ElementType>
-        static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout);
+        static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(const NDShape& sampleShape, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout, bool readOnly = true);
+        template <typename ElementType>
+        static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout, bool readOnly = true);

    private:

--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -0,0 +1,451 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "Learner.h"
+#include "TensorView.h"
+#include "Utils.h"
+
+#define UPDATE_FUNCTION                                                                                       \
+    switch (smoothedGradientValue->GetDataType())                                                     \
+    {                                                                                                         \
+    case DataType::Float:                                                                                     \
+        Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);  \
+        break;                                                                                                \
+    case DataType::Double:                                                                                    \
+        Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount); \
+        break;                                                                                                \
+    default:                                                                                                  \
+        NOT_IMPLEMENTED;                                                                                      \
+    }
+
+
+using namespace Microsoft::MSR::CNTK;
+using namespace std;
+
+namespace CNTK
+{
+    template <typename ElementType>
+    /*static*/ shared_ptr<const Matrix<ElementType>> LearnerBase::GetMatrix(const NDArrayViewPtr& arrayView)
+    {
+        return arrayView->GetMatrix<ElementType>();
+    }
+
+    template <typename ElementType>
+    /*static*/ shared_ptr<Matrix<ElementType>> LearnerBase::GetWritableMatrix(const NDArrayViewPtr& arrayView)
+    {
+        return arrayView->GetWritableMatrix<ElementType>();
+    }
+
+    template <typename ElementType>
+    /*static*/ const TensorView<ElementType>* LearnerBase::GetTensorView(const NDArrayViewPtr& arrayView)
+    {
+        return arrayView->GetTensorView<ElementType>();
+    }
+
+    /*static*/ bool LearnerBase::HasNan(const NDArrayViewPtr& value, const char* name)
+    {
+        switch (value->GetDataType())
+        {
+        case DataType::Float:
+            return value->GetMatrix<float>()->HasNan(name);
+        case DataType::Double:
+            return value->GetMatrix<double>()->HasNan(name);
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(value->GetDataType()));
+        }
+    }
+
+    /*static*/ void LearnerBase::Print(const NDArrayViewPtr& value, const char* msg)
+    {
+        switch (value->GetDataType())
+        {
+        case DataType::Float:
+            value->GetMatrix<float>()->Print(msg);
+            break;
+        case DataType::Double:
+            value->GetMatrix<double>()->Print(msg);
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(value->GetDataType()));
+        }
+    }
+
+    // Clipping gradients to prevent outliers,
+    template <typename ElementType>
+    void LearnerBase::ClipGradient(Matrix<ElementType>& gradient, size_t actualMBSize) const
+    {
+        if (m_additionalOptions.gradientClippingThresholdPerSample != numeric_limits<double>::infinity())
+        {
+            double maxGradientPerMB = m_additionalOptions.gradientClippingThresholdPerSample * actualMBSize;
+            if (m_additionalOptions.gradientClippingWithTruncation)
+                gradient.InplaceTruncate(ElementType(maxGradientPerMB));
+            else
+            {
+                // norm2 normalized
+                double gradientNorm = gradient.FrobeniusNorm();
+                if (gradientNorm > maxGradientPerMB)
+                {
+                    double normFactor = maxGradientPerMB / gradientNorm;
+                    gradient *= ElementType(normFactor);
+                }
+            }
+        }
+    }
+
+    // Performs additional preprocessing before calling the update method 
+    // (gradient clipping and L2 regularization depending on the additional learning parameters).
+    template <typename ElementType>
+    void LearnerBase::PreProcess(const NDArrayViewPtr& parameterValue, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const
+    {
+        const auto& gradientMatrix = gradientValue->GetWritableMatrix<ElementType>();
+
+        // clipping gradients to prevent outliers
+        ClipGradient<ElementType>(*gradientMatrix, actualMBSize);
+
+        // L2 regularizer
+        if (m_additionalOptions.l2RegularizationWeight > 0)
+        {
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            auto weight = ElementType(m_additionalOptions.l2RegularizationWeight * actualMBSize);
+            const auto& parameterMatrix = parameterValue->GetWritableMatrix<ElementType>();
+            Matrix<ElementType>::ScaleAndAdd(weight, *parameterMatrix, *gradientMatrix);
+        }
+    }
+
+    // Performs additional postprocessing after the update method has been executed
+    // (noise injection and L1 regularization specified by the additional learning parameters).
+    template <typename ElementType>
+    void LearnerBase::PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const
+    {
+        const auto& parameterValue = parameter.Value();
+        const auto& parameterMatrix = parameterValue->GetWritableMatrix<ElementType>();
+        if (m_additionalOptions.gaussianNoiseInjectionStdDev > 0)
+        {
+            const auto& gradientMatrix = gradientValue->GetWritableMatrix<ElementType>();
+
+            Matrix<ElementType> sgdUpdateNoise((DEVICEID_TYPE)parameterMatrix->GetDeviceId());
+
+            // get the gradient structure since gradient is sparse
+            sgdUpdateNoise.SetValue(*gradientMatrix);
+
+            auto noiseStdDev = ElementType(m_additionalOptions.gaussianNoiseInjectionStdDev);
+
+            // reset its value to random
+            sgdUpdateNoise.SetGaussianRandomValue(ElementType(0.0), noiseStdDev);
+
+            Matrix<ElementType>::ScaleAndAdd(ElementType(1.0), sgdUpdateNoise, *parameterMatrix);
+        }
+
+        // L1 regularizer with proximal gradient descent method
+        if (m_additionalOptions.l1RegularizationWeight > 0)
+        {
+            auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
+            parameterValue->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
+        }
+    }
+
+    template <typename ElementType>
+    /*static*/ TensorView<ElementType>* LearnerBase::GetWritableTensorView(const NDArrayViewPtr& arrayView)
+    {
+        return arrayView->GetWritableTensorView<ElementType>();
+    }
+
+    LearnerBase::LearnerBase(const unordered_set<Parameter>& parameters)
+        : Learner(parameters),
+        m_learningRatePerSample(0.0),
+        m_sampleCount(0)
+    {
+        const unordered_set<Parameter>& parameterSet = parameters;
+        for (const auto& parameter : parameterSet)
+        {
+            // TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
+            // Should the device be specified on the per-parameter basis?
+            NDArrayViewPtr view;
+            if (parameter.GetDataType() == DataType::Float)
+            {
+                view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), parameter.Value()->Device());
+            }
+            else
+            {
+                view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), parameter.Value()->Device());
+            }
+
+            m_smoothedGradientValues.insert(make_pair(parameter, view));
+            m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
+        }
+    }
+
+    void LearnerBase::ResetSmoothedGradients()
+    {
+        for (const auto& parameter : Parameters())
+        {
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            const auto& data = smoothedGradientValue;
+            switch (data->GetDataType())
+            {
+            case DataType::Float:
+                data->SetValue(0.0f);
+                break;
+            case DataType::Double:
+                data->SetValue(0.0);
+                break;
+            default:
+                LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
+            }
+        }
+    }
+
+    /*virtual*/ bool LearnerBase::Update(const unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) /*override*/
+    {
+        // make sure trainingSampleCount is a valid value
+        assert(trainingSampleCount > 0);
+
+        for (const auto& parameter : Parameters())
+        {
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            const auto& gradientValue = gradientValues.at(parameter);
+// TODO: make this a runtime parameter.
+#if DUMPOUTPUT
+            LOGPRINTF(stderr, "Update_%ls\n", parameter.Name().c_str());
+#endif
+
+#ifdef _DEBUG
+            if (HasNan(smoothedGradientValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
+                LogicError("%ls has NaNs in smoothedGradient.", parameter.Name().c_str());
+#endif
+
+#if DUMPOUTPUT
+            LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
+                        m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
+            LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
+                        LearnerType().c_str(), m_GaussianNoiseInjectStd);
+            Print(gradientValue, "Gradient Update");
+            Print(smoothedGradientValue, "Smoothed Gradient Input");
+#endif
+            UPDATE_FUNCTION;
+
+#if DUMPOUTPUT
+            Print(parameterValue, "Parameter Update");
+#endif
+
+#ifdef _DEBUG
+            const auto& parameterValue = parameter.Value();
+            if (HasNan(parameterValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
+                LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Name().c_str());
+#endif
+        }
+        m_sampleCount += trainingSampleCount;
+        return false;
+    }
+
+    template <typename ElementType>
+    void LearnerBase::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        const auto& parameterValue = parameter.Value();
+        PreProcess<ElementType>(parameterValue, gradientValue, trainingSampleCount);
+        Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+        PostProcess<ElementType>(parameter, gradientValue, trainingSampleCount);
+    }
+
+    string LearnerBase::LearnerType() const
+    {
+        auto name = typeid(*this).name(); 
+        if (strncmp(name, "class ", 6) == 0)
+        {
+            // On Windows, the type name contains "class" prefix. 
+            // Return the actual name, omitting the prefix.
+            return &name[6];
+        } 
+        return name;
+    }
+
+    /*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
+    {
+        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        Dictionary checkpoint;
+
+        for (const auto& parameter : Parameters())
+        {
+            // TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
+            // need to expose "UId" property -- a persistent unique internal name.
+            // Switch to UId as soon as it's available.
+            if (checkpoint.Contains(parameter.Name()))
+            {
+                LogicError("Parameter names must be unique");
+            }
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+
+            // Potentially, could store things like dimensions, element size, format, etc., but
+            // that seems to be redundant, since all of that is passed in the constructor.
+            checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue);
+        }
+        return checkpoint;
+    }
+
+    /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
+    {
+        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        for (const auto& parameter : Parameters())
+        {
+            if (!checkpoint.Contains(parameter.Name()))
+            {
+                LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
+            }
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+
+            const DictionaryValue& state = checkpoint[parameter.Name()];
+
+            const auto& data = smoothedGradientValue;
+
+            DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
+        }
+    }
+
+    /*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& parameterValue = parameter.Value();
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
+
+        const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
+        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
+        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
+                                            learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
+    }
+
+    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
+        : LearnerBase(parameters), m_needAveMultiplier(needAveMultiplier)
+    {
+    }
+
+    /*virtual*/ void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& parameterValue = parameter.Value();
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
+        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
+    }
+
+    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Parameter>& parameters)
+        : LearnerMomentumSGD(parameters)
+    {
+    }
+
+    /*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& parameterValue = parameter.Value();
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
+
+        //const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
+                                            learningRate, ElementType(m_momentumPerSample));
+    }
+
+    LearnerRMSProp::LearnerRMSProp(const unordered_set<Parameter>& parameters,
+                                    double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
+                                    : LearnerBase(parameters),
+                                    m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
+                                    m_needAveMultiplier(needAveMultiplier)
+    {
+    }
+
+    /*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& parameterValue = parameter.Value();
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
+                                                                ElementType(m_gamma), ElementType(m_inc),
+                                                                ElementType(m_max), ElementType(m_dec),
+                                                                ElementType(m_min), m_needAveMultiplier);
+        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
+    }
+
+    // Explicit template instantiations
+    template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr& arrayView);
+    template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr& arrayView);
+    
+    LearnerPtr SGDLearner(const unordered_set<Parameter>& parameters, double learningRatePerSample)
+    {
+        return MakeSharedObject<LearnerSGD>(parameters, learningRatePerSample);
+    }
+
+    LearnerPtr MomentumSGDLearner(const unordered_set<Parameter>& parameters)
+    {
+        return MakeSharedObject<LearnerMomentumSGD>(parameters);
+    }
+
+    LearnerPtr NesterovLearner(const unordered_set<Parameter>& parameters)
+    {
+        return MakeSharedObject<LearnerNesterov>(parameters);
+    }
+
+    LearnerPtr AdaGradLearner(const unordered_set<Parameter>& parameters, bool needAveMultiplier)
+    {
+        return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier);
+    }
+
+    LearnerPtr FSAdaGradLearner(const unordered_set<Parameter>& parameters)
+    {
+        return MakeSharedObject<LearnerFSAdaGrad>(parameters);
+    }
+
+    LearnerPtr RMSPropLearner(const unordered_set<Parameter>& parameters,
+                                double gamma, double inc, double dec, double max, double min, bool needAveMultiplier)
+    {
+        return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier);
+    }
+}
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -0,0 +1,201 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "CNTKLibrary.h"
+#include <numeric>
+
+namespace CNTK 
+{
+    // A collection of additional options that are applicable for all standard learners 
+    // (after these options are set, they retain their value for the entire lifespan of a learner).
+    struct AdditionalLearningOptions
+    {
+        double l1RegularizationWeight = 0.0;
+        double l2RegularizationWeight = 0.0;
+        double gaussianNoiseInjectionStdDev = 0.0;
+        bool gradientClippingWithTruncation = true;
+        double gradientClippingThresholdPerSample = std::numeric_limits<double>::infinity();
+        std::unordered_map<Parameter, double> learningRateMultipliers;
+    };
+
+    // An abstract base class at the root of the standard learners hierarchy
+    // It implements most of the learner functionality, except for the actual update function,
+    // and adds a few pre-/postprocessing methods (which are invoked before and after the update).
+    class LearnerBase : public Learner
+    {
+    public:
+        virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) override final;
+
+        virtual Dictionary GetCheckpointState() const override final;
+
+        virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override final;
+
+        void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
+        {
+            m_additionalOptions = additionalOptions;
+        }
+
+        // TODO: should this be called ResetMomentum?
+        // needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
+        void ResetSmoothedGradients();
+
+        // TODO: move learning rate and momentum scheduling and adjustment functionality 
+        // inside the learner and drop these setters.
+        void SetLearningRate(double value) { m_learningRatePerSample = value; }
+
+    protected:
+        LearnerBase(const std::unordered_set<Parameter>& parameters);
+
+        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const = 0;
+
+        double ParameterDependentLearningRate(const Parameter& parameter) const
+        {
+            return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
+        }
+
+        std::string LearnerType() const;
+
+        double m_learningRatePerSample;
+
+        AdditionalLearningOptions m_additionalOptions;
+
+        std::unordered_map<Parameter, NDArrayViewPtr> m_smoothedGradientValues;
+
+        // The following four static protected methods expose private methods of NDArrayView class
+        // (which declares LearnerBase as friend class), so that they are available to subclasses.
+        template <typename ElementType>
+        static std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrix(const NDArrayViewPtr& arrayView);
+
+        template <typename ElementType>
+        static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(const NDArrayViewPtr& arrayView);
+
+        template <typename ElementType>
+        static const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView(const NDArrayViewPtr& arrayView);
+
+        template <typename ElementType>
+        static Microsoft::MSR::CNTK::TensorView<ElementType>* GetWritableTensorView(const NDArrayViewPtr& arrayView);
+
+        template <typename ElementType>
+        void ClipGradient(Microsoft::MSR::CNTK::Matrix<ElementType>& gradient, size_t actualMBSize) const;
+
+        // Performs additional preprocessing before calling the update method 
+        // (gradient clipping and L2 regularization depending on the additional learning parameters).
+        template <typename ElementType>
+        void PreProcess(const NDArrayViewPtr& parameterValue, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;
+
+        // Performs additional postprocessing after the update method has been executed
+        // (noise injection and L1 regularization specified by the additional learning parameters).
+        template <typename ElementType>
+        void PostProcess(const Parameter& parameter, const NDArrayViewPtr& gradientValue, size_t actualMBSize) const;
+
+    private:
+        // Templatized update function, it invokes preprocess and postprocess using the provided
+        // template parameter and also invokes virtual Update method implemented in one of the subclasses.
+        template <typename ElementType>
+        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+
+        // TODO: make these functions friends of NDViewArray and move to Utils?
+        static bool HasNan(const NDArrayViewPtr& value, const char* name);
+        static void Print(const NDArrayViewPtr& value, const char* msg);
+
+        size_t m_sampleCount;
+    };
+
+    // Vanilla gradient descent optimization algorithm.
+    class LearnerSGD : public LearnerBase
+    {
+    public:
+        LearnerSGD(const std::unordered_set<Parameter>& parameters, double learningRatePerSample = 0)
+            : LearnerBase(parameters), m_momentumPerSample(0.0), m_useNesterovAcceleration(false)
+        {
+            SetLearningRate(learningRatePerSample);
+        }
+
+    protected:
+
+        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+
+        double m_momentumPerSample;
+        bool m_useNesterovAcceleration;
+    };
+
+    // SGD optimization with momentum. 
+    class LearnerMomentumSGD : public LearnerSGD
+    {
+    public:
+        LearnerMomentumSGD(const std::unordered_set<Parameter>& parameters)
+            : LearnerSGD(parameters)
+        {}
+
+        void SetMomentum(double value) { m_momentumPerSample = value; }
+    };
+
+    // Nesterov's accelerated SGDLearnerBase descent. 
+    class LearnerNesterov : public LearnerSGD
+    {
+    public:
+
+        LearnerNesterov(const std::unordered_set<Parameter>& parameters)
+            : LearnerSGD(parameters)
+        {
+            m_useNesterovAcceleration = true;
+        }
+    };
+
+    class LearnerAdaGrad : public LearnerBase
+    {
+    public:
+
+        LearnerAdaGrad(const std::unordered_set<Parameter>& parameters, bool needAveMultiplier);
+
+    protected:
+        bool m_needAveMultiplier;
+
+        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+    };
+
+    class LearnerFSAdaGrad : public LearnerMomentumSGD
+    {
+    public:
+
+        LearnerFSAdaGrad(const std::unordered_set<Parameter>& parameters);
+
+    protected:
+
+        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+    };
+
+    class LearnerRMSProp : public LearnerBase
+    {
+    public:
+
+        LearnerRMSProp(const std::unordered_set<Parameter>& parameters,
+                       double gamma, double inc, double dec, double max, double min, bool needAveMultiplier);
+
+    protected:
+
+        double m_gamma;
+        double m_inc;
+        double m_dec;
+        double m_max;
+        double m_min;
+        bool m_needAveMultiplier;
+
+        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+    };
+}
--- a/Source/CNTKv2LibraryDll/MinibatchSource.cpp
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.cpp
@ -0,0 +1,246 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "CNTKLibrary.h"
+#include "Utils.h"
+#include "Config.h"
+#include "MinibatchSource.h"
+#include "HeapMemoryProvider.h"
+#include "ReaderShim.h"
+#include "Function.h"
+#include <tuple>
+#include "ComputationNetworkBuilder.h"
+
+using namespace Microsoft::MSR::CNTK;
+
+namespace CNTK
+{
+    MinibatchSourcePtr CreateCompositeMinibatchSource(const Dictionary& configuration)
+    {
+        return MinibatchSourcePtr(new CompositeMinibatchSource(configuration));
+    }
+
+    CompositeMinibatchSource::CompositeMinibatchSource(const Dictionary& configuration)
+        : m_epochEndReached(false), m_prevMinibatchSize(0), m_epochSize(SIZE_MAX)
+    {
+        ConfigParameters config;
+        std::wstringstream s;
+        for (const auto& keyValuePair : *(configuration.m_dictionaryData))
+            AddConfigString(s, keyValuePair.first, keyValuePair.second, 0);
+
+        config.Parse(msra::strfun::utf8(s.str()));
+
+        const wchar_t* epochSizeConfigurationKey = L"epochSize";
+        if (configuration.Contains(epochSizeConfigurationKey))
+            m_epochSize = configuration[epochSizeConfigurationKey].GetValue<size_t>();
+
+        if (m_epochSize == 0)
+            m_epochSize = Microsoft::MSR::CNTK::requestDataSize;
+
+        typedef Reader*(*CreateCompositeDataReaderProc)(const ConfigParameters* parameters);
+        CreateCompositeDataReaderProc createReaderProc = (CreateCompositeDataReaderProc)Plugin().Load(L"CompositeDataReader", "CreateCompositeDataReader");
+        m_compositeDataReader.reset(createReaderProc(&config));
+
+        auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
+        for (auto streamDesc : compositeDataReaderStreamDescs)
+            m_streamInfos.insert({ streamDesc->m_name, streamDesc->m_id, AsStorageFormat(streamDesc->m_storageType), AsDataType(streamDesc->m_elementType), AsNDShape(*(streamDesc->m_sampleLayout)) });
+    }
+
+    /*virtual*/ std::unordered_map<StreamInfo, MinibatchData> CompositeMinibatchSource::GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+                                                                                                         const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/) /*override*/
+    {
+        std::unordered_map<StreamInfo, MinibatchData> minibatchData;
+        if (!m_epochEndReached)
+        {
+            // TODO: Support different minibatch sizes for different streams
+            size_t requestedMinibatchSizeInSamples = 0;
+            for (const auto& val : perStreamMBSizeLimits)
+            {
+                size_t maxNumSequencesRequested = val.second.first;
+                size_t maxNumSamplesRequested = val.second.second;
+
+                // TODO: Specifying minibatch size in #sequences is currently unsupported
+                if (maxNumSequencesRequested != 0)
+                    LogicError("Specifying minibatch size in #sequences is currently unsupported");
+
+                if (requestedMinibatchSizeInSamples == 0)
+                    requestedMinibatchSizeInSamples = maxNumSamplesRequested;
+                else
+                {
+                    if (requestedMinibatchSizeInSamples != maxNumSamplesRequested)
+                        LogicError("Different minibatch sizes across different input streams is currently unsupported!");
+                }
+            }
+
+            if (requestedMinibatchSizeInSamples == 0)
+                InvalidArgument("GetNextMinibatch: Requested minibatch sizes must be > 0");
+
+            if (m_prevMinibatchSize == 0)
+            {
+                // TODO: Add support for distributed reading
+                EpochConfiguration epochConfig = { 1, 0, requestedMinibatchSizeInSamples, m_epochSize, 0, 0 };
+                m_compositeDataReader->StartEpoch(epochConfig);
+                m_prevMinibatchSize = requestedMinibatchSizeInSamples;
+            }
+
+            if (requestedMinibatchSizeInSamples != m_prevMinibatchSize)
+                LogicError("GetNextMinibatch: Changing minibatch sizes across calls is currently unsupported");
+
+            auto compositeReaderMinibatchData = m_compositeDataReader->ReadMinibatch();
+            m_epochEndReached = compositeReaderMinibatchData.m_endOfEpoch;
+
+            auto compositeDataReaderStreamDescs = m_compositeDataReader->GetStreamDescriptions();
+            size_t numStreams = compositeDataReaderStreamDescs.size();
+            for (size_t i = 0; i < numStreams; ++i)
+            {
+                auto currentStreamDesc = compositeDataReaderStreamDescs[i];
+                auto iter = std::find_if(perStreamMBSizeLimits.begin(), perStreamMBSizeLimits.end(), [currentStreamDesc](const std::pair<StreamInfo, std::pair<size_t, size_t>>& entry) {
+                    return entry.first.m_id == currentStreamDesc->m_id;
+                });
+
+                if (iter == perStreamMBSizeLimits.end())
+                    continue;
+
+                auto& currentStreamInfo = iter->first;
+                auto sampleShape = AsNDShape(*(currentStreamDesc->m_sampleLayout));
+
+                ValuePtr minibatchValuePtr;
+                if (compositeReaderMinibatchData.m_data.empty())
+                {
+                    minibatchValuePtr = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(currentStreamInfo.m_elementType, sampleShape.AppendShape({ 0, 0 }), DeviceDescriptor::CPUDevice()));
+                    continue;
+                }
+
+                auto currentStreamMinibatchData = compositeReaderMinibatchData.m_data[i];
+                if (currentStreamDesc->m_elementType == ElementType::tfloat)
+                {
+                    auto dataMatrix = std::make_shared<Matrix<float>>(CPUDEVICE);
+                    size_t sampleSize = currentStreamDesc->m_sampleLayout->GetNumElements();
+
+                    // TODO: Eliminate the unnecessary CPU to CPU copy
+                    ReaderShim<float>::FillMatrixFromStream(currentStreamDesc->m_storageType, dataMatrix.get(), sampleSize, currentStreamMinibatchData);
+                    minibatchValuePtr = CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(sampleShape, *dataMatrix, currentStreamMinibatchData->m_layout, false);
+
+                    size_t numSamples = currentStreamMinibatchData->m_layout->GetActualNumSamples();
+                    size_t numSequences = currentStreamMinibatchData->m_layout->GetNumSequences();
+
+                    minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
+                }
+                else
+                    LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
+            }
+        }
+
+        return minibatchData;
+    }
+
+    void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
+                                              std::unordered_map<StreamInfo, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndInvStdDevs,
+                                              const DeviceDescriptor& device /*= DeviceDescriptor::CPUDevice()*/)
+    {
+        typedef std::shared_ptr<ComputationNode<float>> ComputationNodePtr;
+        const auto& minibatchSourceStreams = minibatchSource->StreamInfos();
+
+        auto computationNetwork = std::make_shared<ComputationNetwork>(AsCNTKImplDeviceId(device));
+        ComputationNetworkBuilder<float> builder(*computationNetwork);
+
+        std::vector<ComputationNodeBasePtr> allInputNodes;
+        std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToInputNodeMap;
+        std::unordered_map<StreamInfo, Variable> streamToDummyInputVariableMap;
+        std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToMeanNodeMap;
+        std::unordered_map<StreamInfo, ComputationNodeBasePtr> streamToInvStdDevNodeMap;
+
+        size_t totalSizePerSample = 0;
+        for (auto& currentStreamKV : computedMeanAndInvStdDevs)
+        {
+            auto currentStreamInfo = currentStreamKV.first;
+            if (minibatchSourceStreams.find(currentStreamInfo) == minibatchSourceStreams.end())
+                InvalidArgument("ComputeMeanAndVariance: Stream for which mean and variance is to be computed is not supported by the specified minibatchSource");
+
+            if (currentStreamInfo.m_elementType != DataType::Float)
+                LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
+
+            auto inputVariableShape = currentStreamInfo.m_sampleLayout;
+            auto inputTensorShape = AsTensorShape(inputVariableShape);
+            totalSizePerSample += (inputVariableShape.TotalSize() * sizeof(float));
+
+            ComputationNodePtr inputNode;
+            Variable inputVariable;
+            if (currentStreamInfo.m_storageFormat != StorageFormat::Dense)
+            {
+                inputNode = builder.CreateSparseInputNode(currentStreamInfo.m_name, inputTensorShape);
+                inputVariable = Variable(inputVariableShape, true, DataType::Float, currentStreamInfo.m_name);
+            }
+            else
+            {
+                inputNode = builder.CreateInputNode(currentStreamInfo.m_name, inputTensorShape);
+                inputVariable = Variable(inputVariableShape, DataType::Float, currentStreamInfo.m_name);
+            }
+
+            allInputNodes.push_back(inputNode);
+            streamToInputNodeMap[currentStreamInfo] = inputNode;
+            streamToDummyInputVariableMap[currentStreamInfo] = inputVariable;
+            streamToMeanNodeMap[currentStreamInfo] = builder.Mean(inputNode);
+            streamToInvStdDevNodeMap[currentStreamInfo] = builder.InvStdDev(inputNode);
+        }
+
+        computationNetwork->CompileNetwork();
+        computationNetwork->AllocateAllMatrices(computationNetwork->RootNodes(), {}, nullptr);
+
+        ScopedNetworkOperationMode modeGuard(computationNetwork, NetworkOperationMode::preComputing);
+
+        // initialize
+        auto preComputeNodes = computationNetwork->GetNodesRequiringPreComputation();
+        for (auto & preComputeNode : preComputeNodes)
+            dynamic_pointer_cast<IPreComputeNode>(preComputeNode)->MarkComputed(false /*begin accumulating*/);
+
+        const size_t maxMinibatchDataSize = (1 << 27); // 128 MB
+        const size_t minibatchSize = maxMinibatchDataSize / totalSizePerSample;
+        std::unordered_map<StreamInfo, std::pair<size_t, size_t>> minibatchSizeLimits;
+        for (auto& currentStreamKV : computedMeanAndInvStdDevs)
+            minibatchSizeLimits.insert(std::make_pair(currentStreamKV.first, std::make_pair((size_t)0, minibatchSize)));
+
+        for (;;)
+        {
+            auto minibatchData = minibatchSource->GetNextMinibatch(minibatchSizeLimits, device);
+            if (minibatchData.empty())
+                break;
+
+            for (auto& currentStreamKV : computedMeanAndInvStdDevs)
+                CompositeFunction::PopulateComputationNodeValue<float>({ streamToDummyInputVariableMap[currentStreamKV.first], minibatchData[currentStreamKV.first].m_data }, streamToInputNodeMap[currentStreamKV.first]);
+
+            ComputationNetwork::BumpEvalTimeStamp(allInputNodes);
+
+            computationNetwork->ForwardProp(preComputeNodes);
+        }
+
+        // finalize
+        for (auto & preComputeNode : preComputeNodes)
+            dynamic_pointer_cast<IPreComputeNode>(preComputeNode)->MarkComputed(true /*done accumulating*/);
+
+        // Copy out the results
+        for (auto& currentStreamKV : computedMeanAndInvStdDevs)
+        {
+            ValuePtr mean, invStdDev;
+            if (computedMeanAndInvStdDevs[currentStreamKV.first].first != nullptr)
+                mean = MakeSharedObject<Value>(computedMeanAndInvStdDevs[currentStreamKV.first].first);
+
+            if (computedMeanAndInvStdDevs[currentStreamKV.first].second != nullptr)
+                invStdDev = MakeSharedObject<Value>(computedMeanAndInvStdDevs[currentStreamKV.first].second);
+
+            CompositeFunction::GetNodeOutputOrGradient(streamToDummyInputVariableMap[currentStreamKV.first], mean, streamToMeanNodeMap[currentStreamKV.first], false /*getGradient*/);
+            CompositeFunction::GetNodeOutputOrGradient(streamToDummyInputVariableMap[currentStreamKV.first], invStdDev, streamToInvStdDevNodeMap[currentStreamKV.first], false /*getGradient*/);
+
+            if (computedMeanAndInvStdDevs[currentStreamKV.first].first == nullptr)
+                computedMeanAndInvStdDevs[currentStreamKV.first].first = mean->Data();
+
+            if (computedMeanAndInvStdDevs[currentStreamKV.first].second == nullptr)
+                computedMeanAndInvStdDevs[currentStreamKV.first].second = invStdDev->Data();
+
+        }
+    }
+}
+
--- a/Source/CNTKv2LibraryDll/MinibatchSource.h
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.h
@ -0,0 +1,32 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+#include "stdafx.h"
+#include "CNTKLibrary.h"
+#include "Utils.h"
+#include "Reader.h"
+
+namespace CNTK
+{
+    class CompositeMinibatchSource final : public MinibatchSource
+    {
+    public:
+        CompositeMinibatchSource(const Dictionary& configuration);
+
+        virtual const std::unordered_set<StreamInfo>& StreamInfos() override { return m_streamInfos; }
+
+        virtual std::unordered_map<StreamInfo, MinibatchData> GetNextMinibatch(const std::unordered_map<StreamInfo, std::pair<size_t, size_t>>& perStreamMBSizeLimits,
+                                                                               const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice()) override;
+
+    private: 
+        std::unordered_set<StreamInfo> m_streamInfos;
+        std::shared_ptr<Microsoft::MSR::CNTK::Reader> m_compositeDataReader;
+        bool m_epochEndReached;
+        size_t m_prevMinibatchSize;
+        size_t m_epochSize;
+    };
+}
--- a/Source/CNTKv2LibraryDll/NDArrayView.cpp
+++ b/Source/CNTKv2LibraryDll/NDArrayView.cpp
@ -316,7 +316,17 @@ namespace CNTK
    }

    template <typename ElementType>
-    NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
+    /*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::DefaultDevice()*/)
+    {
+        auto matrixDims = GetMatrixDimensions(shape);
+        auto randomNormalMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)mean, (ElementType)stdDev, seed));
+        auto tensorView = new TensorView<ElementType>(randomNormalMatrix, AsTensorShape(shape));
+
+        return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
+    }
+
+    template <typename ElementType>
+    /*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
    {
        auto matrixDims = GetMatrixDimensions(shape);
        auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
@ -329,6 +339,9 @@ namespace CNTK
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);

+    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
+    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<double>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
+
    template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
    template CNTK_API const double* NDArrayView::DataBuffer<double>() const;

@ -338,8 +351,10 @@ namespace CNTK
    template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
    template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;

-    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
-    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
+    template TensorView<double>* NDArrayView::GetWritableTensorView<double>();

    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
--- a/Source/CNTKv2LibraryDll/NDMask.cpp
+++ b/Source/CNTKv2LibraryDll/NDMask.cpp
@ -81,6 +81,24 @@ namespace CNTK
        GetMatrix()->SetValue(1);
    }

+    size_t NDMask::MaskedCount() const
+    {
+        auto maskMatrix = GetMatrix();
+        std::unique_ptr<char[]> maskData(maskMatrix->CopyToArray());
+        return std::count_if(maskData.get(), maskData.get() + maskMatrix->GetNumElements(), [](const char& val) {
+            return val == 0;
+        });
+    }
+
+    // TODO: This could actually be strided?
+    const char* NDMask::DataBuffer() const
+    {
+        // First make sure that the underlying matrix is on the right device
+        auto matrix = GetMatrix();
+        matrix->TransferToDeviceIfNotThere(AsCNTKImplDeviceId(m_device), true);
+        return matrix->Data();
+    }
+
    Matrix<char>* NDMask::GetMatrix() const
    {
        return m_matrixView.get();
--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -0,0 +1,78 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "CNTKLibrary.h"
+#include "Utils.h"
+
+namespace CNTK
+{
+    Trainer::Trainer(const FunctionPtr& model, const Variable& trainingLoss, const std::unordered_set<LearnerPtr>& parameterLearners)
+        : m_model(model), m_trainingLossVar(trainingLoss), m_parameterLearners(parameterLearners)
+    {
+        auto modelParameters = model->Parameters();
+        std::unordered_set<Parameter> learnerParameters;
+        for (const auto& learner : parameterLearners)
+        {
+            const auto& currentLearnerParameters = learner->Parameters();
+            for (const auto& parameter : currentLearnerParameters)
+            {
+                auto insertRetVal = learnerParameters.insert(parameter);
+                if (!insertRetVal.second)
+                    InvalidArgument("Trainer::Trainer: Parameter named %S is covered by 2 different learners", parameter.Name().c_str());
+            }
+        }
+
+        if (modelParameters != learnerParameters)
+            InvalidArgument("Trainer::Trainer: Union of the parameters covered by the specified parameterLearnes should match the specified model's parameters");
+    }
+
+    bool Trainer::TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::DefaultDevice()*/)
+    {
+        std::unordered_map<Variable, ValuePtr> outputs = { { m_trainingLossVar, nullptr } };
+        auto backPropSate = m_model->Forward(arguments, outputs, computeDevice, { m_trainingLossVar });
+        m_prevMinibatchTrainingLossValue = outputs.begin()->second;
+
+        ValuePtr rootGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(m_trainingLossVar.GetDataType(), outputs.at(m_trainingLossVar)->Data()->Shape(), computeDevice), outputs.at(m_trainingLossVar)->Mask());
+        if (m_trainingLossVar.GetDataType() == DataType::Float)
+            rootGradientValue->Data()->SetValue(1.0f);
+        else
+            rootGradientValue->Data()->SetValue(1.0);
+
+        auto modelParameters = m_model->Parameters();
+        std::unordered_map<Variable, ValuePtr> parameterGradients;
+        for (const auto& parameter : modelParameters)
+            parameterGradients[parameter] = nullptr;
+
+        m_model->Backward(backPropSate, { { m_trainingLossVar, rootGradientValue } }, parameterGradients);
+
+        bool anyUpdatesPerformed = false;
+        for (auto learner : m_parameterLearners)
+        {
+            std::unordered_map<Parameter, NDArrayViewPtr> learnerParameterGradients;
+            const auto& learnerParameters = learner->Parameters();
+            for (const auto& parameter : learnerParameters)
+            {
+                learnerParameterGradients[parameter] = parameterGradients[parameter]->Data();
+
+                if (parameterGradients[parameter]->Mask())
+                    LogicError("The gradient value for a Parameter cannot have an associated mask!");
+            }
+
+            auto trainingLossArguments = m_trainingLossVar.Owner()->Arguments();
+            auto labelsVar = *(std::find_if(trainingLossArguments.begin(), trainingLossArguments.end(), [](const Variable& var) {
+                return var.IsInput();
+            }));
+            auto argumentValue = arguments.at(labelsVar);
+            auto argumentData = argumentValue->Data();
+            auto argumentDataShape = argumentData->Shape();
+            auto mask = argumentValue->Mask();
+            size_t numSamples = argumentDataShape[argumentDataShape.NumAxes() - 1] - ((mask != nullptr) ? mask->MaskedCount() : 0);
+            anyUpdatesPerformed |= learner->Update(learnerParameterGradients, numSamples);
+        }
+
+        return anyUpdatesPerformed;
+    }
+}
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -6,31 +6,162 @@
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include "Utils.h"
+#include "File.h"
+
+using namespace std;

 namespace CNTK
 {
+    template <typename T>
+    void DictionaryValue::AllocateDataPtr(const T& value)
+    {
+        static_assert(is_same<T, NDShape>::value ||
+                      is_same<T, wstring>::value ||
+                      is_same<T, vector<DictionaryValue>>::value ||
+                      is_same<T, Dictionary>::value, "AllocateDataPtr called with invalid type");
+        m_data.m_ptr = new T(value);
+    }
+
+    template <typename T>
+    void DictionaryValue::FreePtrAsType()
+    {
+        T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
+        delete typedPtr;
+
+        m_data.m_ptr = nullptr;
+    }
+
+    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
+    {
+        size_t version;
+        stream >> version;
+
+        stream >> us.m_valueType;
+
+        switch (us.ValueType())
+        {
+        case DictionaryValue::Type::Bool:
+            stream >> us.m_data.m_boolean;
+            break;
+        case DictionaryValue::Type::SizeT:
+            stream >> us.m_data.m_sizeT;
+            break;
+        case DictionaryValue::Type::Float:
+            stream >> us.m_data.m_float;
+            break;
+        case DictionaryValue::Type::Double:
+            stream >> us.m_data.m_double;
+            break;
+        case DictionaryValue::Type::NDShape:
+        {
+            size_t size;
+            stream >> size;
+            vector<size_t> dims(size);
+            for (auto i = 0; i < size; i++)
+            {
+                stream >> dims[i];
+            }
+            us.AllocateDataPtr(NDShape(dims));
+            break;
+        }
+        case DictionaryValue::Type::Vector:
+        {
+            size_t size;
+            stream >> size;
+            vector<DictionaryValue> values(size);
+            for (auto i = 0; i < size; i++)
+            {
+                stream >> values[i];
+            }
+            us.AllocateDataPtr(values);
+            break;
+        }
+        default:
+            NOT_IMPLEMENTED;
+        }
+        return stream;
+    }
+
+    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
+    {
+        stream << us.version;
+
+        stream << us.ValueType();
+
+        switch (us.ValueType())
+        {
+        case DictionaryValue::Type::Bool:
+            stream << us.m_data.m_boolean;
+            break;
+        case DictionaryValue::Type::SizeT:
+            stream << us.m_data.m_sizeT;
+            break;
+        case DictionaryValue::Type::Float:
+            stream << us.m_data.m_float;
+            break;
+        case DictionaryValue::Type::Double:
+            stream << us.m_data.m_double;
+            break;
+        case DictionaryValue::Type::NDShape:
+        {
+            NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
+            auto size = shapePtr->NumAxes();
+            stream << size;
+            for (auto i = 0; i < size; i++)
+            {
+                stream << shapePtr->operator[](i);
+            }
+            break;
+        }
+        case DictionaryValue::Type::Vector:
+        {
+            vector<DictionaryValue>* vectorPtr =
+                reinterpret_cast<vector<DictionaryValue>*>(us.m_data.m_ptr);
+            auto size = vectorPtr->size();
+            stream << size;
+            for (auto i = 0; i < size; i++)
+            {
+                stream << vectorPtr->operator[](i);
+            }
+            break;
+        }
+        default:
+            NOT_IMPLEMENTED;
+        }
+        return stream;
+    }
+
    Dictionary::Dictionary()
-        : m_dictionaryData(new std::unordered_map < std::wstring, DictionaryValue>)
+        : m_dictionaryData(new unordered_map <wstring, DictionaryValue>)
    {
    }

    Dictionary::~Dictionary()
    {
-        delete m_dictionaryData;
+    }
+
+    Dictionary::Dictionary(const Dictionary& other)
+    {
+        *this = other;
+    }
+
+    Dictionary& Dictionary::operator=(const Dictionary& other)
+    {
+        assert(this != &other);
+        m_dictionaryData.reset(new std::unordered_map<std::wstring, DictionaryValue>(*(other.m_dictionaryData)));
+        return *this;
    }

    Dictionary::Dictionary(Dictionary&& other)
        : m_dictionaryData(nullptr)
    {
-        *this = std::move(other);
+        *this = move(other);
    }

    Dictionary& Dictionary::operator=(Dictionary&& other)
    {
        assert(this != &other);

-        delete m_dictionaryData;
-
        m_dictionaryData = other.m_dictionaryData;
        other.m_dictionaryData = nullptr;

@ -51,4 +182,137 @@ namespace CNTK
    {
        return (m_dictionaryData->find(key) != m_dictionaryData->end());
    }
+
+    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
+    {
+        stream << us.version;
+        stream << us.m_dictionaryData->size();
+        for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
+        {
+            stream << it->first;
+            stream << it->second;
+        }
+        return stream;
+    }
+
+    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
+    {
+        size_t version;
+        stream >> version;
+        size_t size;
+        stream >> size;
+        us.m_dictionaryData->reserve(size);
+        for (auto i = 0; i < size; i++)
+        {
+            wstring key;
+            stream >> key;
+            DictionaryValue value;
+            stream >> value;
+            us.m_dictionaryData->insert(make_pair(key, value));
+        }
+        return stream;
+    }
+
+    template <typename T>
+    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    {
+        if (viewPtr->IsSparse())
+        {
+            LogicError("Sparse NDArrayView cannot be serialized into a vector.");
+        }
+
+        auto numElements = viewPtr->Shape().TotalSize();
+
+        vector<DictionaryValue> values(numElements);
+
+        NDArrayViewPtr cpuDataViewPtr = viewPtr;
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
+            cpuDataViewPtr->CopyFrom(*viewPtr);
+        }
+
+        const T* buffer = cpuDataViewPtr->DataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            T v = buffer[i];
+            values[i] = DictionaryValue(v);
+        }
+
+        return values;
+    }
+
+    template <typename T>
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
+    {
+        if (viewPtr->IsSparse())
+        {
+            LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
+        }
+
+        auto numElements = viewPtr->Shape().TotalSize();
+
+        if (values.size() != numElements)
+        {
+            LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
+                        values.size(), numElements);
+        }
+
+        NDArrayViewPtr cpuDataViewPtr = viewPtr;
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
+        }
+
+        T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            buffer[i] = values[i].GetValue<T>();
+        }
+
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            viewPtr->CopyFrom(*cpuDataViewPtr);
+        }
+    }
+
+    // TODO: we store the type info for every element in the vector, which is extremely redundant.
+    // Instead, it'd be nice to introduce some sort of DictionaryValueVector.
+    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    {
+        switch (viewPtr->GetDataType())
+        {
+        case DataType::Float:
+            return SerializeToVector<float>(viewPtr);
+        case DataType::Double:
+            return SerializeToVector<double>(viewPtr);
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
+        }
+    }
+
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values) 
+    {
+        switch (viewPtr->GetDataType())
+        {
+        case DataType::Float:
+            DeserializeFromVector<float>(viewPtr, values);
+            break;
+        case DataType::Double:
+            DeserializeFromVector<double>(viewPtr, values);
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
+        }
+    }
+     
+    template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
+    template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
+    template void DictionaryValue::AllocateDataPtr<wstring>(const wstring& value);
+    template void DictionaryValue::AllocateDataPtr<Dictionary>(const Dictionary& value);
+
+    template void DictionaryValue::FreePtrAsType<NDShape>();
+    template void DictionaryValue::FreePtrAsType<vector<DictionaryValue>>();
+    template void DictionaryValue::FreePtrAsType<wstring>();
+    template void DictionaryValue::FreePtrAsType<Dictionary>();
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -9,251 +9,15 @@
 #include "CommonMatrix.h"
 #include "TensorShape.h"
 #include <string>
+#include "Config.h"
+#include "Reader.h"
+#include "ConvolutionEngine.h"

 namespace CNTK
 {
    // Forward declarations
    class Dictionary;

-    class DictionaryValue
-    {
-    public:
-        enum class Type : unsigned int
-        {
-            None,
-            Bool,
-            SizeT,
-            Double,
-            NDShape,
-            Vector
-        };
-
-        static const char* TypeName(Type type)
-        {
-            if (type == Type::None)
-                return "None";
-            else if (type == Type::Bool)
-                return "Bool";
-            else if (type == Type::SizeT)
-                return "SizeT";
-            else if (type == Type::Double)
-                return "Double";
-            else if (type == Type::NDShape)
-                return "NDShape";
-            else if (type == Type::Vector)
-                return "Vector";
-            else
-                LogicError("Unknown DictionaryValue::Type");
-        }
-
-    public:
-        DictionaryValue()
-            : m_valueType(Type::None)
-        {
-        }
-
-        DictionaryValue(bool value)
-            : m_valueType(GetValueType<bool>())
-        {
-            m_data.m_boolean = value;
-        }
-
-        DictionaryValue(size_t value)
-            : m_valueType(GetValueType<size_t>())
-        {
-            m_data.m_sizeT = value;
-        }
-
-        DictionaryValue(double value)
-            : m_valueType(GetValueType<double>())
-        {
-            m_data.m_double = value;
-        }
-
-        template <typename T>
-        DictionaryValue(const T& value)
-            : m_valueType(GetValueType<T>())
-        {
-            static_assert(std::is_same<T, NDShape>::value ||
-                std::is_same<T, std::vector<DictionaryValue>>::value,
-                "Unsupported ValueType");
-
-            AllocateDataPtr(value);
-        }
-
-        DictionaryValue(const DictionaryValue& other)
-            : m_valueType(Type::Bool)
-        {
-            // The m_valueType must hvae been set to a non-ptr type to prevent an attempt to interpret
-            // the underlying underlying uninitialized value as a ptr and free it.
-            *this = other;
-        }
-
-        DictionaryValue& operator=(const DictionaryValue& other)
-        {
-            if (this != &other)
-            {
-                FreeDataPtr();
-
-                m_valueType = other.m_valueType;
-                m_data = other.m_data;
-
-                if (other.m_valueType == Type::NDShape)
-                    AllocateDataPtr(other.GetValue<NDShape>());
-                else if (other.m_valueType == Type::Vector)
-                    AllocateDataPtr(other.GetValue<std::vector<DictionaryValue>>());
-            }
-
-            return *this;
-        }
-
-        ~DictionaryValue()
-        {
-            FreeDataPtr();
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_boolean;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_sizeT;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_double;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return *(reinterpret_cast<T*>(m_data.m_ptr));
-        }
-
-        bool HasValue() const
-        {
-            return m_valueType != Type::None;
-        }
-
-        Type ValueType() const
-        {
-            return m_valueType;
-        }
-
-    private:
-        template <typename T>
-        static Type GetValueType()
-        {
-            static_assert(std::is_same<T, bool>::value ||
-                std::is_same<T, size_t>::value ||
-                std::is_same<T, double>::value ||
-                std::is_same<T, NDShape>::value ||
-                std::is_same<T, std::vector<DictionaryValue>>::value ||
-                std::is_same<T, CNTK::Dictionary>::value,
-                "Unsupported ValueType");
-
-            if (std::is_same<T, bool>::value)
-                return Type::Bool;
-            else if (std::is_same<T, size_t>::value)
-                return Type::SizeT;
-            else if (std::is_same<T, double>::value)
-                return Type::Double;
-            else if (std::is_same<T, NDShape>::value)
-                return Type::NDShape;
-            else if (std::is_same<T, std::vector<DictionaryValue>>::value)
-                return Type::Vector;
-        }
-
-        template <typename T>
-        void VerifyType() const
-        {
-            if (GetValueType<T>() != m_valueType)
-                RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
-        }
-
-        template <typename T>
-        void AllocateDataPtr(const T& value)
-        {
-            static_assert(std::is_same<T, NDShape>::value || std::is_same<T, std::vector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
-            m_data.m_ptr = new T(value);
-        }
-
-        template <typename T>
-        void FreePtrAsType()
-        {
-            T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
-            delete typedPtr;
-
-            m_data.m_ptr = nullptr;
-        }
-
-        void FreeDataPtr()
-        {
-            if (m_valueType == Type::NDShape)
-                FreePtrAsType<NDShape>();
-            else if (m_valueType == Type::Vector)
-                FreePtrAsType<std::vector<DictionaryValue>>();
-        }
-
-    private:
-        Type m_valueType;
-
-        union ValueData
-        {
-            bool m_boolean;
-            size_t m_sizeT;
-            double m_double;
-            void* m_ptr;
-        } m_data;
-    };
-
-    class Dictionary
-    {
-    public:
-        Dictionary();
-        ~Dictionary();
-
-        // Disallow copy contruction and assignment
-        Dictionary(const Dictionary&) = delete; Dictionary& operator=(const Dictionary&) = delete;
-
-        Dictionary(Dictionary&& other);
-        Dictionary& operator=(Dictionary&& other);
-
-        DictionaryValue& operator[](const std::wstring& key)
-        {
-            return operator[](key.c_str());
-        }
-
-        DictionaryValue& operator[](const wchar_t* key);
-
-        DictionaryValue operator[](const std::wstring& key) const
-        {
-            return operator[](key.c_str());
-        }
-
-        DictionaryValue operator[](const wchar_t* key) const;
-
-        bool Contains(const std::wstring& key) const
-        {
-            return Contains(key.c_str());
-        }
-
-        bool Contains(const wchar_t* key) const;
-
-    private:
-        std::unordered_map<std::wstring, DictionaryValue>* m_dictionaryData;
-    };
-
    // Helper to get the size of an element of the specified DataType
    inline size_t ElementSize(DataType dataType)
    {
@ -317,14 +81,53 @@ namespace CNTK
            LogicError("Unknown DataType");
    }

-    inline Microsoft::MSR::CNTK::TensorShape AsTensorShape(const NDShape& viewShape)
+    inline NDShape AsNDShape(const Microsoft::MSR::CNTK::TensorShape& tensorShape)
+    {
+        // The TensorShape should be flattenable to 1D
+        for (size_t i = 1; i < tensorShape.GetRank(); ++i)
+        {
+            if (!tensorShape.CanFlatten(i))
+                InvalidArgument("AsNDShape() can only be called for TensorShapes that can be flattened to 1D");
+        }
+
+        return std::vector<size_t>(tensorShape.GetDims().begin(), tensorShape.GetDims().end());
+    }
+
+    inline DataType AsDataType(Microsoft::MSR::CNTK::ElementType readerDataType)
+    {
+        switch (readerDataType)
+        {
+        case Microsoft::MSR::CNTK::ElementType::tfloat:
+            return DataType::Float;
+        case Microsoft::MSR::CNTK::ElementType::tdouble:
+            return DataType::Double;
+        default:
+            LogicError("Unsupported ElementType from CNTK Reader");
+        }
+    }
+
+    inline StorageFormat AsStorageFormat(Microsoft::MSR::CNTK::StorageType readerStorageType)
+    {
+        switch (readerStorageType)
+        {
+        case Microsoft::MSR::CNTK::StorageType::dense:
+            return StorageFormat::Dense;
+        case Microsoft::MSR::CNTK::StorageType::sparse_csc:
+            return StorageFormat::SparseCSC;
+        default:
+            LogicError("Unsupported StorageType from CNTK Reader");
+        }
+    }
+
+    inline Microsoft::MSR::CNTK::TensorShape AsTensorShape(const NDShape& viewShape, bool preserveRank = false)
    {
        const size_t maxNumAxesSupportedByTensorView = 12;
        if (viewShape.NumAxes() > maxNumAxesSupportedByTensorView)
            LogicError("The number of requested axes exceeds the currently supported limit");

        // TensorShape is required to be at least 2D
-        Microsoft::MSR::CNTK::SmallVector<size_t> tensorViewShape(std::max<size_t>(2, viewShape.NumAxes()));
+        size_t minRankSize = preserveRank ? viewShape.NumAxes() : 2;
+        Microsoft::MSR::CNTK::SmallVector<size_t> tensorViewShape(std::max<size_t>(minRankSize, viewShape.NumAxes()));
        for (size_t i = 0; i < tensorViewShape.size(); ++i)
            tensorViewShape[i] = (i < viewShape.NumAxes()) ? viewShape[i] : 1;

@ -363,4 +166,151 @@ namespace CNTK
    {
        return var.IsInput() && var.IsSparse();
    }
+
+    std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
+
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
+
+    inline void AddIndentation(std::wstringstream& s, size_t numIndentationSpaces)
+    {
+        for (size_t i = 0; i < numIndentationSpaces; ++i)
+            s << L" ";
+    }
+
+    static const size_t perLevelIndentSize = 4;
+    inline void AddConfigString(std::wstringstream& s, const std::wstring& key, const DictionaryValue& value, size_t numIndentationSpaces);
+    inline void AddConfigString(std::wstringstream& s, const DictionaryValue& value, size_t numIndentationSpaces)
+    {
+        switch (value.ValueType())
+        {
+        case DictionaryValue::Type::Bool:
+            s << value.GetValue<bool>();
+            break;
+        case DictionaryValue::Type::Float:
+            s << value.GetValue<float>();
+            break;
+        case DictionaryValue::Type::Double:
+            s << value.GetValue<double>();
+            break;
+        case DictionaryValue::Type::String:
+            s << value.GetValue<std::wstring>();
+            break;
+        case DictionaryValue::Type::SizeT:
+            s << value.GetValue<size_t>();
+            break;
+        case DictionaryValue::Type::Vector:
+        {
+            const auto& valueVector = value.GetValue<std::vector<DictionaryValue>>();
+            s << L"(" << std::endl;
+            AddIndentation(s, numIndentationSpaces + perLevelIndentSize);
+            bool isFirst = true;
+            for (const auto& val : valueVector)
+            {
+                if (!isFirst)
+                    s << L":";
+                else
+                    isFirst = false;
+
+                AddConfigString(s, val, numIndentationSpaces + perLevelIndentSize);
+            }
+            AddIndentation(s, numIndentationSpaces);
+            s << L")";
+            break;
+        }
+        case DictionaryValue::Type::Dictionary:
+        {
+            const auto& valueDictionary = value.GetValue<Dictionary>();
+            s << L"[" << std::endl;
+            for (const auto& keyValuePair : *(valueDictionary.m_dictionaryData))
+            {
+                AddConfigString(s, keyValuePair.first, keyValuePair.second, numIndentationSpaces + perLevelIndentSize);
+            }
+            AddIndentation(s, numIndentationSpaces);
+            s << L"]";
+            break;
+        }
+        default:
+            LogicError("Unsupported DictionaryValue type");
+        }
+    }
+
+    inline void AddConfigString(std::wstringstream& s, const std::wstring& key, const DictionaryValue& value, size_t numIndentationSpaces)
+    {
+        static const size_t perLevelIndentSize = 4;
+
+        AddIndentation(s, numIndentationSpaces);
+        s << key << L" = ";
+        AddConfigString(s, value, numIndentationSpaces);
+        s << std::endl;
+    }
+
+    template <typename T>
+    inline std::vector<DictionaryValue> AsDictionaryValueVector(const std::vector<T>& basicElementTypeVector)
+    {
+        static_assert(std::is_same<T, bool>::value ||
+                      std::is_same<T, size_t>::value ||
+                      std::is_same<T, float>::value ||
+                      std::is_same<T, double>::value, "Unsupported ValueType");
+
+        std::vector<DictionaryValue> dictionaryValueVector;
+        for (auto value : basicElementTypeVector)
+            dictionaryValueVector.push_back(value);
+
+        return dictionaryValueVector;
+    }
+
+    template <typename T>
+    inline std::vector<T> AsBasicElementTypeVector(const std::vector<DictionaryValue>& dictionaryValueVector)
+    {
+        static_assert(std::is_same<T, bool>::value ||
+            std::is_same<T, size_t>::value ||
+            std::is_same<T, float>::value ||
+            std::is_same<T, double>::value, "Unsupported ValueType");
+
+        std::vector<T> basicElementTypeVector;
+        for (auto value : dictionaryValueVector)
+            basicElementTypeVector.push_back(value.GetValue<T>());
+
+        return basicElementTypeVector;
+    }
+
+    inline PoolingType AsPoolingType(Microsoft::MSR::CNTK::PoolKind cntkPoolingKind)
+    {
+        switch (cntkPoolingKind)
+        {
+        case Microsoft::MSR::CNTK::PoolKind::Average:
+            return PoolingType::Average;
+        case Microsoft::MSR::CNTK::PoolKind::Max:
+            return PoolingType::Max;
+        default:
+            LogicError("Unknown pooling type");
+        }
+    }
+
+    inline Microsoft::MSR::CNTK::PoolKind AsCNTKPoolKind(PoolingType poolingType)
+    {
+        switch (poolingType)
+        {
+        case PoolingType::Average:
+            return Microsoft::MSR::CNTK::PoolKind::Average;
+        case PoolingType::Max:
+            return Microsoft::MSR::CNTK::PoolKind::Max;
+        default:
+            LogicError("Unknown pooling type");
+        }
+    }
+
+    inline std::pair<NDShape, NDShape> GetConvolutionOutputMapCountAndKernelShape(const NDShape& convolutionMapShape, const NDShape& operandShape)
+    {
+        auto outputMapCount = convolutionMapShape.SubShape(0, convolutionMapShape.NumAxes() - operandShape.NumAxes());
+        NDShape paddedOutputMapCount(operandShape.NumAxes(), 1);
+        for (size_t i = 0; i < outputMapCount.NumAxes(); ++i)
+            paddedOutputMapCount[paddedOutputMapCount.NumAxes() - 1 - i] = outputMapCount[outputMapCount.NumAxes() - 1 - i];
+        //for (size_t i = 0; i < outputMapCount.NumAxes(); ++i)
+        //    paddedOutputMapCount[i] = outputMapCount[i];
+
+        NDShape kernelShape = convolutionMapShape.SubShape(outputMapCount.NumAxes());
+
+        return{ paddedOutputMapCount, kernelShape };
+    }
 }
--- a/Source/Common/Include/Basics.h
+++ b/Source/Common/Include/Basics.h
@ -84,9 +84,15 @@ __declspec_noreturn static inline void ThrowFormatted(const char* format, ...)

 // RuntimeError - throw a std::runtime_error with a formatted error string
 #ifndef _MSC_VER // gcc __attribute__((format(printf())) does not percolate through variadic templates; so must go the macro route
+#ifndef RuntimeError
 #define RuntimeError ThrowFormatted<std::runtime_error>
+#endif
+#ifndef LogicError
 #define LogicError ThrowFormatted<std::logic_error>
+#endif
+#ifndef InvalidArgument
 #define InvalidArgument ThrowFormatted<std::invalid_argument>
+#endif
 #else
 template <class... _Types>
 __declspec_noreturn static inline void RuntimeError(const char* format, _Types&&... _Args)
@ -127,13 +133,11 @@ static inline void Warning(const string& message)
    \
 {                                                                                                                             \
        fprintf(stderr, "Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__); \
-        LogicError("Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.\n", __FILE__, __LINE__, __FUNCTION__);      \
+        LogicError("Inside File: %s  Line: %d  Function: %s  -> Feature Not Implemented.", __FILE__, __LINE__, __FUNCTION__);      \
    \
 }
 #endif
-}
-}
-}
+}}}

 #ifndef _MSC_VER
 using Microsoft::MSR::CNTK::ThrowFormatted;
@ -579,6 +583,60 @@ struct nocase_compare
 // random collection of stuff we needed at some place
 // ----------------------------------------------------------------------------

+// Array class
+template <class T>
+class ArrayRef
+{
+    T* elements; // Array of type T
+    size_t count;
+
+public:
+
+    ArrayRef(T* elementsIn, size_t sizeIn)
+    {
+        elements = elementsIn;
+        count = sizeIn;
+    }
+
+    // TODO: Copy Constructor
+    ArrayRef(const ArrayRef& other) = delete;
+
+    // TODO: Move Constructor
+    ArrayRef(ArrayRef&& other) = delete;
+
+    // TODO: Assignment operator
+    ArrayRef& operator=(const ArrayRef& rhs) = delete;
+
+    // TODO: Move assignment operator
+    ArrayRef& operator=(ArrayRef&& rhs) = delete;
+
+    size_t size() const { return count; }
+    T* data() const { return elements; }
+
+    T operator[](size_t i) const
+    {
+        if (i >= size())
+            LogicError("ArrayRef: index overflow");
+        return elements[i];
+    }
+
+    T& operator[](size_t i)
+    {
+        if (i >= count)
+            LogicError("ArrayRef: index overflow");
+        return elements[i];
+    }
+
+    const T* begin() const
+    {
+        return data();
+    }
+    const T* end() const
+    {
+        return data() + size();
+    }
+};
+
 // TODO: maybe change to type id of an actual thing we pass in
 // TODO: is this header appropriate?
 template <class C>
--- a/Source/Common/Include/Config.h
+++ b/Source/Common/Include/Config.h
@ -988,11 +988,10 @@ public:
        return defaultValue;
    }

-    ConfigValue Find(const std::string& name,
-                     const char* defaultvalue = NULL) const
+	// Look up a variable through the nested hierarchy. If not found, return false, and 'result'is untouched.
+	bool TryFind(const std::string& name, ConfigValue& result, const char* defaultvalue = NULL) const
    {
        auto iter = find(name);
-        ConfigValue result;

        // if we aren't found, or they want the default value
        // TODO: What the hell is this?
@ -1002,13 +1001,15 @@ public:
            if (iter == end() && m_parent != NULL)
            {
                result = m_parent->Find(name, defaultvalue);
+				return true;
            }
            else if (defaultvalue != NULL)
            {
                // no parent, so use default value
                std::string fullName = m_configName + ":" + name;
                result = ConfigValue(defaultvalue, fullName, this);
-            }
+				return true;
+			}
        }
        else
        {
@ -1016,10 +1017,19 @@ public:
            rhs = this->ResolveVariables(rhs);
            std::string fullName = m_configName + ":" + name;
            result = ConfigValue(rhs, fullName, this);
-        }
-        return result;
+			return true;
+		}
+        return false; // not found
    }

+	// Look up a variable using TryFind() above. If not found, return empty string.
+	ConfigValue Find(const std::string& name, const char* defaultvalue = NULL) const
+	{
+		ConfigValue result;
+		TryFind(name, result, defaultvalue); // (if returns false, we return an empty ConfigValue)
+		return result;
+	}
+
    // ResolveVariablesInSingleLine - In this method we replace all substrings of 'configLine' of the form "$varName$"
    //     (where varName is a variable name), with the value of the "varName" variable in config.
    //     We search up the config tree for the value, and we throw an error if we don't find it.
@ -1037,10 +1047,7 @@ public:
    {
        // ensure that this method was called on a single line (eg, no newline characters exist in 'configLine').
        if (configLine.find_first_of("\n") != std::string::npos)
-        {
-            LogicError(
-                "\"ResolveVariablesInSingleLine\" shouldn't be called with a string containing a newline character");
-        }
+            LogicError("ResolveVariablesInSingleLine() should not be called with a string containing a newline character");

        std::string newConfigLine = StripComments(configLine);
        std::size_t start = newConfigLine.find_first_of(openBraceVar);
@ -1073,27 +1080,25 @@ public:
            // in nested dictionaries, this is not working.
            if (varName.empty())
            {
-                RuntimeError("$$ is not allowed.  Parsing of string failed: %s:%s",
+                RuntimeError("$$ is not allowed. Parsing of string failed: %s:%s",
                             m_configName.c_str(),
                             newConfigLine.c_str());
            }

            // Note that this call to "Find" can trigger further substitutions of the form $varName2$ -> varValue2,
            // thus making this search process recursive.
-            std::string varValue = this->Find(varName);
-
-            if (varValue.empty())
+			ConfigValue varConfigValue;
+			const bool foundValue = this->TryFind(varName, varConfigValue);
+			if (!foundValue)
            {
-                RuntimeError("No variable found with the name %s.  Parsing of string failed: %s:%s",
+                RuntimeError("No variable found with the name %s. Parsing of string failed: %s:%s",
                             varName.c_str(), m_configName.c_str(),
                             newConfigLine.c_str());
            }

-            if (varValue.find_first_of("\n") != std::string::npos)
-            {
-                LogicError(
-                    "Newline character cannot be contained in the value of a variable which is resolved using $varName$ feature");
-            }
+			std::string varValue = varConfigValue;
+			if (varValue.find_first_of("\n") != std::string::npos)
+                LogicError("Newline characters are not allowed in the value of a variable which is resolved using $varName$ feature");

            // Replace $varName$ with 'varValue'.  Then continue the search for
            // other variables in 'newConfigLine' string, starting at the point
--- a/Source/Common/Include/Eval.h
+++ b/Source/Common/Include/Eval.h
@ -282,7 +282,7 @@ class VariableSchema : public std::vector<VariableLayout>
        Values<ElemType> CreateBuffers(const std::vector<size_t>& maxLengths)
        {
            if (maxLengths.size() != size())
-                throw std::exception("Expected max lengths for all variables.");
+                throw std::runtime_error("Expected max lengths for all variables.");

            Values<ElemType> buffers(size());
            for (size_t i = 0; i < size(); ++i)
--- a/Source/Common/Include/RandomOrdering.h
+++ b/Source/Common/Include/RandomOrdering.h
@ -134,4 +134,5 @@ public:
        return randomizationrange == randomizeDisable;
    }
 };
-} } }
+
+}}}
--- a/Source/Common/Include/ScriptableObjects.h
+++ b/Source/Common/Include/ScriptableObjects.h
@ -29,7 +29,8 @@ public:
        runtime_error(msg)
    {
    }
-    virtual void PrintError(const std::wstring& linePrefix) const = 0;
+    virtual std::wstring GetError(const std::wstring& /*linePrefix*/) const = 0;
+    virtual void PrintError(const std::wstring& /*linePrefix*/) const = 0;
 };

 // -----------------------------------------------------------------------
@ -619,9 +620,9 @@ public:
    {
    }
    // ConfigArray(ConfigValuePtr && val) : firstIndex(0), values(std::vector<ConfigValuePtr>{ move(val) }) { }
-    pair<int, int> GetIndexRange() const
+    pair<int, int> GetIndexBeginEnd() const
    {
-        return make_pair(firstIndex, firstIndex + (int) values.size() - 1);
+        return make_pair(firstIndex, firstIndex + (int)values.size());
    }
    // for use as a plain array: get size and verify that index range starts with 0
    template <typename FAILFN>
--- a/Source/Readers/HTKMLFReader/basetypes.h
+++ b/Source/Readers/HTKMLFReader/basetypes.h
@ -411,7 +411,7 @@ static inline void byteswap(V &v) throw()

 // execute a block with retry
 // Block must be restartable.
-// Use this when writing small files to those unreliable Windows servers.
+// Use this when writing/reading small files to those unreliable Windows servers.
 // TODO: This will fail to compile under VS 2008--we need an #ifdef around this
 template <typename FUNCTION>
 static void attempt(int retries, const FUNCTION &body)
--- a/Source/Common/Include/fileutil.h
+++ b/Source/Common/Include/fileutil.h
@ -592,7 +592,8 @@ void fgetfile(const std::wstring& pathname, std::vector<char>& buffer);
 void fgetfile(FILE* f, std::vector<char>& buffer);
 namespace msra { namespace files {

-void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines);
+void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines, int numberOfTries = 1);
+
 static inline std::vector<std::string> fgetfilelines(const std::wstring& pathname)
 {
    std::vector<char> buffer;
@ -600,7 +601,7 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
    fgetfilelines(pathname, buffer, lines);
    return lines;
 }
-std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
+std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, int numberOfTries = 1);

 }}

--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@ -1251,7 +1251,7 @@ public:
        // BUGBUG: we only really support one archive file at this point
        // read the TOC in one swoop
        std::vector<char> textbuffer;
-        auto toclines = msra::files::fgetfilelines(tocpath, textbuffer);
+        auto toclines = msra::files::fgetfilelines(tocpath, textbuffer, 3);

        // parse it one by one
        size_t archiveindex = SIZE_MAX; // its index
--- a/Source/Common/fileutil.cpp
+++ b/Source/Common/fileutil.cpp
@ -16,6 +16,7 @@
 #endif
 #define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
 #include "Basics.h"
+#include "basetypes.h" //for attemp()
 #include "fileutil.h"
 #include "ProgressTracing.h"

@ -1632,6 +1633,11 @@ static size_t fgetfilechars(const std::wstring& path, vector<char>& buffer)
    return len;
 }

+static void fgetfilechars(const std::wstring& path, vector<char>& buffer, size_t& len)
+{
+    len = fgetfilechars(path, buffer);
+}
+
 template <class LINES>
 static void strtoklines(char* s, LINES& lines)
 {
@ -1639,10 +1645,14 @@ static void strtoklines(char* s, LINES& lines)
        lines.push_back(p);
 }

-void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines)
+void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines, int numberOfTries)
 {
-    // load it into RAM in one huge chunk
-    const size_t len = fgetfilechars(path, buffer);
+    size_t len = 0;
+    msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
+    {
+        // load it into RAM in one huge chunk
+        fgetfilechars(path, buffer, len);
+    });

    // parse into lines
    lines.resize(0);
@ -1651,11 +1661,15 @@ void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer,
 }

 // same as above but returning const char* (avoiding the memory allocation)
-vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer)
+vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer, int numberOfTries)
 {
-    // load it into RAM in one huge chunk
-    const size_t len = fgetfilechars(path, buffer);
-
+    size_t len = 0;
+    msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
+    {
+        // load it into RAM in one huge chunk
+        fgetfilechars(path, buffer, len);
+    });
+    
    // parse into lines
    vector<char*> lines;
    lines.reserve(len / 20);
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -18,6 +18,7 @@
 #include "PreComputeNodes.h"
 #include "EvaluationNodes.h"
 #include "SpecialPurposeNodes.h"
+#include "DeprecatedNodes.h" // (for SaveToDbnFile(), which is also deprecated)
 #include "MPIWrapper.h" // TODO: does not belong here
 #include <string>
 #include <vector>
@ -391,13 +392,38 @@ void ComputationNetwork::Read(const wstring& fileName)
 // node construction
 // -----------------------------------------------------------------------

-// non-static version needed because it accesses m_randomSeedOffset
-// Excessively used by SimpleNetworkBuilder, but always after CreateLearnableParameter(), so we should really absorb it there
-template <class ElemType>
-void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly)
+// helper of InitLearnableParameters()
+// Note: This should really be done through an interface without <ElemType> that LearnableParameter would derive from.
+// However, this is only for NDL (which is deprecated), so I rather not pollute the code with more interfaces just for a deprecated cause.
+template<class ElemType>
+static bool TryPostInitParameters(const ComputationNodeBasePtr& node, const wchar_t* initString, double initValue, unsigned long randomSeed, bool initOnCPUOnly)
 {
    auto learnableParameterNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(node);
-    learnableParameterNode->InitRandom(uniformInit, randomSeed + GetRandomSeedOffset(), initValueScale, initOnCPUOnly);
+    if (!learnableParameterNode)
+        return false;
+    learnableParameterNode->PostInitParameters(initString, (ElemType) initValue, randomSeed, initOnCPUOnly);
+    return true;
+}
+
+// non-static version needed because it accesses m_randomSeedOffset
+void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& node,
+                                                 const wchar_t* initString, // "uniform"|"gaussian"|"fixedValue"
+                                                 double initValue,        //  scale   | scale    | value
+                                                 unsigned long randomSeed /*= 0*/,
+                                                 bool initOnCPUOnly /*= false*/) const
+{
+    randomSeed += GetRandomSeedOffset();
+    if (TryPostInitParameters<float> (node, initString, initValue, randomSeed, initOnCPUOnly) ||
+        TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly))
+        return;
+    LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double>");
+}
+
+// non-static version needed because it accesses m_randomSeedOffset
+// Legacy version that is for random only.
+void ComputationNetwork::RandomInitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly) const
+{
+    InitLearnableParameters(node, uniformInit ? L"uniform" : L"gaussian", initValueScale, randomSeed, initOnCPUOnly);
 }

 bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
@ -714,35 +740,22 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,

    File fstream(outFile, FileOptions::fileOptionsText | FileOptions::fileOptionsWrite);

-    // get precompute node
-    vector<ComputationNodeBasePtr> PreComputedNodes;
+    vector<ComputationNodeBasePtr> preComputedNodes;
+    vector<ComputationNodeBasePtr> pastValueNodes;
+    vector<ComputationNodeBasePtr> futureValueNodes;
+    vector<ComputationNodeBasePtr> learnableParameters;
    vector<ComputationNodeBasePtr> allnodes = GetAllNodes();
    for (const auto& n : allnodes)
    {
        if (n->RequiresPreCompute())
-            PreComputedNodes.push_back(n);
-    }
+            preComputedNodes.push_back(n);

-    // get PastValue node
-    vector<ComputationNodeBasePtr> pastValueNodes;
-    for (const auto& n : allnodes)
-    {
-        if (n->OperationName() == OperationNameOf(PastValueNode) || n->OperationName() == L"Delay")
+        const auto operationName = n->OperationName();
+        if (operationName == OperationNameOf(PastValueNode) || operationName == L"Delay"/*legacy*/) 
            pastValueNodes.push_back(n);
-    }
-
-    // get FuturetValue node
-    vector<ComputationNodeBasePtr> futureValueNodes;
-    for (const auto& n : allnodes)
-    {
-        if (n->OperationName() == OperationNameOf(FutureValueNode))
+        else if (operationName == OperationNameOf(FutureValueNode))
            futureValueNodes.push_back(n);
-    }
-    // get learnableParameters
-    vector<ComputationNodeBasePtr> learnableParameters;
-    for (const auto& n : allnodes)
-    {
-        if (n->OperationName() == OperationNameOf(LearnableParameter))
+        else if (operationName == OperationNameOf(LearnableParameter))
            learnableParameters.push_back(n);
    }

@ -763,7 +776,7 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
    // critera
    fstream << FormSpecialNodes(dotcfg.m_CriteriaStyle, m_criterionNodes);
    // pre-compute nodes
-    fstream << FormSpecialNodes(dotcfg.m_PrecomputingNodeStyle, PreComputedNodes);
+    fstream << FormSpecialNodes(dotcfg.m_PrecomputingNodeStyle, preComputedNodes);
    // PastValue nodes
    fstream << FormSpecialNodes(dotcfg.m_pastValueNodeStyle, pastValueNodes);
    // FutureValue nodes
@ -1062,10 +1075,12 @@ void ComputationNetwork::PerformSVDecomposition(const map<wstring, float>& SVDCo
            wstring rightChildName = name + L"_V";
            shared_ptr<ComputationNode<ElemType>> pLeft = AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(m_deviceId, leftChildName, m, r));
            shared_ptr<ComputationNode<ElemType>> pRight = AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(m_deviceId, rightChildName, r, n));
+            InitLearnableParameters(pLeft,  L"fixedValue", 0); // follow the protocol; otherwise deferred initialization will overwrite the SVD values in validation
+            InitLearnableParameters(pRight, L"fixedValue", 0);

            // TODO: We should be able to move instead of copy but it currently isn't straightforward
            // due to redU and redVT being slices
-            pLeft->ValueAsMatrix() = redU.DeepClone();
+            pLeft->ValueAsMatrix()  = redU.DeepClone();
            pRight->ValueAsMatrix() = redVT.DeepClone();

            // Step 3. Change the network hierachy to include the SVD nodes
@ -1111,7 +1126,7 @@ public:
    ~DbnLayer() {};
 };

-// Save network in the format of the Microsoft-internal legacy "DBN.exe" tool (this function is not useful outside of Microsoft)
+// Save network in the format of the Microsoft-internal legacy "DBN.exe" tool (this function is not useful outside of Microsoft).
 template <class ElemType>
 void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wstring& fileName) const 
 {
@ -1463,7 +1478,6 @@ void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wst
    PutTag("EDBN");
 }

-template void ComputationNetwork::InitLearnableParameters<float>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const float initValueScale, bool initOnCPUOnly);
 template void ComputationNetwork::Read<float>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
@ -1473,7 +1487,6 @@ template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net,
                                                     const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 template void ComputationNetwork::SaveToDbnFile<float>(ComputationNetworkPtr net, const std::wstring& fileName) const;

-template void ComputationNetwork::InitLearnableParameters<double>(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly);
 template void ComputationNetwork::Read<double>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -332,14 +332,15 @@ public:
    // node construction
    // -----------------------------------------------------------------------

-    // non-static version needed because it accesses m_randomSeedOffset
-    // Excessively used by SimpleNetworkBuilder, but always after CreateLearnableParameter(), so we should really absorb it there
-    template <class ElemType>
+    // this function is only for use by NDL (deprecated)
    void InitLearnableParameters(const ComputationNodeBasePtr& node,
-                                 const bool uniformInit,
-                                 const unsigned long randomSeed,
-                                 const ElemType initValueScale,
-                                 bool initOnCPUOnly = false);
+                                 const wchar_t* initString, // "uniform"|"gaussian"|"fixedValue"
+                                 double initValue,          //  scale   | scale    | value
+                                 unsigned long randomSeed = 0,
+                                 bool initOnCPUOnly = false) const;
+    // non-static version needed because it accesses m_randomSeedOffset
+    // Legacy version that is for random only.
+    void RandomInitLearnableParameters(const ComputationNodeBasePtr& node, const bool uniformInit, const unsigned long randomSeed, const double initValueScale, bool initOnCPUOnly = false) const;

    template <typename N>
    static shared_ptr<N> AsNodePtr(const ComputationNodeBasePtr& inode)
@ -522,6 +523,8 @@ public:
    }


+    const std::vector<ComputationNodeBasePtr>& RootNodes()           const { return m_allRoots; }
+
    // these are specified as such by the user
    const std::vector<ComputationNodeBasePtr>& FeatureNodes()        const { return m_featureNodes   ; }
    const std::vector<ComputationNodeBasePtr>& LabelNodes()          const { return m_labelNodes     ; }
@ -751,7 +754,7 @@ public:
        while (!result.second/*if already there*/ && result.first->second != node)
        {
            if (!makeUniqueName || node->NodeName().find_first_of(L".[]") == wstring::npos)
-                RuntimeError("AddNodeToNetIfNotYet: Duplicated name for %ls %ls operation.", node->NodeName().c_str(), node->OperationName().c_str());
+                RuntimeError("AddNodeToNetIfNotYet: Duplicated name for %ls %ls operation (%d vs. %d).", node->NodeName().c_str(), node->OperationName().c_str(), (int)node->m_uniqueNumericId, (int)result.first->second->m_uniqueNumericId);
            node->SetName(L"_" + node->NodeName());
            result = m_nameToNodeMap.insert(make_pair(node->NodeName(), node));
        }
@ -1034,7 +1037,7 @@ public:
    // data members
    // -----------------------------------------------------------------------

-    unsigned long GetRandomSeedOffset()
+    unsigned long GetRandomSeedOffset() const
    {
        return m_randomSeedOffset;
    }
--- a/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkAnalysis.cpp
@ -106,13 +106,13 @@ void ComputationNetwork::FormRecurrentLoops(const ComputationNodeBasePtr& rootNo
                assert(node->m_numNonDelayedParentsInLoop == 0); // (in PurgeStateForFormingRecurrentLoops())
        }
        for (let& node : nestedNodes)
-            {
+        {
            for (auto& input : node->GetInputs())
-                {
+            {
                if (input->m_loopId == node->m_loopId && GetRecurrenceSteppingDirection(node) == 0/*not a Delay node*/)
                    input->m_numNonDelayedParentsInLoop++; // cound #parents of 'input' that are not delay nodes
-                }
            }
+        }

        // re-traverse the graph for all nestedNodes, starting with the first
        // Then update m_nestedNodes with the re-traversed order.
--- a/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEditing.cpp
@ -76,7 +76,7 @@ void ComputationNetwork::CopySubTree(const ComputationNetwork& fromNet,

    ComputationNodeBasePtr fromRoot = fromNet.GetNodeFromName(fromName);

-    for (const auto& fromNode : GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
+    for (const auto& fromNode : fromNet.GetEvalOrder(fromRoot)) // BUGBUG: This probably will fail because the precomputed eval orders are invalid at this point.
    {
        wstring fromNodeName = fromNode->NodeName();
        wstring toNodeName = toNamePrefix + fromNodeName;
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -885,9 +885,9 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
                if (performingBackPropagation)
                {
                    if (outputValueNeededDuringBackProp.find(input) == outputValueNeededDuringBackProp.end())
-                        outputValueNeededDuringBackProp[input] = input->OutputUsedInComputingInputNodesGradients();
+                        outputValueNeededDuringBackProp[input] = input->NeedsGradient() && input->OutputUsedInComputingInputNodesGradients();

-                    outputValueNeededDuringBackProp[input] |= node->InputUsedInComputingInputNodesGradients(i);
+                    outputValueNeededDuringBackProp[input] |= (node->NeedsGradient() && node->InputUsedInComputingInputNodesGradients(i));
                }
                else
                {
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
@ -1,21 +1,12 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
-    <ClCompile Include="..\Common\File.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="ComputationNode.cpp">
      <Filter>Nodes</Filter>
    </ClCompile>
    <ClCompile Include="stdafx.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
-    <ClCompile Include="..\Common\TimerUtility.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="..\Common\BestGpu.cpp">
      <Filter>GPU Interfacing</Filter>
    </ClCompile>
--- a/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkScripting.cpp
@ -380,4 +380,295 @@ public:

 ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetworkWithEdits> registerComputationNetworkWithEdits(L"ComputationNetworkWithEdits");

+// ===================================================================
+// CloneFunctionConfigLambda -- lambda to produce a clone of a network
+//  - creates a BrainScript function that carbon-copies a subsection of an existing network
+//  - the copy can be shallow or deep, where a deep copy gets its own copy of LearnableParameters
+//     - a shallow copy (parameters="shared") is a copy of all nodes that depend on the specified input(s),
+//       while all other nodes are shared from the original network section
+//     - a deep copy (parameters="lernable" or "constant") also copies all reachable LearnableParameters and their dependents
+//     - Input() nodes not listed as `inputNodes` are always shared
+//  - the source network may be a different network, e.g. loaded with BS.Network.Load()
+//  - a deep copy can be read-only (parameters="constant")
+//     - Note: multiple uses of the lambda will not share read-only parameters. This is trickier to implement that one might expect.
+//  - example use cases:
+//     - adaptation (KL): a frozen read-only copy of the starting model is used as a KL-regularizer
+//     - adaptation (DLR): an injected input transform is trained while the network is fixed
+//     - image: lower layers of ImageNet networks serve as immutable feature extractors for another image task
+//     - DSSM: applying the same network subsection to two inputs
+// Usage:
+//    f = CloneFunction (inputNodes, outputNodes, parameters="lernable" /*|"constant"|"shared"*/)
+// Parameters:
+//  - inputNodes:  single node or array of nodes that will become parameters of the function.
+//                 Commonly, this list will include all Input()s that the outputNode(s) depend on.
+//  - outputNodes: single node or dictionary of nodes that the function will emit
+// Example:
+//    # create a BS function by copying a piece of network
+//    net = CloneFunction (network.features, network.logP)
+//    # apply the copy to a new input
+//    out = net (myFeatures)
+//    # This will create a copy of the subsection from network.features to network.logP
+//    # where all links to network.features get replaced by links to myFeatures.
+// Example with multiple input and output nodes:
+//    # create a BS function by copying a piece of network
+//    # This specific example converts a network back into a BrainScript function.
+//    # It passes two input nodes --> the BS function will have 2 inputs;
+//    # and it passes a record of output nodes --> the BS function will return a record with the same member names
+//    network = BS.Network.Load ("some.dnn")
+//    net = CloneFunction ((network.features:network.labels), [ ce = network.ce ; errs = network.errs ])
+//    # create a network from the BS function
+//    features = Input (13)
+//    labels = Input (42)
+//    out = net (features, labels)
+//    criterionNodes = (out.ce)
+//    evaluationNodes = (out.errs)
+// A specific example: Adapting a network, while using the original network as a regularizer (KLD)
+//    # load network
+//    network = BS.Network.Load ("some.dnn")
+//    # create a trainable clone and a read-only reference clone
+//    adaptNet = CloneFunction (network.features, [ z = network.z ], readOnly=false)
+//    # create a read-only clone
+//    refNet = CloneFunction (network.features, [ z = network.z ], readOnly=true)
+//    # create the main network
+//    features = Input (42)
+//    labels = Input (9000)
+//    z = adaptNet (features).z
+//    zRef = refNet (features).z
+//    # training criterion
+//    refWeight = 0.9
+//    kldLabels = labels * (1-refWeight) + Softmax (zRef) * refWeight  # interpolate with ref output
+//    ce = CrossEntropyWithSoftmax (z, kldLabels)
+//    errs = ErrorPrediction (z, labels)
+//    criterionNodes = (ce)
+//    evaluationNodes = (errs)
+// ===================================================================
+
+class CloneFunctionConfigLambda : public ConfigLambda
+{
+    // how we treat the parameters in the clone
+    enum class ParameterTreatment
+    {
+        learnable, // parameters are copied and kept trainable
+        constant,  // parameters are copied and made immutable (e.g. for use of this as a fixed feature extractor)
+        shared     // parameters are shared with where they came from (e.g. for parallel identical paths through a network)
+    };
+public:
+    // -----------------------------------------------------------------------
+    // construction
+    // -----------------------------------------------------------------------
+
+    // Executing this function from BrainScript merely sets up a lambda, but does not actually create any clone.
+    // This is so that the function can be called multiple times in order to create multiple clones.
+    CloneFunctionConfigLambda(const IConfigRecordPtr configp) :
+        ConfigLambda(CreateParamNames(*configp), NamedParams(), [this](vector<ConfigValuePtr> &&args, NamedParams &&namedArgs, const std::wstring &exprName){ return this->DoClone(args, exprName); })
+    {
+        let& config = *configp;
+        // input nodes
+        inputNodes = GetInputNodes(config);
+        // output nodes
+        let outputNodesParam = config[L"outputNodes"];  // can be a node or a record
+        if (outputNodesParam.Is<ComputationNodeBase>()) // scalar case: result is a single node
+            outputNodes[L""] = outputNodesParam.AsPtr<ComputationNodeBase>(); // indicated by a "" node name in outputNodes[]
+        else                                            // multi-valued case: result is a record of nodes
+        {
+            let& outputNodesRecord = outputNodesParam.AsRef<IConfigRecord>();
+            for (let& nodeName : outputNodesRecord.GetMemberIds())
+                outputNodes[nodeName] = outputNodesRecord[nodeName].AsPtr<ComputationNodeBase>();
+            if (outputNodes.empty())
+                InvalidArgument("CloneFunction: At least one output nodes must be specified.");
+        }
+        // treatment of parameters
+        wstring parametersOption = config[L"parameters"];
+        if      (parametersOption == L"learnable") parameterTreatment = ParameterTreatment::learnable;
+        else if (parametersOption == L"constant")  parameterTreatment = ParameterTreatment::constant;
+        else if (parametersOption == L"shared")    parameterTreatment = ParameterTreatment::shared;
+        else InvalidArgument("CloneFunction: 'parameters' option must be 'learnable', 'constant', or 'shared'.");
+
+        // determine which nodes must be cloned
+        //  - intersection of:
+        //     - all indirect inputs of the specified outputs
+        //     - all dependents of leaves
+        //  - where leaves are:
+        //     - specified inputs
+        //     - unless parameters="shared": all parameters the specified outputs depend on
+
+        // determine all indirect inputs of the specified outputs
+        vector<ComputationNodeBasePtr> roots;
+        for (let& outputNodeKV : outputNodes)
+            roots.push_back(outputNodeKV.second);
+        let allInputs = ComputationNodeBase::EnumerateNodes(roots);
+
+        // take the chance to validate inputNodes
+        let allInputsSet = set<ComputationNodeBasePtr>(allInputs.begin(), allInputs.end());
+        for (let& input : inputNodes)
+            if (allInputsSet.find(input) == allInputsSet.end())
+                InvalidArgument("CloneFunction: No specified output depends on the specified input %ls.", input->NodeDescription().c_str());
+        // TODO: Is this really always an error? Are there valid cases where one would over-specify possible input nodes, even if they are not used/needed?
+
+        // determine all leaves and their dependents
+        dependentSet = set<ComputationNodeBasePtr>(inputNodes.begin(), inputNodes.end()); // start with the specified inputs
+        // determine all leaves and their dependents
+        for (let& node : allInputs)
+        {
+            // add parameters that are to be cloned to dependent set
+            if (parameterTreatment != ParameterTreatment::shared && node->Is<IFreezable>())
+                dependentSet.insert(node);
+            // if at least one input is in the dependent set then this node is, too
+            else
+                for (let& input : node->GetInputs())
+                    if (dependentSet.find(input) != dependentSet.end())
+                        dependentSet.insert(node);
+        }
+
+#if 0
+        for (let& node : dependentSet)
+            fprintf(stderr, "CloneFunction: cloning %ls\n", node->NodeDescription().c_str());
+#endif
+
+        // ensure none of the specified inputs reference back into the cloned set
+        // The function we extract must be separable.
+        for (let& input : inputNodes)
+            for (let& node : ComputationNodeBase::EnumerateNodes(vector<ComputationNodeBasePtr>{input})) // check all indirect inputs of each specified input
+            {
+                let iter = dependentSet.find(input);
+                if (iter != dependentSet.end() && *iter != input)
+                    InvalidArgument("CloneFunction: specified function input %ls recursively depends on %ls inside the function.", input->NodeDescription().c_str(), node->NodeDescription().c_str());
+            }
+    }
+
+private:
+    // get the input nodes from the config
+    static vector<ComputationNodeBasePtr> GetInputNodes(const IConfigRecord& config)
+    {
+        return ScriptableObjects::ConfigArray::FlattenedVectorFrom<ComputationNodeBasePtr>(config[L"inputNodes"]);
+    }
+    // create an array of parameter names for all inputs
+    // These names are never actually used, but required by the ConfigLambda constructor, and maybe useful for debugging.
+    static vector<wstring> CreateParamNames(const IConfigRecord& config)
+    {
+        let inputNodes = GetInputNodes(config);
+        vector<wstring> paramNames(inputNodes.size());
+        for (size_t i = 0; i < paramNames.size(); i++)
+            paramNames[i] = msra::strfun::wstrprintf(L"input_%d", (int)i);
+        return paramNames;
+    }
+
+private:
+    // -----------------------------------------------------------------------
+    // the cloning operation itself
+    // -----------------------------------------------------------------------
+
+    // execute the lambda
+    // This will clone all nodes that the outputNodes depend on, and rewire inputs matching inputNodes to inputArgs.
+    ConfigValuePtr DoClone(const vector<ConfigValuePtr>& inputValues, const std::wstring& exprName)
+    {
+        // resolve the input arguments
+        vector<ComputationNodeBasePtr> inputs;
+        for (let& inputValue : inputValues)
+            inputs.push_back(inputValue.ResolveValue());
+        assert(inputValues.size() == inputNodes.size()); // (this should have been checked by BrainScript)
+
+        // do some logging
+        fprintf(stderr, "CloneFunction: ");
+        for (size_t i = 0; i < inputs.size(); i++)
+            fprintf(stderr, "%s%ls : %ls", i == 0 ? "(" : ", ", inputs[i]->NodeName().c_str(), inputs[i]->OperationName().c_str());
+        fprintf(stderr, ") -> ");
+        let singleOutput = outputNodes.size() == 1 && outputNodes.begin()->first.empty();
+        if (singleOutput)
+            fprintf(stderr, "%ls\n", outputNodes.begin()->second->NodeDescription().c_str());
+        else
+        {
+            fprintf(stderr, "[\n");
+            for (let& outputNodesKV : outputNodes)
+                fprintf(stderr, "    %ls = %ls : %ls\n", outputNodesKV.first.c_str(), outputNodesKV.second->NodeName().c_str(), outputNodesKV.second->OperationName().c_str());
+            fprintf(stderr, "]\n");
+        }
+
+        // clone everything in the dependent set
+        //  - specified inputs get mapped to actual parameters
+        //  - all others get duplicated
+        // Note that at this point, the "shared" option has already been considered,
+        // and is reflected in whether parameters are included or not in 'dependentSet'.
+        map<ComputationNodeBasePtr, ComputationNodeBasePtr> clonedNodes;
+        size_t numCloned = 0;
+        for (size_t i = 0; i < inputNodes.size(); i++)
+            clonedNodes[inputNodes[i]] = inputs[i];
+        for (let& node : dependentSet)
+        {
+            // if already there then it's an input that we just mapped above
+            if (clonedNodes.find(node) != clonedNodes.end())
+                continue;
+            // clone
+            ComputationNodeBasePtr newNode;
+            let newName = exprName + L"." + node->GetName();
+            newNode = node->Duplicate(newName, CopyNodeFlags::copyNodeAll);
+            // make it read-only if desired
+            if (parameterTreatment == ParameterTreatment::constant && newNode->Is<IFreezable>())
+                newNode->As<IFreezable>()->FreezeParameters();
+            // and that's our cloned node
+            clonedNodes[node] = newNode;
+            numCloned++;
+        }
+#if 0
+        for (let& nodeKV : clonedNodes)
+            fprintf(stderr, "CloneFunction: cloning %ls -> %ls (%d -> %d)\n", nodeKV.first->NodeDescription().c_str(), nodeKV.second->NodeDescription().c_str(), (int)nodeKV.first->m_uniqueNumericId, (int)nodeKV.second->m_uniqueNumericId);
+#endif
+
+        // all cloned nodes' inputs must be redirected if they reference a node that has been cloned as well
+        size_t numRelinks = 0; // (statistics: how many inputs have we relinked?)
+        for (let& clonedNodesKV : clonedNodes)
+        {
+            let& node = clonedNodesKV.second;
+            let& inputs = node->GetInputs();
+            for (size_t i = 0; i < inputs.size(); i++)
+            {
+                fprintf(stderr, "%ls.inputs[%d] = %ls (%d)", node->NodeName().c_str(), (int)i, inputs[i]->NodeName().c_str(), (int)inputs[i]->m_uniqueNumericId);
+                let iter = clonedNodes.find(inputs[i]);
+                if (iter == clonedNodes.end())
+                    continue;
+                // input is also a cloned node: relink
+                node->SetInput(i, iter->second);
+                fprintf(stderr, " ==>  %ls (%d)\n", inputs[i]->NodeName().c_str(), (int)inputs[i]->m_uniqueNumericId);
+                numRelinks++;
+            }
+        }
+
+        fprintf(stderr, "CloneFunction: Cloned %d nodes and relinked %d inputs.\n", (int)numCloned, (int)numRelinks);
+
+        // return the result
+        //  - if outputNodes was specified as a single node, return a single node
+        //  - if specified as a record, then return a record with the specified names
+
+        if (singleOutput)
+        {
+            return NodeToConfigValuePtr(clonedNodes.find(outputNodes.begin()->second)->second);
+        }
+        else
+        {
+            auto record = make_shared<ConfigRecord>(nullptr, [](const std::wstring & msg){ RuntimeError("CloneFunction: %ls", msg.c_str()); });
+            for (let& outputNodesKV : outputNodes)
+                record->Add(outputNodesKV.first, [](const wstring&){}, move(NodeToConfigValuePtr(clonedNodes.find(outputNodesKV.second)->second)));
+            auto valuep = ConfigValuePtr(record, [](const std::wstring &) { LogicError("CloneFunction: Unexpected failure."); }, exprName);
+            return valuep;
+        }
+    }
+
+    ConfigValuePtr NodeToConfigValuePtr(ComputationNodeBasePtr node)
+    {
+        assert(node);
+        auto valuep = ConfigValuePtr(node, [](const std::wstring &) { LogicError("CloneFunction: Unexpected failure."); }, node->NodeName());
+        return valuep;
+    }
+
+private:
+    // parameters
+    vector<ComputationNodeBasePtr> inputNodes;
+    map<wstring, ComputationNodeBasePtr> outputNodes;
+    ParameterTreatment parameterTreatment;
+    // other
+    set<ComputationNodeBasePtr> dependentSet;                                     // set of nodes that outputNodes depend on
+};
+
+ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<CloneFunctionConfigLambda> registerCloneFunctionConfigLambda(L"CloneFunctionConfigLambda");
+
 }}}
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -38,7 +38,8 @@
 #define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
 #define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
 #define CNTK_MODEL_VERSION_9 9 // Transpose flag in ConvolutionNode to support deconvolution. 
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_9
+#define CNTK_MODEL_VERSION_10 10 // Learning rate multiplier for input nodes. 
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_10

 extern bool g_shareNodeValueMatrices;

@ -184,7 +185,7 @@ protected:                // TODO: should be fully encapsulated here
    bool m_needsGradient; // true if this node or any children need a gradient to be computed (for own consumption or propagation to somewhere in the child tree)

    bool m_valueSharable; // a flag is needed for memory share.
-                          // If it is false (e.g., learnableParameters/InputValue and those nodes are solely induced by learnableParameters),
+                          // If it is false (e.g., LearnableParameters/InputValue and those nodes are solely induced by LearnableParameters),
                          // it will never be released to memory pool
 private:
    bool m_isPartOfLoop; // true if this loop is part of a recurrent loop
@ -289,6 +290,9 @@ public:
        m_gradientInitialized(false), m_nodeName(name == L"" ? CreateUniqNodeName() : name)
    {
        // TODO: should m_learningRateMultiplier be set to 0? Or should every node have a way to add its own say on the learning rate for all its inputs?
+        // we store a unique numeric number for every node that is constructed, as a debugging aid
+        static size_t uniqueNumericId = 0;
+        m_uniqueNumericId = uniqueNumericId++;
    }
    virtual ~ComputationNodeBase()
    {
@ -429,7 +433,18 @@ private:
    {
        if (HasMBLayout())
            LogicError("%ls: Minibatch data cannot be interpreted as a single 2D tensor.", NodeDescription().c_str());
-        else if (m_sampleLayout.GetRank() < 1 || m_sampleLayout.GetRank() > 2) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
+
+        bool notFlattenableTo2D = false;
+        for (size_t i = 2; i < m_sampleLayout.GetRank(); ++i)
+        {
+            if (!m_sampleLayout.CanFlatten(i))
+            {
+                notFlattenableTo2D = true;
+                break;
+            }
+        }
+
+        if (m_sampleLayout.GetRank() < 1 || ((m_sampleLayout.GetRank() > 2) && notFlattenableTo2D)) // note: scalars are not stored as tensors of rank 0, but rather as 1-dim vectors. TODO: clean this up some day
            LogicError("%ls: Sample [%s] is not a column vector or matrix (1D or 2D tensor).", NodeDescription().c_str(), string(m_sampleLayout).c_str());
    }
 public:
@ -441,7 +456,11 @@ public:
    size_t GetAsMatrixNumCols() const
    {
        CheckTensorIsMatrix();
-        return m_sampleLayout.GetRank() > 1 ? m_sampleLayout[1] : 1; // a column vector is also a Matrix
+        auto flattenedLayout = m_sampleLayout;
+        if (flattenedLayout.GetRank() > 2)
+            flattenedLayout.FlattenTo2DInPlace(1, "GetAsMatrixNumCols()");
+
+        return flattenedLayout.GetRank() > 1 ? flattenedLayout[1] : 1; // a column vector is also a Matrix
    }

    // setting/updating the dimensions of the node
@ -574,8 +593,8 @@ public:
            else // a whole vector
            {
                ScriptableObjects::ConfigArrayPtr inputsArray = *inputsArg;
-                const auto range = inputsArray->GetIndexRange();
-                for (int i = range.first; i <= range.second; i++) // pull them. This will resolve all of them.
+                const auto range = inputsArray->GetIndexBeginEnd();
+                for (int i = range.first; i < range.second; i++) // pull them. This will resolve all of them.
                    inputs.push_back(inputsArray->At(i, [](const wstring&) { LogicError("GetInputs: out of bounds index while iterating??"); }));
            }
        }
@ -833,6 +852,8 @@ public:
    // Helper that returns [a x b x c], including dynamic axes.
    const std::string ShapeDescription() const;

+    // debugging helper
+    size_t m_uniqueNumericId; // (a unique handle for debugging)
 protected:

    // -----------------------------------------------------------------------
@ -1891,6 +1912,13 @@ public:

 struct IRecurrentNode { virtual int GetRecurrenceSteppingDirection() const = 0; };

+// =======================================================================
+// IFreezable -- nodes that have parameters that can be frozen
+// e.g. if a trained model is to be used as a fixed feature extractor for another
+// =======================================================================
+
+struct IFreezable { virtual void FreezeParameters() { } };
+
 // =======================================================================
 // PreComputedNodeBase -- interface implemented by ComputationNodes that precompute
 // TODO: We can use this interface in more places.
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -139,6 +139,16 @@ public:
        fstream << "PoolKind: " << (int)m_poolKind << "\n";
    }

+    TensorShape KernelShape() const { return m_kernelShape; }
+    TensorShape Strides() const { return m_stride; }
+    std::vector<bool> Sharing() const { return m_sharing; }
+    std::vector<bool> AutoPad() const { return m_autoPad; }
+    TensorShape LowerPad() const { return m_lowerPad; }
+    TensorShape UpperPad() const { return m_upperPad; }
+    bool Transpose() const { return m_transpose; }
+    size_t MaxTempMemSizeInSamples() const { return m_maxTempMemSizeInSamples; }
+    PoolKind PoolingKind() const { return m_poolKind; }
+
 protected:
    TensorShape m_kernelShape;
    TensorShape m_mapCount;
@ -148,7 +158,7 @@ protected:
    TensorShape m_lowerPad;
    TensorShape m_upperPad;
    PoolKind m_poolKind;
-    bool m_transpose;
+    bool m_transpose; // means de-convolution ...I think
    ImageLayoutKind m_imageLayout;

    size_t m_maxTempMemSizeInSamples;
@ -339,6 +349,10 @@ public:
            size_t mapCount = m_mapCount.GetNumElements();
            size_t weightCols = kW * kH * inDims.m_numChannels;

+            // if mapCount is 0 then take it from the input matrix
+            if (mapCount == 0)
+                Input(0)->GetAsMatrixNumRows();
+
            // check/infer input [0] (weights)
            // BUGBUG: For now, we treat the weights as a 2D matrix. They should be a tensor proper.
            Input(0)->ValidateInferInputDimsFrom(TensorShape(mapCount, weightCols));
--- a/Source/ComputationNetworkLib/DeprecatedNodes.h
+++ b/Source/ComputationNetworkLib/DeprecatedNodes.h
@ -61,4 +61,109 @@ public:
 template class SumColumnElementsNode<float>;
 template class SumColumnElementsNode<double>;

+// -----------------------------------------------------------------------
+// (deprecated) PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
+// Computes
+//   output = (feature - mean) .* invStdDev
+// where mean and invStdDev are meant to be single elements while features
+// is minibatch data.
+// Deprecated since it can be trivially expressed in BrainScript.
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class PerDimMeanVarNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
+{
+    typedef ComputationNode<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"PerDimMeanVarNormalization";
+    }
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarNormalizationNode);
+    PerDimMeanVarNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
+    {
+        InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
+    }
+
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
+    {
+        size_t rank = DetermineElementwiseTensorRank();
+        auto output    =           ValueTensorFor(rank, fr);
+        auto input     = Input(0)->ValueTensorFor(rank, fr);
+        auto mean      = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto invStdDev = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
+
+        output.AssignDifferenceOf(input, mean);               // output = input - mean
+        output.AssignElementwiseProductOf(output, invStdDev); // output *= invStdDev
+    }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
+
+        Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
+        Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
+
+
+#if 1
+        // support for legacy models when the mean and variance vectors were stored as column vectors (N,1)
+        // This code will copy the shape of Input(0) (source) to Input(1) and Input(2) (target) if:
+        //   1. The source is a 3-tensor with shape 1x1xM
+        //   2. The target is a vector (i.e., a 2-tensor with shape Nx1)
+        //   3. Both targets have the same number of elements
+        //   4. The number of elements in the target (N) is the same as the number of elements in the source (M)
+        // Note: This is somewhat ugly [Jasha Droppo].
+
+        auto dimsA = Input(0)->GetSampleLayout().GetDims();
+        auto dimsB = Input(1)->GetSampleLayout().GetDims();
+        auto dimsC = Input(2)->GetSampleLayout().GetDims();
+
+        if (
+            // Test condition 1.
+            (dimsA.size() == 3 && dimsA[0] == 1 && dimsA[1] == 1) &&
+            // Test condition 2.
+            (dimsB.size() == 2 && dimsB[1] == 1) &&
+            (dimsC.size() == 2 && dimsC[1] == 1) &&
+            // Test condition 3. and condition 4.
+            (dimsB[0] == dimsC[0] && dimsB[0] == dimsA[2])
+            )
+        {
+            // for error messages
+            string dimsBstring = string(Input(1)->GetSampleLayout());
+            string dimsCstring = string(Input(2)->GetSampleLayout());
+
+            // reshape Input(1)
+            Input(1)->SetDims(TensorShape(dimsA), false);
+            fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of second input (%ls %ls operation) was patched to [%s] (from [%s])\n",
+                NodeName().c_str(), OperationName().c_str(), Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str(), string(Input(1)->GetSampleLayout()).c_str(), dimsBstring.c_str());
+
+            // reshape Input(2)
+            Input(2)->SetDims(TensorShape(dimsA), false);
+            fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of third input (%ls %ls operation) was patched to [%s] (from [%s])\n",
+                NodeName().c_str(), OperationName().c_str(), Input(2)->NodeName().c_str(), Input(2)->OperationName().c_str(), string(Input(2)->GetSampleLayout()).c_str(), dimsCstring.c_str());
+        }
+
+#endif
+
+        if (isFinalValidationPass)
+        {
+            if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
+                InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
+        }
+
+        SetDims(Input(0));
+    }
+};
+
+template class PerDimMeanVarNormalizationNode<float>;
+template class PerDimMeanVarNormalizationNode<double>;
+
 }}}
--- a/Source/ComputationNetworkLib/InputAndParamNodes.cpp
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
@ -18,47 +18,107 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // TODO: add -Node to the class name
 // -----------------------------------------------------------------------

-// BUGBUG: If called after random init, this will reset to 0.
-// TODO: Need to remember the init parameters, and do it here.
 template <class ElemType>
 void LearnableParameter<ElemType>::InitShape(const TensorShape& shape)
 {
    SetDims(shape, false);
    UpdateFunctionValuesSize(); // this allocates the matrix
-    Value().SetValue(0); // TODO: invalidate instead
+    Value().Invalidate();
 }

 // constructor from config
+// Parameterization is a little wicked. An older version required to specify the type of initialization
+// ("uniform|fixedValue|gaussian|fromFile|fromLiteral") and then a parameter with a matching name.
+// Now, only the matching parameter is sufficient, making it less verbose.
+//  - init="uniform|gaussian" (random init, scaled by arg initValueScale)
+//  - init="zero"
+//  - initValue=scalar --> initialize from this value
+//  - initValue=array or nested array --> initialize from this value, infer dimensions  --TODO: not implemented yet
+//  - initFromFilePath="..." --> read from a data file. This infers the dimensions from the file.
+// deprecated:
+//  - init="fixedValue",  value from 'value'            --deprecated in favor of just specifying initValue
+//  - init="fromFile",    value from 'initFromFilePath' --deprecated in favor of just specifying 'initFromFilePath'
+//  - init="fromLiteral", value from 'initFromLiteral'  --deprecated in favor of initValue=array expression
+// The forms that infer the dimensions have different BrainScript names. TODO: need one for fromFile
+// TODO: All forms that require specified dimensions but contain zeroes (to be updated by graph)
+//       will need to do deferred initialization, or have a way to repeat it.
 template <class ElemType>
 LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp) :
    LearnableParameter(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"shape"))
 {
-    // TODO: Change dimensions to take a generic tensor instead. That will be a (minor) breaking change that will require fix-ups when converting from NDL to BrainScript.
-    AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
-    // parameters[rows, [cols=1]] plus other optional parameters (learningRateMultiplier=[1|0|float], init=[uniform|gaussian|fixedvalue], initValueScale=[1|float], value=[0|float])
+    AttachInputsFromConfig(configp, this->GetExpectedNumInputs()); // (we have none; this checks that none are provided)
+    // Parameter{dims, other optional parameters: learningRateMultiplier=[1|0|float], init=[uniform|gaussian|], initValueScale=[1|float], initValue=[''|float], initFromFilePath=[''|string]}
+
+    // constant vs. parameter (with optional LR scaling)
    if (configp->Exists(L"learningRateMultiplier"))
        SetLearningRateMultiplier(configp->Get(L"learningRateMultiplier"));
    else if (configp->Exists(L"needsGradient") || configp->Exists(L"needGradient") || configp->Exists(L"computeGradient"))
        InvalidArgument("Deprecated parameter names needsGradient|needGradient|computeGradient are not supported in BrainScript. Use learningRateMultiplier instead.");

+    // initialization
    wstring initString = configp->Get(L"init");
-    if (initString == L"fixedValue")
-        Value().SetValue((ElemType) configp->Get(L"value"));
-    else if (initString == L"uniform" || initString == L"gaussian")
+    wstring initFromFilePath = configp->Get(L"initFromFilePath");
+    let& initValue = configp->Get(L"initValue");   // may be empty string, scalar, or array
+    // infer the type of the initial value from what other optional args are given
+    if (initString.empty())
    {
-        // TODO: add these options also to old NDL
+        if (!initFromFilePath.empty())                       // 'initFromFilePath' given --> initialize from file
+            initString = L"fromFile"; // (note: this is only used internally; external use is deprecated)
+        else if (!initValue.Is<ScriptableObjects::String>()) // 'initValue' given (not an empty string) --> initialize from value
+        {
+            if (initValue.Is<ScriptableObjects::Double>())
+                initString = L"fromValue"; // (note: this is only used internally)
+            else if (initValue.Is<ScriptableObjects::ConfigArray>())
+                initString = L"fromValueArray"; // (note: this is only used internally)
+            else
+                InvalidArgument("'initValue' must be numerical");
+        }
+        else if (!initValue.AsRef<ScriptableObjects::String>().empty()) // it's a string: must be empty
+            InvalidArgument("LearnableParameter: 'initValue' must be an empty string or not a string.");
+        else  // no pertinent optional arguments given: default to 'uniform'
+            initString = L"uniform"; // default is uniform
+    }
+    // deferred variants
+    // Deferred means that this kind of initialization is allowed when some dimensions are unspecified, and thus happens during Validate().
+    if (initString == L"uniform" || initString == L"gaussian") // random init
+    {
+        m_initString = initString;
+        // TODO: add more randomization types, and use a more meaningful scaling
+        // Keras uses "normal" instead of "gaussian". We can use that here too to denote the one with sane scaling, and deprecate "gaussian" with a warning.
        static unsigned long randomSeed = 1;
        int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
-        InitRandom((initString == L"uniform"), forcedRandomSeed < 0 ? randomSeed++ : (unsigned long) forcedRandomSeed, configp->Get(L"initValueScale"), configp->Get(L"initOnCPUOnly"));
+        m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
+        m_initValueScale = configp->Get(L"initValueScale");
+        m_initOnCPUOnly = configp->Get(L"initOnCPUOnly");
    }
-    else if (initString == L"fromFile")
+    else if (initString == L"zero")
+    {
+        m_initString = L"fromValue";
+        m_initValue = 0;
+    }
+    else if (initString == L"fromValue") // from 'initValue'
+    {
+        m_initString = initString;
+        m_initValue = initValue;
+    }
+    // non-deferred variants
+    // For the dimensions are always known at this point, so we don't need/want to have to save all those parameters.
+    else if (initString == L"fromValueArray") // from 'initValue' which has array form
+        InvalidArgument("'initValue' for arrays not yet implemented"); // array not yet implemented
+    else if (initString == L"fromFile") // load from 'iniFromFilePath'
    {
-        wstring initFromFilePath = configp->Get(L"initFromFilePath");
        if (initFromFilePath.empty())
            RuntimeError("initFromFilePath parameter must be provided when using \"fromFile\" initialization method");
        InitFromFile(initFromFilePath);
+        m_initString.clear();
    }
-    else if (initString == L"fromLiteral")
+    // legacy
+    else if (initString == L"fixedValue") // deprecated. Use initValue=... instead
+    {
+        m_initString = L"fromValue";
+        m_initValue = (ElemType)configp->Get(L"value");
+    }
+    else if (initString == L"fromLiteral") // deprecated. Use initValue=array instead
    {
        wstring initFromLiteral = configp->Get(L"initFromLiteral");
        if (initFromLiteral.empty())
@ -66,9 +126,49 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
        size_t numRows, numCols;
        auto array = File::LoadMatrixFromStringLiteral<ElemType>(msra::strfun::utf8(initFromLiteral), numRows, numCols);
        InitFromArray(array, numRows, numCols);
+        m_initString.clear();
    }
    else
        RuntimeError("init must be one of the values of [ uniform | gaussian | fixedValue | fromFile ]");
+
+    // initialize
+    // This will be repeated if the matrix gets resized due to dimension inference.
+    LazyInitParameters();
+
+    if (!m_initString.empty())
+        fprintf(stderr, "%ls: Initializating Parameter[%s] as %ls later when dimensions are fully known.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str());
+}
+
+// variant of above from NDL. Must be called right after plain constructor.
+// This overwrites any pending deferred initialization with a new one.
+// Initialization is done immediately if all dimensions are already known, otherwise kept pending.
+template <class ElemType>
+void LearnableParameter<ElemType>::PostInitParameters(const wstring& initString, // "uniform"|"gaussian"|"fixedValue"
+                                                      ElemType initValue,        //  scale   | scale    | value
+                                                      unsigned long randomSeed /*= 0*/,
+                                                      bool initOnCPUOnly /*= false*/)
+{
+    if (initString == L"uniform" || initString == L"gaussian") // random init
+    {
+        m_initString = initString;
+        m_randomSeed = randomSeed;
+        m_initValueScale = initValue;
+        m_initOnCPUOnly = initOnCPUOnly;
+    }
+    else if (initString == L"fixedValue") // from constant value
+    {
+        m_initString = L"fromValue";
+        m_initValue = initValue;
+    }
+    else
+        LogicError("PostInitParameters: invalid init string '%ls'", m_initString.c_str());
+
+    // initialize
+    // This will be repeated if the matrix gets resized due to dimension inference.
+    LazyInitParameters();
+
+    if (!m_initString.empty())
+        fprintf(stderr, "%ls: Initializating Parameter[%s] as %ls later when dimensions are fully known.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str());
 }

 // initialize with random numbers
@ -162,9 +262,25 @@ void LearnableParameter<ElemType>::InitFromArray(const std::vector<ElemType>& ar
    VerifyDataSize(Value());      // sanity check
 }

+// TODO: Move this error check there, since this is called only from one place.
+template <class ElemType>
+void LearnableParameter<ElemType>::ReviseFromFile(const std::wstring& reviseFromFilePath)
+{
+    try
+    {
+        InitFromFile(reviseFromFilePath);
+    }
+    catch (const std::exception & e)
+    {
+        RuntimeError("ReviseFromFile: Failed to reload %ls %ls operation from file %ls: %s", NodeName().c_str(), OperationName().c_str(), reviseFromFilePath.c_str(), e.what());
+    }
+}
+
 template <class ElemType>
 void LearnableParameter<ElemType>::Save(File& fstream) const /*override*/
 {
+    if (!m_initString.empty())
+        LogicError("LearnableParameter: Cannot Save() before deferred initialization has completed.");
    Base::Save(fstream);
    fstream << m_learningRateMultiplier;
    m_sampleLayout.Save(fstream);
@ -204,12 +320,31 @@ void LearnableParameter<ElemType>::Load(File& fstream, size_t modelVersion) /*ov
    LoadValue(fstream);
    SetDims(sampleLayout, false); // note: call this after LoadValue() since LoadValue() overwrites m_sampleLayout
    VerifyDataSize(Value());      // sanity check
+
+    m_initString.clear(); // deferred initialization not possible after loading
+}
+
+template <class ElemType>
+/*virtual*/ void LearnableParameter<ElemType>::CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const /*override*/
+{
+    Base::CopyTo(nodeP, newName, flags);
+    if (flags & CopyNodeFlags::copyNodeValue)
+    {
+        auto node = dynamic_pointer_cast<LearnableParameter<ElemType>>(nodeP);
+        node->m_initString     = m_initString;
+        node->m_randomSeed     = m_randomSeed;
+        node->m_initValueScale = m_initValueScale;
+        node->m_initOnCPUOnly  = m_initOnCPUOnly;
+        node->m_initValue      = m_initValue;
+    }
 }

 // computation functions don't do anything for parameter nodes
 template <class ElemType>
 /*virtual*/ void LearnableParameter<ElemType>::UpdateFunctionMBSize() /*override*/
 {
+    if (!m_initString.empty())
+        LogicError("LearnableParameter: Deferred initialization has not been completed until first call to UpdateFunctionMBSize().");
 }

 template <class ElemType>
@ -226,18 +361,70 @@ template <class ElemType>
 template <class ElemType>
 /*virtual*/ void LearnableParameter<ElemType>::Validate(bool isFinalValidationPass) /*override*/
 {
+    //fprintf(stderr, "Validate %ls: called in init state '%ls' with dims [%s]\n", NodeDescription().c_str(), m_initString.c_str(), string(GetSampleLayout()).c_str());
    Base::Validate(isFinalValidationPass);
    m_pMBLayout = nullptr; // this node does not hold mini-batch data
+
+    // lazy init if we got a dimension now
+#if 0 // fake old buggy behavior before deferred initialization
+    if (isFinalValidationPass && !m_initString.empty() && (m_initString != L"fromValue" || m_initValue != 0))
+    {
+        fprintf(stderr, "Validate: deferred '%ls' initialization patched to fromValue 0 for back compat\n", m_initString.c_str());
+        m_initString = L"fromValue";
+        m_initValue = 0;
+    }
+#endif
+#if 0
+    // We call this here and in Validate(true), since we don't know which gets called first.
+    // TODO: Actually this should never be needed, because each time dimensions change, we init.
+    //       So if we get here without fully-known dimensions, this call won't do anything either.
+    if (isFinalValidationPass)
+        LazyInitParameters();
+#endif
+}
+
+// deferred initialization
+// We support a feature that some dimensions can be specified as 0, and get inferred.
+// This is only possible for initialization methods that do not come with their own dimensions
+// (such as initialization from an array literal).
+// When initialization succeeded (all dimensions known), the pending initialization is cleared.
+// This is called from constructor and InferInputDimsFrom().
+// BUGBUG: We cannot really enforce the calling sequence. Save() verifies that this has been cleared.
+//         Note that this may be called AFTER Validate(true) (still during validation, but after final validation of this node).
+template <class ElemType>
+void LearnableParameter<ElemType>::LazyInitParameters()
+{
+    // if no lazy init pending then we are done
+    if (m_initString.empty())
+        return;
+    // if not all dimensions are known yet, we cannot proceed: keep it pending
+    if (GetSampleLayout().GetNumElements() == 0)
+        return;
+    // OK, proceed
+    if (m_initString == L"fromValue")
+    {
+        fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initValue);
+        Value().SetValue(m_initValue);
+    }
+    else if (m_initString == L"uniform" || m_initString == L"gaussian")
+    {
+        fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, scale=%f, onCPU=%s).\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(), (int)m_randomSeed, m_initValueScale, m_initOnCPUOnly ? "true" : "false");
+        InitRandom((m_initString == L"uniform"), m_randomSeed, m_initValueScale, m_initOnCPUOnly);
+    }
+    else
+        LogicError("LearnableParameter: Invalid value of m_initString '%ls' for deferred initialization for %ls.", m_initString.c_str(), NodeDescription().c_str());
+    // and remember that we are done
+    m_initString.clear();
 }

 // called from ComputationNode::ValidateInferInputDimsFrom()
 // In case of an error, this function just backs out without updating.
 // The caller must verify the dimensions.
 // This is a bit weird since it is called after this node has been Validated once.
-// BUGBUG: This will clear out any random initialization to 0. So currently this is not usable for most cases.
 template <class ElemType>
 void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherShape)
 {
+//fprintf(stderr, "InferInputDimsFrom %ls: called in init state '%ls' with dims [%s], offered new dims [%s]\n", NodeDescription().c_str(), m_initString.c_str(), string(GetSampleLayout()).c_str(), string(otherShape).c_str());
    const auto& thisShape = GetSampleLayout();

    // see where we stand with our shape
@ -248,7 +435,10 @@ void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherSh
    // infer at least one dimension
    if (otherShape.GetRank() == 0 || otherShape.GetNumElements() == 0)
        return; // LogicError("ValidateInferInputDimsFrom: Inferred dimensions must not be empty.");
-    
+
+    if (m_initString.empty())
+        LogicError("InferInputDimsFrom: Attempted to infer dimensions, with initialization completed or no deferred initialization pending.");
+
    // if no dimensions have been set at all, copy otherShape
    // Don't verify dimensions in this case, because the node may have explicitly been defined as a vector of 0 elements.
    bool hasAnyDim = false;
@ -266,7 +456,20 @@ void LearnableParameter<ElemType>::InferInputDimsFrom(const TensorShape& otherSh
                newDims[i] = otherShape[i];
        InitShape(TensorShape(newDims));
    }
-    fprintf(stderr, "%ls %ls operation: Tensor shape was inferred as [%s].\n", NodeName().c_str(), OperationName().c_str(), string(GetSampleLayout()).c_str());
+    fprintf(stderr, "%ls operation: Tensor shape was inferred as [%s].\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str());
+
+    // initialize the values
+    // We call this here and in Validate(true), since we don't know which gets called first.
+    // Note: It seems that this is not necessary, and that Validate(true) is only called after inference.
+#if 0 // fake old buggy behavior before deferred initialization
+    if (m_initString != L"fromValue" || m_initValue != 0)
+    {
+        fprintf(stderr, "InferInputDimsFrom: deferred '%ls' initialization patched to fromValue 0 for back compat\n", m_initString.c_str());
+        m_initString = L"fromValue";
+        m_initValue = 0;
+    }
+#endif
+    LazyInitParameters();
 }

 template <class ElemType>
@ -286,6 +489,12 @@ template <class ElemType>
    PrintNodeValuesToFile(printValues, printMetadata, fstream);
 }

+template <class ElemType>
+/*virtual*/ void LearnableParameter<ElemType>::FreezeParameters() /*override*/ // from IFreezable
+{
+    SetLearningRateMultiplier(0);
+}
+
 template class LearnableParameter<float>;
 template class LearnableParameter<double>;

--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -21,7 +21,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // -----------------------------------------------------------------------

 template <class ElemType>
-class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
+class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>, public IFreezable
 {
    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
    static const std::wstring TypeName() { return L"LearnableParameter"; }
@ -29,69 +29,57 @@ class LearnableParameter : public ComputationNode<ElemType>, public NumInputs<0>
    void InitShape(const TensorShape& shape);

 public:
-    LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name)
+    // this constructor is always run (all other constructors call this one)
+    LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name) :
+        Base(deviceId, name)
    {
        SetLearningRateMultiplier(1.0f); // enable normal learning by default
        MarkValueNonSharable();
+        m_initString = L"fromValue"; // default init is with 0; typically overwritten
+        m_initValue = 0;
    }
-    LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& shape)
-        : Base(deviceId, name)
+    LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& shape) :
+        LearnableParameter(deviceId, name)
    {
-        SetLearningRateMultiplier(1.0f);
-        MarkValueNonSharable();
        InitShape(shape);
+        LazyInitParameters();
    }
-    LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, size_t cols)
-        : LearnableParameter(deviceId, name, TensorShape(rows, cols))
+    LearnableParameter(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, size_t cols) :
+        LearnableParameter(deviceId, name, TensorShape(rows, cols))
    {
    }
    LearnableParameter(const ScriptableObjects::IConfigRecordPtr configp);

-    // initialize with random numbers
-    // if 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing)
-    void InitRandom(const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
+    // initialize after plain constructor; for use by NDL
+    void PostInitParameters(const std::wstring& initString, // "uniform"|"gaussian"|"fixedValue"
+                            ElemType initValue,             //  scale   | scale    | value
+                            unsigned long randomSeed = 0,
+                            bool initOnCPUOnly = false);

    // initialize by reading a matrix from a text file
    void InitFromFile(const std::wstring& initFromFilePath);

+private:
+    // initialize with random numbers
+    // If 'initOnCPUOnly' then always init on CPU, making initialization consistent across both (for testing).
+    void InitRandom(const bool uniformInit, const unsigned long randomSeed, const ElemType initValueScale, bool initOnCPUOnly);
+
    // helper to initialize from a matrix read from a text file or a string literal
    void InitFromArray(const std::vector<ElemType>& array, size_t numRows, size_t numCols);

+    // deferred initialization
+    void LazyInitParameters();
+
+public:
    // reload parameters from file
    // This is called from MEL.
-    // TODO: Move this error check there, since this is called only from one place.
-    void ReviseFromFile(const std::wstring& reviseFromFilePath)
-    {
-#if 1
-        try
-        {
-            InitFromFile(reviseFromFilePath);
-        }
-        catch(const std::exception & e)
-        {
-            RuntimeError("ReviseFromFile: Failed to reload %ls %ls operation from file %ls: %s", NodeName().c_str(), OperationName().c_str(), reviseFromFilePath.c_str(), e.what());
-        }
-#else
-        size_t numRows, numCols;
-        auto array = File::LoadMatrixFromTextFile<ElemType>(reviseFromFilePath, numRows, numCols);
-        size_t nRows, nCols;
-        DetermineDataSize(nRows, nCols); // BUGBUG: private
-
-        if (numRows != nRows || numCols != nCols)
-        {
-            RuntimeError("Error in ReviseFromFile for node %ls using file %ls:  original size (%d x %d) vs current size (%d x %d)",
-                         m_nodeName.c_str(), reviseFromFilePath.c_str(), (int) nRows, (int) nCols, (int) numRows, (int) numCols);
-        }
-
-        Value().SetValue(numRows, numCols, m_deviceId, array.data(), matrixFlagNormal);
-        VerifyDataSize(Value());      // sanity check
-#endif
-    }
+    void ReviseFromFile(const std::wstring& reviseFromFilePath);

    virtual void Save(File& fstream) const override;
    virtual void Load(File& fstream, size_t modelVersion) override;

+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override;
+
    // computation functions don't do anything for parameter nodes
    virtual void UpdateFunctionMBSize() override;
    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override;
@ -106,6 +94,17 @@ public:
    void InferInputDimsFrom(const TensorShape& otherShape);

    virtual void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override;
+
+    // called from CloneFunction(..., parameters="constant")
+    virtual void FreezeParameters() override; // from IFreezable
+
+private:
+    // init parameters for deferred initialization (which happens in Validate())
+    std::wstring m_initString; // if non-empty then deferred initialization is needed. Gets cleared upon completion of deferred init.
+    unsigned long m_randomSeed;
+    ElemType m_initValueScale;
+    bool m_initOnCPUOnly;
+    ElemType m_initValue;
 };

 // -----------------------------------------------------------------------
@ -162,7 +161,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>, pu
    typedef ComputationNode<ElemType> Base;
    UsingComputationNodeMembers;

-    void Init(const TensorShape& sampleLayout, bool isSparse, const std::wstring axisName)
+    void Init(const TensorShape& sampleLayout, bool isSparse, const std::wstring axisName, float learningRateMultiplier = 0)
    {
        m_isSparse = isSparse;
        MarkValueNonSharable();
@ -171,7 +170,7 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>, pu

        SetDims(sampleLayout, HasMBLayout()); // also called when reloading a file. Then we have an MBLayout, otherwise not yet
        UpdateFunctionValuesSize();           // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
-        SetLearningRateMultiplier(0);
+        SetLearningRateMultiplier(learningRateMultiplier);
        m_dynamicAxisNodeName = axisName;
    }

@ -225,9 +224,9 @@ protected:
            Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse, axisName);
    }

+public:
    virtual const std::wstring GetRequestedDynamicAxis() const { return m_dynamicAxisNodeName; }

-public:
    virtual void Save(File& fstream) const override
    {
        Base::Save(fstream);
@ -239,6 +238,8 @@ public:
        unsigned int nrAxes = 1;
        fstream << nrAxes;
        fstream << m_dynamicAxisNodeName;
+
+        fstream << m_learningRateMultiplier;
    }

    virtual void Load(File& fstream, size_t modelVersion) override
@ -268,7 +269,12 @@ public:
        }
        else
            m_dynamicAxisNodeName = L""; // Use default
-        Init(sampleLayout, m_isSparse, m_dynamicAxisNodeName);
+
+        float learningRateMultiplier = 0;
+        if (modelVersion >= CNTK_MODEL_VERSION_10)
+            fstream >> learningRateMultiplier;
+
+        Init(sampleLayout, m_isSparse, m_dynamicAxisNodeName, learningRateMultiplier);
    }

    // InputValue must not resize its inputs because that might destroy it. It should already have the correct size.
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -463,6 +463,8 @@ public:
        Base::AllocateGradientMatricesForInputs(matrixPool);
    }

+    size_t OutputRank() const { return m_outputRank; }
+
 private:
    size_t m_outputRank;
 };
--- a/Source/ComputationNetworkLib/PreComputeNodes.h
+++ b/Source/ComputationNetworkLib/PreComputeNodes.h
@ -376,117 +376,12 @@ private:
 template class InvStdDevNode<float>;
 template class InvStdDevNode<double>;

-// -----------------------------------------------------------------------
-// PerDimMeanVarNormalizationNode (feature, mean, invStdDev)
-// Computes
-//   output = (feature - mean) .* invStdDev
-// where mean and invStdDev are meant to be single elements while features
-// is minibatch data.
-// TODO: Why do we need this? Why not use Plus and ElementTimes?
-// -----------------------------------------------------------------------
-
-template <class ElemType>
-class PerDimMeanVarNormalizationNode : public ComputationNode<ElemType>, public NumInputs<3>
-{
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"PerDimMeanVarNormalization";
-    }
-
-public:
-    DeclareConstructorFromConfigWithNumInputs(PerDimMeanVarNormalizationNode);
-    PerDimMeanVarNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name)
-    {
-    }
-
-    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&) override
-    {
-        InvalidArgument("PerDimMeanVarNormalizationNode should only be called in the evaluation stage. Is any of its descendents a learnable parameter that requires gradient?");
-    }
-
-    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-    {
-        size_t rank = DetermineElementwiseTensorRank();
-        auto output    =           ValueTensorFor(rank, fr);
-        auto input     = Input(0)->ValueTensorFor(rank, fr);
-        auto mean      = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
-        auto invStdDev = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
-
-        output.AssignDifferenceOf(input, mean);               // output = input - mean
-        output.AssignElementwiseProductOf(output, invStdDev); // output *= invStdDev
-    }
-
-    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-    {
-        Base::Validate(isFinalValidationPass);
-        InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
-
-        Input(1)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
-        Input(2)->ValidateInferInputDimsFrom(Input(0)->GetSampleLayout());
-
-
-#if 1
-        // support for legacy models when the mean and variance vectors were stored as column vectors (N,1)
-        // This code will copy the shape of Input(0) (source) to Input(1) and Input(2) (target) if:
-        //   1. The source is a 3-tensor with shape 1x1xM
-        //   2. The target is a vector (i.e., a 2-tensor with shape Nx1)
-        //   3. Both targets have the same number of elements
-        //   4. The number of elements in the target (N) is the same as the number of elements in the source (M)
-        // Note: This is somewhat ugly [Jasha Droppo].
-
-        auto dimsA = Input(0)->GetSampleLayout().GetDims();
-        auto dimsB = Input(1)->GetSampleLayout().GetDims();
-        auto dimsC = Input(2)->GetSampleLayout().GetDims();
-
-        if (
-            // Test condition 1.
-            (dimsA.size() == 3 && dimsA[0] == 1 && dimsA[1] == 1) &&
-            // Test condition 2.
-            (dimsB.size() == 2 && dimsB[1] == 1) &&
-            (dimsC.size() == 2 && dimsC[1] == 1) &&
-            // Test condition 3. and condition 4.
-            (dimsB[0] == dimsC[0] && dimsB[0] == dimsA[2])
-            )
-        {
-            // for error messages
-            string dimsBstring = string(Input(1)->GetSampleLayout());
-            string dimsCstring = string(Input(2)->GetSampleLayout());
-
-            // reshape Input(1)
-            Input(1)->SetDims(TensorShape(dimsA), false);
-            fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of second input (%ls %ls operation) was patched to [%s] (from [%s])\n",
-                NodeName().c_str(), OperationName().c_str(), Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str(), string(Input(1)->GetSampleLayout()).c_str(), dimsBstring.c_str());
-
-            // reshape Input(2)
-            Input(2)->SetDims(TensorShape(dimsA), false);
-            fprintf(stderr, "\n%ls %ls operation: For legacy compatibility, the sample layout of third input (%ls %ls operation) was patched to [%s] (from [%s])\n",
-                NodeName().c_str(), OperationName().c_str(), Input(2)->NodeName().c_str(), Input(2)->OperationName().c_str(), string(Input(2)->GetSampleLayout()).c_str(), dimsCstring.c_str());
-        }
-
-#endif
-
-        if (isFinalValidationPass)
-        {
-            if (!Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout()) || !Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(2)->GetSampleLayout()))
-                InvalidArgument("PerDimMeanVarNormalizationNode: All inputs should have same sample layout.");
-        }
-
-        SetDims(Input(0));
-    }
-};
-
-template class PerDimMeanVarNormalizationNode<float>;
-template class PerDimMeanVarNormalizationNode<double>;
-
 // -----------------------------------------------------------------------
 // PerDimMeanVarDeNormalizationNode (feature, mean, invStdDev)
 // Computes
 //   output = feature ./ invStdDev + mean
 // with parameters the same as PerDimMeanVarNormalizationNode.
-// TODO: Why do we need this? Why not use Plus and ElementDividedBy?
+// TODO: Deprecate like PerDimMeanVarNormalizationNode as soon as we have a test case. Or just delete it.
 // -----------------------------------------------------------------------

 template <class ElemType>
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -464,6 +464,9 @@ public:
            LogicError("Unrecognized direction in DelayedValueNodeBase");
    }

+    int TimeStep() const { return m_timeStep; }
+    ElemType InitialActivationValue() const { return m_initialActivationValue; }
+
 protected:
    ElemType m_initialActivationValue;       // starting value for hidden activation vector at boundary
    Matrix<ElemType> m_delayedValue;         // saves the activation of the previous step that this node points to
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -34,9 +34,9 @@ template <class ElemType>
    if (flags & CopyNodeFlags::copyNodeValue)
    {
        auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
-        node->m_axis      = m_axis;
-        node->m_operation = m_operation;
-        node->m_op        = m_op;
+        node->m_axis        = m_axis;
+        node->m_operation   = m_operation;
+        node->m_reductionOp = m_reductionOp;
    }
 }

@ -64,7 +64,7 @@ template <class ElemType>
    auto input  = Input(0)->ValueTensorFor(rank, fr);

    // the actual operation is a Copy with reduction, where the magic is in the reduction op
-    result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_op);
+    result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_reductionOp);
    // note: we can implement "Mean" by passing 1/dim for alpha
 }

@ -79,13 +79,46 @@ template <class ElemType>
    auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr); // ...to this one

    // gradients are not as simple as passing an op-code, unfortunately
-    switch (m_op)
+    switch (m_reductionOp)
    {
    case ElementWiseOperator::opSum:
        // "Sum": broadcast the gradient
        sliceInputGrad.AddCopyOf(sliceOutputGrad);
        break;

+    case ElementWiseOperator::opLogSum:
+        {
+            auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
+            auto output = ValueTensorFor(rank, fr.AllowBroadcast());
+            // Let: f(x, y, z) = log(exp x + exp y + exp z)
+            // For the derivative we get:
+            // df / dx = exp(x)/exp(f)
+            //         = exp(x – f)
+            sliceInputGrad.AddElementwiseProductWithExpOfDiffOf(sliceOutputGrad, input, output);
+        }
+        break;
+
+    case ElementWiseOperator::opMin:
+    case ElementWiseOperator::opMax:
+        auto input = Input(inputIndex)->ValueTensorFor(rank, fr);
+        auto output = ValueTensorFor(rank, fr.AllowBroadcast());
+
+        // POTENTIAL PROBLEM:
+        // For ReduceMin/Max there are combinations of input values where the gradient is not defined because the function has an edge at these points.
+        // E.g. for ReduceMin this is the case when the minimum input value is attained by several inputs at the same time.
+        // In these cases there is no correct gradient.The question is if this could lead to any problems.
+        // Let's look at two scenarios where this might happen:
+        //
+        // * Scenario 1: The input comes from a layer of nodes like e.g. ReLU and some of them might operate in the regime where they clip to a constant value.
+        //   In this case it's not a problem that the input gradient is kind of bad as the derivative of the concerning input nodes will be zero anyway.
+        //
+        // * Scenario 2: The input data is directly coming from training data. Here bad gradients don't matter as we wouldn't wan't to propagate gradients to the training data.
+        //
+        // So as we don't have a better solution yet and it probably doesn't have impact let's stay with the current solution.
+        // Also note that for Clip , Min, Max and ReLU we have the same kind of problem.
+        sliceInputGrad.AddCopyIfEqualOf(input, output, sliceOutputGrad);
+        break;
+
        // more coming

        // "LogPlus": softmax
@ -93,18 +126,18 @@ template <class ElemType>
        //   df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i  - ReduceLogPlus(x))
        // targetGradient = gradientFromTop .* Exp (inputValue - outputValue)   --TODO: verify
        // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.
-
-        // "Max": Copy the gradient only to the max value. targetGradient += gradientFromTop .* (outputValue == inputValue). Needs its own opcode. --TODO : verify
    }
 }

 template <class ElemType>
 /*virtual*/ bool ReduceElementsNode<ElemType>::OutputUsedInComputingInputNodesGradients() const /*override*/
 {
-    switch (m_op)
+    switch (m_reductionOp)
    {
-    case ElementWiseOperator::opSum: return false;
-    // will be different e.g. for LogPlus, Max, and Min
+    case ElementWiseOperator::opSum:    return false;
+    case ElementWiseOperator::opLogSum: return true;
+    case ElementWiseOperator::opMin:    return true;
+    case ElementWiseOperator::opMax:    return true;
    }
    LogicError("Should not get here.");
 }
@ -112,25 +145,31 @@ template <class ElemType>
 template <class ElemType>
 /*virtual*/ bool ReduceElementsNode<ElemType>::InputUsedInComputingInputNodesGradients(size_t inputIndex) const /*override*/
 {
-    switch (m_op)
+    switch (m_reductionOp)
    {
-    case ElementWiseOperator::opSum: return false;
-    // will be different for LogPlus, Max, and Min
+    case ElementWiseOperator::opSum:    return false;
+    case ElementWiseOperator::opLogSum: return true;
+    case ElementWiseOperator::opMin:    return true;
+    case ElementWiseOperator::opMax:    return true;
    }
    LogicError("Should not get here.");
 }

-// map the operation specific as a string to an ElementWiseOperator to pass to 
+// map the operation specified as a string to an ElementWiseOperator value.
 template <class ElemType>
 void ReduceElementsNode<ElemType>::ValidateOp()
 {
 #if 1 // legacy with initial experiments, delete this soon
-    if (m_operation == L"Plus") m_op = ElementWiseOperator::opSum;
+    if (m_operation == L"Plus") m_reductionOp = ElementWiseOperator::opSum;
    else
 #endif
-    if (m_operation == L"Sum") m_op = ElementWiseOperator::opSum;
+    if      (m_operation == L"Sum")    m_reductionOp = ElementWiseOperator::opSum;
+    else if (m_operation == L"LogSum") m_reductionOp = ElementWiseOperator::opLogSum;
+    else if (m_operation == L"Min")    m_reductionOp = ElementWiseOperator::opMin;
+    else if (m_operation == L"Max")    m_reductionOp = ElementWiseOperator::opMax;
+
    // more here
-    else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum'. And a few more soon.", NodeDescription().c_str(), m_operation.c_str());
+    else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum', 'Max', 'Min'.", NodeDescription().c_str(), m_operation.c_str());
 }

 template <class ElemType>
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -196,7 +196,7 @@ class ReduceElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
    void ValidateOp();
 public:
    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = 0) :
-        Base(deviceId, name), m_operation(operation), m_axis(axis), m_op((ElementWiseOperator)-1/*invalid*/)
+        Base(deviceId, name), m_operation(operation), m_axis(axis), m_reductionOp((ElementWiseOperator)-1/*invalid*/)
    {
        if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate())
            ValidateOp();
@ -220,7 +220,7 @@ public:
 private:
    int m_axis;
    std::wstring m_operation; // the operation as a string, e.g. "Sum", see ValidateOp()
-    ElementWiseOperator m_op; // the operation mapped to our internal opCode
+    ElementWiseOperator m_reductionOp; // the reduction operation mapped to our internal opCode
 };

 // -----------------------------------------------------------------------
--- a/Source/ComputationNetworkLib/SpecialPurposeNodes.cpp
+++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.cpp
@ -28,7 +28,7 @@ TraceNode<ElemType>::TraceNode(const ScriptableObjects::IConfigRecordPtr configp
    m_message        = (const std::wstring&)configp->Get(L"say");
    m_logFirst       = configp->Get(L"logFirst");
    m_logFrequency   = configp->Get(L"logFrequency");
-    m_logGradientToo = false; // configp->Get(L"logGradientToo"); not yet implemented
+    m_logGradientToo = configp->Get(L"logGradientToo");
    m_formattingOptions = WriteFormattingOptions(*configp);
    m_onlyUpToRow    = configp->Get(L"onlyUpToRow");
    m_onlyUpToT      = configp->Get(L"onlyUpToT");
@ -75,7 +75,31 @@ template <class ElemType>
    auto result =           ValueTensorFor(rank, fr);
    auto input  = Input(0)->ValueTensorFor(rank, fr);
    result.AssignCopyOf(input);
-    // log the content
+
+	// do the tracing
+	Log(fr, false/*means log value*/);
+}
+
+template <class ElemType>
+/*virtual*/ void TraceNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
+{
+    assert(inputIndex == 0); inputIndex;
+
+    size_t rank = DetermineElementwiseTensorRank();
+    auto sliceOutputGrad =           GradientTensorFor(rank, fr);      // propagate from this one...
+    auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr);      // ...to this one
+
+    sliceInputGrad.AddCopyOf(sliceOutputGrad);
+
+	// do the tracing
+	if (m_logGradientToo)
+		Log(fr, true/*means log gradient*/);
+}
+
+// log value or gradient
+template <class ElemType>
+/*virtual*/ void TraceNode<ElemType>::Log(const FrameRange& fr, bool logGradientInstead) const
+{
    if (m_numMBsRun == 1)
    {
        const auto prologue = m_formattingOptions.Processed(NodeName(), m_formattingOptions.prologue, m_numMBsRun);
@ -94,30 +118,18 @@ template <class ElemType>
        let timeRange = fr.GetTimeRange();
        fprintf(stderr, "------- Trace["); // --- for better visual separability from actual content
        if (fr.IsAllFrames())
-            fprintf(stderr, "*");
-        else if (timeRange.second == timeRange.first+1)
-            fprintf(stderr, "%d", (int)timeRange.first);
+            ;
        else if (timeRange.second == timeRange.first + 1)
+            fprintf(stderr, "%d", (int)timeRange.first);
+        else if (timeRange.second > timeRange.first + 1)
            fprintf(stderr, "%d..%d", (int)timeRange.first, (int)timeRange.second-1);
-        fprintf(stderr, "] %ls --> %s\n", m_message.c_str(), Input(0)->FormatOperationPrototype("").c_str());
+        fprintf(stderr, "] %ls %s--> %s\n", m_message.c_str(), logGradientInstead ? "(gradient) " : "", Input(0)->FormatOperationPrototype("").c_str());
        Input(0)->WriteMinibatchWithFormatting(stderr, fr, m_onlyUpToRow, m_onlyUpToT, m_formattingOptions.transpose, m_formattingOptions.isCategoryLabel, m_formattingOptions.isSparse, m_labelMapping,
                                               sequenceSeparator, sequencePrologue, sequenceEpilogue, elementSeparator, sampleSeparator,
-                                               valueFormatString, /*outputGradient=*/false);
+											   valueFormatString, logGradientInstead);
    }
 }

-template <class ElemType>
-/*virtual*/ void TraceNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
-{
-    assert(inputIndex == 0); inputIndex;
-
-    size_t rank = DetermineElementwiseTensorRank();
-    auto sliceOutputGrad =           GradientTensorFor(rank, fr);      // propagate from this one...
-    auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr);      // ...to this one
-
-    sliceInputGrad.AddCopyOf(sliceOutputGrad);
-}
-
 template <class ElemType>
 /*virtual*/ void TraceNode<ElemType>::Validate(bool isFinalValidationPass) // override
 {
--- a/Source/ComputationNetworkLib/SpecialPurposeNodes.h
+++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.h
@ -47,6 +47,9 @@ public:
    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }

+private:
+	void Log(const FrameRange& fr, bool logGradientInstead) const;
+
 private:
    // configuration
    std::wstring m_message;
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -1534,8 +1534,8 @@ template class DropoutNode<float>;
 template class DropoutNode<double>;

 // -----------------------------------------------------------------------
-// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev, spatial,
-//                         normalizationTimeConstant = 0, blendTimeConstant = 0,
+// BatchNormalizationNode (input, scale, bias, runMean, runInvStdDev,
+//                         spatial, normalizationTimeConstant = 0, blendTimeConstant = 0,
 //                         epsilon = 0.00001,
 //                         useCntkEngine = true, imageLayout = 'cudnn')
 //
@ -1553,51 +1553,48 @@ template class DropoutNode<double>;
 // where gamma and beta are trainable parameters(represented as LearnableParameter).
 // 
 // * input is the input of the batch normalization node
-// * scale is a LearnableParameter that stores scale vector(gamma term in the equation above).
-// * bias is a LearnableParameter that stores bias vector(beta term). scale and bias must have the same dimensions which must be equal 
+// * scale is a LearnableParameter that stores scale vector (gamma term in the equation above).
+// * bias is a LearnableParameter that stores bias vector (beta term). scale and bias must have the same dimensions which must be equal 
 //      to the input dimensions in case of spatial = false or number of output convolution feature maps in case of spatial = true.
 // * runMean is the running mean which is used during evaluation phase and might be used during training as well.
 //      It is represented as a LearnableParameter with the same dimensions as scale and bias.
 // * runInvStdDev is the running inverse square root of variance(so InvStdDev = 1 / sqrt(var + epsilon)).
 //      It is represented as a LearnableParameter with the same dimensions as scale and bias.
 // * spatial is a flag that specifies whether to compute mean / var for each feature in a mininbatch independently or, in case of convolutional layers, per feature map.
+//      TODO: This must be configured in a generic fashion where tensor axes are chosen along which parameters are tied.
 // * normalizationTimeConstant is the time constant which is used to compute running average of mean and variance.
-//      Value 0 (default) means there will be no exponential smoothing and running mean / variance will always have values computed for the last seen mininbatch.
-//      Value 1#INF (infinity)means running values are "frozen" (i.e.will not be updated).
+//      Value 0 (default) means there will be no exponential smoothing and running mean/variance will always have values computed for the last seen mininbatch.
+//      Value 1#INF (infinity) means running values are "frozen" (i.e.will not be updated).
 // * blendTimeConstant is the time constant which allows to specify how much of running mean / var should be "blended" into mean / var of the current minibatch.
 //      Value 0 (default) means no blending will happen and only the current minibatch statistics will be used.
-//      Value 1#INF (infinity)means only running mean / var will be used(this is used, for example, in evaluation phase).
+//      Value 1#INF (infinity) means only running mean / var will be used(this is used, for example, in evaluation phase).
 // * epsilon is a conditioner constant used in computing InvStdDev
-// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN - based.
-// * imageLayout is the image layout.Only cudnn is supported.
+// * useCntkEngine is a boolean flag that specifies which batch normalization implementation to use : CNTK or cuDNN-based.
+// * imageLayout is the image layout. Only cudnn is supported at present.
 // -----------------------------------------------------------------------
 template <class ElemType>
-class BatchNormalizationNode : public ComputationNode<ElemType>, public NumInputs<5>
+class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, public NumInputs<5>, public IFreezable
 {
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"BatchNormalization";
-    }
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"BatchNormalization"; }

 public:
-    BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
+    BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name) :
+        Base(deviceId, name), m_spatial(false), m_normTimeConst(0), m_blendTimeConst(0), m_epsilon(0), m_useCntkEngine(true),
        m_mbCount(0), m_imageLayoutKind(ImageLayoutKind::CHW)
    {
    }
    BatchNormalizationNode(DEVICEID_TYPE deviceId, const wstring& name, bool spatial, double normalizationTimeConstant, double blendTimeConstant,
-                           double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind)
-                           : Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
-                           m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
+                           double epsilon, bool useCntkEngine, ImageLayoutKind imageLayoutKind) :
+        Base(deviceId, name), m_spatial(spatial), m_normTimeConst(normalizationTimeConstant), m_blendTimeConst(blendTimeConstant),
+        m_epsilon(epsilon), m_useCntkEngine(useCntkEngine), m_imageLayoutKind(imageLayoutKind), m_mbCount(0)
    {
    }
-    BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp)
-        : BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
-                                 configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"), 
-                                 configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
-                                 ImageLayoutKindFrom(configp->Get(L"imageLayout")))
+    BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
+        BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
+                               configp->Get(L"normalizationTimeConstant"), configp->Get(L"blendTimeConstant"), 
+                               configp->Get(L"epsilon"), configp->Get(L"useCntkEngine"),
+                               ImageLayoutKindFrom(configp->Get(L"imageLayout")))
    {
        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
    }
@ -1689,46 +1686,110 @@ public:
        }
    }

-    void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+private: // time-constant conversions
+
+    // map time constants to exp avg factor
+    // This is the factor for the current MB's estimate (1-factor is used for the previous value of the running stats).
+    double ComputeExpAvgFactor() const
    {
+        // in inference mode, only use long-term mean and do not update running estimates
+        if (!Environment().IsTraining())
+            return 0;                                        //  (m_normTimeConst == infinity) no new contribution from current minibatch
+
+        // REVIEW alexeyk: hack, m_normTimeConst < 0 is used to denote corpus-level statistics (without forgetting factor).
+        if (m_normTimeConst < 0)
+            return 1.0 / (1.0 + m_mbCount); // (this is the hack case) TODO: verify this formula; shouldn't we use #samples instead of MB count?
+
+        // Convert to per-minibatch factor. The limit, positivie infinity, means that running mean/var parameters are "frozen"
+        // that is, do not require updates.
+        // The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
+        double numSamples = (double)GetMBLayout()->GetActualNumSamples();
+        if (!isfinite(m_normTimeConst))                      // infinite
+            return 0;                                        // no new contribution from current minibatch (infinitely long memory)
+        else if (m_normTimeConst > 0)                        // not zero
+            return 1.0 - exp(-numSamples / m_normTimeConst); // interpolate expAvgFactor * MB stats + (1-expAvgFactor) * prev running stats
+        else                                                 // zero
+            return 1.0;                                      // don't use running stats at all
+    }
+
+    // map sample count to blend factor
+    // This is the interpolation weight for the running statistics (the current MB statistics are weighted with 1-this).
+    double ComputeBlendFactor() const
+    {
+        // in inference mode, only use long-term mean and do not update running estimates
+        if (!Environment().IsTraining())
+            return 1.0; // (m_blendTimeConst == infinity) estimate is taken 100% from the long-term running estimate
+
+        // convert to blend factor (= weight for running stats)
+        // The code below special-cases two boundary cases, but those are just the limit cases of the main formula.
+        double numSamples = (double)GetMBLayout()->GetActualNumSamples();
+        if (!isfinite(m_blendTimeConst))                               // infinite weight for prior stats
+            return 1.0;                                                // only use running statistics
+        else if (m_blendTimeConst > 0)                                 // not zero
+            return m_blendTimeConst / (m_blendTimeConst + numSamples); // interpolate blendFactor * running stats + (1-blendFactor) * MB stats
+        else                                                           // zero
+            return 0;                                                  // no weight for prior stats, only use MB stats
+    }
+public:
+
+    // Note: This function assumes that inputIndex=0 is called before the others.
+    // BUGBUG: The node should not make assumptions in which order the inputs' derivates are computed. It currently assumes to start with 0.
+    // BUGBUG: If the input has no learnables (e.g. using BN instead of corpus mean/var norm), this will not be called for inputIndex=0 at all.
+    virtual void BackpropToNonLooping(size_t inputIndex) override
+    {
+        FrameRange fr(Input(0)->GetMBLayout());
+
        if (inputIndex == 0) // derivative with respect to the input.
        {
            auto sliceOutputGrad = GradientFor(fr);
            auto sliceInputValue = Input(0)->ValueFor(fr);
            const Matrix<ElemType>& scale = Input(1)->Value();
            const Matrix<ElemType>& bias = Input(2)->Value();
+            const Matrix<ElemType>& runMean = Input(3)->Value();
+            const Matrix<ElemType>& runInvStdDev = Input(4)->Value();

            auto sliceInputGrad = Input(0)->GradientFor(fr);
-            m_dScale->Resize(scale);
+            // The mean used in Forward() are either saveMean or runMean.
+            // This is decided by the engine, which communicates back the decision by returning
+            // an empty saveMean in case runMean should be used. Likewise for stddev.
+            let& actualMean      = !m_saveMean->IsEmpty()      ? *m_saveMean      : runMean;      // empty if only the running mean is used
+            let& actualInvStdDev = !m_saveInvStdDev->IsEmpty() ? *m_saveInvStdDev : runInvStdDev;
+            m_dScale->Resize(scale); // gradients for scale and bias get stored here
            m_dBias->Resize(bias);
+
+            double blendFactor = ComputeBlendFactor();  // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)
+
            // Compute all derivatives in one step. Save derivatives with respect to scale and bias in temp matrices.
-            m_bnEng->Backward(sliceInputValue, sliceOutputGrad, sliceInputGrad, scale,
-                                              *m_saveMean, *m_saveInvStdDev, *m_dScale, *m_dBias);
+            m_bnEng->Backward(sliceInputValue, sliceOutputGrad, // (in)  input from below, gradient from above
+                              sliceInputGrad,                   // (out) gradient for data input goes here
+                              scale,                            // (in)  out of scale and bias, only scale is needed in gradient propagation
+                              blendFactor,                      // (in)  smoothing weight for running stats (1=use only running stats)
+                              actualMean, actualInvStdDev,      // (in)  actual mean/stddev values used in ForwardProp()
+                              *m_dScale, *m_dBias);             // (out) gradients for scale and bias
        }
        else if (inputIndex == 1) // derivative with respect to the scale
        {
            // Derivative with respect to the scale was precomputed during input derivative computation.
            Matrix<ElemType>& grad = Input(1)->Gradient();
            grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dScale->Data());
+            // BUGBUG: ^^ This should add the gradient, not overwrite it.
        }
        else if (inputIndex == 2) // derivative with respect to the bias
        {
            // Derivative with respect to the bias was precomputed during input derivative computation.
            Matrix<ElemType>& grad = Input(2)->Gradient();
            grad.SetValue(grad.GetNumRows(), grad.GetNumCols(), grad.GetDeviceId(), m_dBias->Data());
+            // BUGBUG: ^^ Also here, this should add the gradient, not overwrite it.
        }
        // No derivatives with respect to running mean and InvStdDev.
    }

-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The BatchNormalizationNode does not require its output value for computing
-        // the gradients of its input nodes
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }

-    void ForwardProp(const FrameRange& fr) override
+    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
    {
+        FrameRange fr(Input(0)->GetMBLayout());
+
        Matrix<ElemType> sliceInputValue = Input(0)->ValueFor(fr);

        const Matrix<ElemType>& scale = Input(1)->Value();
@ -1744,42 +1805,16 @@ public:

        Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-        double expAvgFactor;
-        double blendFactor;
-        if (!Environment().IsTraining())
-        {
-            expAvgFactor = 0;
-            blendFactor = 1.0;
+        // determine the factors from the time constants
+        double expAvgFactor = ComputeExpAvgFactor(); // weight for the new MB statistics in the running estimate. The previous value of the running statistics is kept with weight (1-this)
+        double blendFactor  = ComputeBlendFactor();  // interpolation weight for the running statistics (the current MB statistics are weighted with 1-this)

-            m_saveMean->Resize(0, 0);
-            m_saveInvStdDev->Resize(0, 0);
-        }
-        else
-        {
-            double numSamples = (double)GetMBLayout()->GetActualNumSamples();
-            if (m_normTimeConst > 0)
-            {
-                // Convert to per-minibatch factor. Treat positivie infinity as if running mean/var parameters are "frozen"
-                // that is, do not require updates.
-                expAvgFactor = !isfinite(m_normTimeConst) ? 0 : (1.0 - exp(-numSamples / m_normTimeConst));
-            }
-            else
-            {
-                // REVIEW alexeyk: hack, m_normTimeConst < 0 is used to compute CMA.
-                expAvgFactor = (m_normTimeConst < 0) ? (1.0 / (1.0 + m_mbCount)) : 1.0;
-            }
-
-            if (!isfinite(m_blendTimeConst))
-                blendFactor = 1.0;
-            else
-                blendFactor = m_blendTimeConst > 0 ? (m_blendTimeConst / (m_blendTimeConst + numSamples)) : 0;
-
-            m_saveMean->Resize(runMean);
-            m_saveInvStdDev->Resize(runMean);
-        }
-
-        m_bnEng->Forward(sliceInputValue, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev,
-                         sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);
+        m_bnEng->Forward(/*in=*/ sliceInputValue, scale, bias, // (in)
+                         expAvgFactor, blendFactor,
+                         runMean, runInvStdDev,                // (in/out) running estimates, updated from the current MB mean/stddev
+                         /*out=*/ sliceOutputValue,            // (out) batch-normalized output value
+                         m_epsilon,
+                         *m_saveMean, *m_saveInvStdDev);       // (out) actual interpolated mean/stddev values. Note: unused/empty for blendFactor==1 for CNTK engine

        m_mbCount++;
    }
@ -1820,25 +1855,25 @@ public:
    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeForwardProp(matrixPool);
-            RequestMatrixFromPool(m_saveMean, matrixPool);
-            RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
-        }
+        RequestMatrixFromPool(m_saveMean, matrixPool);
+        RequestMatrixFromPool(m_saveInvStdDev, matrixPool);
+    }

    void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeBackprop(matrixPool);
-            RequestMatrixFromPool(m_dScale, matrixPool);
-            RequestMatrixFromPool(m_dBias, matrixPool);
-        }
+        RequestMatrixFromPool(m_dScale, matrixPool);
+        RequestMatrixFromPool(m_dBias, matrixPool);
+    }

    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
    {
        Base::ReleaseMatricesAfterBackprop(matrixPool);
-            ReleaseMatrixToPool(m_saveMean, matrixPool);
-            ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
-            ReleaseMatrixToPool(m_dScale, matrixPool);
-            ReleaseMatrixToPool(m_dBias, matrixPool);
-        }
+        ReleaseMatrixToPool(m_saveMean, matrixPool);
+        ReleaseMatrixToPool(m_saveInvStdDev, matrixPool);
+        ReleaseMatrixToPool(m_dScale, matrixPool);
+        ReleaseMatrixToPool(m_dBias, matrixPool);
+    }

    void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
                                       double blendTimeConstant, double prevBlendTimeConstant)
@ -1851,6 +1886,20 @@ public:
            m_blendTimeConst = blendTimeConstant;
    }

+    // called from CloneFunction(..., parameters="constant")
+    // Once called, this node is put into inference mode.
+    virtual void FreezeParameters() override // from IFreezable
+    {
+        m_normTimeConst  = std::numeric_limits<double>::infinity();
+        m_blendTimeConst = std::numeric_limits<double>::infinity();
+    }
+
+    double NormalizationTimeConstant() const { return m_normTimeConst; }
+    double BlendTimeConstant() const { return m_blendTimeConst; }
+    bool Spatial() const { return m_spatial; }
+    double Epsilon() const { return m_epsilon; }
+    bool UseCNTKEngine() const { return m_useCntkEngine; }
+
 private:
    // Old versioning - do not use. Do not remove until we're sure there are no old models around.
    struct VersionInfo
@ -1865,36 +1914,51 @@ private:
    VersionInfo m_version;

 private:
+    // --- configuration parameters
+
    // Determines whether to use per-activation (used after non-convolutional layers like fully connected)
    // or spatial (used after convolutional layers).
+    // TODO: This should not be a config option, but rather inferred from dimensions of the Parameters.
    bool m_spatial;
-    // Time constant for running mean and variance.
+
+    // Time constant for estimating the running mean and variance.
+    // This is the time constant of a low-pass filter.
+    // If 0, running mean and variance just remember the last minibatch.
+    // If infinity, running mean and variance are not updated, like in inference mode.
    double m_normTimeConst;
-    // Time constant for blending running mean/var and current minibatch mean/var.
-    // The main idea is to represent current minibatch statistics as MAP estimate, linear interpolation
-    // of smoothed and minibatch statistics. 
+
+    // Equivalent sample count for blending running mean/var and current minibatch mean/var.
+    // Roughly, this specifies how many samples "worth" is the running statistics,
+    // relative to the current minibatch statistics.
+    // If 0, only use the current MB statistics. If infinity, use only the running mean, like in inference mode.
+    // The main idea is to estimate the mean/variance as a MAP estimate using the running mean/var as a prrior.
+    // This should make the method more robust to the case of very small minibatches,
+    // and also provides a meaningful interpretation of inference mode, where only the prior is used.
+    // Effectively, this ends up in a linear interpolation of running and minibatch statistics.
    // The idea is due to Frank Seide et al.
-    // It should also work well in data parallelism scenario
-    // as opposed to plain vanilla BN implementation which would require aggregation of statistics
-    // from all nodes.
+    // It should also work well in data parallelism scenario, as opposed to plain vanilla BN implementation
+    // which would require aggregation of statistics from all nodes.
    // REVIEW alexeyk: if this works, document it properly in Wiki.
    double m_blendTimeConst;
+
    // Epsilon used to compute inverse std deviation.
    double m_epsilon;
    // Whether to use CNTK or cuDNN BN implementation.
    bool m_useCntkEngine;
    // Layout (e.g. CHW).
    ImageLayoutKind m_imageLayoutKind;
+
+    // --- working variables
+
    // Minibatch count, used to compute cumulative moving average.
    size_t m_mbCount;

-    // Stores pre-computed on forward pass mean values that are used in gradient computation.
+    // Interpolated actual mean/stddev values. Pre-computed on forward pass, also used in gradient computation.
    shared_ptr<Matrix<ElemType>> m_saveMean;
-    // Stores pre-computed on forward pass InvStdDev values that are used in gradient computation.
    shared_ptr<Matrix<ElemType>> m_saveInvStdDev;
-    // Stores scale derivatives
+    // Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
+    // Not used for blendFactor=1 in CNTK engine.
    shared_ptr<Matrix<ElemType>> m_dScale;
-    // Stores bias derivatives.
    shared_ptr<Matrix<ElemType>> m_dBias;

    std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -321,15 +321,17 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
        RuntimeError("Expected %d outputs, but got %d.", (int)m_outputNodes.size(), (int)outputs.size());

    size_t i = 0;
-    for (auto& input : m_inputMatrices)
+    for (auto& inputNode : m_inputNodes)
    {
        // const cast: The matrix class takes this over without copying and could theoretically change the contents,
        // though it doesn't in this case.
        auto& buffer = const_cast<ValueBuffer<ElemType, ValueContainer>&>(inputs[i]);
-        shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
+        auto matrix = dynamic_pointer_cast<Matrix<ElemType>>(inputNode->ValuePtr());
        auto type = matrix->GetMatrixType();
-        size_t numRows = input.second.sampleLayout.GetNumElements();
+        size_t numRows = inputNode->GetSampleLayout().GetNumElements();

+        if (buffer.m_buffer.data() == nullptr)
+            RuntimeError("Input %ls: Buffer is not allocated.", m_inputNodes[i]->GetName().c_str());
        if (type == MatrixType::DENSE)
        {
            if (buffer.m_buffer.size() % numRows != 0)
@ -340,8 +342,12 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
        }
        else if (type == MatrixType::SPARSE)
        {
+            if (buffer.m_colIndices.data() == nullptr)
+                RuntimeError("Input %ls: Due to sparse input format, expected colIndices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
+            if (buffer.m_indices.data() == nullptr)
+                RuntimeError("Input %ls: Due to sparse input format, expected Indices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices.size() < 2)
-                RuntimeError("Input %ls: Expected at least one element.", m_inputNodes[i]->GetName().c_str());
+                RuntimeError("Input %ls: Expected at least one element (2 entries in colIndices array).", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices[0] != 0)
                RuntimeError("Input %ls: First element of column indices must be 0", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices[buffer.m_colIndices.size() - 1] != buffer.m_indices.size())
@ -352,8 +358,8 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem

        int numCols = type == MatrixType::DENSE ? buffer.m_buffer.size() / numRows : buffer.m_colIndices.size() - 1;
        assert(numCols >= 1);
-        input.second.pMBLayout->Init(1, numCols);
-        input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
+        inputNode->GetMBLayout()->Init(1, numCols);
+        inputNode->GetMBLayout()->AddSequence(0, 0, 0, numCols);

        if (type == MatrixType::DENSE)
            matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);
--- a/Source/Extensibility/EvalWrapper/EvalExtendedWrapper.cpp
+++ b/Source/Extensibility/EvalWrapper/EvalExtendedWrapper.cpp
@ -14,6 +14,11 @@
 #include <msclr\marshal_cppstd.h>

 #include "CNTKException.h"
+#pragma warning(push)
+#pragma warning(disable : 4793) // Function compiled as native
+#include "Basics.h"
+#include "ScriptableObjects.h"
+#pragma warning(pop)
 #include "EvalCommon.h"
 #include "Eval.h"

@ -250,7 +255,14 @@ public:
            outputNodeNames.push_back(context.marshal_as<std::wstring>(output));
        }

-        m_eval->StartForwardEvaluation(outputNodeNames);
+        try
+        {
+            m_eval->StartForwardEvaluation(outputNodeNames);
+        }
+        catch (const exception& ex)
+        {
+            throw GetCustomException(ex);
+        }
    }

    //
@ -354,6 +366,11 @@ private:
        {
            return gcnew CNTKBadAllocException(gcnew System::String(ex.what()));
        }
+        else if (dynamic_cast<const ScriptableObjects::ScriptingException*>(&ex) != nullptr) // Includes derived classes
+        {
+            const auto& err = dynamic_cast<const ScriptableObjects::ScriptingException&>(ex);
+            return gcnew CNTKLogicErrorException(gcnew System::String(wstrprintf(L"%ls\n%ls", utf16(err.what()).c_str(), err.GetError(L"").c_str()).c_str()), nullptr);
+        }
        else
        {
            return gcnew CNTKException(gcnew System::String(ex.what()));
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
@ -56,6 +56,8 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
@ -66,10 +68,6 @@
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <DelayLoadDLLs>
-      </DelayLoadDLLs>
-      <AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -79,10 +77,6 @@
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <DelayLoadDLLs>
-      </DelayLoadDLLs>
-      <AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Release|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalDependencies Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
--- a/Source/Math/BatchNormalizationEngine.cpp
+++ b/Source/Math/BatchNormalizationEngine.cpp
@ -25,8 +25,6 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
        assert(m_inOutT.GetNumElements() == bias.GetNumRows());
        assert(m_inOutT.GetNumElements() == runMean.GetNumRows());
        assert(m_inOutT.GetNumElements() == runInvStdDev.GetNumRows());
-        assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
-        assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
    }
    else
    {
@ -34,26 +32,35 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
        assert((m_inOutT.GetNumElements() % bias.GetNumRows()) == 0);
        assert((m_inOutT.GetNumElements() % runMean.GetNumRows()) == 0);
        assert((m_inOutT.GetNumElements() % runInvStdDev.GetNumRows()) == 0);
-        assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
-        assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
    }
    assert(scale.GetNumCols() == 1);
    assert(bias.GetNumCols() == 1);
    assert(runMean.GetNumCols() == 1);
    assert(runInvStdDev.GetNumCols() == 1);
-    assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
-    assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);

    EnsureCompatible();
    ForwardCore(in, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
+
+    if (!m_spatial)
+    {
+        assert(saveMean.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveMean.GetNumRows());
+        assert(saveInvStdDev.GetNumElements() == 0 || m_inOutT.GetNumElements() == saveInvStdDev.GetNumRows());
+    }
+    else
+    {
+        assert(saveMean.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveMean.GetNumRows()) == 0);
+        assert(saveInvStdDev.GetNumElements() == 0 || (m_inOutT.GetNumElements() % saveInvStdDev.GetNumRows()) == 0);
+    }
+    assert(saveMean.GetNumElements() == 0 || saveMean.GetNumCols() == 1);
+    assert(saveInvStdDev.GetNumElements() == 0 || saveInvStdDev.GetNumCols() == 1);
 }

 template <class ElemType>
-void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, 
+void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
                                         const Mat& saveMean, const Mat& saveInvStdDev, Mat& scaleGrad, Mat& biasGrad)
 {
    EnsureCompatible();
-    BackwardCore(in, srcGrad, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+    BackwardCore(in, srcGrad, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
 }

 template <class ElemType>
@ -88,10 +95,10 @@ protected:
        in.BatchNormalizationForward(scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev, out, epsilon, saveMean, saveInvStdDev);
    }

-    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
+    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
                      Mat& scaleGrad, Mat& biasGrad) override
    {
-        srcGrad.BatchNormalizationBackward(in, grad, scale, saveMean, saveInvStdDev, scaleGrad, biasGrad);
+        srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, saveMean, saveInvStdDev, scaleGrad, biasGrad);
    }
 };

@ -128,4 +135,4 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
 template class BatchNormEngine<float>;
 template class BatchNormEngine<double>;

-} } }
+}}}
--- a/Source/Math/BatchNormalizationEngine.h
+++ b/Source/Math/BatchNormalizationEngine.h
@ -37,7 +37,7 @@ public:
    void Forward(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
                 Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);

-    void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
+    void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
                  Mat& scaleGrad, Mat& biasGrad);

    static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
@ -55,10 +55,11 @@ protected:

    virtual void EnsureCompatible() = 0;

+    // saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runInvStdDev,
                 Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;

-    virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
+    virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
                  Mat& scaleGrad, Mat& biasGrad) = 0;

 protected:
@ -70,4 +71,4 @@ protected:

 #pragma warning(pop)

-} } }
+}}}
--- a/Source/Math/BlockMultiplier.h
+++ b/Source/Math/BlockMultiplier.h
@ -9,6 +9,7 @@
 #include <emmintrin.h>
 #include <tmmintrin.h>
 #include <immintrin.h>
+#include <smmintrin.h>
 #include <assert.h>
 #include <cstdint>
 #include <iostream>
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -32,8 +32,10 @@
 #include <vld.h>
 #endif

+#pragma warning(disable : 4100) // unreferenced formal parameter; "struct TensorOpReduction<ElemType, OPFN, typename ReductionOp, N, -1>" trigger this
 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this
-#pragma warning(disable : 4702) // unreachable code; triggered for unknown reasons
+#pragma warning(disable : 4244) // unreachable code; triggered for unknown reasons
+#pragma warning(disable : 4702) // conversion from 'double' to 'float'

 #ifdef USE_ACML
 // Download ACML 5.3.1 (e.g., acml5.3.1-ifort64.exe) or above
@ -4418,13 +4420,16 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
                                                    CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev, CPUMatrix<ElemType>& out, double epsilon,
                                                    CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const
 {
-    UNUSED(epsilon); UNUSED(saveMean); UNUSED(saveInvStdDev);
+    UNUSED(epsilon);

    assert((GetNumRows() % scale.GetNumRows()) == 0);

    if (expAvgFactor != 0 || blendFactor != 1)
        RuntimeError("Batch normalization training on CPU is not yet implemented.");

+    saveMean.Resize(0, 0); // only doing inference: these two are not produced
+    saveInvStdDev.Resize(0, 0);
+
    bool spatial = GetNumRows() != scale.GetNumRows();
    if (spatial)
    {
@ -4453,10 +4458,11 @@ void CPUMatrix<ElemType>::BatchNormalizationForward(const CPUMatrix<ElemType>& s
 }

 template <class ElemType>
-void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
+void CPUMatrix<ElemType>::BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor,
+                                                     const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
                                                     CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const
 {
-    UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
+    UNUSED(in); UNUSED(grad); UNUSED(scale); UNUSED(blendFactor), UNUSED(saveMean); UNUSED(saveInvStdDev); UNUSED(scaleGrad); UNUSED(biasGrad);
    RuntimeError("Batch normalization training on CPU is not yet implemented.");
 }

@ -6042,35 +6048,38 @@ int CPUMatrix<ElemType>::SetNumThreads(int numThreads)

 // perform loop over reduction index m
 // This function is declared inside a wrapper struct to allow partial specialization (m = -1).
-template <class ElemType, typename OPFN, size_t N, int m>
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int m>
 struct TensorOpReduction
 {
    // reduction case (non-reduction case is specialized)
-    static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn,
+    static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
    {
        array<ptrdiff_t, N - 1> strides;   // N-1 because last one is the result pointer, which is unused in reduction
        for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
            strides[i] = reducingStrides[i][(size_t) m];
-        double /*ElemType*/ aggregate = 0;
-        for (size_t dim = reducingOpDims[(size_t) m]; dim-- > 0;)
+
+        double aggregate = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
+        for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
        {
-            // need to descend into one loop deeper
-            aggregate += TensorOpReduction<ElemType, OPFN, N, m - 1>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
            // advance the pointers
            for (size_t i = 0; i < N - 1; i++)
                pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
+
+            // need to descend into one loop deeper
+            aggregate = reductionOp(aggregate, TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides));
        }
-        return (ElemType) aggregate;
+        // Actually it would be nicer to return double but we keep ElementType so that test don't return different numbers than previous implementation.
+        return static_cast<double>(aggregate);
    }
 };

 // perform loop over reduction index m
 // This is the specialized version for m = -1, which terminates the recursion.
-template <class ElemType, typename OPFN, size_t N>
-struct TensorOpReduction<ElemType, OPFN, N, -1>
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
+struct TensorOpReduction<ElemType, OPFN, ReductionOp, N, -1>
 {
-    static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn,
+    static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
                                const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&)
    {
        return opfn(pointers); // finally we are doing some work!!!
@ -6082,10 +6091,10 @@ struct TensorOpReduction<ElemType, OPFN, N, -1>
 // -----------------------------------------------------------------------

 // perform loop over regular index k and reducing index m for N operands (counting the output)
-template <class ElemType, typename OPFN, size_t N, bool vectorizable, int m, int k>
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m, int k>
 struct TensorOpIteration
 {
-    static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
+    static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
                            const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
                            const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
    {
@ -6096,7 +6105,7 @@ struct TensorOpIteration
        for (size_t dim = regularOpDims[(size_t) k]; dim-- > 0;)
        {
            // need to descend into one loop deeper
-            TensorOpIteration<ElemType, OPFN, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
            // advance the pointers
            for (size_t i = 0; i < N; i++)
                pointers[i] += strides[i];
@ -6106,10 +6115,10 @@ struct TensorOpIteration

 // Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
 // This is a very common case, e.g. adding vectors or computing the Sigmoid.
-template <class ElemType, typename OPFN>
-struct TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
+template <class ElemType, typename OPFN, typename ReductionOp>
+struct TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
 {
-    static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn,
+    static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
                            const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                            const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
    {
@ -6121,25 +6130,25 @@ struct TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduc
        if (beta != 0)
 #pragma omp parallel for
            for (int k = 0; k < (int) K; k++)
-                TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
        else if (alpha != 1)
 #pragma omp parallel for
            for (int k = 0; k < (int) K; k++)
-                TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
        else
 #pragma omp parallel for
            for (int k = 0; k < (int) K; k++)
-                TensorOpIteration<ElemType, OPFN, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
        // TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default).
        // TODO: The signedness of k (required for omp) causes an extra sign-extend.
        // TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
    }
 };
 // and unary
-template <class ElemType, typename OPFN>
-struct TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
+template <class ElemType, typename OPFN, typename ReductionOp>
+struct TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
 {
-    static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn,
+    static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
                            const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                            const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
    {
@ -6150,27 +6159,27 @@ struct TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduc
        if (beta != 0)
 #pragma omp parallel for
            for (int k = 0; k < (int) K; k++)
-                TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
        else if (alpha != 1)
 #pragma omp parallel for
            for (int k = 0; k < (int) K; k++)
-                TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
        else
 #pragma omp parallel for
            for (int k = 0; k < (int) K; k++)
-                TensorOpIteration<ElemType, OPFN, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }
 };

-template <class ElemType, typename OPFN, size_t N, bool vectorizable, int m>
-struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m>
+struct TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, -1>
 {
-    static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
+    static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
                            const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
                            const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
    {
        // we are at element level for the result: perform the op (there may still be reduction)
-        ElemType val = TensorOpReduction<ElemType, OPFN, N, m>::Loop(pointers, opfn, reducingOpDims, reducingStrides);
+        ElemType val = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
        // scale
        val *= alpha;
        // combine with previous value in target matrix, then write it out
@ -6188,8 +6197,8 @@ struct TensorOpIteration<ElemType, OPFN, N, vectorizable, m, -1>
 // -----------------------------------------------------------------------

 // tensor operation with k+1 dimensions (-1 means scalar)
-template <class ElemType, typename OPFN, size_t N, int k>
-static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn,
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int k>
+static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn, ReductionOp reductionOp,
                                    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
                                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
 {
@ -6197,9 +6206,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
    switch (dims)
    {
    case 2:
-        return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 1:
-        return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 0:
    {
        // if all leading dimensions are 1, we can let the compiler do some unrolling
@ -6207,9 +6216,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
        for (size_t i = 0; i < N; i++)
            leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
        if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions
-            return TensorOpIteration<ElemType, OPFN, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            return TensorOpIteration<ElemType, OPFN, ReductionOp, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
        else
-            return TensorOpIteration<ElemType, OPFN, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    }
    default:
        LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int) dims);
@ -6218,11 +6227,11 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po

 // tensor operation, generalized in number of arguments, operation already provided as a lambda
 // This function now expands into different k.
-template <class ElemType, typename OPFN, size_t N>
-static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn,
-                           const array<size_t, N>& offsets,
-                           const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
-                           const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
+static void TensorOpWithFnAndReduction(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
+    const array<size_t, N>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
 {
    for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
        pointers[i] += offsets[i];
@ -6230,17 +6239,50 @@ static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType
    switch (dims)
    {
    case 4:
-        return TensorOpWithRegularLoop<ElemType, OPFN, N, 3>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 3>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 3:
-        return TensorOpWithRegularLoop<ElemType, OPFN, N, 2>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 2>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 2:
-        return TensorOpWithRegularLoop<ElemType, OPFN, N, 1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 1:
-        return TensorOpWithRegularLoop<ElemType, OPFN, N, 0>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 0>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 0:
-        return TensorOpWithRegularLoop<ElemType, OPFN, N, -1>(beta, pointers, alpha, opfn, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, -1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    default:
-        LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int) dims);
+        LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
+    }
+}
+
+// tensor operation, generalized in number of arguments, operation already provided as a lambda
+// This function now expands into different reductionOps
+template <class ElemType, typename OPFN, size_t N>
+static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, ElementWiseOperator reductionOp,
+    const array<size_t, N>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+{
+// BUGBUG: Using always 'double' as type of aggregator even for ElemType==float. Reason: otherwise some e2e test would fail as historically we 
+// used double for aggregator of sum. But:
+// * for min and max reductions this is meaningless.
+// * It is not consitent with what we do on GPU, there we aggregate on ElemType.
+// * It costs performance.
+// TODO: apdapt e2e tests to run with aggregator of type ElemType.
+#define CaseTensorOpWithFnAndReduction(oper)                                                  \
+    case ElementWiseOperator::op##oper:                                                       \
+    return TensorOpWithFnAndReduction(beta, pointers, alpha, opfn, [](double a, double b)     \
+                                    {                                                         \
+                                    return Op##oper(a, b);                                    \
+                                    },                                                        \
+                                    offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
+    switch (reductionOp)
+    {
+        CaseTensorOpWithFnAndReduction(Sum);
+        CaseTensorOpWithFnAndReduction(LogSum);
+        CaseTensorOpWithFnAndReduction(Min);
+        CaseTensorOpWithFnAndReduction(Max);
+    default:
+        LogicError("Specified ElementWiseOperator op %d not suported as reduction operation.", (int)reductionOp);
    }
 }

@ -6256,8 +6298,11 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
 {
-    if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
-        InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
+    if (reductionOp != ElementWiseOperator::opSum    &&
+        reductionOp != ElementWiseOperator::opLogSum &&
+        reductionOp != ElementWiseOperator::opMin    &&
+        reductionOp != ElementWiseOperator::opMax)
+        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");

 // TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
 #define CaseUnaryTensorOp(oper)                                                        \
@ -6266,7 +6311,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
                              {                                                        \
                                  return Op##oper((*(pp[0])));                         \
                              },                                                       \
-                              offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+                              reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)

    array<ElemType*, 2> pointers = {a.Data(), Data()};
    switch (op)
@ -6294,7 +6339,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
                              {                                                        \
                                  return Op##oper((*(pp[0])), (*(pp[1])));             \
                              },                                                       \
-                              offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+                              reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)

    array<ElemType*, 3> pointers = {a.Data(), b.Data(), Data()};
    switch (op)
@ -6322,7 +6367,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
                              {                                                        \
                                  return Op##oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); \
                              },                                                       \
-                              offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+                              reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)

    array<ElemType*, 4> pointers = {a.Data(), b.Data(), c.Data(), Data()};
    switch (op)
@ -6359,11 +6404,33 @@ template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
 template void CPUMatrix<char>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly);
 template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
 template char* CPUMatrix<char>::CopyToArray(void) const;
-
 template void CPUMatrix<char>::CopySection(size_t numRows, size_t numCols, char* dst, size_t colStride) const;
 template void CPUMatrix<char>::Reshape(const size_t, const size_t);

+// Support <short>
+template CPUMatrix<short>::CPUMatrix(const size_t numRows, const size_t numCols);
+template CPUMatrix<short>::CPUMatrix(const size_t numRows, const size_t numCols, short* pArray, const size_t matrixFlags);
+template CPUMatrix<short>::CPUMatrix();
+template CPUMatrix<short>::CPUMatrix(CPUMatrix<short> const&);
+template CPUMatrix<short>::CPUMatrix(CPUMatrix<short>&&);
+template size_t CPUMatrix<short>::LocateElement(size_t, size_t) const;
+template CPUMatrix<short>::~CPUMatrix();
+template CPUMatrix<short> CPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
+template CPUMatrix<short>& CPUMatrix<short>::operator=(CPUMatrix<short>&&);
+template void CPUMatrix<short>::SetValue(const short);
+template void CPUMatrix<short>::SetValue(const size_t numRows, const size_t numCols, short* pArray, size_t matrixFlags);
+template void CPUMatrix<short>::SetValue(CPUMatrix<short> const&);
+//template void CPUMatrix<short>::SetValue(GPUMatrix<short> const&);
+//template void CPUMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
+//template void CPUMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
+template void CPUMatrix<short>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly);
+template void CPUMatrix<short>::Resize(const size_t numRows, const size_t numCols, bool growOnly);
+template short* CPUMatrix<short>::CopyToArray(void) const;
+template void CPUMatrix<short>::CopySection(size_t numRows, size_t numCols, short* dst, size_t colStride) const;
+template void CPUMatrix<short>::Reshape(const size_t, const size_t);
+
 template CPUMatrix<int>::CPUMatrix(const size_t, const size_t, int*, const size_t);
 template CPUMatrix<int>::~CPUMatrix();

 }}}
+
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -375,7 +375,7 @@ public:

    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runInvStdDev,
                                   CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
-    void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
+    void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
                                    CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;

 public:
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -781,6 +781,7 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
        }
    }

+    // TODO: Implement CSR as a transposition of b, like we do for GPU.
    if (rhs.GetFormat() != matrixFormatSparseCSC)
        NOT_IMPLEMENTED;

@ -820,13 +821,42 @@ void CPUSparseMatrix<ElemType>::MultiplyAndWeightedAdd(ElemType alpha, const CPU
            }
        }
    }
+    // the transposeA case is copy-paste from above with rows/cols of lhs swapped
    else if (transposeA && !transposeB)
    {
-        NOT_IMPLEMENTED;
+        for (size_t j = 0; j < rhs.GetNumCols(); j++)
+        {
+            size_t start = rhs.SecondaryIndexLocation()[j]; // ColLocation
+            size_t end = rhs.SecondaryIndexLocation()[j + 1];
+            for (size_t p = start; p < end; p++)
+            {
+                size_t i = rhs.MajorIndexLocation()[p]; // RowLocation
+                ElemType val = rhs.Buffer()[p];
+
+                for (size_t h = 0; h < lhs.GetNumCols(); h++)
+                {
+                    c(h, j) += alpha * lhs(i, h) * val;
+                }
+            }
+        }
    }
-    else
+    else if (transposeA && transposeB)
    {
-        NOT_IMPLEMENTED;
+        for (size_t j = 0; j < rhs.GetNumCols(); j++)
+        {
+            size_t start = rhs.SecondaryIndexLocation()[j];
+            size_t end = rhs.SecondaryIndexLocation()[j + 1];
+
+            for (size_t p = start; p < end; p++)
+            {
+                size_t i = rhs.MajorIndexLocation()[p];
+                ElemType val = rhs.Buffer()[p];
+                for (size_t h = 0; h < lhs.GetNumCols(); h++)
+                {
+                    c(h, i) += alpha * lhs(j, h) * val;
+                }
+            }
+        }
    }
 }

@ -1475,6 +1505,29 @@ template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t st
 template void CPUSparseMatrix<char>::AssignColumnSliceToDense(CPUMatrix<char>&, size_t startColumn, size_t numCols) const;
 template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);

+// Support <short>
+template CPUSparseMatrix<short>::CPUSparseMatrix(const MatrixFormat format, const size_t numRows, const size_t numCols, const size_t size);
+template CPUSparseMatrix<short>::CPUSparseMatrix(MatrixFormat);
+template CPUSparseMatrix<short>::CPUSparseMatrix(CPUSparseMatrix<short> const&);
+template CPUSparseMatrix<short>::CPUSparseMatrix(CPUSparseMatrix<short>&&);
+template CPUSparseMatrix<short>& CPUSparseMatrix<short>::operator=(CPUSparseMatrix<short>&& moveFrom);
+template void CPUSparseMatrix<short>::SetValue(size_t, size_t, short);
+//template void CPUSparseMatrix<short>::SetValue(CPUMatrix<short> const&);
+//template void CPUSparseMatrix<short>::SetValue(GPUMatrix<short> const&);
+template void CPUSparseMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
+//template void CPUSparseMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
+template short* CPUSparseMatrix<short>::Data() const;
+template short* CPUSparseMatrix<short>::Data();
+template void CPUSparseMatrix<short>::Reset(void);
+template void CPUSparseMatrix<short>::Resize(const size_t, const size_t, const size_t, const bool);
+template void CPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, bool);
+template void CPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const MatrixFormat, const bool, bool);
+template CPUSparseMatrix<short>::~CPUSparseMatrix();
+template CPUSparseMatrix<short> CPUSparseMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
+template CPUMatrix<short> CPUSparseMatrix<short>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
+template void CPUSparseMatrix<short>::AssignColumnSliceToDense(CPUMatrix<short>&, size_t startColumn, size_t numCols) const;
+template CPUSparseMatrix<short>& CPUSparseMatrix<short>::operator=(const CPUSparseMatrix<short>& deepCopyFrom);
+
 template CPUSparseMatrix<int>::CPUSparseMatrix(const MatrixFormat, const size_t, const size_t, const size_t);
 template CPUSparseMatrix<int>::~CPUSparseMatrix();

--- a/Source/Math/CntkBatchNormalization.cuh
+++ b/Source/Math/CntkBatchNormalization.cuh
@ -110,11 +110,11 @@ __device__ __forceinline__ T Shuffle(T input, int srcLane)
 {
 #ifdef __CUDA_ARCH__
    // shfl is supported only on Kepler+
-    static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer");
+    static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer.");
    return cub::ShuffleIndex(input, srcLane);
 #else
    assert(false);
-    return input;
+    return input; // keep compiler happy
 #endif
 }

@ -163,8 +163,12 @@ void Call(size_t vectorSize, Targs... args)
 //    As a result, each block has 2 * blockDim.x (mean and inverse stddev) values to write at the end.
 //    
 template <int BlockDimX, int BlockDimY, int U, typename ElemType>
-__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
-                                                double epsilon, ElemType* xMean, ElemType* xInvStdDev)
+__global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
+                                              const ElemType* x,                         // (in) input data
+                                              double expAvgFactor,
+                                              ElemType* runMean, ElemType* runInvStdDev, // (in/out) running mean/stddev, gets updated with current minibatch
+                                              double epsilon,
+                                              ElemType* xMean, ElemType* xInvStdDev)     // (out) this minibatch's mean
 {
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
@ -182,9 +186,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
        return;
    assert(irowSrcBase + U <= vectorSize);

+    // --- estimate this minibatch's mean/stddev
+
+    // first estimate mean over all data for this thread
    int n = 0;
-    ElemType mean[U];
-    ElemType m2[U];
+    ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
+    ElemType m2[U];   // likewise for stdev
 #pragma unroll
    for (int k = 0; k < U; k++)
    {
@ -207,12 +214,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
            ElemType d = curVal[k] - mean[k];
            // REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
            // Using precise math slows down the code by about 40%.
-            mean[k] += d / n;
+            mean[k] += d / n; // mean_n = [mean_{n-1} * (n-1) + curVal] / n = mean_{n-1} *n/n - mean_{n-1} / n + curVal / n
            m2[k] += d * (curVal[k] - mean[k]);
        }
        psrc += vectorSize * BlockDimY;
    }

+    // now reduce minibatch mean/stddev across threads
    const int tid = threadIdx.y * BlockDimX + threadIdx.x;
    const int laneId = tid & 0x1f;
    // First, reduce within warp using shuffle.
@ -259,6 +267,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
    }
    __syncthreads();

+    // --- final reduction and update of running mean/stddev
+
    // Accumulate and write final results.
    // REVIEW alexeyk: see if atomicAdd can be used instead, do perf comparison.
    if (threadIdx.y == 0)
@ -283,7 +293,10 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
        size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
        // Store mean and running mean.
        StoreValues<U>(mean, xMean + idxDstBase);
-        if (expAvgFactor == 1)
+        // at this point, minibatch mean has been saved into xMean[]
+
+        // accumulate running mean
+        if (expAvgFactor == 1) // 100% comes from current minibatch, nothing from history
            StoreValues<U>(mean, runMean + idxDstBase);
        else
        {
@ -294,6 +307,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
                run[k] = expAvgFactor * mean[k] + (1.0 - expAvgFactor) * run[k];
            StoreValues<U>(run, runMean + idxDstBase);
        }
+        // at this point, runMean[] has been updated
+
        // Store inv std dev and its running version.
 #pragma unroll
        for (int k = 0; k < U; k++)
@ -301,6 +316,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
            m2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
        }
        StoreValues<U>(m2, xInvStdDev + idxDstBase);
+        // at this point, minibatch stddev has been saved into xInvStdDev[]
+
        if (expAvgFactor == 1)
            StoreValues<U>(m2, runInvStdDev + idxDstBase);
        else
@ -312,6 +329,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
                run[k] = expAvgFactor * m2[k] + (1.0 - expAvgFactor) * run[k];
            StoreValues<U>(run, runInvStdDev + idxDstBase);
        }
+        // at this point, runInvStdDev[] has been updated
    }
 }

@ -467,8 +485,13 @@ template <int U>
 struct ComputeBatchMeanAndInvStdDev
 {
    template <typename ElemType>
-    static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, double expAvgFactor, ElemType* runMean, ElemType* runInvStdDev,
-                     double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
+    static void Call(size_t vectorSize, size_t batchSize,
+                     const ElemType* x,                         // (in) input data
+                     double expAvgFactor,
+                     ElemType* runMean, ElemType* runInvStdDev, // (in/out) running mean/stddev, gets updated with current minibatch
+                     double epsilon,
+                     ElemType* xMean, ElemType* xInvStdDev,     // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
+                     cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);

@ -594,8 +617,11 @@ template <int U>
 struct NormalizeBatchTraining
 {
    template <typename ElemType>
-    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, ElemType* y,
-        const ElemType* bnScale, const ElemType* bnBias, const ElemType* batchMean, const ElemType* batchInvStdDev, cudaStream_t stream)
+    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
+                     const ElemType* x, ElemType* y,                            // (in, out) data to normalize -> normalized data
+                     const ElemType* bnScale, const ElemType* bnBias,           // (in) scale/bias to denormalize with
+                     const ElemType* batchMean, const ElemType* batchInvStdDev, // (in) actual mean/stddev to normalize with
+                     cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);

@ -839,7 +865,7 @@ struct ComputeSpatialScaleAndBiasGradients
 {
    template <typename ElemType>
    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
-        ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+                     ElemType* dScale, ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
    {
        assert((spatialSize % U) == 0);
        assert((vectorSize % spatialSize) == 0);
@ -854,9 +880,10 @@ struct ComputeSpatialScaleAndBiasGradients
    }
 };

+// mbStatsWeight is the weight with which current MB's stats were used (0 means not at all, locked model).
 template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
 __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
-                                                    const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias,
+                                                    const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
                                                    const ElemType* saveMean, const ElemType* saveInvStdDev)
 {
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
@ -917,18 +944,29 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
        LoadValues<U>(pdy, dyCur);
        LoadValues<U>(pdx, dxCur);
        // From the BN paper, dL/dxi is a sum of three terms: dL/dxi = t1 + t2 + t3
-        // After simplifcation, they become the following:
-        // 1. t1 = scale * dL/dyi * invStdDev
-        // 2. t2 = (-scale / m) * invStdDev * xHat * dL/dScale
-        // 3. t3 = (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
+        // The formulas for dBias and dScale happen to occur as subexpressions in this gradient as well.
+        // Leveraging this, this gradient can be simplified to:
+        //   t1 = scale * dL/dyi * invStdDev
+        //   t2 = mbStatsWeight * (-scale / m) * invStdDev * xHat * dL/dScale
+        //   t3 = mbStatsWeight * (-scale / m) * invStdDev * dL/dBias (for this one note that Sum(xHat) == 0)
+        // with
+        //   dBias = Reduce(dy)
+        //   dScale = Reduce(dy * xHat)
        // Simplifying this a bit more, we get the formula below.
        ElemType val[U];
        int m = Spatial ? batchSize * spatialSize : batchSize;
 #pragma unroll
        for (int k = 0; k < U; k++)
        {
-            ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k];
-            val[k] = dxCur[k] + (scale[k] * invStdDev[k]) * (dyCur[k] - (xNorm * ds[k] + db[k]) / m);
+            ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
+            // scale * invStdDev * (
+            //   dL/dyi
+            //   - mbStatsWeight * (xHat * dL/dScale + dL/dBias) / m
+            // )
+            val[k] = dxCur[k]   // (adding to gradient)
+                     + (scale[k] * invStdDev[k]) * (
+                        dyCur[k]
+                        - mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
        }
        StoreValues<U>(val, pdx);
    }
@ -939,25 +977,26 @@ struct BackpropagateBatchNormGradients
 {
    template <typename ElemType>
    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
-                        const ElemType* bnScale, const ElemType* dScale, const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
+                     const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
+                     const ElemType* dBias, const ElemType* saveMean, const ElemType* saveInvStdDev, cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);
        const int BlockDimX = 32 / U;
        const int BlockDimY = 4 * U;
        auto bdim = dim3(BlockDimX, BlockDimY);
        auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)),
-                            static_cast<unsigned int>(RoundUpToMultiple(batchSize, BlockDimY)));
+                         static_cast<unsigned int>(RoundUpToMultiple(batchSize,  BlockDimY)));
        if (spatial)
        {
-            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true, U><<<gdim, bdim, 0, stream>>>(
-                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
+            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
+                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
        }
        else
        {
-            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false, U><<<gdim, bdim, 0, stream>>>(
-                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, dScale, dBias, saveMean, saveInvStdDev);
+            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, false/*not spatial*/, U><<<gdim, bdim, 0, stream>>>(
+                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, saveMean, saveInvStdDev);
        }
    }
 };

-} } }
+}}}
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -96,7 +96,8 @@ enum ElementWiseOperator
    opCond /*a ? b : c*/,
    opClip, /*clip a within interval b..c*/
    opElementwiseProductWithLogSumDerivative,
-    opCopyIfEqual
+    opCopyIfEqual,
+    opElementwiseProductWithExpOfDiff, /* a * exp(b - c) */
    // Note: not all that's implemented in CNTK ComputationNodes has an opcode yet.
 };

@ -157,7 +158,8 @@ enum ElementWiseOperator
    Macro(Cond);                                        \
    Macro(CopyIfEqual);                                 \
    Macro(Clip);                                        \
-    Macro(ElementwiseProductWithLogSumDerivative);      
+    Macro(ElementwiseProductWithLogSumDerivative);      \
+    Macro(ElementwiseProductWithExpOfDiff);

 // -----------------------------------------------------------------------
 // various enums to describe
--- a/Source/Math/CuDnnBatchNormalization.cu
+++ b/Source/Math/CuDnnBatchNormalization.cu
@ -53,32 +53,37 @@ protected:
        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
        // cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
        epsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
-        // expAvgFactor == 0 && blendFactor == 1 means we are in eval mode.
+        // expAvgFactor == 0 && blendFactor == 1 means we are in inference mode.
        if (expAvgFactor == 0 && blendFactor == 1)
        {
+            saveMean.Resize(0, 0);      // (these are not produced in this case)
+            saveInvStdDev.Resize(0, 0);
            CUDNN_CALL(cudnnBatchNormalizationForwardInference(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(out),
-                m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
+                                                               m_scaleBiasCuDnnT, ptr(scale), ptr(bias), ptr(runMean), ptr(runInvStdDev), epsilon));
        }
        else
        {
+            saveMean.Resize(runMean);
+            saveInvStdDev.Resize(runMean);
            CUDNN_CALL(cudnnBatchNormalizationForwardTraining(*m_cudnn, mode, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in),
-                m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
-                epsilon, ptr(saveMean), ptr(saveInvStdDev)));
+                                                              m_inOutCuDnnT, ptr(out), m_scaleBiasCuDnnT, ptr(scale), ptr(bias), expAvgFactor, ptr(runMean), ptr(runInvStdDev),
+                                                              epsilon, ptr(saveMean), ptr(saveInvStdDev)));
        }
    }

-    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, const Mat& saveMean, const Mat& saveInvStdDev,
+    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
                      Mat& scaleGrad, Mat& biasGrad) override
    {
+        UNUSED(blendFactor);  // BUGBUG: It should be used.
        m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
        // REVIEW alexeyk: remove once Philly is upgraded to prod version. Also change betaParamDiff to 1 and update CNTK BN engine.
 #if CUDNN_MAJOR >= 5 || (CUDNN_MAJOR == 4 && CUDNN_PATCHLEVEL >= 7)
        CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
-            m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
+                                                   m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
 #else
-        CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
-            m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
+        CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, &C::One,                    m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
+                                                   m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), CUDNN_BN_MIN_EPSILON, ptr(saveMean), ptr(saveInvStdDev)));
 #endif
    }

--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -116,6 +116,44 @@ const char* CudaErrString<curandStatus>(curandStatus)

 namespace Microsoft { namespace MSR { namespace CNTK {

+/*static*/ bool SyncGuard::s_isSyncEnabled = false;
+
+/*static*/ void SyncGuard::EnableSync()
+{
+    s_isSyncEnabled = true;
+}
+
+SyncGuard::SyncGuard(bool forceSync /*= false*/)
+    : m_forceSync(forceSync)
+{
+    m_done = nullptr;
+    if (m_forceSync || s_isSyncEnabled)
+    {
+        CUDA_CALL(cudaGetLastError());
+        CUDA_CALL(cudaEventCreate(&m_done));
+    }
+}
+
+SyncGuard::~SyncGuard()
+{
+    if (m_forceSync || s_isSyncEnabled)
+    {
+        // The regular use of this destructor is to synchronize the GPU, but also
+        // to check for errors. So this destructor is where CUDA errors would be thrown.
+        // If this destructor runs during stack unwinding, then a different error has
+        // already happened that should be reported; so we only clean up the resource.
+        if (std::uncaught_exception())
+            cudaEventDestroy(m_done);
+        else
+        {
+            // failures in a prior launch might be reported here
+            CUDA_CALL(cudaEventRecord(m_done));
+            CUDA_CALL(cudaEventSynchronize(m_done));
+            CUDA_CALL(cudaEventDestroy(m_done));
+        }
+    }
+}
+
 template <typename AllocatedElemType>
 AllocatedElemType* TracingGPUMemoryAllocator::Allocate(int deviceId, size_t numRows, size_t numCols)
 {
@ -1911,7 +1949,8 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
    while (p / 2 > width)
        p = p / 2;

-    _computeNceOutput<ElemType><<<GetNumElements() / 2, p>>>(
+    // note: kernel has hard-coded dimension of 512
+    _computeNceOutputMax512Threads<ElemType> << <GetNumElements() / 2, p >> >(
        Data(),
        sampleCount,
        m_numRows / 2,
@ -1925,7 +1964,8 @@ void GPUMatrix<ElemType>::AssignNoiseContrastiveEstimation(const GPUMatrix<ElemT
    while (p / 2 > GetNumElements() / 2)
        p = p / 2;
    // summing up objective must be done in one block
-    _assignNoiseContrastiveEstimation<ElemType><<<1, p>>>(
+    // note: kernel has hard-coded dimension of 512
+    _assignNoiseContrastiveEstimationMax512Threads<ElemType> << <1, p >> >(
        Data(),
        sampleCount,
        m_numRows / 2,
@ -1970,7 +2010,8 @@ void GPUMatrix<ElemType>::AssignSoftmaxSum(const GPUMatrix<ElemType>& a, GPUMatr
    while (p / 2 > width)
        p = p / 2;

-    _assignSoftmaxSum<ElemType><<<1, p>>>(
+    // note: kernel has hard-coded dimension of 512
+    _assignSoftmaxSumMax512Threads<ElemType> << <1, p >> >(
        my_a.Data(),
        width,
        Data(),
@ -2046,7 +2087,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignLogSoftmaxOf(const GPUMatrix<Ele
        CUDA_LONG N = (CUDA_LONG) GetNumCols();
        CUDA_LONG M = (CUDA_LONG) GetNumRows();
        SyncGuard syncGuard;
-        _assignColumnwiseLogSoftmaxOf<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
+        // note: kernel uses hard-coded thread dimension
+        _assignColumnwiseLogSoftmaxOf512Threads<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
    }
    else
    {
@ -2072,7 +2114,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignHardmaxOf(const GPUMatrix<ElemTy
        CUDA_LONG N = (CUDA_LONG) GetNumCols();
        CUDA_LONG M = (CUDA_LONG) GetNumRows();
        SyncGuard syncGuard;
-        _assignColumnwiseHardmaxOf<<<N, 512, 0, t_stream>>>(a.Data(), Data(), N, M);
+        // note: kernel uses hard-coded thread dimension
+        _assignColumnwiseHardmaxOf512Threads << <N, 512, 0, t_stream >> >(a.Data(), Data(), N, M);
    }
    else
    {
@ -2224,7 +2267,8 @@ ElemType GPUMatrix<ElemType>::SumOfElements() const
    ElemType h_sum;

    // WARNING: THIS kernel is not the most efficient way!
-    _reductionSum<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements());
+    // note: kernel has hard-coded dimension of 1024
+    _reductionSum1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements());
    CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
    TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);
    return h_sum;
@ -2241,7 +2285,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSumOfElements(const GPUMatrix<El
    PrepareDevice();
    SyncGuard syncGuard;
    // WARNING: THIS kernel is not the most efficient way!
-    _reductionSumAndAssign<ElemType><<<1, 1024>>>(Data(), a.Data(), (CUDA_LONG) a.GetNumElements(), (CUDA_LONG) GetNumElements());
+    // note: kernel has hard-coded dimension of 1024
+    _reductionSumAndAssign1024Threads<ElemType> << <1, 1024 >> >(Data(), a.Data(), (CUDA_LONG)a.GetNumElements(), (CUDA_LONG)GetNumElements());
    return (*this);
 }

@ -2253,7 +2298,8 @@ DeviceBoundNumber<ElemType> GPUMatrix<ElemType>::Sum_AsDeviceBoundNum() const
    ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);

    // WARNING: THIS kernel is not the most efficient way!
-    _reductionSum<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements());
+    // note: kernel has hard-coded dimension of 1024
+    _reductionSum1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements());
    DeviceBoundNumber<ElemType> result;
    result.ShallowCopyFrom(d_sum, GetComputeDeviceId());
    return result;
@ -2555,7 +2601,8 @@ ElemType GPUMatrix<ElemType>::FrobeniusNorm() const

    ElemType h_sum = 0;
    // WARNING: THIS kernel is not the most efficient way!
-    _reductionSum2<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_sum, (CUDA_LONG) GetNumElements(), true);
+    // note: kernel has hard-coded dimension of 1024
+    _reductionSum21024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_sum, (CUDA_LONG)GetNumElements(), true);
    CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
    TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);

@ -2572,7 +2619,8 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignFrobeniusNormOf(const GPUMatrix<

    PrepareDevice();
    // WARNING: THIS kernel is not the most efficient way!
-    _reductionSum2<ElemType><<<1, 1024, 0, t_stream>>>(a.Data(), Data(), (CUDA_LONG) a.GetNumElements(), true);
+    // note: kernel has hard-coded dimension of 1024
+    _reductionSum21024Threads<ElemType> << <1, 1024, 0, t_stream >> >(a.Data(), Data(), (CUDA_LONG)a.GetNumElements(), true);

    return *this;
 }
@ -2581,13 +2629,14 @@ template <class ElemType>
 ElemType GPUMatrix<ElemType>::MatrixNormInf() const
 {
    if (IsEmpty())
-        LogicError("MatrixNorm1: Matrix is empty.");
+        LogicError("MatrixNormInf: Matrix is empty.");

    ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);

    ElemType h_maxAbs = 0;
    // WARNING: THIS kernel is not the most efficient way!
-    _reductionMatrixNormInf<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_maxAbs, (CUDA_LONG) GetNumElements());
+    // note: kernel has hard-coded dimension of 1024
+    _reductionMatrixNormInf1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_maxAbs, (CUDA_LONG)GetNumElements());
    CUDA_CALL(cudaMemcpy(&h_maxAbs, d_maxAbs, sizeof(ElemType), cudaMemcpyDeviceToHost));
    TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_maxAbs);
    return h_maxAbs;
@ -2610,7 +2659,8 @@ ElemType GPUMatrix<ElemType>::MatrixNorm0() const
    ElemType* d_nz = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
    ElemType h_nz = 0;
    // WARNING: THIS kernel is not the most efficient way!
-    _reductionMatrixNorm0<ElemType><<<1, 1024, 0, t_stream>>>(Data(), d_nz, (CUDA_LONG) GetNumElements());
+    // note: kernel has hard-coded dimension of 1024
+    _reductionMatrixNorm01024Threads<ElemType> << <1, 1024, 0, t_stream >> >(Data(), d_nz, (CUDA_LONG)GetNumElements());
    CUDA_CALL(cudaMemcpy(&h_nz, d_nz, sizeof(ElemType), cudaMemcpyDeviceToHost));
    TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_nz);
    return h_nz;
@ -2667,7 +2717,8 @@ void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<E
        maxIndexes.RequireSize(1, n);

        int blocksPerGrid = n; // we'll have 1 block processing 1 column
-        _vectorMaxMinReduce<ElemType, true><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(us.Data(), maxIndexes.Data(), maxValues.Data(), m, n);
+        // note: kernel has hard-coded dimension of 512
+        _vectorMaxMinReduce512Threads<ElemType, true><<<blocksPerGrid, 512, 0, t_stream>>>(us.Data(), maxIndexes.Data(), maxValues.Data(), m, n);

        /*int blocksPerGrid=(int)ceil(1.0*n/GridDim::maxThreadsPerBlock);
            _vectorMax<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(us.Data(),maxIndexes.Data(),maxValues.Data(),m,n,isColWise);*/
@ -2793,7 +2844,8 @@ void GPUMatrix<ElemType>::VectorMin(GPUMatrix<ElemType>& minIndexes, GPUMatrix<E
        minIndexes.RequireSize(1, n);

        int blocksPerGrid = n; // we'll have 1 block processing 1 column
-        _vectorMaxMinReduce<ElemType, false><<<blocksPerGrid, GridDim::maxThreadsPerBlock, 0, t_stream>>>(us.Data(), minIndexes.Data(), minValues.Data(), m, n);
+        // note: kernel has hard-coded dimension of 512
+        _vectorMaxMinReduce512Threads<ElemType, false> << <blocksPerGrid, 512, 0, t_stream >> >(us.Data(), minIndexes.Data(), minValues.Data(), m, n);

        /*
            int blocksPerGrid=(int)ceil(1.0*n/GridDim::maxThreadsPerBlock);
@ -2823,8 +2875,9 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignNumOfDiff(const GPUMatrix<ElemTy
    if (!searchInCol)
    {
        // int blocksPerGrid=(int)ceil(1.0*a.GetNumElements()/GridDim::maxThreadsPerBlock);
-        // _assignNumOfDiff<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(a.Data(), b.Data(), Data(), a.GetNumElements());
-        _assignNumOfDiff<ElemType><<<1, 1024, 0, t_stream>>>(a.Data(), b.Data(), Data(), (CUDA_LONG) a.GetNumElements());
+        // _assignNumOfDiff1024Threads<ElemType><<<blocksPerGrid,GridDim::maxThreadsPerBlock,0,t_stream>>>(a.Data(), b.Data(), Data(), a.GetNumElements());
+        // note: kernel has hard-coded dimension of 1024
+        _assignNumOfDiff1024Threads<ElemType> << <1, 1024, 0, t_stream >> >(a.Data(), b.Data(), Data(), (CUDA_LONG)a.GetNumElements());
    }
    else
    {
@ -3107,6 +3160,7 @@ void GPUMatrix<ElemType>::AveragePoolingBackward(const GPUMatrix<int>& mpRowCol,
                                                                Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows());
 }

+// returns saveMean/saveInvStdDev which are the actual values used to perform the normalization, except for blendFactor 1, in which case they are unused and set to empty
 template <class ElemType>
 void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
                                                    GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
@ -3122,10 +3176,13 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
    assert(0 < vectorSize && vectorSize <= std::numeric_limits<int>::max());
    assert(0 < batchSize  && batchSize  <= std::numeric_limits<int>::max());

+    // --- compute data mean/stddev (into saveMean/saveInvStdDev) and update running mean/stddev
    SyncGuard syncGuard;
    // If expAvgFactor == 0 && blendFactor == 1 then we don't need to compute current minibatch statistics.
    if (expAvgFactor > 0 || blendFactor < 1)
    {
+        saveMean.RequireSize(runMean);
+        saveInvStdDev.RequireSize(runMean);
        if (spatial)
        {
            Call<ComputeSpatialBatchMeanAndInvStdDev, ElemType>(spatialSize, vectorSize, spatialSize, batchSize, Data(),
@ -3139,35 +3196,50 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
                                                         saveMean.Data(), saveInvStdDev.Data(), GetStream());
        }
    }
+    else // not computing new statistics
+    {
+        saveMean.RequireSize(0, 0);
+        saveInvStdDev.RequireSize(0, 0);
+    }
+
+    // --- apply MAP estimates of mean/stddev (interpolation of data and running mean/stddev) to data
    // When:
-    //     blendFactor == 1 - use running mean/var instead of the current minibatch mean/var.
+    //     blendFactor == 1 - use running mean/var instead of the current minibatch mean/var. Note: saveMean/saveInvStdDev are NOT produced.
    // 0 < blendFactor <  1 - blend running mean/var with mean/var of the current minibatch: saveMean = (1 - blendFactor) * saveMean + blendFactor * runMean
    //     blendFactor == 0 - use mean/var of the current minibatch.
    if (blendFactor < 1)
    {
+        // non-zero blendFactor: interpolate minibatch mean/stddev in-place with running mean/stddev
        if (blendFactor > 0)
        {
            // REVIEW alexeyk: can be rolled into NormalizeBatchTraining to save bandwidth.
            // TODO: add a 'beta' parameter to ScaleAndAdd()
            Scale((ElemType)(1 - blendFactor), saveMean);
-            ScaleAndAdd((ElemType)blendFactor, runMean, saveMean);
+            ScaleAndAdd((ElemType)blendFactor, /*in*/ runMean, /*in/out*/ saveMean);
            Scale((ElemType)(1 - blendFactor), saveInvStdDev);
            ScaleAndAdd((ElemType)blendFactor, runInvStdDev, saveInvStdDev);
        }
-        Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
-                                               spatial, Data(), out.Data(), scale.Data(), bias.Data(),
-                                               saveMean.Data(), saveInvStdDev.Data(), GetStream());
+        // normalize
+        Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
+                                               Data(), out.Data(),                            // (in, out) data to be normalized -> normalized data
+                                               scale.Data(), bias.Data(),                     // (in) scale/bias to denormalize with
+                                               /*(in)*/saveMean.Data(), saveInvStdDev.Data(), // (in) actual mean/stddev to normalize with
+                                               GetStream());
    }
-    else
+    else // blendFactor == 1: use running mean/stddev only
    {
-        Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize,
-                                               spatial, Data(), out.Data(), scale.Data(), bias.Data(),
+        Call<NormalizeBatchTraining, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
+                                               Data(), out.Data(),
+                                               scale.Data(), bias.Data(),
                                               runMean.Data(), runInvStdDev.Data(), GetStream());
+        // CNTK engine returns saveMean and saveInvStdDev empty, but cnDNN engine does not.
    }
 }

+// saveMean/saveInvStdDev are the interpolated mean/stddev as used in ForwardProp().
+// For blendFactor=1, they are not used and can be uninitialized or empty.
 template <class ElemType>
-void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, 
+void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
                                                     const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
                                                     GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
 {
@ -3192,8 +3264,9 @@ void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>&
        Call<ComputeScaleAndBiasGradients, ElemType>(vectorSize, vectorSize, batchSize, in.Data(), Data(), scaleGrad.Data(), biasGrad.Data(),
                                                     saveMean.Data(), saveInvStdDev.Data(), GetStream());
    }
+    ElemType mbStatsWeight = (ElemType)(1 - blendFactor); // weight for contribution from actual MB stats (0 if none, e.g. locked BN node)
    Call<BackpropagateBatchNormGradients, ElemType>(spatial ? spatialSize : vectorSize, vectorSize, spatialSize, batchSize, spatial,
-                                                    in.Data(), Data(), grad.Data(), scale.Data(), scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
+                                                    in.Data(), Data(), grad.Data(), scale.Data(), mbStatsWeight, scaleGrad.Data(), biasGrad.Data(), saveMean.Data(), saveInvStdDev.Data(), GetStream());
 }

 #pragma region Static BLAS Functions
@ -3990,7 +4063,8 @@ ElemType GPUMatrix<ElemType>::GetLearnRateForBlock_Helper(const GPUMatrix<ElemTy
    }
    // d_res[0] should now contain inner product of matrices
    // Compute squared Frobenius norms (squared sums of elements)
-    _lrHelper<ElemType><<<1, 512, 0, t_stream>>>(Gradients.Data(), SmoothedGradients.Data(), (CUDA_LONG) Gradients.GetNumElements(), d_res);
+    // note: kernel has hard-coded dimension of 512
+    _lrHelper512Threads<ElemType> << <1, 512, 0, t_stream >> >(Gradients.Data(), SmoothedGradients.Data(), (CUDA_LONG)Gradients.GetNumElements(), d_res);
    ElemType res;
    CUDA_CALL(cudaMemcpy(&res, d_res, sizeof(ElemType), cudaMemcpyDeviceToHost));
    TracingGPUMemoryAllocator::Free<ElemType>(Gradients.GetComputeDeviceId(), d_res);
@ -4214,16 +4288,21 @@ void GPUMatrix<ElemType>::RCRFBackwardCompute(
    ElemType* d_zeta = TracingGPUMemoryAllocator::Allocate<ElemType>(alpha.GetComputeDeviceId(), iNumLab);

    CUDA_LONG N = iNumLab;
-    int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock);
+    // TODO: change all three '512' to 'GridDim::maxThreadsPerBlock' (not doing this now since I cannot test it)
+    int blocksPerGrid = (int) ceil(1.0 * N / 512);
    size_t szMemSize;
    for (int t = iNumPos - 1; t >= 0; t--)
    {
        szMemSize = sizeof(ElemType) * iNumLab;
-        _rcrfBackwardComputeZeta<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
+        // This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
+        assert(iNumLab <= 1024);
+        _rcrfBackwardComputeZetaMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, shift);
        szMemSize = iNumLab * 3;
        szMemSize *= sizeof(ElemType);
-        _rcrfBackwardCompute<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, iNumPos, alpha.Data(), beta.Data(),
-                                                                                                  d_zeta, pair_scores.Data(), iNumLab, shift);
+        // This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == 3 * iNumLab.
+        assert(iNumLab <= 1024);
+        _rcrfBackwardComputeMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, iNumPos, alpha.Data(), beta.Data(),
+                                                                                           d_zeta, pair_scores.Data(), iNumLab, shift);
    }
    /*
        error = cudaGetErrorString(cudaPeekAtLastError());
@ -4255,16 +4334,22 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
    ElemType* d_zeta = TracingGPUMemoryAllocator::Allocate<ElemType>(alpha.GetComputeDeviceId(), iNumLab);

    CUDA_LONG N = iNumLab;
-    int blocksPerGrid = (int) ceil(1.0 * N / GridDim::maxThreadsPerBlock);
+    // TODO: change all three '512' to 'GridDim::maxThreadsPerBlock' (not doing this now since I cannot test it)
+    int blocksPerGrid = (int)ceil(1.0 * N / 512);
    size_t szMemSize;
    for (int t = 0; t < iNumPos; t++)
    {
        szMemSize = sizeof(ElemType) * iNumLab;
-        _rcrfTransGrdComputeZeta<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
+        // This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
+        assert(iNumLab <= 1024);
+        // BUGBUG: This is launched with 512 threads per block, but allocates shared mem as if there is only one block. Likewise for all 4 of these functions.
+        _rcrfTransGrdComputeZetaMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t - 1, iNumPos, alpha.Data(), d_zeta, pair_scores.Data(), iNumLab, startLbl, shift);
        szMemSize = iNumLab * 3;
        szMemSize *= sizeof(ElemType);
-        _rcrfTransGrdCompute<ElemType><<<blocksPerGrid, GridDim::maxThreadsPerBlock, szMemSize>>>(t, startLbl, alpha.Data(), beta.Data(),
-                                                                                                  d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
+        // This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
+        assert(iNumLab <= 1024);
+        _rcrfTransGrdComputeMax1024Labels<ElemType> << <blocksPerGrid, 512, szMemSize >> >(t, startLbl, alpha.Data(), beta.Data(),
+                                                                                           d_zeta, pair_scores.Data(), lbls.Data(), grd.Data(), iNumPos, iNumLab, shift);
    }
    TracingGPUMemoryAllocator::Free<ElemType>(alpha.GetComputeDeviceId(), d_zeta);
 };
@ -4278,11 +4363,16 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
 template <class ElemType>
 static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
 {
-    // using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
-    // And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
-    static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
-    if (deviceId >= _countof(onesCache))
-        LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int) _countof(onesCache), (int) deviceId + 1);
+    // using a dynamically allocated array so this will never get freed, avoiding free-after-DLL-unload issues.
+    // and using shared_ptrs since we don't want to leak more than CacheSize elements
+    // when using a plain array we would have to control lifetime of the object and destructor would be called for every element in the array at the end
+    const int CacheSize = 32;
+    static shared_ptr<GPUMatrix<ElemType>> * onesCache = new shared_ptr<GPUMatrix<ElemType>>[CacheSize]; // cache of objects
+
+    if (deviceId >= CacheSize){
+        LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", CacheSize, (int)deviceId + 1);
+    }
+
    auto p = onesCache[deviceId];
    if (!p || p->GetNumRows() < N) // must (re-)allocate
    {
@ -4300,8 +4390,11 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
 {
-    if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
-        InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");
+    if (reductionOp != ElementWiseOperator::opSum    &&
+        reductionOp != ElementWiseOperator::opLogSum &&
+        reductionOp != ElementWiseOperator::opMin    &&
+        reductionOp != ElementWiseOperator::opMax)
+        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");

    a.PrepareDevice();
    if (a.GetComputeDeviceId() != GetComputeDeviceId())
@ -4322,10 +4415,11 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
            return LaunchUnaryTensorOp<ElemType>(beta, a.Data()+ offsets[0], Data()+ offsets[1], alpha, op, regularOpDims[0]);
    }

-    // special case: reducing a matrix onto a column vector; can be done with SGEMM
+    // special case: sum-reducing a matrix onto a column vector; can be done with SGEMM
    // Note: A minor risk is that with this, our own reduction function will rarely be used.
    // That function was tested to give the same results with 'double', and nearly the same with 'float' (different summation order matters).
    else if (op == ElementWiseOperator::opCopy && // we are just adding to target without any further operation
+             reductionOp == ElementWiseOperator::opSum &&
 #ifdef _DEBUG
             sizeof(ElemType) == sizeof(float) && // in debug don't shortcut 'double' so we have some test of our own codepath
 #endif
@ -4348,7 +4442,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,

    // regular case
    else
-        return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpN<ElemType, 2>(beta, array<ElemType*, 2>{a.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }

 // perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
@ -4365,7 +4459,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
    if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId())
        InvalidArgument("All matrices must be on the same GPU");

-    return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3>{a.Data(), b.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    return TensorOpN<ElemType, 3>(beta, array<ElemType*, 3>{a.Data(), b.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }

 // perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
@ -4381,7 +4475,7 @@ void GPUMatrix<ElemType>::TensorOp(ElemType beta, const GPUMatrix<ElemType>& a,
    a.PrepareDevice();
    if (a.GetComputeDeviceId() != GetComputeDeviceId() || b.GetComputeDeviceId() != GetComputeDeviceId() || c.GetComputeDeviceId() != GetComputeDeviceId())
        InvalidArgument("All matrices must be on the same GPU");
-    return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4>{a.Data(), b.Data(), c.Data(), Data()}, alpha, op, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    return TensorOpN<ElemType, 4>(beta, array<ElemType*, 4>{a.Data(), b.Data(), c.Data(), Data()}, alpha, op, reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
 }

 // =======================================================================
@ -4420,24 +4514,50 @@ template void GPUMatrix<char>::SetValue(const size_t numRows, const size_t numCo
 template void GPUMatrix<char>::SetValue(GPUMatrix<char> const&);
 //template void GPUMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
 //template void GPUMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
-
 template void GPUMatrix<char>::CopySection(size_t numRows, size_t numCols, char* dst, size_t colStride) const;
 template void GPUMatrix<char>::Reshape(const size_t, const size_t);
 template GPUMatrix<char>& GPUMatrix<char>::operator*=(char);
 template DEVICEID_TYPE GPUMatrix<char>::PrepareDevice(DEVICEID_TYPE deviceId) const;

+// Support <short>
+template GPUMatrix<short>::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId);
+template GPUMatrix<short>::GPUMatrix(const size_t numRows, const size_t numCols, int deviceId, short* pArray, const size_t matrixFlags);
+template GPUMatrix<short>::GPUMatrix(const GPUMatrix<short>&);
+template GPUMatrix<short>::GPUMatrix(GPUMatrix<short>&&);
+template short* GPUMatrix<short>::CopyToArray() const;
+template void GPUMatrix<short>::ChangeDeviceTo(int);
+template void GPUMatrix<short>::Resize(size_t, size_t, bool);
+template void GPUMatrix<short>::RequireSize(size_t, size_t, bool);
+
+template GPUMatrix<short>::~GPUMatrix();
+template GPUMatrix<short> GPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
+template GPUMatrix<short>& GPUMatrix<short>::operator=(GPUMatrix<short>&&);
+template GPUMatrix<short>::GPUMatrix(int);
+template void GPUMatrix<short>::SetValue(const short);
+template void GPUMatrix<short>::SetValue(const size_t numRows, const size_t numCols, int deviceId, short* pArray, size_t matrixFlags);
+//template void GPUMatrix<short>::SetValue(CPUMatrix<short> const&);
+template void GPUMatrix<short>::SetValue(GPUMatrix<short> const&);
+//template void GPUMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
+//template void GPUMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
+template void GPUMatrix<short>::CopySection(size_t numRows, size_t numCols, short* dst, size_t colStride) const;
+template void GPUMatrix<short>::Reshape(const size_t, const size_t);
+template GPUMatrix<short>& GPUMatrix<short>::operator*=(short);
+template DEVICEID_TYPE GPUMatrix<short>::PrepareDevice(DEVICEID_TYPE deviceId) const;
+
 template GPUMatrix<int>::GPUMatrix(const size_t, const size_t, int, int*, const size_t);
 template GPUMatrix<int>::~GPUMatrix();

 template int* TracingGPUMemoryAllocator::Allocate<int>(int, size_t);
 template size_t* TracingGPUMemoryAllocator::Allocate<size_t>(int, size_t);
 template long* TracingGPUMemoryAllocator::Allocate<long>(int, size_t);
+template short* TracingGPUMemoryAllocator::Allocate<short>(int, size_t);
 template char* TracingGPUMemoryAllocator::Allocate<char>(int, size_t);
 template float* TracingGPUMemoryAllocator::Allocate<float>(int, size_t);
 template double* TracingGPUMemoryAllocator::Allocate<double>(int, size_t);

 template void TracingGPUMemoryAllocator::Free<int>(int, int*, bool);
 template void TracingGPUMemoryAllocator::Free<size_t>(int, size_t*, bool);
+template void TracingGPUMemoryAllocator::Free<short>(int, short*, bool);
 template void TracingGPUMemoryAllocator::Free<char>(int, char*, bool);
 template void TracingGPUMemoryAllocator::Free<float>(int, float*, bool);
 template void TracingGPUMemoryAllocator::Free<double>(int, double*, bool);
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -61,6 +61,27 @@ cudaStream_t MATH_API GetStream();

 namespace Microsoft { namespace MSR { namespace CNTK {

+// -----------------------------------------------------------------------
+// SyncGuard -- synchronize around CUDA calls
+// -----------------------------------------------------------------------
+
+class SyncGuard
+{
+private:
+    static bool s_isSyncEnabled;
+
+    bool m_forceSync;
+#ifndef CPUONLY
+    cudaEvent_t m_done;
+#endif
+
+public:
+    static MATH_API void EnableSync();
+
+    SyncGuard(bool forceSync = false);
+    ~SyncGuard();
+};
+
 // -----------------------------------------------------------------------
 // DeviceBoundNumber -- This class represents a number which resides on a particular device. Use it to avoid unnecessary transfers between CPU and GPU
 // -----------------------------------------------------------------------
@ -207,18 +228,14 @@ public:
    // multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
    // will call Resize, which may fail if the matrix has multiple views.
    void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
+    void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly); }
+
    // Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
    // actually resizes the underlying matrix, doing any allocation as required.
    void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow

-    ElemType& operator()(const size_t /*row*/, const size_t /*col*/)
-    {
-        LogicError("GPUMatrix doesn't support this");
-    }
-    const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const
-    {
-        LogicError("GPUMatrix doesn't support this");
-    }
+    ElemType&       operator()(const size_t /*row*/, const size_t /*col*/)       { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
+    const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
    ElemType Get00Element() const;

    void SetValue(const ElemType v);
@ -453,7 +470,8 @@ public:
    void BatchNormalizationForward(const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& bias, double expAvgFactor, double blendFactor,
                                   GPUMatrix<ElemType>& runMean, GPUMatrix<ElemType>& runInvStdDev, GPUMatrix<ElemType>& out, double epsilon,
                                   GPUMatrix<ElemType>& saveMean, GPUMatrix<ElemType>& saveInvStdDev) const;
-    void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
+    void BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor,
+                                    const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
                                    GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const;

 public:
@ -623,51 +641,4 @@ static void CudaCall(ERRTYPE retCode, const char* exprString, const char* libNam
 #define CURAND_CALL(expr)   (CudaCall((expr), #expr, "CURAND",   CURAND_STATUS_SUCCESS))
 #define CUDNN_CALL(expr)    (CudaCall((expr), #expr, "cuDNN",    CUDNN_STATUS_SUCCESS))

-// -----------------------------------------------------------------------
-// SyncGuard -- synchronize around CUDA calls
-// -----------------------------------------------------------------------
-
-class SyncGuard
-{
-    static bool DoSync()
-    {
-#ifdef NO_SYNC // this strange way of writing it allows modifying this variable at runtime in the debugger
-        static bool do_sync = false;
-#else
-        static bool do_sync = true;
-#endif
-        return do_sync;
-    }
-    cudaEvent_t m_done;
-public:
-    SyncGuard()
-    {
-        m_done = nullptr;
-        if (DoSync())
-        {
-            CUDA_CALL(cudaGetLastError());
-            CUDA_CALL(cudaEventCreate(&m_done));
-        }
-    }
-    ~SyncGuard()
-    {
-        if (DoSync())
-        {
-            // The regular use of this destructor is to synchronize the GPU, but also
-            // to check for errors. So this destructor is where CUDA errors would be thrown.
-            // If this destructor runs during stack unwinding, then a different error has
-            // already happened that should be reported; so we only clean up the resource.
-            if (std::uncaught_exception())
-                cudaEventDestroy(m_done);
-            else
-            {
-                // failures in a prior launch might be reported here
-                CUDA_CALL(cudaEventRecord(m_done));
-                CUDA_CALL(cudaEventSynchronize(m_done));
-                CUDA_CALL(cudaEventDestroy(m_done));
-            }
-        }
-    }
-};
-
 #endif // CPUONLY
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -95,8 +95,8 @@ static INT CeilDiv(INT a, INT2 b) // ceil(a/b)

 struct GridDim
 {
-    static const CUDA_LONG maxThreadsPerBlock = 512; // use this many threads per block
-    static const CUDA_LONG maxWarpsPerBlock = 16;    // use this many warps per block. This means 512 threads for warpSize=32
+    static const CUDA_LONG maxThreadsPerBlock = 1024; // use this many threads per block
+    static const CUDA_LONG maxWarpsPerBlock = 32;     // use this many warps per block. This means 1024 threads for warpSize=32

    // use these for launching
    //   GridDim grid(NN);
@ -127,7 +127,7 @@ struct GridDim
        }

        // put it back together
-        m_threadsPerBlock = warpsPerProc * warpSize;        // =a multiple of 32 that is as close to 512 as makes sense given NN
+        m_threadsPerBlock = warpsPerProc * warpSize;        // =a multiple of 32 that is as close to 1024 as makes sense given NN
        m_blocksPerGrid = CeilDiv(N, m_threadsPerBlock);
        if (m_blocksPerGrid == 1)
            m_threadsPerBlock = N; // don't launch more than necessary  --TODO: Does this make a difference at all?
@ -847,7 +847,7 @@ __global__ void _logSoftMaxColWise(

 // each block processes one column. There must be 512 threads in a block
 template <class ElemType>
-__global__ void _assignColumnwiseLogSoftmaxOf(
+__global__ void _assignColumnwiseLogSoftmaxOf512Threads(
    const ElemType* a,
    ElemType* us,
    const CUDA_LONG m_numCols,
@ -1015,7 +1015,7 @@ __global__ void _logSoftMaxRowWise(

 // each block processes one column. There must be 512 threads in a block
 template <class ElemType>
-__global__ void _assignColumnwiseHardmaxOf(
+__global__ void _assignColumnwiseHardmaxOf512Threads(
    const ElemType* a,
    ElemType* us,
    const CUDA_LONG m_numCols,
@ -2198,7 +2198,7 @@ __global__ void _addSignOf(

 // This function processes 1 column per block. this function needs 512 threads
 template <class ElemType, bool IsMax>
-__global__ void _vectorMaxMinReduce(
+__global__ void _vectorMaxMinReduce512Threads(
    const ElemType* us,
    ElemType* Indexes,
    ElemType* Values,
@ -2585,7 +2585,7 @@ __global__ void _addElementToElement(
 }

 template <class ElemType>
-__global__ void _assignNumOfDiff(
+__global__ void _assignNumOfDiff1024Threads(
    const ElemType* a,
    const ElemType* b,
    ElemType* c,
@ -2664,7 +2664,7 @@ __global__ void _assignNumOfDiff(
 }

 /*template<class ElemType>
-__global__ void _assignNumOfDiff(
+__global__ void _assignNumOfDiff1024Threads(
 ElemType *a,
 ElemType *b,
 ElemType *c,
@ -3343,8 +3343,9 @@ __global__ void _computeGradientOfInput(
 }
 #endif

+#if 0
 template <class ElemType>
-__global__ void computeNCEForwardProp(
+__global__ void computeNCEForwardProp512Threads(
    const ElemType* val,
    const int* col,
    int numRows,
@ -3406,9 +3407,10 @@ __global__ void computeNCEForwardProp(
            res[i] = partials[0];
    }
 }
+#endif

 template <class ElemType>
-__global__ void _computeNceOutput(
+__global__ void _computeNceOutputMax512Threads(
    const ElemType* col,
    int numRows,
    int sampleCount,
@ -3477,7 +3479,7 @@ __global__ void _computeNceOutput(
 }

 template <class ElemType>
-__global__ void _assignSoftmaxSum(
+__global__ void _assignSoftmaxSumMax512Threads(
    const ElemType* softmax,
    int sampleCount,
    const ElemType* a,
@ -3489,7 +3491,7 @@ __global__ void _assignSoftmaxSum(
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
-    // tmp is the buffer that stores NCE output calculated from _computeNceOutput
+    // tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
    // c is the matrix to store objective

    __shared__ ElemType partials[512];
@ -3529,7 +3531,7 @@ __global__ void _assignSoftmaxSum(
 }

 template <class ElemType>
-__global__ void _assignNoiseContrastiveEstimation(
+__global__ void _assignNoiseContrastiveEstimationMax512Threads(
    const ElemType* val,
    int numRows,
    int sampleCount,
@ -3545,7 +3547,7 @@ __global__ void _assignNoiseContrastiveEstimation(
    // col is an array contains index of the word samples
    // a is a matrix in column major format contains output from hidden layer
    // b is the weight matrix for output layer
-    // tmp is the buffer that stores NCE output calculated from _computeNceOutput
+    // tmp is the buffer that stores NCE output calculated from _computeNceOutputMax512Threads
    // c is the matrix to store objective

    __shared__ ElemType partials[512];
@ -3863,7 +3865,7 @@ __global__ void _normalGradForSparseBlock(
 //This function should be called with 1024 threads per block and 1 block
 //THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
 template <class ElemType>
-__global__ void _reductionSum(
+__global__ void _reductionSum1024Threads(
    const ElemType* data,
    ElemType* sum,
    CUDA_LONG N)
@ -3944,7 +3946,7 @@ __global__ void _reductionSum(
 //This function should be called with 1024 threads per block and 1 block
 //THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
 template <class ElemType>
-__global__ void _reductionSumAndAssign(
+__global__ void _reductionSumAndAssign1024Threads(
    ElemType* toAssign,
    const ElemType* data,
    CUDA_LONG N, // length of data
@ -4028,7 +4030,7 @@ __global__ void _reductionSumAndAssign(
 //This function should be called with 1024 threads per block and 1 block
 //THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
 template <class ElemType>
-__global__ void _reductionSum2(
+__global__ void _reductionSum21024Threads(
    const ElemType* data,
    ElemType* sum,
    CUDA_LONG N,
@ -4118,7 +4120,7 @@ __global__ void _reductionSum2(
 //This function should be called with 1024 threads per block and 1 block
 //THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
 template <class ElemType>
-__global__ void _reductionMatrixNormInf(
+__global__ void _reductionMatrixNormInf1024Threads(
    const ElemType* data,
    ElemType* maxAbs,
    CUDA_LONG N)
@ -4206,7 +4208,7 @@ __global__ void _reductionMatrixNormInf(
 //This function should be called with 1024 threads per block and 1 block
 //THIS IS NOT THE MOST EFFICIENT IMPLEMENTATION!!!
 template <class ElemType>
-__global__ void _reductionMatrixNorm0(
+__global__ void _reductionMatrixNorm01024Threads(
    const ElemType* data,
    ElemType* nz,
    CUDA_LONG N)
@ -4306,7 +4308,7 @@ __global__ void _getSparseVectorRepresntationForCSCMatrix(
 }

 template <class ElemType>
-__global__ void _lrHelper(
+__global__ void _lrHelper512Threads(
    const ElemType* data1,
    const ElemType* data2,
    const CUDA_LONG N,
@ -4408,7 +4410,7 @@ __global__ void _lrHelper(

 /*
 template<class ElemType>
-__global__ void _lrHelper(
+__global__ void _lrHelper512Threads(
 ElemType* d_tmp)
 {
 if (sizeof(ElemType)==sizeof(float))
@ -4572,83 +4574,11 @@ __global__ void _minusOneAt(
        c[id] = c[id] - 1.0;
 }

-// the kernel function for RCRF backward computation
+// the kernel function for CRFLSTMNetwork  backward computation
 // assume a column slice of input and output
+// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == 3 * iNumLab.
 template <class ElemType>
-__global__ void _rcrfBackwardCompute(
-    const size_t iNumPos,
-    const ElemType* galpha, // column slice at current time t
-    ElemType* gbeta,        // column slices with [row, 2] at current time t for [
-    const ElemType* gpair_scores,
-    const size_t iNumLab, const int shift)
-{
-    int id = blockDim.x * blockIdx.x + threadIdx.x;
-
-    extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
-    // need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
-
-    ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
-    ElemType* pair_scores = alpha + iNumPos * iNumLab;
-    ElemType* beta = alpha + iNumPos * iNumLab + iNumLab * iNumLab;
-
-    if (id < 0 || id >= iNumLab)
-        return;
-
-    // copy global memory to shared memory to save time
-    for (int t = iNumPos - 1; t >= 0; t--)
-    {
-        alpha[IDX2C(id, t, iNumLab)] = galpha[IDX2C(id, t, iNumLab)];
-    }
-
-    for (int j = 0; j < iNumLab; j++)
-        pair_scores[IDX2C(id, j, iNumLab)] = gpair_scores[IDX2C(id, j, iNumLab)];
-
-    __syncthreads();
-
-    for (int t = iNumPos - 1; t >= 0; t--)
-    {
-        ElemType fSum;
-        ElemType fTmp = LZERO;
-        if (t == iNumPos - 1)
-        {
-            fSum = LZERO;
-            for (int j = 0; j < iNumLab; j++)
-            {
-                fSum = logaddk(fSum, alpha[IDX2C(j, t, iNumLab)]);
-            }
-
-            fTmp = alpha[IDX2C(id, t, iNumLab)] - fSum;
-        }
-        else
-        {
-            for (int j = 0; j < iNumLab; j++)
-            {
-                fSum = LZERO;
-                for (int m = 0; m < iNumLab; m++)
-                {
-                    fSum = logaddk(fSum, alpha[IDX2C(m, t, iNumLab)] + pair_scores[IDX2C(j, m, iNumLab)]);
-                }
-
-                fTmp = logaddk(fTmp, beta[IDX2C(j, t + 1, iNumLab)] + alpha[IDX2C(id, t, iNumLab)] + pair_scores[IDX2C(j, id, iNumLab)] - fSum);
-            }
-        }
-
-        beta[IDX2C(id, t, iNumLab)] = fTmp;
-        __syncthreads();
-    }
-
-    // copy from shared memory to global memory to pass values
-    for (int t = iNumPos - 1; t >= 0; t--)
-    {
-        gbeta[IDX2C(id, t, iNumLab)] = beta[IDX2C(id, t, iNumLab)];
-    }
-    //    __syncthreads();
-}
-
-/// the kernel function for CRFLSTMNetwork  backward computation
-/// assume a column slice of input and output
-template <class ElemType>
-__global__ void _rcrfBackwardCompute(
+__global__ void _rcrfBackwardComputeMax1024Labels(
    const size_t t, // time position
    const size_t iNumPos,
    const ElemType* galpha,       // column slice at current time t
@ -4659,13 +4589,13 @@ __global__ void _rcrfBackwardCompute(
 {
    int id = blockDim.x * blockIdx.x + threadIdx.x;

-    extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
-    // need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
+    extern __shared__ double sh_alpha_and_beta[]; // [id] or [id + iNumLab] or [id + 2 * iNumLab)]
+    // need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)

    ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
    ElemType* beta_t1 = (ElemType*) (alpha + iNumLab);
    ElemType* zeta = (ElemType*) (beta_t1 + iNumLab);
-    ElemType pair_scores[1024];
+    ElemType pair_scores[1024];  // [j=0..iNumLab-1]

    if (id < 0 || id >= iNumLab)
        return;
@ -4697,9 +4627,10 @@ __global__ void _rcrfBackwardCompute(
    gbeta[IDX2C(id, t, iNumLab)] = fTmp;
 }

-/// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
+// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
+// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
 template <class ElemType>
-__global__ void _rcrfBackwardComputeZeta(
+__global__ void _rcrfBackwardComputeZetaMax1024Labels(
    const size_t t, // time position
    const size_t iNumPos,
    const ElemType* galpha, // column slice at current time t
@ -4709,11 +4640,11 @@ __global__ void _rcrfBackwardComputeZeta(
 {
    int id = blockDim.x * blockIdx.x + threadIdx.x;

-    extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
-    // need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
+    extern __shared__ double sh_alpha_and_beta[]; // [id]
+    // need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)

    ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
-    ElemType pair_scores[1024];
+    ElemType pair_scores[1024]; // [j=0..iNumLab-1]

    if (id < 0 || id >= iNumLab)
        return;
@ -4739,8 +4670,9 @@ __global__ void _rcrfBackwardComputeZeta(
 }

 /// $\zeta_t(j) = {\sum_k exp(\delta_{t-1}(k) + a_{kj}(t))}$.
+// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
 template <class ElemType>
-__global__ void _rcrfTransGrdComputeZeta(
+__global__ void _rcrfTransGrdComputeZetaMax1024Labels(
    const int t, // time position
    const size_t iNumPos,
    const ElemType* galpha, // column slice at current time t
@ -4752,11 +4684,11 @@ __global__ void _rcrfTransGrdComputeZeta(
 {
    int id = blockDim.x * blockIdx.x + threadIdx.x;

-    extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
-    // need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
+    extern __shared__ double sh_alpha_and_beta[]; // [id]
+    // need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)

    ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
-    ElemType pair_scores[1024];
+    ElemType pair_scores[1024]; // [j=0..iNumLab-1]

    if (id < 0 || id >= iNumLab)
        return;
@ -4790,8 +4722,9 @@ __global__ void _rcrfTransGrdComputeZeta(
    gzeta[id] = fSum;
 }

+// This function assumes iNumLab <= 1024 and that shared memory == total (!) number of threads == iNumLab.
 template <class ElemType>
-__global__ void _rcrfTransGrdCompute(
+__global__ void _rcrfTransGrdComputeMax1024Labels(
    int t,
    const size_t start_lbl,
    const ElemType* galpha,
@ -4806,13 +4739,13 @@ __global__ void _rcrfTransGrdCompute(
 {
    int id = blockDim.x * blockIdx.x + threadIdx.x;

-    extern __shared__ double sh_alpha_and_beta[]; // intersting, has to use [], instead of *
-    // need bye size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)
+    extern __shared__ double sh_alpha_and_beta[]; // [id]
+    // need byte size = (iNumPos * iNumLab * 2 + iNumLab * iNumLab) * sizeof(ElemType)

    ElemType* alpha = (ElemType*) (sh_alpha_and_beta);
    ElemType* beta = (ElemType*) (alpha + iNumLab);
    ElemType* zeta = (ElemType*) (beta + iNumLab);
-    ElemType pair_scores[1024];
+    ElemType pair_scores[1024]; // [j=0..iNumLab-1]

    if (id < 0 || id >= iNumLab)
        return;
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -2290,7 +2290,7 @@ ElemType GPUSparseMatrix<ElemType>::SumOfElements() const
    ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
    ElemType h_sum;
    // WARNING: THIS kernel is not the most efficient way!
-    _reductionSum<ElemType><<<1, 1024>>>(NzValues(), d_sum, (LONG64) GetNumNZElements());
+    _reductionSum1024Threads<ElemType><<<1, 1024>>>(NzValues(), d_sum, (LONG64) GetNumNZElements());
    CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
    TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);

@ -2307,7 +2307,7 @@ ElemType GPUSparseMatrix<ElemType>::FrobeniusNorm() const
    ElemType* d_sum = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
    ElemType h_sum = 0;
    // WARNING: THIS kernel is not the most efficient way!
-    _reductionSum2<ElemType><<<1, 1024>>>(NzValues(), d_sum, (int) GetNumNZElements());
+    _reductionSum21024Threads<ElemType><<<1, 1024>>>(NzValues(), d_sum, (int) GetNumNZElements());
    CUDA_CALL(cudaMemcpy(&h_sum, d_sum, sizeof(ElemType), cudaMemcpyDeviceToHost));
    TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_sum);

@ -2326,7 +2326,7 @@ ElemType GPUSparseMatrix<ElemType>::MatrixNormInf() const
    ElemType* d_maxAbs = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), 1);
    ElemType h_maxAbs = 0;
    // WARNING: THIS kernel is not the most efficient way!
-    _reductionMatrixNormInf<ElemType><<<1, 1024>>>(NzValues(), d_maxAbs, (int) GetNumNZElements());
+    _reductionMatrixNormInf1024Threads<ElemType><<<1, 1024>>>(NzValues(), d_maxAbs, (int) GetNumNZElements());
    CUDA_CALL(cudaMemcpy(&h_maxAbs, d_maxAbs, sizeof(ElemType), cudaMemcpyDeviceToHost));
    TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), d_maxAbs);

@ -2689,7 +2689,6 @@ template void GPUSparseMatrix<char>::CopyToCPUSparseMatrix(CPUSparseMatrix<char>
 template void GPUSparseMatrix<char>::ChangeDeviceTo(int);
 template void GPUSparseMatrix<char>::Resize(const size_t, const size_t, const size_t, const bool);
 template void GPUSparseMatrix<char>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
-template void GPUSparseMatrix<int>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
 template void GPUSparseMatrix<char>::Reset();
 template GPUSPARSE_INDEX_TYPE GPUSparseMatrix<char>::SecondaryIndexValueAt(size_t) const;
 template GPUSparseMatrix<char>::~GPUSparseMatrix();
@ -2699,8 +2698,32 @@ template GPUSparseMatrix<char>& GPUSparseMatrix<char>::operator=(GPUSparseMatrix
 template void GPUSparseMatrix<char>::Reshape(const size_t, const size_t);
 template void GPUSparseMatrix<char>::ScaleAndAdd(char, GPUSparseMatrix<char> const &, GPUMatrix<char> &);

+// Support <short>
+template GPUSparseMatrix<short>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
+template GPUSparseMatrix<short>::GPUSparseMatrix(const size_t, const size_t, const size_t, DEVICEID_TYPE, const MatrixFormat);
+template GPUSparseMatrix<short>::GPUSparseMatrix(GPUSparseMatrix<short> const&);
+template GPUSparseMatrix<short>::GPUSparseMatrix(GPUSparseMatrix<short>&&);
+template void GPUSparseMatrix<short>::SetValue(CPUSparseMatrix<short> const&);
+template void GPUSparseMatrix<short>::SetValue(GPUSparseMatrix<short> const&);
+template void GPUSparseMatrix<short>::SetValue(GPUMatrix<short> const&);
+//template void GPUSparseMatrix<short>::SetValue(CPUMatrix<short> const&);
+template void GPUSparseMatrix<short>::CopyToDenseMatrix(GPUMatrix<short>&) const;
+template void GPUSparseMatrix<short>::CopyToCPUSparseMatrix(CPUSparseMatrix<short>&) const;
+template void GPUSparseMatrix<short>::ChangeDeviceTo(int);
+template void GPUSparseMatrix<short>::Resize(const size_t, const size_t, const size_t, const bool);
+template void GPUSparseMatrix<short>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);
+template void GPUSparseMatrix<short>::Reset();
+template GPUSPARSE_INDEX_TYPE GPUSparseMatrix<short>::SecondaryIndexValueAt(size_t) const;
+template GPUSparseMatrix<short>::~GPUSparseMatrix();
+template GPUSparseMatrix<short> GPUSparseMatrix<short>::ColumnSlice(size_t, size_t) const;
+template GPUMatrix<short> GPUSparseMatrix<short>::CopyColumnSliceToDense(size_t, size_t) const;
+template GPUSparseMatrix<short>& GPUSparseMatrix<short>::operator=(GPUSparseMatrix<short>&&);
+template void GPUSparseMatrix<short>::Reshape(const size_t, const size_t);
+template void GPUSparseMatrix<short>::ScaleAndAdd(short, GPUSparseMatrix<short> const &, GPUMatrix<short> &);
+
 template GPUSparseMatrix<int>::GPUSparseMatrix(DEVICEID_TYPE, const MatrixFormat);
 template GPUSparseMatrix<int>::~GPUSparseMatrix();
+template void GPUSparseMatrix<int>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, const bool);

 template <class ElemType>
 MATH_API File& operator>>(File& stream, GPUSparseMatrix<ElemType>& us)
--- a/Source/Math/GPUTensor.cu
+++ b/Source/Math/GPUTensor.cu
@ -19,6 +19,7 @@
 #include <cuda_runtime.h>
 #include "cublas_v2.h"
 #include <assert.h>
+#include<limits.h>

 #ifndef let
 #define let const auto
@ -47,9 +48,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 //  - supports general strides
 //  - input broadcasting is supported by stride=0
 //  - the operation is denoted by an opCode
-//  - reduction is supported, including summation (dual to broadcasting when computing gradients)
-//  - reduction operation is given by an opCode. Only a few specific opCodes may be used for reduction.
-//    Note: reduction opCodes are not implemented yet, only summation is supported.
+//  - reduction is supported, including summation, min, max (dual to broadcasting when computing gradients)
+//  - reduction operation is given by an opCode: opSum, opMin, opMax and opLogSum.
 //
 // This library makes extensive use of templates and macros.
 // Specifically, templates are used recursively to recurse over tensor dimensions.
@ -261,6 +261,65 @@ struct TensorOps
    }
 };

+//----------------------------------------------------------------------------
+// For reductions we need the neutral elements of the corresponding binary ops
+//----------------------------------------------------------------------------
+template <typename ElemType> __device__ ElemType NeutralValue(ElementWiseOperator op)
+{
+    return 0; // error, only the explicit instantiations below should be used.
+};
+
+template<> __device__ float NeutralValue<float>(ElementWiseOperator op)
+{
+    switch (op)
+    {
+    case ElementWiseOperator::opSum:    return 0;
+    case ElementWiseOperator::opLogSum: return -INFINITY;
+    case ElementWiseOperator::opMin:    return FLT_MAX;
+    case ElementWiseOperator::opMax:    return FLT_MIN;
+    default:                            return 0; // error
+    }
+};
+
+template<> __device__ double NeutralValue<double>(ElementWiseOperator op)
+{
+    switch (op)
+    {
+    case ElementWiseOperator::opSum:    return 0;
+    case ElementWiseOperator::opLogSum: return -INFINITY;
+    case ElementWiseOperator::opMin:    return DBL_MAX;
+    case ElementWiseOperator::opMax:    return DBL_MIN;
+    default:                            return 0; // error
+    }
+};
+
+
+// ----------------------------------------------------------------------------
+// Function to update an aggregate value for the specifed reduction operation
+// ----------------------------------------------------------------------------
+
+template<typename ReductionType, class ElemType> __device__ void UpdateAggregate(ReductionType& aggregate, ElemType val, ElementWiseOperator reductionOp)
+{
+    switch (reductionOp)
+    {
+    case ElementWiseOperator::opSum:
+        aggregate += val;
+        break;
+    case ElementWiseOperator::opLogSum:
+        aggregate = OpLogSum(aggregate, val);
+        break;
+    case ElementWiseOperator::opMin:
+        if (val < aggregate)
+            aggregate = val;
+        break;
+    case ElementWiseOperator::opMax:
+        if (val > aggregate)
+            aggregate = val;
+        break;
+    }
+};
+
+
 // -----------------------------------------------------------------------
 // function to compute the value for a given output location (including reduction)
 // -----------------------------------------------------------------------
@ -272,12 +331,12 @@ template <class ElemType, C_size_t N, C_int M, C_int m>
 struct TensorOpReduce
 {
    // this version for m >= 0
-    static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
+    static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                       const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides)
    {
        // start with index 0
        // We may use 'double' since we are memory-bound anyway.
-        ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
+        ReduceElemType aggregate = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
        // apply this index to the pointers
        C_size_t dim = reducingOpDims[m];
        for (C_size_t k = 1 /*done with k=0 already*/; k < dim; k++)
@ -285,8 +344,8 @@ struct TensorOpReduce
            // bump the pointers
            for (C_size_t i = 0; i < N - 1; i++) // N-1 because output is not used here
                pointers[i] += reducingStrides(i, (C_size_t) m);
-            ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
-            aggregate += val;
+            ElemType val = TensorOpReduce<ElemType, N, M, m - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
+            UpdateAggregate<ReduceElemType, ElemType>(aggregate, val, reductionOp);
        }
        return (ElemType) aggregate;
    }
@ -299,7 +358,7 @@ struct TensorOpReduce<ElemType, N, M, /*m=*/-1>
 {
    // this version for m = -1
    // the pointers are pointing to the right location(s) to take the operation over
-    static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op,
+    static __device__ ElemType Compute(FixedArray<ElemType*, N> pointers, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                       const FixedArray<C_unsigned_int, M>& /*reducingOpDims*/, const FixedMatrix<C_int, N, M>& /*reducingStrides*/)
    {
        return TensorOps<ElemType>::Compute(pointers, op); // finally computing something!
@ -354,7 +413,7 @@ template <class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce, C_i
 struct TensorOpElement
 {
    // template-recursive version loops over indices
-    static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
+    static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const FixedArray<C_unsigned_int, K>& regularOpStrides, const FixedMatrix<C_int, N, K>& regularStrides,
                                   const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides,
                                   CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
@ -367,7 +426,7 @@ struct TensorOpElement
        for (C_size_t i = 0; i < N; i++)
            pointers[i] += index * regularStrides(i, (C_size_t) k); // now this dimension is taken care of
        // process the previous index
-        TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
+        TensorOpElement<ElemType, N, M, K, parallelReduce, k - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
    }
 };

@ -376,7 +435,7 @@ template <class ElemType, C_size_t N, C_int M, C_int K, bool parallelReduce>
 struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
 {
    // template-recursive version loops over indices
-    static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
+    static __device__ void Compute(CUDA_LONG id, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const FixedArray<C_unsigned_int, K>& regularOpStrides, const FixedMatrix<C_int, N, K>& regularStrides,
                                   const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides,
                                   CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
@ -387,7 +446,7 @@ struct TensorOpElement<ElemType, N, M, K, parallelReduce, /*k=*/0>
        for (C_size_t i = 0; i < N; i++)
            pointers[i] += index * regularStrides(i, 0); // now this dimension is taken care of
        // process the previous index
-        TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/ 0, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
+        TensorOpElement<ElemType, N, M, K, parallelReduce, -1>::Compute(/*id*/ 0, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
    }
 };

@ -397,13 +456,13 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/false, /*k=*/-1>
 {
    // template-recursion-teminating version computes the actual value for this output location
    // now the output pointers point to the right element (input pointers may still iterate for reduction)
-    static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
+    static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const FixedArray<C_unsigned_int, K>& /*regularOpStrides*/, const FixedMatrix<C_int, N, K>& /*regularStrides*/,
                                   const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides, CUDA_LONG /*reductionBegin*/, CUDA_LONG /*reductionChunkSize*/)
    {
        // compute the operation for this output coordinate
        // This may still involve a reduction over inverse-broadcasting dimensions.
-        ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reducingOpDims, reducingStrides);
+        ElemType val = TensorOpReduce<ElemType, N, M, M - 1>::Compute(pointers, op, reductionOp, reducingOpDims, reducingStrides);
        // scale
        val *= alpha;
        // combine with previous value in target matrix, then write it out
@ -423,7 +482,7 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
 {
    // template-recursion-teminating version computes the actual value for this output location
    // now the output pointers point to the right element (input pointers may still iterate for reduction)
-    static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
+    static __device__ void Compute(CUDA_LONG /*id*/, ElemType beta, FixedArray<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const FixedArray<C_unsigned_int, K>& /*regularOpStrides*/, const FixedMatrix<C_int, N, K>& /*regularStrides*/,
                                   const FixedArray<C_unsigned_int, M>& reducingOpDims, const FixedMatrix<C_int, N, M>& reducingStrides, CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
    {
@ -442,22 +501,24 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>
        CUDA_LONG reductionEnd = min(reductionBegin + reductionChunkSize, reductionDim);

        // compute the operation for this input coordinate
-        ReduceElemType sum = 0;
+        ReduceElemType aggregate = NeutralValue<ReduceElemType>(reductionOp);
+
        for (CUDA_LONG redId = reductionBegin + tid; redId < reductionEnd; redId += tids)
        {
            auto val = TensorOpParallelReduce<ElemType, N, M, M - 1>::Compute(redId, pointers, op, reducingOpDims, reducingStrides);
-            sum += val;
+            UpdateAggregate<ReduceElemType, ElemType>(aggregate, val, reductionOp);
        }

        // reduce    --cf https://docs.nvidia.com/cuda/samples/6_Advanced/reduction/doc/reduction.pdf
        __shared__ ReduceElemType volatile accumulators[GridDim::maxThreadsPerBlock /*tids*/];
-        accumulators[tid] = sum;
+        accumulators[tid] = aggregate;
        __syncthreads();
-        static_assert(GridDim::maxThreadsPerBlock <= 512, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
-        for (CUDA_LONG i = 256; i; i >>= 1)
+        static_assert(GridDim::maxThreadsPerBlock <= 1024, "GridDim::maxThreadsPerBlock too large, need to add manually unrolled steps");
+        for (CUDA_LONG i = 512; i; i >>= 1)
        {
            if (tid < i && tid + i < tids)
-                accumulators[tid] += accumulators[tid + i];
+                UpdateAggregate<volatile ReduceElemType, volatile ReduceElemType>(accumulators[tid], accumulators[tid + i], reductionOp);
+
            if (0 + i < tids)
                __syncthreads(); // sync if condition true for at least one thread
            // TODO: use volatile* and then we can skip the __syncthreads() for the last 32 values. See Amit's allreduce() function implementation in MatrixQuantizer_kernel.cu.
@ -496,13 +557,13 @@ struct TensorOpElement<ElemType, N, M, K, /*parallelReduce=*/true, /*k=*/-1>

 // launch tensor op with CUDA
 template <class ElemType, C_size_t N, C_int M, C_int K>
-__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
+__global__ void _launchTensorOp(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
                                FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides)
 {
    CUDA_LONG id = GridDim::GetLinearThreadId();
    if (id < numElements) // note: there are no __syncthread() calls inside
-        TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
+        TensorOpElement<ElemType, N, M, K, false, K - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, 0, 0);
 }

 template <class ElemType, C_size_t N, C_int K>
@ -527,7 +588,7 @@ static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, Ele
    CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual input element
    SyncGuard syncGuard;
    GridDim grid(NN);
-    _launchTensorOp<ElemType, N, /*M=*/0, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
+    _launchTensorOp<ElemType, N, /*M=*/0, K> <<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream >>>(beta, pointers, alpha, op, (ElementWiseOperator)(-1) /* dummy reductionOp */, regularOpStrides, regularStrides, grid.m_N, reducingOpDims, reducingStrides);
 }

 // -----------------------------------------------------------------------
@ -535,7 +596,7 @@ static void LaunchTensorOp(ElemType beta, array<ElemType*, N> pointerVector, Ele
 // -----------------------------------------------------------------------

 template <class ElemType, C_size_t N, C_int M, C_int K>
-__global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
+__global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                             FixedArray<C_unsigned_int, K> regularOpStrides, FixedMatrix<C_int, N, K> regularStrides, CUDA_LONG numElements,
                                             FixedArray<C_unsigned_int, M> reducingOpDims, FixedMatrix<C_int, N, M> reducingStrides,
                                             CUDA_LONG reductionBegin, CUDA_LONG reductionChunkSize)
@ -546,7 +607,7 @@ __global__ void _launchTensorOpWithReduction(ElemType beta, FixedArray<ElemType*
    pointers[pointers.size() - 1] += numElements * reductionBlock; // the output tensor is dense (no gaps); and there is one copy for each reduction block (those get further reduced into one later)
 #endif
    if (id < numElements)                               // note: we have __syncthread() calls but only entire blocks in sync, so this is OK
-        TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
+        TensorOpElement<ElemType, N, M, K, true, K - 1>::Compute(id, beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, reducingOpDims, reducingStrides, reductionBegin, reductionChunkSize);
 }

 // helper function to provide a reduction buffer
@ -582,7 +643,7 @@ static shared_ptr<ElemType> GetReductionBuffer(size_t N)

 // All dimensions (N-ariness, number of input dimensions K and number of reduction dimensions M) are bound to template parameters now.
 template <class ElemType, C_size_t N, C_int M, C_int K>
-static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op,
+static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> pointerVector, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                        const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrideVectors,
                                        const SmallVector<size_t>& reducingOpDimVector, const array<SmallVector<ptrdiff_t>, N>& reducingStrideVectors)
 {
@ -601,7 +662,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
    FixedMatrix<C_int,         N, M> reducingStrides(reducingStrideVectors);

    // launch the kernel
-    CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual input element
+    CUDA_LONG NN = (CUDA_LONG) numElements; // linear space identifying each individual output element
    SyncGuard syncGuard;

    // do some optimization for reductions
@ -631,7 +692,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
    {
        // we got enough elements to generate: do one element per thread, and reduction inside
        _launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
-            beta, pointers, alpha, op,
+            beta, pointers, alpha, op, reductionOp,
            regularOpStrides, regularStrides, grid.m_N,
            reducingOpDims, reducingStrides);
    }
@ -684,9 +745,9 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
        if (numReductionChunks == 1)
        {
            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(
-                beta, pointers, alpha, op,
+                beta, pointers, alpha, op, reductionOp,
                regularOpStrides, regularStrides, NN,
-                reducingOpDims, reducingStrides, 0, reductionChunkSize);
+                reducingOpDims, reducingStrides, /*reductionBegin*/ 0, reductionChunkSize);
        }
        // --- case (b)
        // Reduction across blocks. This is the difficult one.
@ -721,7 +782,7 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
            ElemType beta1  = 0;
            ElemType alpha1 = 1;
            _launchTensorOpWithReduction<ElemType, N, M, K> << <dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream >> >(
-                beta1, pointers1, alpha1, op,
+                beta1, pointers1, alpha1, op, reductionOp,
                regularOpStrides, regularStrides1, NN,
                reducingOpDims, reducingStrides, /*reductionBegin*/0, reductionChunkSize);

@ -738,14 +799,14 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
            const array<SmallVector<ptrdiff_t>, 2> reducingStrideVectors2{ SmallVector<ptrdiff_t>{ NN }, SmallVector<ptrdiff_t>{ 0 } };
            const SmallVector<size_t>              reducingOpDimVector2{ (size_t)numReductionChunks };
            LaunchTensorOpWithReduction<ElemType, /*N=*/2, /*M=*/1, K>(
-                beta, pointerVector2, alpha, ElementWiseOperator::opCopy,
+                beta, pointerVector2, alpha, ElementWiseOperator::opCopy, reductionOp,
                regularOpDims, regularStrideVectors2,
                reducingOpDimVector2, reducingStrideVectors2);
            // (note: ^^this will have a nested syncGuard, which is fine)

 #else
            _launchTensorOp<ElemType, N, M, K><<<grid.m_blocksPerGrid, grid.m_threadsPerBlock, 0, t_stream>>>(
-                beta, pointers, alpha, op,
+                beta, pointers, alpha, op, reductionOp,
                regularOpStrides, regularStrides, grid.m_N,
                reducingOpDims, reducingStrides);
            //for (size_t z = 0; z < numBlocksZ; z++)
@ -768,16 +829,16 @@ static void LaunchTensorOpWithReduction(ElemType beta, array<ElemType*, N> point
        else if (beta == 1)
        {
            // no need to pre-scale; just add (common for gradients)
-            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
            return;
        }
        else
        {
            // We need more than one chunk, we will use atomicAdd().
            // First reset/pre-multiply input; then do the remaining chunks using atomicAdd().
-            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(beta, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, 0, reductionChunkSize);
            // We will leave it like this for a while, but eventually need to revisit using temporary memory.
-            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
+            _launchTensorOpWithReduction<ElemType, N, M, K><<<dim3(numBlocksX, numBlocksY, numBlocksZ - 1), numThreadsX, numThreadsX * sizeof(ReduceElemType), t_stream>>>(/*beta=*/1, pointers, alpha, op, reductionOp, regularOpStrides, regularStrides, NN, reducingOpDims, reducingStrides, reductionChunkSize, reductionChunkSize);
        }
 #endif
    }
@ -856,7 +917,7 @@ void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemTy

 // tensor operation with k+1 dimensions (-1 means scalar)
 template <class ElemType, C_size_t N, C_int K>
-static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op,
+static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
                                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
 {
@ -864,9 +925,9 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
    switch (dims)
    {
    case 2:
-        return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return LaunchTensorOpWithReduction<ElemType, N, 2, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 1:
-        return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return LaunchTensorOpWithReduction<ElemType, N, 1, K>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 0:
        return LaunchTensorOp<ElemType, N, K>(beta, pointers, alpha, op, regularOpDims, regularStrides);
    default:
@ -877,7 +938,7 @@ static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& po
 // tensor operation, generalized in number of arguments
 // This function now expands into different k. It also eliminates the offsets by adding them to the pointers.
 template <class ElemType, C_size_t N>
-void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
+void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
               const array<size_t, N>& offsets,
               const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
               const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
@ -888,15 +949,15 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
    switch (dims)
    {
    case 4:
-        return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, N, 4>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 3:
-        return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, N, 3>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 2:
-        return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, N, 2>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 1:
-        return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, N, 1>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    case 0:
-        return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        return TensorOpWithRegularLoop<ElemType, N, 0>(beta, pointers, alpha, op, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
    default:
        LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (C_int) dims);
    }
@ -906,27 +967,27 @@ void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, Elem
 // explicit instantiations--these are being called from GPUMatrix.cu
 //------------------------------------------------------------------------

-template void TensorOpN<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op,
+template void TensorOpN<float, 2>(float beta, array<float*, 2> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                  const array<size_t, 2>& offsets,
                                  const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                  const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
-template void TensorOpN<float, 3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op,
+template void TensorOpN<float, 3>(float beta, array<float*, 3> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                  const array<size_t, 3>& offsets,
                                  const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                                  const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
-template void TensorOpN<float, 4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op,
+template void TensorOpN<float, 4>(float beta, array<float*, 4> pointers, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                  const array<size_t, 4>& offsets,
                                  const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                  const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
-template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op,
+template void TensorOpN<double, 2>(double beta, array<double*, 2> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 2>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
-template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op,
+template void TensorOpN<double, 3>(double beta, array<double*, 3> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 3>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
-template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op,
+template void TensorOpN<double, 4>(double beta, array<double*, 4> pointers, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
                                   const array<size_t, 4>& offsets,
                                   const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
                                   const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
--- a/Source/Math/GPUTensor.h
+++ b/Source/Math/GPUTensor.h
@ -18,11 +18,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #define C_unsigned_int CUDA_LONG

 template <class ElemType, C_size_t N>
-void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op,
+void TensorOpN(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
               const array<size_t, N>& offsets,
               const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
               const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides);

 template <class ElemType>
 void LaunchUnaryTensorOp(ElemType beta, const ElemType* pa, ElemType* pb, ElemType alpha, ElementWiseOperator op, size_t regularOpDim);
-} } }
+
+}}}
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -175,6 +175,7 @@
    <ClInclude Include="RNGHandle.h" />	
    <ClInclude Include="TensorOps.h" />
    <ClInclude Include="TensorView.h" />
+    <ClInclude Include="Quantizers.h" />
    <None Include="GPUWatcher.cu" />
    <None Include="GPUWatcher.h">
      <FileType>CppHeader</FileType>
--- a/Source/Math/Math.vcxproj.filters
+++ b/Source/Math/Math.vcxproj.filters
@ -123,6 +123,7 @@
    <ClInclude Include="BlockMultiplierPlatform.h">
        <Filter>CPU</Filter>
    </ClInclude>
+    <ClInclude Include="Quantizers.h" />
  </ItemGroup>
  <ItemGroup>
    <None Include="GPUMatrix.h">
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -1139,7 +1139,12 @@ template <>
 /*static*/ char Matrix<char>::MakeNan(size_t)
 {
    return 0;
-} // (needed for completeness)
+} // (needed for completeness and to pass unit tests)
+template <>
+/*static*/ short Matrix<short>::MakeNan(size_t)
+{
+    return 0;
+} // (needed for completeness and to pass unit tests)

 template <class ElemType>
 void Matrix<ElemType>::MaskColumnsValue(const Matrix<char>& columnsMask, ElemType val)
@ -4289,7 +4294,8 @@ void Matrix<ElemType>::BatchNormalizationForward(const Matrix<ElemType>& scale,
 }

 template <class ElemType>
-void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
+void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor,
+                                                  const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
                                                  Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const
 {
    DecideAndMoveToRightDevice(*this, grad);
@ -4297,10 +4303,10 @@ void Matrix<ElemType>::BatchNormalizationBackward(const Matrix<ElemType>& in, Ma
    // REVIEW alexeyk: add sparse version.
    DISPATCH_MATRIX_ON_FLAG(this,
                            this,
-                            m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix),
+                            m_CPUMatrix->BatchNormalizationBackward(*(in.m_CPUMatrix), *(grad.m_CPUMatrix), *(scale.m_CPUMatrix), blendFactor,
                                                                    *(saveMean.m_CPUMatrix), *(saveInvStdDev.m_CPUMatrix),
                                                                    *(scaleGrad.m_CPUMatrix), *(biasGrad.m_CPUMatrix)),
-                            m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix),
+                            m_GPUMatrix->BatchNormalizationBackward(*(in.m_GPUMatrix), *(grad.m_GPUMatrix), *(scale.m_GPUMatrix), blendFactor,
                                                                    *(saveMean.m_GPUMatrix), *(saveInvStdDev.m_GPUMatrix),
                                                                    *(scaleGrad.m_GPUMatrix), *(biasGrad.m_GPUMatrix)),
                            NOT_IMPLEMENTED,
@ -5401,6 +5407,7 @@ void Matrix<ElemType>::TensorOp(ElemType beta, const Matrix<ElemType>& a, const
                            NOT_IMPLEMENTED);
 }

+//template class Matrix<short>;
 template class Matrix<float>;
 template class Matrix<double>;

@ -5430,6 +5437,31 @@ template void Matrix<char>::Resize(const size_t numRows, const size_t numCols, c
 template void Matrix<char>::Reshape(const size_t, const size_t);
 template char* Matrix<char>::CopyToArray(void) const;

+// Matrix<short> methods
+template Matrix<short>::Matrix(DEVICEID_TYPE);
+template Matrix<short>::Matrix(Matrix<short>&&);
+template Matrix<short>::Matrix(const size_t numRows, const size_t numCols, DEVICEID_TYPE deviceId, const MatrixType matrixType, const MatrixFormat matrixFormat);
+template Matrix<short>::Matrix(const size_t numRows, const size_t numCols, short* pArray, DEVICEID_TYPE deviceId, const size_t matrixFlags, const size_t nnz);
+template Matrix<short>::~Matrix();
+template Matrix<short>& Matrix<short>::operator=(Matrix<short>&& moveFrom);
+template short* Matrix<short>::Data() const;
+template int Matrix<short>::GetDeviceId() const;
+template size_t Matrix<short>::GetNumElements() const;
+template Matrix<short> Matrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
+template void Matrix<short>::_transferToDevice(int id_to, bool isBeingMoved, bool emptyTransfer) const;
+template void Matrix<short>::TransferToDeviceIfNotThere(int id_to, bool isBeingMoved, bool emptyTransfer, bool updatePreferredDevice) const;
+template size_t Matrix<short>::GetNumRows() const;
+template size_t Matrix<short>::GetNumCols() const;
+template void Matrix<short>::SetValue(const short);
+template void Matrix<short>::SetValue(size_t numRows, const size_t numCols, int deviceId, short* pArray, size_t matrixFlags);
+//template void Matrix<short>::SetValue(const Matrix<short>&, MatrixFormat);
+template void Matrix<short>::SetValue(const Matrix<short>&);
+template void Matrix<short>::AssignValuesOf(const Matrix<short>&);
+template bool Matrix<short>::IsEmpty() const;
+template void Matrix<short>::Resize(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, bool growOnly);
+template void Matrix<short>::Reshape(const size_t, const size_t);
+template short* Matrix<short>::CopyToArray(void) const;
+
 template Matrix<int>::Matrix(const size_t, const size_t, int*, DEVICEID_TYPE, const size_t, const size_t);

 }}}
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -503,7 +503,7 @@ public:
    void BatchNormalizationForward(const Matrix<ElemType>& scale, const Matrix<ElemType>& bias, double expAvgFactor, double blendFactor,
                                   Matrix<ElemType>& runMean, Matrix<ElemType>& runInvStdDev, Matrix<ElemType>& out, double epsilon,
                                   Matrix<ElemType>& saveMean, Matrix<ElemType>& saveInvStdDev) const;
-    void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
+    void BatchNormalizationBackward(const Matrix<ElemType>& in, Matrix<ElemType>& grad, const Matrix<ElemType>& scale, double blendFactor, const Matrix<ElemType>& saveMean, const Matrix<ElemType>& saveInvStdDev,
                                    Matrix<ElemType>& scaleGrad, Matrix<ElemType>& biasGrad) const;

 public:
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -708,6 +708,7 @@ void GPUSparseMatrix<ElemType>::ConvertBuffer(OutType* outBuffer, const InType*

 #pragma endregion Helper Functions

+template class MATH_API GPUSparseMatrix<short>;
 template class MATH_API GPUSparseMatrix<char>;
 template class MATH_API GPUSparseMatrix<float>;
 template class MATH_API GPUSparseMatrix<double>;
@ -1832,7 +1833,7 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, 
+void GPUMatrix<ElemType>::BatchNormalizationBackward(const GPUMatrix<ElemType>& in, GPUMatrix<ElemType>& grad, const GPUMatrix<ElemType>& scale, double blendFactor, 
                                                     const GPUMatrix<ElemType>& saveMean, const GPUMatrix<ElemType>& saveInvStdDev,
                                                     GPUMatrix<ElemType>& scaleGrad, GPUMatrix<ElemType>& biasGrad) const
 {
@ -2216,6 +2217,7 @@ GPURNGHandle::GPURNGHandle(int deviceId, unsigned long seed)

 #pragma endregion GPURNGHandle functions

+template class GPUMatrix<short>;
 template class GPUMatrix<char>;
 template class GPUMatrix<float>;
 template class GPUMatrix<double>;
@ -2276,6 +2278,9 @@ float CudaTimer::Elapsed()
    return 0;
 }

+/*static*/ void SyncGuard::EnableSync()
+{
+}
 } } }

 // define a dummy GPUWatcher class too
--- a/Source/Math/Quantizers.h
+++ b/Source/Math/Quantizers.h
@ -0,0 +1,106 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+#pragma once
+#include "Basics.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// RawType - input type to the quantizer. Currently CNTK supports float or double as RawType.
+// QuantizedType - output type of the quantizer
+template <class RawType, class QuantizedType>
+class QuantizerBase 
+{
+public:
+    QuantizerBase()
+    {
+        rangeMax = std::numeric_limits<QuantizedType>::max();
+    }
+    virtual void Quantize(const ArrayRef<RawType>& input, ArrayRef<QuantizedType>& output) = 0;
+    virtual void Dequantize(const ArrayRef<QuantizedType>& input, ArrayRef<RawType>& output) = 0;
+
+protected:
+    QuantizedType rangeMax;
+};
+
+// Symmetric quantizer. 
+// Quantization is achieved by 
+//    1. Finding the absolute max of values to be quantized.
+//    2. Adjusting the absolute max with extraBits parameter.
+//    3. Scaling all values in the collection to be within the symmetric range of the QuantizedType
+template <class RawType, class QuantizedType>
+class SymmetricQuantizer : public QuantizerBase<RawType, QuantizedType>
+{
+    RawType m_quantizeFactor;
+    RawType m_inverseQuantizerFactor;
+    RawType m_absMax;
+public:
+    // elements - collection to be quantized
+    // extraBits decreases the quantization normalizer to prevent integer overflow during BLAS routines.
+    //     Higher extraBits will decrease precision of quantization, but will make BLAS routines less prone to overflow.
+    //     For quantization with shorts, recommended value of extraBits is 1-3.
+    // This constructor accepts the collection of RawType to initialize internal quantizer
+    // and then apply this quantizer to collections with similar range as the one it was initialized with.
+    SymmetricQuantizer(const ArrayRef<RawType>& input, size_t extraBits)
+    {
+        m_absMax = FindAbsMax(input);
+        Initialize(m_absMax, extraBits);
+    }
+
+    // absoluteMax - the range of the quantizer (normally represents maximum absolute value of the values in the collection to be quantized).
+    // extraBits - see comment in another ctor
+    SymmetricQuantizer(RawType absoluteMax, size_t extraBits)
+    {
+        Initialize(absoluteMax, extraBits);
+    }
+
+    // Perform quantization of the input collection, put result into pre-allocated output collection
+    virtual void Quantize(const ArrayRef<RawType>& input, ArrayRef<QuantizedType>& output)
+    {
+        assert(input.size() == output.size());
+
+        for (size_t i = 0; i < input.size(); i++)
+        {
+#ifdef _DEBUG
+            assert(abs(input[i]) <= m_absMax);
+#endif
+            output[i] = (QuantizedType) round((input[i] * m_quantizeFactor));
+        }
+    }
+
+    // Accept quantized collection as input, put de-quantization result into pre-allocated output collection.
+    virtual void Dequantize(const ArrayRef<QuantizedType>& input, ArrayRef<RawType>& output)
+    {
+        assert(input.size() == output.size());
+
+        for (size_t i = 0; i < input.size(); i++)
+        {
+            output[i] = (RawType)(input[i] * m_inverseQuantizerFactor);
+        }
+    }
+
+private: 
+    // Find absolute maximum value
+    RawType FindAbsMax(const ArrayRef<RawType>& arrayRef)
+    {
+        RawType maxElem = *std::max_element(arrayRef.begin(), arrayRef.end());
+        RawType minElem = *std::min_element(arrayRef.begin(), arrayRef.end());
+
+        return std::max(maxElem, std::abs(minElem));
+    }
+
+    void Initialize(RawType absoluteMax, size_t extraBits)
+    {
+        RawType shiftedMax = absoluteMax * (1 << extraBits);
+        if (shiftedMax == 0)
+        {
+            LogicError("The absolute max element in the sequence to be quantized is 0.");
+        }
+        m_absMax = absoluteMax;
+        m_quantizeFactor = rangeMax / shiftedMax;
+        m_inverseQuantizerFactor = 1 / m_quantizeFactor;
+    }
+};
+
+}}}
--- a/Source/Math/TensorOps.h
+++ b/Source/Math/TensorOps.h
@ -261,6 +261,8 @@ DefTernaryOp(Cond, a ? b : c);
 DefTernaryOp(CopyIfEqual, a == b ? c : 0); // CopyIfEqual(a,b)(c) -- if a==b copy c, otherwise 0; used for gradient of clip, min, max, etc.
 DefTernaryOp(Clip, c < a ? a : (c > b ? b : c)); // Clip(min,max)(data) => a=min, b=max, c=data
 DefTernaryOp(ElementwiseProductWithLogSumDerivative, a * Sigmoid(c - b));
+DefTernaryOp(ElementwiseProductWithExpOfDiff, a * exp_(b - c));
+

 #pragma pop_macro("DefTernaryOp")
 }}}
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -14,6 +14,10 @@
 #pragma warning(push)
 #pragma warning(disable : 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.

+namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
+    template <class ElemType> struct TensorTest;
+}}}}
+
 // This class is exported from the Math.dll.
 namespace Microsoft { namespace MSR { namespace CNTK {

@ -149,6 +153,7 @@ private:

    const Matrix<ElemType>& GetSOB() const { return *m_sob; }
    Matrix<ElemType>&       GetSOB()       { return *m_sob; }
+    friend Test::TensorTest<ElemType>;

    // -------------------------------------------------------------------
    // sob members
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.cpp
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.cpp
@ -46,7 +46,7 @@ CNTKTextFormatReader::CNTKTextFormatReader(MemoryProviderPtr provider,
        {
            // Verbosity is a general config parameter, not specific to the text format reader.
            int verbosity = config(L"verbosity", 0);
-            m_randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer);
+            m_randomizer = make_shared<BlockRandomizer>(verbosity, window, m_deserializer, true);
        }
        else
        {
--- a/Source/Readers/CompositeDataReader/CompositeDataReader.cpp
+++ b/Source/Readers/CompositeDataReader/CompositeDataReader.cpp
@ -100,7 +100,7 @@ CompositeDataReader::CompositeDataReader(const ConfigParameters& config, MemoryP
        size_t randomizationWindow = config(L"randomizationWindow", requestDataSize);
        // By default using STL random number generator.
        bool useLegacyRandomization = config(L"useLegacyRandomization", false);
-        m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, randomizationWindow, deserializer, BlockRandomizer::DecimationMode::chunk, useLegacyRandomization, multiThreadedDeserialization);
+        m_sequenceEnumerator = std::make_shared<BlockRandomizer>(verbosity, randomizationWindow, deserializer, true /* should Prefetch */, BlockRandomizer::DecimationMode::chunk, useLegacyRandomization, multiThreadedDeserialization);
    }
    else
    {
@ -251,7 +251,7 @@ void CompositeDataReader::StartEpoch(const EpochConfiguration& cfg)

    if (config.m_totalEpochSizeInSamples <= 0)
    {
-        RuntimeError("Unsupported minibatch size '%d'.", (int)config.m_totalEpochSizeInSamples);
+        RuntimeError("Unsupported epoch size '%d'.", (int)config.m_totalEpochSizeInSamples);
    }

    m_sequenceEnumerator->StartEpoch(config);
--- a/Показать больше
+++ b/Показать больше