merged from master

2016-07-19 16:40:51 -07:00 · 2016-07-19 16:40:51 -07:00 · 39a9175097
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -27,6 +27,11 @@
    <CudaVersion Condition="Exists('$(CUDA_PATH_V7_5)') And '$(CudaVersion)' == ''">7.5</CudaVersion>
    <CudaVersion Condition="Exists('$(CUDA_PATH_V7_0)') And '$(CudaVersion)' == ''">7.0</CudaVersion>

+    <HasOpenCv>false</HasOpenCv>
+    <HasOpenCv Condition="Exists('$(OPENCV_PATH)') Or Exists('$(OPENCV_PATH_V31)')">true</HasOpenCv>
+
+    <UseZip>false</UseZip>
+    <UseZip Condition="Exists('$(ZLIB_PATH)')">true</UseZip>
  </PropertyGroup>

  <Choose>
@ -70,7 +75,33 @@
        <UnitTestDlls>$(OutDir)mkl_cntk_s.dll;</UnitTestDlls>
      </PropertyGroup>
    </When>
-  </Choose>
+   </Choose>
+
+  <PropertyGroup Condition="$(UseZip)">
+    <ZipInclude>$(ZLIB_PATH)\include;$(ZLIB_PATH)\lib\libzip\include;</ZipInclude>
+    <ZipDefine>USE_ZIP</ZipDefine>
+    <ZipLibPath>$(ZLIB_PATH)\lib;</ZipLibPath>
+    <ZipLibs>zlib.lib;zip.lib;</ZipLibs>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="Exists('$(OPENCV_PATH)')">
+    <OpenCvPath>$(OPENCV_PATH)</OpenCvPath>
+    <OpenCvVersion>300</OpenCvVersion>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="Exists('$(OPENCV_PATH_V31)')">
+    <OpenCvPath>$(OPENCV_PATH_V31)</OpenCvPath>
+    <OpenCvVersion>310</OpenCvVersion>
+  </PropertyGroup>
+
+  <PropertyGroup Condition="$(HasOpenCv)">
+    <OpenCvInclude>$(OpenCvPath)\include;</OpenCvInclude>
+    <OpenCvWorld Condition="$(ReleaseBuild)">opencv_world$(OpenCvVersion)</OpenCvWorld>
+    <OpenCvWorld Condition="$(DebugBuild)">opencv_world$(OpenCvVersion)d</OpenCvWorld>
+    <OpenCvLib>$(OpenCvWorld).lib</OpenCvLib>
+    <OpenCvLibPath>$(OpenCvPath)\x64\vc12\lib</OpenCvLibPath>
+    <OpenCvBinPath>$(OpenCvPath)\x64\vc12\bin</OpenCvBinPath>
+  </PropertyGroup>

  <PropertyGroup Condition="'$(CudaVersion)' == '7.5'">
    <CudaPath>$(CUDA_PATH_V7_5)</CudaPath>
--- a/CNTK.sln
+++ b/CNTK.sln
@ -685,11 +685,6 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalWrapper", "Source\Exten
 		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
 	EndProjectSection
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClient", "Source\Extensibility\CSEvalClient\CSEvalClient.csproj", "{41E11A59-62B2-4927-A4F8-F40B1B612C6C}"
-	ProjectSection(ProjectDependencies) = postProject
-		{EF766CAE-9CB1-494C-9153-0030631A6340} = {EF766CAE-9CB1-494C-9153-0030631A6340}
-	EndProjectSection
-EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Examples", "Examples", "{BD46CE02-3740-4526-80F6-CC7973B953E5}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Image", "Image", "{FC7E7EC7-6E6A-4518-81C6-DA60451C657A}"
@ -1051,8 +1046,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelBM", "ParallelBM",
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SequenceToSequence", "SequenceToSequence", "{A1521DC4-C8EC-47BD-9E63-7BE30ED2EC26}"
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Source\Extensibility\CPPEvalClient\CPPEvalClient.vcxproj", "{578D52A0-3928-4405-A016-F016E8B49031}"
-EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "HtkDeserializers", "HtkDeserializers", "{977ECCB7-598D-4548-B95B-BACA9CC7D98B}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DNN", "DNN", "{1DBB2575-F5C8-43F4-B982-D05D6ADC2F9B}"
@ -1140,6 +1133,18 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Scripts", "Scripts", "{6826
 	ProjectSection(SolutionItems) = preProject
 		Scripts\pytest.ini = Scripts\pytest.ini
 		Scripts\txt2ctf.py = Scripts\txt2ctf.py
+		Scripts\uci2ctf.py = Scripts\uci2ctf.py
+	EndProjectSection
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "ManagedEvalTests", "Tests\UnitTests\ManagedEvalTests\ManagedEvalTests.csproj", "{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Evaluation", "Evaluation", "{3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClient", "Examples\Evaluation\CSEvalClient\CSEvalClient.csproj", "{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Examples\Evaluation\CPPEvalClient\CPPEvalClient.vcxproj", "{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}"
+	ProjectSection(ProjectDependencies) = postProject
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
 	EndProjectSection
 EndProject
 Global
@ -1308,14 +1313,6 @@ Global
 		{EF766CAE-9CB1-494C-9153-0030631A6340}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
 		{EF766CAE-9CB1-494C-9153-0030631A6340}.Release|x64.ActiveCfg = Release|x64
 		{EF766CAE-9CB1-494C-9153-0030631A6340}.Release|x64.Build.0 = Release|x64
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Debug|x64.ActiveCfg = Debug|x64
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Debug|x64.Build.0 = Debug|x64
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Release|x64.ActiveCfg = Release|x64
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C}.Release|x64.Build.0 = Release|x64
 		{F0A9637C-20DA-42F0-83D4-23B4704DE602}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
 		{F0A9637C-20DA-42F0-83D4-23B4704DE602}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
 		{F0A9637C-20DA-42F0-83D4-23B4704DE602}.Debug|x64.ActiveCfg = Debug|x64
@ -1372,14 +1369,6 @@ Global
 		{7B7A563D-AA8E-4660-A805-D50235A02120}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
 		{7B7A563D-AA8E-4660-A805-D50235A02120}.Release|x64.ActiveCfg = Release|x64
 		{7B7A563D-AA8E-4660-A805-D50235A02120}.Release|x64.Build.0 = Release|x64
-		{578D52A0-3928-4405-A016-F016E8B49031}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
-		{578D52A0-3928-4405-A016-F016E8B49031}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
-		{578D52A0-3928-4405-A016-F016E8B49031}.Debug|x64.ActiveCfg = Debug|x64
-		{578D52A0-3928-4405-A016-F016E8B49031}.Debug|x64.Build.0 = Debug|x64
-		{578D52A0-3928-4405-A016-F016E8B49031}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
-		{578D52A0-3928-4405-A016-F016E8B49031}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
-		{578D52A0-3928-4405-A016-F016E8B49031}.Release|x64.ActiveCfg = Release|x64
-		{578D52A0-3928-4405-A016-F016E8B49031}.Release|x64.Build.0 = Release|x64
 		{82125DA1-1CD7-45B5-9281-E6AE7C287CB7}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
 		{82125DA1-1CD7-45B5-9281-E6AE7C287CB7}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
 		{82125DA1-1CD7-45B5-9281-E6AE7C287CB7}.Debug|x64.ActiveCfg = Debug|x64
@ -1412,6 +1401,30 @@ Global
 		{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
 		{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.ActiveCfg = Release|x64
 		{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.Build.0 = Release|x64
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Debug|x64.ActiveCfg = Debug|x64
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Debug|x64.Build.0 = Debug|x64
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Release|x64.ActiveCfg = Release|x64
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31}.Release|x64.Build.0 = Release|x64
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Debug|x64.ActiveCfg = Debug|x64
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Debug|x64.Build.0 = Debug|x64
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Release|x64.ActiveCfg = Release|x64
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}.Release|x64.Build.0 = Release|x64
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Debug|x64.ActiveCfg = Debug|x64
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Debug|x64.Build.0 = Debug|x64
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.ActiveCfg = Release|x64
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -1502,7 +1515,6 @@ Global
 		{E6DC3B7D-303D-4A54-B040-D8DCF8C56E17} = {8C128B1D-87E0-4643-AB93-2581589AE425}
 		{06D2C644-AE5F-4C30-A1F6-C78E2845AAB1} = {EF710C5A-E616-442A-889D-C997D39AF2E1}
 		{EF766CAE-9CB1-494C-9153-0030631A6340} = {60F87E25-BC87-4782-8E20-1621AAEBB113}
-		{41E11A59-62B2-4927-A4F8-F40B1B612C6C} = {60F87E25-BC87-4782-8E20-1621AAEBB113}
 		{BD46CE02-3740-4526-80F6-CC7973B953E5} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5}
 		{FC7E7EC7-6E6A-4518-81C6-DA60451C657A} = {BD46CE02-3740-4526-80F6-CC7973B953E5}
 		{CEADE942-4077-4577-ACF9-41C04388DDC0} = {BD46CE02-3740-4526-80F6-CC7973B953E5}
@ -1552,7 +1564,6 @@ Global
 		{4D6F731C-4A6D-4E21-AC3C-9E1F26E5547E} = {6994C86D-A672-4254-824A-51F4DFEB807F}
 		{36C42845-0D48-4A46-9C67-2B593A80A09C} = {6994C86D-A672-4254-824A-51F4DFEB807F}
 		{A1521DC4-C8EC-47BD-9E63-7BE30ED2EC26} = {47755F2E-D674-4175-9E38-8EA053455072}
-		{578D52A0-3928-4405-A016-F016E8B49031} = {60F87E25-BC87-4782-8E20-1621AAEBB113}
 		{977ECCB7-598D-4548-B95B-BACA9CC7D98B} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
 		{1DBB2575-F5C8-43F4-B982-D05D6ADC2F9B} = {977ECCB7-598D-4548-B95B-BACA9CC7D98B}
 		{772A0DB3-4710-4281-8AA9-A9F1F7C543D3} = {977ECCB7-598D-4548-B95B-BACA9CC7D98B}
@ -1568,5 +1579,9 @@ Global
 		{731312A8-6DA3-4841-AFCD-57520BA1BF8E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
 		{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{F4CC3AB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
+		{CC8DDDCB-D53A-4B30-8596-AEF1C493DB31} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
+		{3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA} = {47755F2E-D674-4175-9E38-8EA053455072}
+		{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
+		{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E} = {3385EBEA-5F97-4B2B-9F30-0E6D7F91B9CA}
 	EndGlobalSection
 EndGlobal
--- a/Documentation/Documents/Network
+++ b/Documentation/Documents/Network
@ -863,38 +863,27 @@ The dimension reduced matrix consisting of the maximum value within each pooling

 This function is often associated with Convolution() operations.

-### Delay
+### PastValue, FutureValue

-Delay node used in recurrent networks, allows creation of a loop in the convolutional network that will repeat a specified number of times.
+PastValue and FutureValue nodes are used in recurrent networks, allow creation of a loop in the convolutional network that will repeat a specified number of times. PastValue retrieves the value of a node several steps away in the past, while FutureValue retrieves the value of a node from future.

-`Delay(rows, [cols], delayNode, delayTime=1, needGradient=true, defaultHiddenActivity=0.1)`
+`PastValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`
+`FutureValue(rows, [cols], node, timeStep=1, defaultHiddenActivity=0.1)`

 #### Parameters

-`cvweight` – convolution weight matrix, it has the dimensions of \[outputChannels, kernelWidth \* kernelHeight \* inputChannels\]
+`rows` – number of rows in the node

-`kernelWidth` – width of the kernel
+`cols` – number of cols in the node. This value is often ommit since the length of a sequence varies

-`kernelHeight` – height of the kernel
+`timeStep` – \[default = 1\] number of time steps toward the past and future

-`outputChannels` – number of output channels
-
-`horizontalSubsample` – subsamples in the horizontal direction
-
-`verticalSubsample` – subsamples in the vertical direction
-
-#### Optional Parameters
-
-`delayTime` – \[default = 1\] the amount of delay that will be introduced (number of times the loop will happen)
-
-`needGradient` – \[default = true\] does the gradient need to be computed for this node
-
-`defaultHiddenActivity` – \[default = 0.1\] the numerical amount for the defaultHiddenActivity
+`defaultHiddenActivity` – \[default = 0.1\] default value to use when passing the sequence bounday or when the value is missing.

 #### Returns

-The results of the completed Delay loop
+Either the past or future value of a node

 #### Notes

-This node is used in recurrent networks, where a delay is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network that will repeat delayTime number of iterations.
+This node is used in recurrent networks, where a past value is introduced to examine values from a previous time, such as the prior value (t-1). This has the affect of creating a loop in the computational network.
--- a/Source/Extensibility/CPPEvalClient/CPPEvalClient.cpp
+++ b/Source/Extensibility/CPPEvalClient/CPPEvalClient.cpp
@ -4,9 +4,10 @@
 //
 // CPPEvalClient.cpp : Sample application using the evaluation interface from C++
 //
-
-#include "stdafx.h"
-#include "eval.h"
+#include "Eval.h"
+#ifdef _WIN32
+#include "Windows.h"
+#endif

 using namespace Microsoft::MSR::CNTK;

@ -23,41 +24,38 @@ typedef std::map<std::wstring, std::vector<float>*> Layer;
 /// <description>
 /// This program is a native C++ client using the native evaluation interface
 /// located in the <see cref="eval.h"/> file.
-/// The CNTK evaluation dll (EvalDLL.dll), must be found through the system's path. 
+/// The CNTK evaluation library (EvalDLL.dll on Windows, and LibEval.so on Linux), must be found through the system's path. 
 /// The other requirement is that Eval.h be included
 /// In order to run this program the model must already exist in the example. To create the model,
 /// first run the example in <CNTK>/Examples/Image/MNIST. Once the model file 01_OneHidden is created,
 /// you can run this client.
 /// This program demonstrates the usage of the Evaluate method requiring the input and output layers as parameters.
-int _tmain(int argc, _TCHAR* argv[])
+int main(int argc, char* argv[])
 {
    // Get the binary path (current working directory)
-    argc = 0;
-    std::wstring wapp(argv[0]);
-    std::string app(wapp.begin(), wapp.end());
-    std::string path = app.substr(0, app.rfind("\\"));
-
-    // Load the eval library
-    auto hModule = LoadLibrary(L"evaldll.dll");
-    if (hModule == nullptr)
-    {
-        const std::wstring msg(L"Cannot find evaldll.dll library");
-        const std::string ex(msg.begin(), msg.end());
-        throw new std::exception(ex.c_str());
-    }
-
-    // Get the factory method to the evaluation engine
-    std::string func = "GetEvalF";
-    auto procAddress = GetProcAddress(hModule, func.c_str());
-    auto getEvalProc = (GetEvalProc<float>)procAddress;
-
-    // Native model evaluation instance
+    argc = 0;   
+    std::string app = argv[0];
+    std::string path; 
    IEvaluateModel<float> *model;
-    getEvalProc(&model);
+    size_t pos;

-    // This relative path assumes launching from CNTK's binary folder
-    const std::string modelWorkingDirectory = path + "\\..\\..\\Examples\\Image\\MNIST\\Data\\";
-    const std::string modelFilePath = modelWorkingDirectory + "..\\Output\\Models\\01_OneHidden";
+#ifdef _WIN32
+    pos = app.rfind("\\");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);
+
+    // This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
+    const std::string modelWorkingDirectory = path + "/../../Examples/Image/MNIST/Data/";
+#else // on Linux
+    pos = app.rfind("/");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);
+
+    // This relative path assumes launching from CNTK's binary folder, e.g. build/release/bin/
+    const std::string modelWorkingDirectory = path + "/../../../Examples/Image/MNIST/Data/";
+#endif
+    
+    GetEvalF(&model);
+
+    const std::string modelFilePath = modelWorkingDirectory + "../Output/Models/01_OneHidden";

    // Load model with desired outputs
    std::string networkConfiguration;
@ -97,7 +95,7 @@ int _tmain(int argc, _TCHAR* argv[])

    // Output the results
    fprintf(stderr, "Layer '%ls' output:\n", outputLayerName.c_str());
-    for each (auto& value in outputs)
+    for (auto& value : outputs)
    {
        fprintf(stderr, "%f\n", value);
    }
--- a/Source/Extensibility/CPPEvalClient/CPPEvalClient.vcxproj
+++ b/Source/Extensibility/CPPEvalClient/CPPEvalClient.vcxproj
@ -19,7 +19,7 @@
    </ProjectConfiguration>
  </ItemGroup>
  <PropertyGroup Label="Globals">
-    <ProjectGuid>{578D52A0-3928-4405-A016-F016E8B49031}</ProjectGuid>
+    <ProjectGuid>{CCC07E8E-F33A-4AF7-9F60-93E2AA61C75E}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
    <RootNamespace>CPPEvalClient</RootNamespace>
  </PropertyGroup>
@ -69,7 +69,7 @@
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <DelayLoadDLLs>%(DelayLoadDLLs)</DelayLoadDLLs>
      <Profile>true</Profile>
    </Link>
@ -104,15 +104,8 @@
      <LinkLibraryDependencies>true</LinkLibraryDependencies>
    </ProjectReference>
  </ItemDefinitionGroup>
-  <ItemGroup>
-    <ClInclude Include="stdafx.h" />
-    <ClInclude Include="targetver.h" />
-  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="CPPEvalClient.cpp" />
-    <ClCompile Include="stdafx.cpp">
-      <PrecompiledHeader>Create</PrecompiledHeader>
-    </ClCompile>
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
--- a/Source/Extensibility/CPPEvalClient/CPPEvalClient.vcxproj.filters
+++ b/Source/Extensibility/CPPEvalClient/CPPEvalClient.vcxproj.filters
@ -15,17 +15,6 @@
    </Filter>
  </ItemGroup>
  <ItemGroup>
-    <ClInclude Include="stdafx.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-    <ClInclude Include="targetver.h">
-      <Filter>Header Files</Filter>
-    </ClInclude>
-  </ItemGroup>
-  <ItemGroup>
-    <ClCompile Include="stdafx.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="CPPEvalClient.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
--- a/Source/Extensibility/CSEvalClient/App.config
+++ b/Source/Extensibility/CSEvalClient/App.config
--- a/Examples/Evaluation/CSEvalClient/CSEvalClient.csproj
+++ b/Examples/Evaluation/CSEvalClient/CSEvalClient.csproj
@ -0,0 +1,84 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
+    <ProjectGuid>{1C6E6C53-1AA7-4B69-913E-B97BB5A872CF}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient</RootNamespace>
+    <AssemblyName>CSEvalClient</AssemblyName>
+    <TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
+    <DebugSymbols>true</DebugSymbols>
+    <OutputPath>..\..\..\x64\Debug\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <DebugType>full</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+    <Prefer32Bit>true</Prefer32Bit>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug_CpuOnly|x64'">
+    <DebugSymbols>true</DebugSymbols>
+    <OutputPath>..\..\..\x64\Debug_CpuOnly\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <DebugType>full</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+    <Prefer32Bit>true</Prefer32Bit>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
+    <OutputPath>..\..\..\x64\Release\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <Optimize>true</Optimize>
+    <DebugType>pdbonly</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+    <Prefer32Bit>true</Prefer32Bit>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release_CpuOnly|x64'">
+    <OutputPath>..\..\..\x64\Release_CpuOnly\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <Optimize>true</Optimize>
+    <DebugType>pdbonly</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+    <Prefer32Bit>true</Prefer32Bit>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="Microsoft.CSharp" />
+    <Reference Include="System.Drawing" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="CntkBitmapExtensions.cs" />
+    <Compile Include="ModelEvaluator.cs" />
+    <Compile Include="Program.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="App.config" />
+  </ItemGroup>
+  <ItemGroup>
+    <ProjectReference Include="..\..\..\Source\Extensibility\EvalWrapper\EvalWrapper.vcxproj">
+      <Project>{ef766cae-9cb1-494c-9153-0030631a6340}</Project>
+      <Name>EvalWrapper</Name>
+    </ProjectReference>
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
--- a/Examples/Evaluation/CSEvalClient/CntkBitmapExtensions.cs
+++ b/Examples/Evaluation/CSEvalClient/CntkBitmapExtensions.cs
@ -0,0 +1,212 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CntkBitmapExtensions.cs -- extension methods for transforming images used in CNTK.
+//
+using System;
+using System.Collections.Generic;
+using System.Drawing;
+using System.Drawing.Imaging;
+using System.Linq;
+using System.Runtime.InteropServices;
+using System.Threading.Tasks;
+
+namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
+{
+    public static class CntkBitmapExtensions
+    {
+        /// <summary>
+        /// Resizes an image
+        /// </summary>
+        /// <param name="image">The image to resize</param>
+        /// <param name="width">New width in pixels</param>
+        /// <param name="height">New height in pixesl</param>
+        /// <param name="useHighQuality">Resize quality</param>
+        /// <returns>The resized image</returns>
+        public static Bitmap Resize(this Bitmap image, int width, int height, bool useHighQuality)
+        {
+            var newImg = new Bitmap(width, height);
+
+            newImg.SetResolution(image.HorizontalResolution, image.VerticalResolution);
+
+            using (var g = Graphics.FromImage(newImg))
+            {
+                g.CompositingMode = System.Drawing.Drawing2D.CompositingMode.SourceCopy;
+                if (useHighQuality)
+                {
+                    g.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.HighQualityBicubic;
+                    g.CompositingQuality = System.Drawing.Drawing2D.CompositingQuality.HighQuality;
+                    g.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.HighQuality;
+                    g.PixelOffsetMode = System.Drawing.Drawing2D.PixelOffsetMode.HighQuality;
+                }
+                else
+                {
+                    g.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.Default;
+                    g.CompositingQuality = System.Drawing.Drawing2D.CompositingQuality.Default;
+                    g.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.Default;
+                    g.PixelOffsetMode = System.Drawing.Drawing2D.PixelOffsetMode.Default;
+                }
+
+                var attributes = new ImageAttributes();
+                attributes.SetWrapMode(System.Drawing.Drawing2D.WrapMode.TileFlipXY);
+                g.DrawImage(image, new Rectangle(0, 0, width, height), 0, 0, image.Width, image.Height, GraphicsUnit.Pixel, attributes);
+            }
+
+            return newImg;
+        }
+
+        /// <summary>
+        /// Extracts image pixels in CHW
+        /// </summary>
+        /// <param name="image">The bitmap image to extract features from</param>
+        /// <returns>A list of pixels in HWC order</returns>
+        public static List<float> ExtractCHW(this Bitmap image)
+        {
+            var features = new List<float>(image.Width * image.Height * 3);
+            for (int c = 0; c < 3; c++)
+            {
+                for (int h = 0; h < image.Height; h++)
+                {
+                    for (int w = 0; w < image.Width; w++)
+                    {
+                        var pixel = image.GetPixel(w, h);
+                        float v = c == 0 ? pixel.B : c == 1 ? pixel.G : pixel.R;
+                        features.Add(v);
+                    }
+                }
+            }
+
+            return features;
+        }
+
+        /// <summary>
+        /// Extracts image pixels in CHW using parallelization
+        /// </summary>
+        /// <param name="image">The bitmap image to extract features from</param>
+        /// <returns>A list of pixels in CHW order</returns>
+        public static List<float> ParallelExtractCHW(this Bitmap image)
+        {
+            // We use local variables to avoid contention on the image object through the multiple threads.
+            int channelStride = image.Width * image.Height;
+            int imageWidth = image.Width;
+            int imageHeight = image.Height;
+
+            var features = new byte[imageWidth * imageHeight * 3];
+            var bitmapData = image.LockBits(new Rectangle(0, 0, imageWidth, imageHeight), ImageLockMode.ReadOnly, image.PixelFormat);
+            IntPtr ptr = bitmapData.Scan0;
+            int bytes = Math.Abs(bitmapData.Stride) * bitmapData.Height;
+            byte[] rgbValues = new byte[bytes];
+
+            int stride = bitmapData.Stride;
+
+            // Copy the RGB values into the array.
+            System.Runtime.InteropServices.Marshal.Copy(ptr, rgbValues, 0, bytes);
+
+            // The mapping depends on the pixel format
+            // The mapPixel lambda will return the right color channel for the desired pixel
+            Func<int, int, int, int> mapPixel = GetPixelMapper(image.PixelFormat, stride);
+
+            Parallel.For(0, imageHeight, (int h) =>
+            {
+                Parallel.For(0, imageWidth, (int w) =>
+                {
+                    Parallel.For(0, 3, (int c) =>
+                    {
+                        features[channelStride * c + imageWidth * h + w] = rgbValues[mapPixel(h, w, c)];
+                    });
+                });
+            });
+
+            image.UnlockBits(bitmapData);
+
+            return features.Select(b => (float)b).ToList();
+        }
+
+        /// <summary>
+        /// Extracts image pixels in HWC
+        /// </summary>
+        /// <param name="image">The bitmap image to extract features from</param>
+        /// <returns>A list of pixels in HWC order</returns>
+        public static List<float> ExtractHWC(this Bitmap image)
+        {
+            var features = new List<float>(image.Width * image.Height * 3);
+            for (int w = 0; w < image.Width; w++)
+            {
+                for (int h = 0; h < image.Height; h++)
+                {
+                    for (int c = 0; c < 3; c++)
+                    {
+                        var pixel = image.GetPixel(w, h);
+                        float v = c == 0 ? pixel.B : c == 1 ? pixel.G : pixel.R;
+
+                        features.Add(v);
+                    }
+                }
+            }
+
+            return features;
+        }
+
+        /// <summary>
+        /// Extracts image pixels in HWC using multiple threads
+        /// </summary>
+        /// <param name="image">The bitmap image to extract features from</param>
+        /// <returns>A list of pixels in HWC order</returns>
+        public static List<float> ParallelExtractHWC(this Bitmap image)
+        {
+            int heightStride = image.Width * 3;
+            int widthStride = image.Height * 3;
+            int imageWidth = image.Width;
+            int imageHeight = image.Height;
+
+            var features = new byte[image.Width * image.Height * 3];
+            var bitmapData = image.LockBits(new Rectangle(0, 0, image.Width, image.Height), ImageLockMode.ReadOnly, image.PixelFormat);
+            IntPtr ptr = bitmapData.Scan0;
+            int bytes = Math.Abs(bitmapData.Stride) * bitmapData.Height;
+            byte[] rgbValues = new byte[bytes];
+
+            int stride = bitmapData.Stride;
+
+            // Copy the RGB values into the array.
+            System.Runtime.InteropServices.Marshal.Copy(ptr, rgbValues, 0, bytes);
+
+            // The mapping depends on the pixel format
+            // The mapPixel lambda will return the right color channel for the desired pixel
+            Func<int, int, int, int> mapPixel = GetPixelMapper(image.PixelFormat, stride);
+
+            Parallel.For(0, 3, (int c) =>
+            {
+                Parallel.For(0, imageHeight, (int h) =>
+                {
+                    Parallel.For(0, imageWidth, (int w) =>
+                    {
+                        features[w * widthStride + h * 3 + c] = rgbValues[mapPixel(h, w, c)];
+                    });
+                });
+            });
+
+            image.UnlockBits(bitmapData);
+
+            return features.Select(b => (float)b).ToList();
+        }
+
+        /// <summary>
+        /// Returns a function for extracting the R-G-B values properly from an image based on its pixel format
+        /// </summary>
+        /// <param name="pixelFormat">The image's pixel format</param>
+        /// <param name="heightStride">The stride (row byte count)</param>
+        /// <returns>A function with signature (height, width, channel) returning the corresponding color value</returns>
+        private static Func<int, int, int, int> GetPixelMapper(PixelFormat pixelFormat, int heightStride)
+        {
+            switch (pixelFormat)
+            {
+                case PixelFormat.Format32bppArgb:
+                    return (h, w, c) => h * heightStride + w * 4 + c;  // bytes are B-G-R-A
+                case PixelFormat.Format24bppRgb:
+                default:
+                    return (h, w, c) => h * heightStride + w * 3 + c;  // bytes are B-G-R
+            }
+        }
+    }
+}
--- a/Examples/Evaluation/CSEvalClient/ModelEvaluator.cs
+++ b/Examples/Evaluation/CSEvalClient/ModelEvaluator.cs
@ -0,0 +1,214 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// ModelEvaluator.cs -- wrapper for a network so it can be evaluated one call at a time.
+// 
+// THIS CODE IS FOR ILLUSTRATION PURPOSES ONLY. NOT FOR PRODUCTION.
+//
+
+using System;
+using System.Collections.Concurrent;
+using System.Collections.Generic;
+using System.Linq;
+using System.Threading;
+
+namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
+{
+    /// <summary>
+    /// This class provides an Eval model wrapper to restrict model evaluation calls to one at a time.
+    /// </summary>
+    /// <remarks>
+    /// This class is not thread-safe except through the static methods.
+    /// Each ModelEvaluator instance wraps an Eval model, and exposes the Evaluate method for either
+    /// a vector of inputs or a record string.
+    /// The static interface provides the management of the concurrency of the models and restricts
+    /// the evaluations to a single thread.
+    /// </remarks>
+    public sealed class ModelEvaluator
+    {
+        /// <summary>
+        /// The cntk model evaluation instance
+        /// </summary>
+        private readonly IEvaluateModelManagedF m_model;
+
+        /// <summary>
+        /// The input layer key
+        /// </summary>
+        private readonly string m_inKey;
+
+        /// <summary>
+        /// The output layer key
+        /// </summary>
+        private readonly string m_outKey;
+
+        /// <summary>
+        /// The model instance number
+        /// </summary>
+        private readonly int m_modelInstance;
+
+        /// <summary>
+        /// The input buffer
+        /// </summary>
+        private Dictionary<string, List<float>> m_inputs;
+
+        /// <summary>
+        /// Indicates if the object is diposed
+        /// </summary>
+        private static bool Disposed
+        {
+            get;
+            set;
+        }
+
+        /// <summary>
+        /// The ModelEvaluator's models to manage
+        /// </summary>
+        private static readonly BlockingCollection<ModelEvaluator> Models = new BlockingCollection<ModelEvaluator>();
+
+        /// <summary>
+        /// Initializes the Model Evaluator to process multiple models concurrently
+        /// </summary>
+        /// <param name="numConcurrentModels">The number of concurrent models</param>
+        /// <param name="modelFilePath">The model file path to load the model from</param>
+        /// <param name="numThreads"></param>
+        public static void Initialize(int numConcurrentModels, string modelFilePath, int numThreads = 1)
+        {
+            if (Disposed)
+            {
+                throw new CNTKRuntimeException("Model Evaluator has been disposed", string.Empty);
+            }
+
+            for (int i = 0; i < numConcurrentModels; i++)
+            {
+                Models.Add(new ModelEvaluator(modelFilePath, numThreads, i));
+            }
+            
+            Disposed = false;
+        }
+
+        /// <summary>
+        /// Disposes of all models
+        /// </summary>
+        public static void DisposeAll()
+        {
+            Disposed = true;
+
+            foreach (var model in Models)
+            {
+                model.Dispose();
+            }
+            
+            Models.Dispose();
+        }
+
+        /// <summary>
+        /// Evaluates a record containing the input data and the expected outcome value
+        /// </summary>
+        /// <param name="record">A tab-delimited string with the first entry being the expected value.</param>
+        /// <returns>true if the outcome is as expected, false otherwise</returns>
+        public static bool Evaluate(string record)
+        {
+            var model = Models.Take();
+            try
+            {
+                var outcome = model.EvaluateRecord(record);
+                return outcome;
+            }
+            finally
+            { 
+                Models.Add(model); 
+            }
+        }
+
+        /// <summary>
+        /// Evaluated a vector and returns the output vector
+        /// </summary>
+        /// <param name="inputs">The input vector</param>
+        /// <returns>The output vector</returns>
+        public static List<float> Evaluate(List<float> inputs)
+        {
+            var model = Models.Take();
+            try
+            {
+                var outcome = model.EvaluateInput(inputs);
+                return outcome;
+            }
+            finally
+            {
+                Models.Add(model);
+            }
+        }
+
+        /// <summary>
+        /// Creates an instance of the <see cref="ModelEvaluator"/> class.
+        /// </summary>
+        /// <param name="modelFilePath">The model file path</param>
+        /// <param name="numThreads">The number of concurrent threads for the model</param>
+        /// <param name="id">A unique id for the model</param>
+        /// <remarks>The id is used only for debugging purposes</remarks>
+        private ModelEvaluator(string modelFilePath, int numThreads, int id)
+        {
+            m_modelInstance = id;
+
+            m_model = new IEvaluateModelManagedF();
+
+            // Configure the model to run with a specific number of threads
+            m_model.Init(string.Format("numCPUThreads={0}", numThreads));
+
+            // Load model
+            m_model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1);
+
+            // Generate random input values in the appropriate structure and size
+            var inDims = m_model.GetNodeDimensions(NodeGroup.Input);
+            m_inKey = inDims.First().Key;
+            m_inputs = new Dictionary<string, List<float>>() { { m_inKey, null } };
+
+            // We request the output layer names(s) and dimension, we'll use the first one.
+            var outDims = m_model.GetNodeDimensions(NodeGroup.Output);
+            m_outKey = outDims.First().Key;
+        }
+
+        /// <summary>
+        /// Evaluates a test record
+        /// </summary>
+        /// <param name="record">A tab-delimited string containing as the first entry the expected outcome, values after that are the input data</param>
+        /// <returns>true if the record's expected outcome value matches the computed value</returns>
+        private bool EvaluateRecord(string record)
+        {
+            // The first value in the line is the expected label index for the record's outcome
+            int expected = int.Parse(record.Substring(0, record.IndexOf('\t')));
+            m_inputs[m_inKey] =
+                record.Substring(record.IndexOf('\t') + 1).Split('\t').Select(float.Parse).ToList();
+
+            // We can call the evaluate method and get back the results (single layer)...
+            var outputs = m_model.Evaluate(m_inputs, m_outKey);
+
+            // Retrieve the outcome index (so we can compare it with the expected index)
+            int index = 0;
+            var max = outputs.Select(v => new { Value = v, Index = index++ })
+                .Aggregate((a, b) => (a.Value > b.Value) ? a : b)
+                .Index;
+
+            return (expected == max);
+        }
+
+        /// <summary>
+        /// Evaluates an input vector against the model as the first defined input layer, and returns the first defined output layer
+        /// </summary>
+        /// <param name="inputs">Input vector</param>
+        /// <returns>The output vector</returns>
+        private List<float> EvaluateInput(List<float> inputs)
+        {
+            return m_model.Evaluate(new Dictionary<string, List<float>>() { { m_inKey, inputs } }, m_outKey);
+        }
+
+        /// <summary>
+        /// Disposes of the resources
+        /// </summary>
+        private void Dispose()
+        {
+            m_model.Dispose();
+        }
+    }
+}
--- a/Source/Extensibility/CSEvalClient/Program.cs
+++ b/Source/Extensibility/CSEvalClient/Program.cs
@ -7,9 +7,14 @@

 using System;
 using System.Collections.Generic;
+using System.Diagnostics;
+using System.Drawing;
 using System.IO;
 using System.Linq;
 using System.Linq.Expressions;
+using System.Threading;
+using System.Threading.Tasks;
+using Microsoft.MSR.CNTK.Extensibility.Managed;

 namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
 {
@ -20,6 +25,9 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
    /// This program is a managed client using the CLIWrapper to run the model evaluator in CNTK.
    /// There are four cases shown in this program related to model loading, network creation and evaluation.
    /// 
+    /// To run this program from the CNTK binary drop, you must add the NuGet package for model evaluation first.
+    /// Refer to <see cref="https://github.com/Microsoft/CNTK/wiki/NuGet-Package"/> for information regarding the NuGet package for model evaluation.
+    /// 
    /// EvaluateModelSingleLayer and EvaluateModelMultipleLayers
    /// --------------------------------------------------------
    /// These two cases require the 01_OneHidden model which is part of the <CNTK>/Examples/Image/MNIST example.
@ -30,6 +38,19 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
    /// ----------------------------------------------------------------
    /// These two cases do not required a trained model (just the network description). These cases show how to extract values from a single forward-pass
    /// without any input to the model.
+    /// 
+    /// EvaluateMultipleModels
+    /// ----------------------
+    /// This case requires the 02_Convolution model and the Test-28x28_cntk_text.txt test file which are part of the <CNTK>/Examples/Image/MNIST example.
+    /// Refer to <see cref="https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md"/> for how to train
+    /// the model used in this example.
+    /// 
+    /// EvaluateImageClassificationModel
+    /// -----------------------
+    /// This case requires the ResNet_18 trained model which can be downloaded from <see cref="https://www.cntk.ai/resnet/ResNet_18.model"/>.
+    /// This case shows how to evaluate a model that was trained with the ImageReader.
+    /// The input for evaluation needs to be transformed in a similar manner as the ImageReader did during training.
+    /// 
    /// </description>
    class Program
    {
@ -42,7 +63,7 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
        private static void Main(string[] args)
        {
            initialDirectory = Environment.CurrentDirectory;
-
+            
            Console.WriteLine("====== EvaluateModelSingleLayer ========");
            EvaluateModelSingleLayer();

@ -55,6 +76,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
            Console.WriteLine("\n====== EvaluateNetworkSingleLayerNoInput ========");
            EvaluateNetworkSingleLayerNoInput();

+            Console.WriteLine("\n====== EvaluateExtendedNetworkSingleLayerNoInput ========");
+            EvaluateExtendedNetworkSingleLayerNoInput();
+
+            Console.WriteLine("\n====== EvaluateMultipleModels ========");
+            EvaluateMultipleModels();
+
+            Console.WriteLine("\n====== EvaluateModelImageInput ========");
+            EvaluateImageClassificationModel();
+
            Console.WriteLine("Press <Enter> to terminate.");
            Console.ReadLine();
        }
@ -83,11 +113,11 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
                    model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1);

                    // Generate random input values in the appropriate structure and size
-                    var inDims = model.GetNodeDimensions(NodeGroup.nodeInput);
+                    var inDims = model.GetNodeDimensions(NodeGroup.Input);
                    var inputs = GetDictionary(inDims.First().Key, inDims.First().Value, 255);

                    // We request the output layer names(s) and dimension, we'll use the first one.
-                    var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
+                    var outDims = model.GetNodeDimensions(NodeGroup.Output);
                    outputLayerName = outDims.First().Key;
                    // We can call the evaluate method and get back the results (single layer)...
                    outputs = model.Evaluate(inputs, outputLayerName);
@ -124,20 +154,20 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
                using (var model = new IEvaluateModelManagedF())
                {
                    // Desired output layers
-                    string hiddenLayerName = "h1.z";
-                    string outputLayerName = "ol.z";
+                    const string hiddenLayerName = "h1.z";
+                    const string outputLayerName = "ol.z";

                    // Load model
                    string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\01_OneHidden");
-                    List<string> desiredOutputLayers = new List<string>() { hiddenLayerName, outputLayerName };
+                    var desiredOutputLayers = new List<string>() { hiddenLayerName, outputLayerName };
                    model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1, outputNodeNames: desiredOutputLayers);

                    // Generate random input values in the appropriate structure and size
-                    var inDims = model.GetNodeDimensions(NodeGroup.nodeInput);
+                    var inDims = model.GetNodeDimensions(NodeGroup.Input);
                    var inputs = GetDictionary(inDims.First().Key, inDims.First().Value, 255);

                    // We request the output layer names(s) and dimension, we'll get both the hidden layer and the output layer
-                    var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
+                    var outDims = model.GetNodeDimensions(NodeGroup.Output);

                    // We can preallocate the output structure and pass it in (multiple output layers)
                    outputs = new Dictionary<string, List<float>>()
@ -187,7 +217,7 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
                    var inputs = new Dictionary<string, List<float>>() { { "features", new List<float>() { 1.0f } } };

                    // We can call the evaluate method and get back the results (single layer output)...
-                    var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
+                    var outDims = model.GetNodeDimensions(NodeGroup.Output);
                    outputLayerName = outDims.First().Key;
                    outputs = model.Evaluate(inputs, outputLayerName);
                }
@ -242,6 +272,209 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
            }
        }

+        /// <summary>
+        /// Evaluates an extended network (without a model and without input) and obtains a single layer output
+        /// </summary>
+        private static void EvaluateExtendedNetworkSingleLayerNoInput()
+        {
+            const string modelDefinition = @"precision = ""float"" 
+                                     traceLevel = 1
+                                     run=NDLNetworkBuilder
+                                     NDLNetworkBuilder=[
+                                     v1 = Constant(1)
+                                     v2 = Constant(2, tag=""output"")
+                                     ol = Plus(v1, v2, tag=""output"")
+                                     FeatureNodes = (v1)
+                                     ]";
+
+            try
+            {
+                // The examples assume the executable is running from the data folder
+                // We switch the current directory to the data folder (assuming the executable is in the <CNTK>/x64/Debug|Release folder
+                string workingDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Other\Simple2d\Config");
+                Environment.CurrentDirectory = initialDirectory;
+
+                using (var model = new ModelEvaluationExtendedF())
+                {
+                    // Create the network
+                    // This network (AddOperatorConstantNoInput.cntk) is a simple network consisting of a single binary operator (Plus)
+                    // operating over a two constants, therefore no input is necessary.
+                    model.CreateNetwork(modelDefinition);
+
+                    VariableSchema outputSchema = model.GetOutputSchema();
+
+                    var outputNodeNames = outputSchema.Select(s => s.Name).ToList<string>();
+                    model.StartForwardEvaluation(outputNodeNames);
+
+                    var outputBuffer = outputSchema.CreateBuffers<float>();
+                    var inputBuffer = new ValueBuffer<float>[0];
+
+                    // We can call the evaluate method and get back the results...
+                    model.ForwardPass(inputBuffer, outputBuffer);
+
+                    // We expect two outputs: the v2 constant, and the ol Plus result
+                    var expected = new float[][] { new float[] { 2 }, new float[] { 3 } };
+
+                    Console.WriteLine("Expected values: {0}", string.Join(" - ", expected.Select(b => string.Join(", ", b)).ToList<string>()));
+                    Console.WriteLine("Actual Values  : {0}", string.Join(" - ", outputBuffer.Select(b => string.Join(", ", b.Buffer)).ToList<string>()));
+                }
+            }
+            catch (CNTKException ex)
+            {
+                Console.WriteLine("Error: {0}\nNative CallStack: {1}\n Inner Exception: {2}", ex.Message, ex.NativeCallStack, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
+            }
+            catch (Exception ex)
+            {
+                Console.WriteLine("Error: {0}\nCallStack: {1}\n Inner Exception: {2}", ex.Message, ex.StackTrace, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
+            }
+        }
+
+        /// <summary>
+        /// Evaluates multiple instances of a model in the same process.
+        /// </summary>
+        /// <remarks>
+        /// Although all models execute concurrently (multiple tasks), each model is evaluated with a single task at a time.
+        /// </remarks>
+        private static void EvaluateMultipleModels()
+        {
+            // Specifies the number of models in memory as well as the number of parallel tasks feeding these models (1 to 1)
+            int numConcurrentModels = 4;
+
+            // Specifies the number of times to iterate through the test file (epochs)
+            int numRounds = 1;
+
+            // Counts the number of evaluations accross all models
+            int count = 0;
+
+            // Counts the number of failed evaluations (output != expected) accross all models
+            int errorCount = 0;
+
+            // The examples assume the executable is running from the data folder
+            // We switch the current directory to the data folder (assuming the executable is in the <CNTK>/x64/Debug|Release folder
+            Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\MNIST\Data\");
+
+            // Load model
+            string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\02_Convolution");
+
+            // Initializes the model instances
+            ModelEvaluator.Initialize(numConcurrentModels, modelFilePath);
+
+            string testfile = Path.Combine(Environment.CurrentDirectory, @"Test-28x28_cntk_text.txt");
+            Stopwatch sw = new Stopwatch();
+            sw.Start();
+
+            try
+            {
+                for (int i = 0; i < numRounds; i++)
+                {
+                    // Feed each line to a single model in parallel
+                    Parallel.ForEach(File.ReadLines(testfile), new ParallelOptions() { MaxDegreeOfParallelism = numConcurrentModels }, (line) =>
+                    {
+                        Interlocked.Increment(ref count);
+
+                        // The file format correspond to the CNTK Text Format Reader format (https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader)
+                        var sets = line.Split('|');
+                        var labels = sets[1].Trim().Split(' ').Skip(1);
+                        var features = sets[2].Trim().Split(' ').Skip(1);
+
+                        // Retrieve the 1-hot vector with the label index
+                        var expected = labels.Select(float.Parse).Select((v, index) => new { Value = v, Index = index })
+                            .Aggregate((a, b) => (a.Value > b.Value) ? a : b)
+                            .Index;
+
+                        // Retrieve the features
+                        var inputs = features.Select(float.Parse).ToList();
+
+                        // We can call the evaluate method and get back the results (single layer)...
+                        var outputs = ModelEvaluator.Evaluate(inputs);
+
+                        // Retrieve the outcome index (so we can compare it with the expected index)
+                        var max = outputs.Select((v, index) => new { Value = v, Index = index })
+                            .Aggregate((a, b) => (a.Value > b.Value) ? a : b)
+                            .Index;
+
+                        // Count the errors
+                        if (expected != max)
+                        {
+                            Interlocked.Increment(ref errorCount);
+                        }
+                    });
+                }
+            }
+            catch (CNTKException ex)
+            {
+                Console.WriteLine("Error: {0}\nNative CallStack: {1}\n Inner Exception: {2}", ex.Message, ex.NativeCallStack, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
+            }
+            catch (Exception ex)
+            {
+                Console.WriteLine("Error: {0}\nCallStack: {1}\n Inner Exception: {2}", ex.Message, ex.StackTrace, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
+            }
+
+            sw.Stop();
+            ModelEvaluator.DisposeAll();
+            
+            Console.WriteLine("The file {0} was processed using {1} concurrent model(s) with an error rate of: {2:P2} ({3} error(s) out of {4} record(s)), and a throughput of {5:N2} records/sec", @"Test-28x28_cntk_text.txt", 
+                numConcurrentModels, (float)errorCount / count, errorCount, count, (count + errorCount) * 1000.0 / sw.ElapsedMilliseconds);
+        }
+
+        /// <summary>
+        /// This method shows how to evaluate a trained image classification model
+        /// </summary>
+        public static void EvaluateImageClassificationModel()
+        {
+            try
+            {
+                // This example requires the RestNet_18 model.
+                // The model can be downloaded from <see cref="https://www.cntk.ai/resnet/ResNet_18.model"/>
+                // The model is assumed to be located at: <CNTK>\Examples\Image\Miscellaneous\ImageNet\ResNet 
+                // along with a sample image file named "zebra.jpg".
+                string workingDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\Miscellaneous\ImageNet\ResNet");
+                Environment.CurrentDirectory = initialDirectory;
+
+                List<float> outputs;
+
+                using (var model = new IEvaluateModelManagedF())
+                {
+                    string modelFilePath = Path.Combine(workingDirectory, "ResNet_18.model");
+                    model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1);
+
+                    // Prepare input value in the appropriate structure and size
+                    var inDims = model.GetNodeDimensions(NodeGroup.Input);
+                    if (inDims.First().Value != 224 * 224 * 3)
+                    {
+                        throw new CNTKRuntimeException(string.Format("The input dimension for {0} is {1} which is not the expected size of {2}.", inDims.First(), inDims.First().Value, 224 * 224 * 3), string.Empty);
+                    }
+
+                    // Transform the image
+                    string imageFileName = Path.Combine(workingDirectory, "zebra.jpg");
+                    Bitmap bmp = new Bitmap(Bitmap.FromFile(imageFileName));
+
+                    var resized = bmp.Resize(224, 224, true);
+                    var resizedCHW = resized.ParallelExtractCHW();
+                    var inputs = new Dictionary<string, List<float>>() { {inDims.First().Key, resizedCHW } };
+
+                    // We can call the evaluate method and get back the results (single layer output)...
+                    var outDims = model.GetNodeDimensions(NodeGroup.Output);
+                    outputs = model.Evaluate(inputs, outDims.First().Key);
+                }
+
+                // Retrieve the outcome index (so we can compare it with the expected index)
+                var max = outputs.Select((value, index) => new { Value = value, Index = index })
+                    .Aggregate((a, b) => (a.Value > b.Value) ? a : b)
+                    .Index;
+
+                Console.WriteLine("Outcome: {0}", max);
+            }
+            catch (CNTKException ex)
+            {
+                Console.WriteLine("Error: {0}\nNative CallStack: {1}\n Inner Exception: {2}", ex.Message, ex.NativeCallStack, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
+            }
+            catch (Exception ex)
+            {
+                Console.WriteLine("Error: {0}\nCallStack: {1}\n Inner Exception: {2}", ex.Message, ex.StackTrace, ex.InnerException != null ? ex.InnerException.Message : "No Inner Exception");
+            }
+        }
+
        /// <summary>
        /// Dumps the output to the console
        /// </summary>
--- a/Source/Extensibility/CSEvalClient/Properties/AssemblyInfo.cs
+++ b/Source/Extensibility/CSEvalClient/Properties/AssemblyInfo.cs
--- a/Examples/Image/MNIST/Config/04_DeConv.cntk
+++ b/Examples/Image/MNIST/Config/04_DeConv.cntk
@ -49,25 +49,21 @@ train = [
        maxEpochs = 10
    ]
    
-    # Note: this reader crashes if randomization is turned on.
    reader = [
-        readerType = "UCIFastReader"
-        # To get the data (Train-28x28.txt) please run `python mnist_convert.py` 
-        # from the 'AdditionalFiles' folder. See REAMDE.md for details.
-        file = "$DataDir$/Train-28x28.txt"
-        
-        features = [
-            dim = 784
-            start = 1
+        readerType = "CNTKTextFormatReader"
+        # See ../README.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
        ]
-        
-        labels = [
-            dim = 1
-            start = 0
-            labelDim = 10
-            labelMappingFile = "$DataDir$/labelsmap.txt"
-        ]
-    ]    
+    ]   
 ]

 #######################################
@ -83,19 +79,17 @@ test = [
    ]
    
    reader = [
-        readerType = "UCIFastReader"
-        file = "$DataDir$/Test-28x28.txt"
-        
-        features = [
-            dim = 784
-            start = 1
-        ]
-        
-        labels = [
-            dim = 1
-            start = 0
-            labelDim = 10
-            labelMappingFile = "$DataDir$/labelsmap.txt"
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = [
+            features = [
+                dim = 784
+                format = "dense"
+            ]
+            labels = [
+                dim = 10
+                format = "dense"
+            ]
        ]
    ]
 ]
--- a/Examples/Image/MNIST/README.md
+++ b/Examples/Image/MNIST/README.md
@ -7,7 +7,7 @@
 |Purpose   |This example demonstrates usage of the NDL (Network Description Language) to define networks.
 |Network   |NDLNetworkBuilder, simple feed forward and convolutional networks, cross entropy with softmax.
 |Training  |Stochastic gradient descent both with and without momentum.
-|Comments  |There are two config files, details are provided below.
+|Comments  |There are four config files, details are provided below.

 ## Running the example

@ -57,7 +57,7 @@ The output folder will be created inside Image/MNIST/.

 ### Config files

-There are three config files and corresponding network description files in the 'Config' folder:
+There are four config files and the corresponding network description files in the 'Config' folder:

 1. 01_OneHidden.ndl is a simple, one hidden layer network that produces 2.3% of error.
 To run the sample, navigate to the Data folder and run the following command:  
@ -74,7 +74,11 @@ As a result, it achieves around 0.8% of error after training for just 2 epochs (
 To run the sample, navigate to the Data folder and run the following command:  
 `cntk configFile=../Config/03_ConvBatchNorm.cntk`

-For more details, refer to .ndl and corresponding .cntk files.
+4. 04_DeConv.ndl illustrates the usage of Deconvolution and Unpooling. It is a network with one Convolution, one Pooling, one Unpooling and one Deconvolution layer. In fact it is an auto-encoder network where Rectified Linear Unit (ReLU) or Sigmoid layer is now replaced with Convolutional ReLU (for encoding) and Deconvolutional ReLU (for decoding) layers. The network goal is to reconstruct the original signal, with Mean Squared Error (MSE) used to minimize the reconstruction error. Generally such networks are used in semantic segmentation.  
+To run the sample, navigate to the Data folder and run the following command:  
+`cntk configFile=../Config/04_DeConv.cntk` 
+
+For more details, refer to .ndl and the corresponding .cntk files.

 ### Additional files

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/zebra.jpg
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/zebra.jpg
--- a/Examples/Text/ATIS/ATIS.cntk
+++ b/Examples/Text/ATIS/ATIS.cntk
@ -0,0 +1,226 @@
+# The configuration file to build language understanding model with ATIS corpus.
+# An LSTM model is built to tag each word in sentences with its semantic label.
+
+WorkDir = work
+DataDir = data
+
+modelPath = $WorkDir$/ATIS.slot.lstm
+parallelTrain = true
+
+#stderr = $WorkDir$/log
+
+command = Train:Output:Test
+
+precision = "float"
+deviceId = "-1"        # change to "auto" to use GPUs
+
+wordCount = 944    # number of words
+labelCount = 127   # number of labels 
+
+# The command to train the LSTM model
+Train = [
+    action = train
+    BrainScriptNetworkBuilder = [
+        inputDim = $wordCount$
+        labelDim = $labelCount$
+        featDim = inputDim*3   # contextual words are used as features: previous word, current word, next word.
+        embDim = 150
+        hiddenDim = 300
+        maxLayer = 1
+        initScale = 6
+        featuresPW = Input(inputDim)    # the previous word
+        featuresCW = Input(inputDim)    # the current word
+        featuresNW = Input(inputDim)    # the next word
+        features = RowStack(featuresPW : featuresCW : featuresNW)
+        
+        labels = Input(labelDim, tag = "label")
+        
+        # embedding layer
+        emb = Parameter(embDim, featDim)
+        featEmbedded = emb * features
+        
+        # build the LSTM stack
+        lstmDims[i:0..maxLayer-1] = hiddenDim
+        NoAuxInputHook (input, lstmState) = BS.Constants.None
+        lstmStack = BS.RNNs.RecurrentLSTMPStack (lstmDims, 
+            cellDims=lstmDims,
+            featEmbedded, 
+            inputDim=embDim,
+            previousHook=BS.RNNs.PreviousHC,
+            augmentInputHook=BS.RNNs.NoAuxInputHook, 
+            augmentInputDim=0,
+            enableSelfStabilization=false)
+
+        lstmOutputLayer = Length (lstmStack)-1
+        LSTMoutput = lstmStack[lstmOutputLayer].h
+    
+        W = Parameter(labelDim, hiddenDim, init = "uniform", initValueScale=initScale)
+        b = Parameter(labelDim, 1, init = "fixedValue", value=0)
+        outputs = W * LSTMoutput + b
+        
+        cr = CrossEntropyWithSoftmax(labels, outputs)
+
+        criterionNodes = (cr)
+        evaluationNodes = (cr)
+        outputNodes = (outputs)
+    ]
+
+    SGD = [
+        # maximum number of epochs
+        maxEpochs = 1   # set to 1 so this can be added to regression test. Increase to 20 get a good accuracy
+
+        # for each epoch, maximum number of input samples(words) is set below
+        epochSize = 36000   
+
+        # minibatchSize should be larger than the maximum sentence length
+        minibatchSize = 70
+
+        learningRatesPerSample = 0.01*2:0.005*12:0.001
+        gradUpdateType = "FSAdaGrad"
+
+        gradientClippingWithTruncation = true
+        clippingThresholdPerSample = 15.0
+
+        # number of minibatches to report progress
+        numMBsToShowResult = 100
+        
+        firstMBsToShowResult = 10 
+        
+        # if validation shows that the model has no improvement, then do back-up to the previously
+        # estimated model and reduce learning rate
+        loadBestModel = true
+
+        parallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            parallelizationStartEpoch = 2
+            distributedMBReading = true
+            dataParallelSGD = [
+                gradientBits = 1
+            ]
+        ]
+    ]
+
+    reader = [
+        readerType = "CNTKTextFormatReader" 
+        file = "$DataDir$/ATIS.train.cntk.sparse" 
+        randomize = true
+        input = [ 
+            featuresPW = [ 
+                alias = "PW"    # previous word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ] 
+            featuresCW = [ 
+                alias = "CW"    # current word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            featuresNW = [ 
+                alias = "NW"    # next word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]            
+            
+            labels = [ 
+                alias = "L"     # label
+                dim = $labelCount$        
+                format = "sparse" 
+            ] 
+        ]
+    ]   
+]
+
+# Evaluate the model to predict labels
+Output = [
+    action = "write"
+
+    traceLevel = 1
+    epochSize = 0
+
+    defaultHiddenActivity = 0.1
+    BrainScriptNetworkBuilder = [
+        modelAsTrained = BS.Network.Load ("$modelPath$")
+        final = Hardmax(modelAsTrained.outputs)
+    ]
+    
+    outputPath = $WorkDir$/model.writeaction
+    outputNodeNames = final
+    
+    reader = [
+        readerType = "CNTKTextFormatReader" 
+        file = "$DataDir$/ATIS.test.cntk.sparse" 
+
+        randomize = false
+        input = [ 
+            featuresPW = [ 
+                alias = "PW"    # previous word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ] 
+            featuresCW = [ 
+                alias = "CW"    # current word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            featuresNW = [ 
+                alias = "NW"    # next word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]            
+            
+            labels = [ 
+                alias = "L"     # label
+                dim = $labelCount$        
+                format = "sparse" 
+            ] 
+        ]
+    ]
+]
+
+# Evaluate the model's accuracy 
+Test = [
+    action = "test"
+
+    traceLevel = 1
+    epochSize = 0
+
+    defaultHiddenActivity = 0.1
+    BrainScriptNetworkBuilder = [
+        labels = Input($labelCount$, tag = "label")
+        modelAsTrained = BS.Network.Load ("$modelPath$")
+        final = Hardmax(modelAsTrained.outputs)
+        errorRate = ErrorPrediction(labels, final, tag='evaluation')
+    ]
+    
+    evalNodeNames  = errorRate
+    
+    reader = [
+        readerType = "CNTKTextFormatReader" 
+        file = "$DataDir$/ATIS.test.cntk.sparse" 
+
+        randomize = false
+        input = [ 
+            featuresPW = [ 
+                alias = "PW"    # previous word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ] 
+            featuresCW = [ 
+                alias = "CW"    # current word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            featuresNW = [ 
+                alias = "NW"    # next word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]            
+            
+            labels = [ 
+                alias = "L"     # label
+                dim = $labelCount$        
+                format = "sparse" 
+            ] 
+        ]
+    ]
+]
--- a/Examples/Text/ATIS/README.md
+++ b/Examples/Text/ATIS/README.md
@ -0,0 +1,168 @@
+# Build Language Understanding Models with CNTK
+
+This example demonstrates how to use build language understanding model with CNTK using ATIS data set. This example is similar to 
+[SLU example](https://github.com/Microsoft/CNTK/tree/master/Examples/Text/Miscellaneous/SLU). They are different in that
+  - CNTKTextFormatReader is used here, instead of LUSequenceReader
+  - With CNTKTextFormatReader, the input format is much more flexible. In the example setting, sparse contextual feature vectors are explored
+  - Sparse label input is used.
+
+The Air travel information system (ATIS) corpus is used for training and testing.
+## Download the example
+The data and configuration is checked in to github. You can get it by command:
+
+`git clone https://github.com/Microsoft/cntk`
+
+The example is under folder: 
+`<cntk_root>\Examples\Text\ATIS`
+
+## Data File Format
+There are four files under `data` sub-folder
+
+|Files                  |Content |
+|:----------------------|:--------|
+|ATIS.train.cntk.sparse |featurized training data set  
+|ATIS.test.cntk.sparse  |featurized test data set 
+|ATIS.vocab             |all words extracted from training data. Vocab size: 944 
+|ATIS.labels            |all semantic labels extracted from training data. Total labels: 127 
+
+We preprocess ATIS data by converting words into word indexes, and labels into label IDs in order to use 
+[CNTKTextFormatReader](https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader). You can use any 
+script/tool to preprocess your text data files. In this example, data is already preprocessed.
+
+The last two files ATIS.vocab and ATIS.labels are not really required to run the example. They are included for evaluation and debugging purpose. 
+E.g. they can be used to convert .sparse files back to original text files. 
+
+To understand the data format (two .sparse files), let's start with a sample sentence:
+```
+BOS i would like to find a flight from charlotte to Las Vegas that makes a stop in St. Louis EOS
+```
+it is converted into the following text:
+```
+1    |PW 1:1     |CW 1:1     |NW 12:1    |L 126:1
+1    |PW 1:1     |CW 12:1    |NW 39:1    |L 126:1
+1    |PW 12:1    |CW 39:1    |NW 28:1    |L 126:1
+1    |PW 39:1    |CW 28:1    |NW 3:1     |L 126:1
+1    |PW 28:1    |CW 3:1     |NW 86:1    |L 126:1
+1    |PW 3:1     |CW 86:1    |NW 15:1    |L 126:1
+1    |PW 86:1    |CW 15:1    |NW 10:1    |L 126:1
+1    |PW 15:1    |CW 10:1    |NW 4:1     |L 126:1
+1    |PW 10:1    |CW 4:1     |NW 101:1   |L 126:1
+1    |PW 4:1     |CW 101:1   |NW 3:1     |L 48:1
+1    |PW 101:1   |CW 3:1     |NW 92:1    |L 126:1
+1    |PW 3:1     |CW 92:1    |NW 90:1    |L 78:1
+1    |PW 92:1    |CW 90:1    |NW 33:1    |L 123:1
+1    |PW 90:1    |CW 33:1    |NW 338:1   |L 126:1
+1    |PW 33:1    |CW 338:1   |NW 15:1    |L 126:1
+1    |PW 338:1   |CW 15:1    |NW 132:1   |L 126:1
+1    |PW 15:1    |CW 132:1   |NW 17:1    |L 126:1
+1    |PW 132:1   |CW 17:1    |NW 72:1    |L 126:1
+1    |PW 17:1    |CW 72:1    |NW 144:1   |L 71:1
+1    |PW 72:1    |CW 144:1   |NW 2:1     |L 119:1
+1    |PW 144:1   |CW 2:1     |NW 2:1     |L 126:1
+```
+where the first column identifies the sequence (sentence) ID, which is the same for all words of the same sentence. There are four input streams: PW, CW, NW, L. 
+The input "PW" represents the previous word ID, "CW" for current word, and "NW" for next word. Input name "L" is for labels. The input names can be anything you 
+like and you can add more input as needed, e.g. words in a bigger window.
+
+Words "BOS" and "EOS" denote beginning of sentence and end of sentences respectively.
+
+Each line above represents one sample (word). E.g. the meaning of this line: `1	|PW 4:1	|CW 101:1	|NW 3:1	|L 48:1`:
+* the sequence ID is 1
+* the current word is "charlotte" whose word ID is 101
+* the previous word is "from" whose ID is 4
+* the next word is "to" whose ID is 3
+* the semantic label is "B-fromloc.city_name" whose label Id is 48.
+
+All word IDs, label IDs and corresponding words and labels are stored in ATIS.vocab and ATIS.labels.
+
+## CNTK Configuration
+
+In this example, we use BrainScript to create one-layer LSTM with embedding for slot tagging. The consolidated config file is ATIS.cntk. One can check the file (with some comments) 
+for details, especially how the reader is configured in ATIS.cntk.
+
+    reader=[
+        readerType = "CNTKTextFormatReader" 
+        file = "$DataDir$/ATIS.train.cntk.sparse" 
+
+        miniBatchMode = "partial" 
+        randomize = true
+        input = [
+            featuresPW = [ 
+                alias = "PW"    # previous word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ] 
+            featuresCW = [ 
+                alias = "CW"    # current word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            featuresNW = [ 
+                alias = "NW"    # next word
+                dim = $wordCount$ 
+                format = "sparse" 
+            ]
+            
+            labels = [ 
+                alias = "L"     # label
+                dim = $labelCount$
+                format = "sparse" 
+            ] 
+        ]
+    ]  
+
+The above section tell CNTK to use CNTKTextFormatReader to read data from the file "$DataDir/ATIS.train.cntk.sparse". The same input names (PW, CW, NW, L) are used to refer inputs (features and labels) provided in data files. The input is read into different 
+feature vectors: featuresPW, featuresCW, featuresNW and labels. These vectors are later used to build LSTM node with BrainScript as follows. 
+```
+        featuresPW = Input(inputDim)
+        featuresCW = Input(inputDim)
+        featuresNW = Input(inputDim)
+        features = RowStack(featuresPW : featuresCW : featuresNW)
+        labels=Input(labelDim, tag="label")
+        # embedding layer
+        emb = LearnableParameter(embDim, featDim)
+        featEmbedded = Times(emb, features)
+        # build the LSTM stack
+        lstmDims[i:0..maxLayer] = hiddenDim
+        NoAuxInputHook (input, lstmState) = BS.Constants.None
+        lstmStack = BS.RNNs.RecurrentLSTMPStack (lstmDims, 
+            cellDims=lstmDims,
+            featEmbedded, 
+            inputDim=embDim,
+            previousHook=BS.RNNs.PreviousHC,
+            augmentInputHook=BS.RNNs.NoAuxInputHook, 
+            augmentInputDim=0,
+            enableSelfStabilization=false)
+        lstmOutputLayer = Length (lstmStack)-1
+        LSTMoutput = lstmStack[lstmOutputLayer].h
+
+```
+A few other notes about the config:
+- it is important to specify the format is "sparse".
+- the gradUpdateType is set FSAdaGrad. This setting reports better model accuracy comparing any other update methods.
+- multiple LSTM layers can be used by changing the value of maxLayer.
+
+Three commands are configured: Train, Output and Test. The command "Train" is used to train a model, "Output" is used to evaluate the model against a test set and store
+the model output, and the command "Test" is to calculate the model's accuracy.
+
+## Run the example
+
+One can run the example locally or on Philly (for Microsoft internal users). 
+
+To run locally,
+
+```sh
+> mkdir work              # the default work_dir
+> open ATIS.cntk and update the value of deviceId: -1 for CPU, auto for GPU
+> cntk.exe configFile=ATIS.cntk
+```
+
+By default, the maxEpochs is set to 1 to save training time. One can change it to larger value such as 20 in order to get a good model accuracy. 
+Depends on GPU, it normally takes about 20 minutes to run 20 epochs on single GPU, and slot F1 score is about 93.
+
+**For Microsoft users only**, to run the job on Philly:
+- first upload data folder to philly cloud. e.g. `\\storage.gcr.philly.selfhost.corp.microsoft.com\pnrsy\<your_alias>\ATIS `
+- update the config file to philly cloud, e.g. `\\storage.gcr.philly.selfhost.corp.microsoft.com\pnrsy_scratch\<your_alias>\ATIS`
+- go to http://philly/ to create a new job by specifying data folder and config file, and start the job.
+
+More details about Philly, including how to upload data to Philly and start jobs, can be found [here](https://microsoft.sharepoint.com/teams/ATISG/SitePages/Philly%20Users%20Guide.aspx)
--- a/Examples/Text/ATIS/data/ATIS.label
+++ b/Examples/Text/ATIS/data/ATIS.label
@ -0,0 +1,127 @@
+B-aircraft_code
+B-airline_code
+B-airline_name
+B-airport_code
+B-airport_name
+B-arrive_date.date_relative
+B-arrive_date.day_name
+B-arrive_date.day_number
+B-arrive_date.month_name
+B-arrive_date.today_relative
+B-arrive_time.end_time
+B-arrive_time.period_mod
+B-arrive_time.period_of_day
+B-arrive_time.start_time
+B-arrive_time.time
+B-arrive_time.time_relative
+B-booking_class
+B-city_name
+B-class_type
+B-compartment
+B-connect
+B-cost_relative
+B-day_name
+B-day_number
+B-days_code
+B-depart_date.date_relative
+B-depart_date.day_name
+B-depart_date.day_number
+B-depart_date.month_name
+B-depart_date.today_relative
+B-depart_date.year
+B-depart_time.end_time
+B-depart_time.period_mod
+B-depart_time.period_of_day
+B-depart_time.start_time
+B-depart_time.time
+B-depart_time.time_relative
+B-economy
+B-fare_amount
+B-fare_basis_code
+B-flight
+B-flight_days
+B-flight_mod
+B-flight_number
+B-flight_stop
+B-flight_time
+B-fromloc.airport_code
+B-fromloc.airport_name
+B-fromloc.city_name
+B-fromloc.state_code
+B-fromloc.state_name
+B-meal
+B-meal_code
+B-meal_description
+B-mod
+B-month_name
+B-or
+B-period_of_day
+B-restriction_code
+B-return_date.date_relative
+B-return_date.day_name
+B-return_date.day_number
+B-return_date.month_name
+B-return_date.today_relative
+B-return_time.period_mod
+B-return_time.period_of_day
+B-round_trip
+B-state_code
+B-state_name
+B-stoploc.airport_code
+B-stoploc.airport_name
+B-stoploc.city_name
+B-stoploc.state_code
+B-time
+B-time_relative
+B-today_relative
+B-toloc.airport_code
+B-toloc.airport_name
+B-toloc.city_name
+B-toloc.country_name
+B-toloc.state_code
+B-toloc.state_name
+B-transport_type
+I-airline_name
+I-airport_name
+I-arrive_date.day_number
+I-arrive_time.end_time
+I-arrive_time.period_of_day
+I-arrive_time.start_time
+I-arrive_time.time
+I-arrive_time.time_relative
+I-city_name
+I-class_type
+I-cost_relative
+I-depart_date.day_number
+I-depart_date.today_relative
+I-depart_time.end_time
+I-depart_time.period_of_day
+I-depart_time.start_time
+I-depart_time.time
+I-depart_time.time_relative
+I-economy
+I-fare_amount
+I-fare_basis_code
+I-flight_mod
+I-flight_number
+I-flight_stop
+I-flight_time
+I-fromloc.airport_name
+I-fromloc.city_name
+I-fromloc.state_name
+I-meal_code
+I-meal_description
+I-restriction_code
+I-return_date.date_relative
+I-return_date.day_number
+I-return_date.today_relative
+I-round_trip
+I-state_name
+I-stoploc.city_name
+I-time
+I-today_relative
+I-toloc.airport_name
+I-toloc.city_name
+I-toloc.state_name
+I-transport_type
+O
--- a/Examples/Text/ATIS/data/ATIS.test.cntk.sparse
+++ b/Examples/Text/ATIS/data/ATIS.test.cntk.sparse
--- a/Examples/Text/ATIS/data/ATIS.train.cntk.sparse
+++ b/Examples/Text/ATIS/data/ATIS.train.cntk.sparse
--- a/Examples/Text/ATIS/data/ATIS.vocab
+++ b/Examples/Text/ATIS/data/ATIS.vocab
@ -0,0 +1,944 @@
+</s>
+BOS
+EOS
+to
+from
+flights
+the
+on
+what
+me
+flight
+show
+i
+boston
+san
+a
+denver
+in
+and
+francisco
+atlanta
+is
+pittsburgh
+dallas
+all
+baltimore
+list
+philadelphia
+like
+are
+airlines
+of
+between
+that
+washington
+pm
+leaving
+please
+morning
+would
+fly
+for
+city
+fare
+wednesday
+first
+need
+after
+trip
+oakland
+there
+ground
+round
+does
+transportation
+'d
+which
+cheapest
+you
+arriving
+class
+before
+available
+american
+new
+fares
+milwaukee
+with
+give
+have
+afternoon
+york
+st.
+one
+dc
+at
+way
+monday
+leave
+arrive
+airport
+thursday
+how
+want
+tuesday
+nonstop
+find
+am
+earliest
+go
+vegas
+miami
+las
+united
+information
+orlando
+phoenix
+chicago
+sunday
+saturday
+evening
+charlotte
+twenty
+newark
+can
+delta
+toronto
+seattle
+diego
+kansas
+indianapolis
+houston
+airline
+noon
+any
+friday
+lake
+salt
+'s
+next
+us
+o'clock
+cleveland
+continental
+air
+angeles
+los
+august
+worth
+do
+fort
+july
+stop
+code
+5
+seventh
+early
+memphis
+tell
+aircraft
+downtown
+or
+june
+6
+louis
+montreal
+cincinnati
+around
+tomorrow
+cost
+going
+latest
+petersburg
+tampa
+many
+minneapolis
+nashville
+8
+get
+mean
+jose
+detroit
+10
+an
+departing
+stopover
+tacoma
+by
+about
+twa
+much
+7
+leaves
+may
+long
+type
+burbank
+see
+expensive
+ticket
+international
+12
+travel
+could
+dollars
+than
+daily
+columbus
+service
+beach
+'m
+california
+9
+night
+least
+know
+economy
+time
+4
+depart
+into
+meal
+paul
+coach
+book
+april
+airports
+northwest
+la
+lowest
+now
+december
+less
+westchester
+day
+serves
+it
+serve
+november
+okay
+arrives
+used
+field
+love
+last
+ontario
+second
+county
+return
+kind
+september
+mitchell
+general
+as
+stops
+flying
+2
+third
+be
+direct
+fifth
+eighth
+stopping
+times
+breakfast
+out
+make
+capacity
+car
+take
+schedule
+seating
+sixth
+1000
+number
+goes
+cities
+dinner
+connecting
+3
+dl
+fourth
+airfare
+possible
+this
+has
+served
+meals
+ninth
+looking
+also
+restriction
+week
+late
+eastern
+returning
+back
+today
+interested
+price
+business
+most
+prices
+1991
+two
+types
+flies
+twentieth
+will
+through
+limousine
+ua
+bwi
+via
+tenth
+using
+stand
+plane
+ap
+fifteenth
+guardia
+same
+1
+should
+other
+arrangements
+f
+only
+rental
+then
+display
+your
+shortest
+wednesdays
+listing
+canadian
+classes
+again
+numbers
+thirtieth
+florida
+express
+midwest
+tickets
+where
+twelfth
+sixteenth
+h
+north
+eleventh
+carolina
+seventeenth
+under
+smallest
+mco
+distance
+lunch
+either
+makes
+if
+qx
+transport
+far
+hp
+57
+october
+no
+my
+m80
+thank
+arizona
+jfk
+colorado
+jersey
+q
+weekday
+airplane
+y
+planes
+some
+departure
+use
+ewr
+their
+ohio
+thirty
+nineteenth
+when
+fourteenth
+explain
+layover
+alaska
+march
+stopovers
+live
+people
+traveling
+serving
+rent
+hi
+offer
+later
+yes
+january
+area
+logan
+right
+booking
+sfo
+midnight
+yn
+but
+during
+landings
+february
+dfw
+abbreviation
+630
+both
+'re
+230
+qw
+boeing
+coming
+passengers
+arrange
+hours
+qo
+codes
+trying
+tower
+466
+canada
+each
+530
+over
+uses
+arrivals
+11
+southwest
+281
+trips
+838
+days
+those
+takeoffs
+lufthansa
+west
+1100
+arrival
+757
+minnesota
+anywhere
+america
+430
+thrift
+let
+mornings
+nationair
+'ll
+kinds
+cheap
+close
+seats
+pennsylvania
+name
+quebec
+indiana
+michigan
+saturdays
+different
+taxi
+provided
+rates
+utah
+these
+starting
+sometime
+costs
+making
+bh
+eighteenth
+following
+another
+ff
+near
+747
+ea
+1992
+connect
+help
+choices
+sa
+maximum
+wish
+1115
+six
+weekdays
+more
+total
+s
+dc10
+d9s
+2100
+snack
+1245
+georgia
+72s
+73s
+f28
+heading
+departures
+amount
+825
+737
+813
+ap57
+sixteen
+m
+sorry
+serviced
+three
+miles
+departs
+1700
+requesting
+718
+land
+nevada
+100
+so
+tennessee
+tuesdays
+hello
+destination
+reservation
+texas
+rentals
+co
+meaning
+ap80
+1500
+270
+thursdays
+philly
+thirteenth
+services
+sundays
+turboprop
+stands
+415
+provide
+cars
+we
+great
+mondays
+include
+sure
+'t
+well
+2134
+fn
+555
+ord
+934
+connection
+296
+abbreviations
+755
+highest
+hold
+720
+fit
+80
+soon
+four
+ten
+noontime
+too
+offers
+options
+within
+difference
+c
+restrictions
+plan
+originating
+describe
+nw
+1110
+connections
+dulles
+21
+733
+say
+approximately
+define
+852
+1291
+rate
+who
+proper
+beginning
+being
+329
+352
+don
+1024
+such
+wanted
+615
+mealtime
+provides
+prefer
+1288
+257
+across
+continent
+overnight
+local
+route
+746
+off
+j31
+closest
+19
+lax
+l10
+be1
+1994
+red
+eye
+not
+aa
+dca
+determine
+1200
+1205
+dtw
+airfares
+capacities
+200
+town
+lga
+300
+1993
+database
+1765
+eight
+up
+originate
+look
+cp
+carries
+here
+201
+located
+dinnertime
+1039
+lastest
+1222
+they
+just
+d
+limo
+3724
+210
+stapleton
+343
+1145
+schedules
+932
+nonstops
+without
+landing
+b
+midway
+217
+bound
+727
+takeoff
+324
+train
+along
+friends
+transcontinental
+missouri
+reservations
+lives
+767
+269
+ac
+atl
+month
+taking
+repeat
+845
+airplanes
+buy
+still
+itinerary
+actually
+earlier
+various
+reaching
+very
+names
+505
+grounds
+ap68
+must
+kennedy
+operation
+4400
+1201
+297
+question
+combination
+basis
+laying
+1133
+650
+tonight
+43
+ls
+sam
+ap58
+once
+nighttime
+yx
+kw
+212
+1600
+tpa
+prior
+good
+1800
+819
+inform
+k
+dc9
+305
+anything
+771
+459
+calling
+designate
+417
+spend
+hou
+1220
+directly
+jet
+reverse
+staying
+l1011
+belong
+445
+515
+travels
+order
+mci
+150
+110
+connects
+charges
+minimum
+intercontinental
+497766
+sounds
+811
+seat
+final
+phl
+20
+start
+823
+1059
+271
+382
+able
+put
+locate
+hartfield
+scheduled
+run
+225
+1158
+equipment
+begins
+lands
+reaches
+carried
+wn
+bn
+try
+included
+130
+continuing
+india
+lester
+pearson
+listings
+1209
+everywhere
+sd
+whether
+offered
+486
+1300
+950
+usa
+1045
+al
+currently
+enroute
+visit
+them
+takes
+55
+thing
+705
+fridays
+catch
+straight
+advertises
+having
+planning
+listed
+1055
+405
+468
+equal
+working
+sb
+hopefully
+dh8
+symbols
+sort
+cover
+810
+operating
+320
+639
+seventeen
+1207
+608
+besides
+companies
+'ve
+got
+somebody
+else
+wants
+level
+vicinity
+1940
+311
+mia
+instead
+priced
+eleven
+comes
+greatest
+summer
+economic
+bay
+402
+gets
+date
+1020
+730
+400
+doesn
+toward
+home
+1850
+1505
+runs
+673
+723
+thanks
+bring
+zone
+yyz
+afternoons
+non
+largest
+500
+come
+428
+98
+qualify
+279
+137338
+d10
+539
+fine
+while
+665
+concerning
+iah
+1230
+oak
+preferably
+twelve
+3357
+323
+nights
+229
+regarding
+seven
+inexpensive
+420
+416
+repeating
+scenario
+139
+82
+kindly
+limousines
+345
+afterwards
+734
+place
+includes
+106
+1026
+124
+fifteen
+bna
+supper
+oh
+71
+thereafter
+2153
+year
+discount
+1130
+1030
+world
+trans
+including
+represented
+o
+'hare
+exceeding
+815
+928
+163
+bur
+419
+cvg
+1017
+315
+842
+1083
+0900
+longest
+called
+snacks
+645
+ever
+single
--- a/227
+++ b/227
@ -35,6 +35,9 @@
 #     defaults to /usr/local/
 # These can be overridden on the command line, e.g. make BUILDTYPE=debug

+# TODO: Build static libraries for common dependencies that are shared by multiple 
+# targets, e.g. eval and CNTK.
+
 ARCH=$(shell uname)

 ifndef BUILD_TOP
@ -68,7 +71,7 @@ INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2L
 # COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
 COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
 CPPFLAGS:= 
-CXXFLAGS:= -msse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
+CXXFLAGS:= -msse4.1 -mssse3 -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
 LIBPATH:=
 LIBS:=
 LDFLAGS:=
@ -87,7 +90,7 @@ SRC:=
 all : buildall

 # Set up basic nvcc options and add CUDA targets from above
-CUFLAGS = -m 64
+CUFLAGS = -m 64 

 ifdef CUDA_PATH
  ifndef GDK_PATH
@ -107,7 +110,7 @@ ifdef CUDA_PATH
  # This is a suggested/default location for NVML
  INCLUDEPATH+=$(GDK_PATH)/include/nvidia/gdk
  INCLUDEPATH+=$(CUB_PATH)
-  NVMLPATH=$(GDK_PATH)/src/gdk/nvml/lib
+  NVMLLIBPATH=$(GDK_PATH)/src/gdk/nvml/lib

 # Set up CUDA includes and libraries
  INCLUDEPATH += $(CUDA_PATH)/include
@ -167,6 +170,10 @@ ifdef KALDI_PATH
  KALDI_LIBS += -lkaldi-util -lkaldi-matrix -lkaldi-base -lkaldi-hmm -lkaldi-cudamatrix -lkaldi-nnet -lkaldi-lat
 endif

+ifdef SUPPORT_AVX2
+  CPPFLAGS += -mavx2
+endif
+
 # Set up nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
 # In debug mode we will rely on JIT to create code "on the fly" for the underlying architecture
 GENCODE_SM30 := -gencode arch=compute_30,code=\"sm_30,compute_30\"
@ -225,6 +232,7 @@ ORIGINDIR:='$$ORIGIN'

 CNTKMATH:=cntkmath

+RPATH=-Wl,-rpath,

 ########################################
 # Build info
@ -239,7 +247,6 @@ ifneq ("$(BUILDINFO_OUTPUT)","Success")
  $(error Could not generate $(BUILDINFO))
 endif

-
 ########################################
 # Math library
 ########################################
@ -269,6 +276,7 @@ COMMON_SRC =\
 	$(SOURCEDIR)/Common/fileutil.cpp \

 MATH_SRC =\
+	$(SOURCEDIR)/Math/BlockHandlerSSE.cpp \
 	$(SOURCEDIR)/Math/CPUMatrix.cpp \
 	$(SOURCEDIR)/Math/CPUSparseMatrix.cpp \
 	$(SOURCEDIR)/Math/CPURNGHandle.cpp \
@ -282,6 +290,12 @@ MATH_SRC =\
 	$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
 	$(SOURCEDIR)/Math/BatchNormalizationEngine.cpp \

+ifdef SUPPORT_AVX2
+MATH_SRC +=\
+	$(SOURCEDIR)/Math/BlockHandlerAVX.cpp \
+
+endif
+
 ifdef CUDA_PATH
 MATH_SRC +=\
 	$(SOURCEDIR)/Math/GPUMatrix.cu \
@ -310,14 +324,13 @@ CNTKMATH_LIB:= $(LIBDIR)/lib$(CNTKMATH).so
 ALL += $(CNTKMATH_LIB)
 SRC+=$(MATH_SRC)

-RPATH=-Wl,-rpath,
-
 $(CNTKMATH_LIB): $(MATH_OBJ)
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
 	@mkdir -p $(dir $@)
-	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp

+########################################
 # CNTKLibrary
 ########################################

@ -362,6 +375,8 @@ CNTKLIBRARY_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/Utils.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Value.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Variable.cpp \
+	$(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \
+

 CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
 CNTKLIBRARY_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
@ -376,14 +391,13 @@ CNTKLIBRARY_LIB:=$(LIBDIR)/lib$(CNTKLIBRARY).so
 ALL+=$(CNTKLIBRARY_LIB)
 SRC+=$(CNTKLIBRARY_SRC)

-RPATH=-Wl,-rpath,
-
 $(CNTKLIBRARY_LIB): $(CNTKLIBRARY_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH)

+########################################
 # CNTKLibrary tests
 ########################################

@ -400,14 +414,70 @@ CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJ
 ALL+=$(CNTKLIBRARY_TESTS)
 SRC+=$(CNTKLIBRARY_TESTS_SRC)

-RPATH=-Wl,-rpath,
-
 $(CNTKLIBRARY_TESTS): $(CNTKLIBRARY_TESTS_OBJ) | $(CNTKLIBRARY_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) -l$(CNTKMATH)

+########################################
+# LibEval
+########################################
+
+EVAL:=eval
+
+SGDLIB_SRC=\
+	$(SOURCEDIR)/SGDLib/Profiler.cpp \
+	$(SOURCEDIR)/SGDLib/SGD.cpp
+	
+EVAL_SRC=\
+	$(SOURCEDIR)/EvalDll/CNTKEval.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
+	$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
+	$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
+	$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
+	$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
+	$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
+	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
+
+EVAL_SRC+=$(SGDLIB_SRC)
+EVAL_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
+EVAL_SRC+=$(CNTK_COMMON_SRC)
+EVAL_SRC+=$(SEQUENCE_TRAINING_LIB_SRC)
+
+EVAL_OBJ:=$(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SRC)))
+
+EVAL_LIB:=$(LIBDIR)/lib$(EVAL).so
+ALL+=$(EVAL_LIB)
+SRC+=$(EVAL_SRC)
+
+$(EVAL_LIB): $(EVAL_OBJ) 
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS)
+
+########################################
+# Eval Sample client
+########################################
+EVAL_SAMPLE_CLIENT:=$(BINDIR)/cppevalclient
+
+EVAL_SAMPLE_CLIENT_SRC=\
+	$(SOURCEDIR)/../Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp 
+
+EVAL_SAMPLE_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SAMPLE_CLIENT_SRC))
+
+ALL+=$(EVAL_SAMPLE_CLIENT)
+SRC+=$(EVAL_SAMPLE_CLIENT_SRC)
+
+$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $(EVAL_SAMPLE_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ -l$(EVAL) -l$(CNTKMATH)
+
+########################################
 # BinaryReader plugin
 ########################################

@ -692,8 +762,6 @@ CNTK_SRC =\
 	$(SOURCEDIR)/CNTK/CNTK.cpp \
 	$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
 	$(SOURCEDIR)/CNTK/tests.cpp \
-	$(SOURCEDIR)/SGDLib/Profiler.cpp \
-	$(SOURCEDIR)/SGDLib/SGD.cpp \
 	$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
 	$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
 	$(SOURCEDIR)/ActionsLib/OtherActions.cpp \
@ -706,7 +774,7 @@ CNTK_SRC =\
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \

-
+CNTK_SRC+=$(SGDLIB_SRC)
 CNTK_SRC+=$(CNTK_COMMON_SRC)
 CNTK_SRC+=$(COMPUTATION_NETWORK_LIB_SRC)
 CNTK_SRC+=$(SEQUENCE_TRAINING_LIB_SRC)
@ -721,7 +789,7 @@ $(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -fopenmp

 # deployable resources: standard library of BS
 CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
@ -731,6 +799,127 @@ $(CNTK_CORE_BS): $(SOURCEDIR)/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
 	@echo bin-placing deployable resource files
 	cp -f $^ $@

+########################################
+# Unit Tests
+########################################
+
+# use system pre-installed Boost libraries
+# Todo: use our own version of boost libraries 
+BOOSTLIB_PATH = /usr/lib/x86_64-linux-gnu
+BOOSTLIBS := boost_unit_test_framework boost_filesystem boost_system
+
+UNITTEST_EVAL_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/EvalTests/EvalExtendedTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/EvalTests/stdafx.cpp
+
+UNITTEST_EVAL_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_EVAL_SRC))
+
+UNITTEST_EVAL := $(BINDIR)/evaltests
+# Temporarily not build unit tests as the docker image does not include boost.
+#ALL += $(UNITTEST_EVAL)
+#SRC += $(UNITTEST_EVAL_SRC)
+
+$(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) -l$(EVAL) -l$(CNTKMATH) 
+
+#TODO: create project specific makefile or rules to avoid adding project specific path to the global path
+INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
+
+UNITTEST_READER_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/HTKLMFReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ImageReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/ReaderLibTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/UCIFastReaderTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/ReaderTests/stdafx.cpp \
+	$(SOURCEDIR)/Readers/CNTKTextFormatReader/Indexer.cpp \
+	$(SOURCEDIR)/Readers/CNTKTextFormatReader/TextParser.cpp \
+
+UNITTEST_READER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_READER_SRC))
+
+UNITTEST_READER := $(BINDIR)/readertests
+# Temporarily not build unit tests as the docker image does not include boost.
+#ALL += $(UNITTEST_READER)
+#SRC += $(UNITTEST_READER_SRC)
+
+$(UNITTEST_READER): $(UNITTEST_READER_OBJ) | $(HTKMLFREADER) $(HTKDESERIALIZERS) $(UCIFASTREADER) $(COMPOSITEDATAREADER) $(IMAGEREADER) $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS))  -l$(CNTKMATH) 
+
+UNITTEST_NETWORK_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/OperatorEvaluation.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/stdafx.cpp \
+	$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
+	$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
+	$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
+	$(SOURCEDIR)/ActionsLib/OtherActions.cpp \
+	$(SOURCEDIR)/ActionsLib/SpecialPurposeActions.cpp \
+	$(SOURCEDIR)/ActionsLib/NetworkFactory.cpp \
+	$(SOURCEDIR)/ActionsLib/NetworkDescriptionLanguage.cpp \
+	$(SOURCEDIR)/ActionsLib/SimpleNetworkBuilder.cpp \
+	$(SOURCEDIR)/ActionsLib/NDLNetworkBuilder.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
+	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \
+
+UNITTEST_NETWORK_SRC += $(COMPUTATION_NETWORK_LIB_SRC)
+UNITTEST_NETWORK_SRC += $(CNTK_COMMON_SRC)
+UNITTEST_NETWORK_SRC += $(SEQUENCE_TRAINING_LIB_SRC)
+UNITTEST_NETWORK_SRC += $(SGDLIB_SRC)
+
+UNITTEST_NETWORK_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_NETWORK_SRC)))
+
+UNITTEST_NETWORK := $(BINDIR)/networktests
+# Temporarily not build unit tests as the docker image does not include boost.
+#ALL += $(UNITTEST_NETWORK)
+#SRC += $(UNITTEST_NETWORK_SRC)
+
+$(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATREADER)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) -l$(CNTKMATH) $(LIBS) 
+
+UNITTEST_MATH_SRC = \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/BlockMultiplierTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/constants.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/CPUSparseMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/fixtures.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixCudaBlasTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/GPUSparseMatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixBlasTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixDataSynchronizationTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixFileWriteReadTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
+
+UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
+
+UNITTEST_MATH := $(BINDIR)/mathtests
+# Temporarily not build unit tests as the docker image does not include boost.
+#ALL += $(UNITTEST_MATH)
+#SRC += $(UNITTEST_MATH_SRC)
+
+$(UNITTEST_MATH): $(UNITTEST_MATH_OBJ) | $(CNTKMATH_LIB) 
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(NVMLLIBPATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(LIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(patsubst %, -l%, $(BOOSTLIBS)) $(LIBS) -l$(CNTKMATH) 
+
+unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH)
+
+
 ########################################
 # General compile and dependency rules
 ########################################
@ -755,13 +944,13 @@ $(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
 	@mkdir -p $(dir $@)
 	$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"

-$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
+$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION) 
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
 	@mkdir -p $(dir $@)
 	$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}

-.PHONY: clean buildall all
+.PHONY: clean buildall all unittests

 clean:
 	@echo $(SEPARATOR)
--- a/README.md
+++ b/README.md
@ -1,18 +1,17 @@
 # CNTK

 ## Latest news
-*2016-06-16.* V 1.5 Binary release. NuGet Package with CNTK Model Evaluation Libraries. 
-NuGet Package is added to CNTK v.1.5 binaries. See [CNTK Releases page](https://github.com/Microsoft/CNTK/releases) and [NuGet Package description](https://github.com/Microsoft/CNTK/wiki/Nuget-Package-for-Evaluation).
+*2016-07-15.* V 1.6 Binary release  
+CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)

-*2016-06-15.*  CNTK now supports building against a custom Intel® Math Kernel Library (MKL).
-See [setup instructions](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine) on how to set this up for your platform.
+*2016-07-12.* We have further expanded Licensing options for CNTK 1bit-SGD and related components. See the details at the [Wiki page](https://github.com/microsoft/cntk/wiki/CNTK-1bit-SGD-License). These new options are an extension of the new CNTK 1bit-SGD License that we have announced on Jun 23, 2016.

-*2016-06-10.* See CNTK v.1.5 binary release announcement in the official [Microsoft Research Blog](https://blogs.msdn.microsoft.com/msr_er/2016/06/10/microsoft-improves-programming-flexibility-of-its-ai-toolkit/)
+*2016-07-05.* CNTK now supports *Deconvolution* and *Unpooling*. See the usage example in the Network number 4 in [MNIST Sample](https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md).

-*2016-06-08.* V 1.5 Binary release
-CNTK v.1.5 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
+*2016-06-23.* New License Terms for CNTK 1bit-SGD and related components.  
+Effective immediately the License Terms for CNTK 1bit-SGD and related components have changed. The new Terms provide more flexibility and enable new usage scenarios, especially in commercial environments. Read the new Terms at the [standard location](https://cntk1bitsgd.codeplex.com/license). Please note, that while the new Terms are significantly more flexible comparing to the previous ones, they are still **more restrictive** than the main CNTK License. Consequently everything described in [Enabling 1bit-SGD](https://github.com/Microsoft/CNTK/wiki/Enabling-1bit-SGD) section of the Wiki remains valid.

-*2016-06-01.* An updated version of the network-description language has been made available under the new [BrainScript Network Builder](https://github.com/Microsoft/CNTK/wiki/BrainScript-Network-Builder), which features full expression parsing, recursive functions, and more.
+*2016-06-20.* A [post](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/) on Intel MKL and CNTK is published in the [Intel IT Peer Network](http://itpeernetwork.intel.com/accelerating-the-computational-network-tool-kit-with-intel-mkl/)

 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

@ -45,3 +44,8 @@ Amit Agarwal, Eldar Akchurin, Chris Basoglu, Guoguo Chen, Scott Cyphers, Jasha D
 ## Disclaimer 

 CNTK is in active use at Microsoft and constantly evolving. There will be bugs.
+
+
+## Microsoft Open Source Code of Conduct
+
+This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.
--- a/Scripts/README.md
+++ b/Scripts/README.md
@ -0,0 +1,24 @@
+This directory contains different script helping using different components of CNTK.
+
+### CNTK Text format Converters
+Two Python Scripts for converting Data to CNTK Text format for using as an input for CNTK Text Format Reader (see https://github.com/microsoft/cnTK/wiki/CNTKTextFormat-Reader).
+```
+txt2ctf.py 
+```
+Converts a set of dictionary files and a plain text file to CNTK Text format. Run ```python txt2ctf.py -h``` to see usage instructions. See the comments in the beginning of the script file for the specific usage example. 
+
+```
+uci2ctf.py
+```
+Converts data stored in a text file in UCI format to CNTK Text format. Run ```python uci2ctf.py -h``` to see usage instructions and example. Also see a usage example below:
+```
+python Scripts/uci2ctf.py --input_file Examples/Image/MNIST/Data/Train-28x28.txt --features_start 1 --features_dim 784 --labels_start 0 --labels_dim 1 --num_labels 10  --output_file Examples/Image/MNIST/Data/Train-28x28_cntk_text.txt
+```
+```input_file``` – original dataset in the (columnar) UCI format
+```features_start``` – index of the first feature column (start parameter in the UCIFastReader config, see https://github.com/Microsoft/CNTK/wiki/UCI-Fast-Reader)
+```features_dim``` – number of feature columns (dim parameter in the UCIFastReader config)
+```labels_start``` - index of the first label column
+```labels_dim``` – number of label columns
+```num_labels``` – number of possible label values (labelDim parameter in the UCIFastReader config)
+```output_file``` – path and filename of the resulting dataset.
+
--- a/Source/Readers/CNTKTextFormatReader/uci_to_cntk_text_format_converter.py
+++ b/Source/Readers/CNTKTextFormatReader/uci_to_cntk_text_format_converter.py
--- a/Source/1BitSGD
+++ b/Source/1BitSGD
@ -1 +1 @@
-Subproject commit 18fcb1a9378432ae179948b0f1e281115a2c7d86
+Subproject commit f7afb8c6a08a6652d84de1b62377175788be5284
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -191,6 +191,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(KhatriRaoProductNode), L"ColumnwiseCrossProduct")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LearnableParameter), L"Parameter")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LogNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(LogPlusNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LogSoftmaxNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LogisticNode), L"Logistic")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LookupTableNode))) ret = true;
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
@ -53,7 +53,6 @@ public:

 __declspec_noreturn static inline void EvaluationError(const wstring &msg, TextLocation where)
 {
-    //Microsoft::MSR::CNTK::DebugUtil::PrintCallStack();
    throw EvaluationException(msg, where);
 }

--- a/Source/CNTK/BrainScript/BrainScriptParser.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp
@ -89,9 +89,18 @@ struct Issue
 // Because it is often hard to recognize an issue only from the point where it occurred, we also report the history in compact visual form.
 // Since often multiple contexts are on the same source line, we only print each source line once in a consecutive row of contexts.
 /*static*/ void TextLocation::PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
+{
+    wstring error = CreateIssueMessage(locations, errorKind, kind, what);
+    fprintf(stderr, "%ls", error.c_str());
+    fflush(stderr);
+}
+
+/*static*/ wstring TextLocation::CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what)
 {
    vector<Issue> issues; // tracing the error backwards
    size_t symbolIndex = 0;
+    wstring message;
+
    for (size_t n = 0; n < locations.size(); n++)
    {
        let& location = locations[n];
@ -125,20 +134,23 @@ struct Issue
    if (!locations.empty()) // (be resilient to some throwers not having a TextLocation; to be avoided)
    {
        let& firstLoc = issues.front().location;
-        fprintf(stderr, "[CALL STACK]\n");
+        message += wstrprintf(L"[CALL STACK]\n");
        for (auto i = issues.rbegin(); i != issues.rend(); i++)
        {
            let& issue = *i;
            auto& where = issue.location;
            const auto& lines = where.GetSourceFile().lines;
            const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str();
-            fprintf(stderr, "  %ls\n  %ls\n", line, issue.markup.c_str());
+            message += wstrprintf(L"  %ls\n  %ls\n", line, issue.markup.c_str());
        }
-        fprintf(stderr, "%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
+        message += wstrprintf(L"%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
    }
    else
-        fprintf(stderr, "%ls while %ls", errorKind, kind);
-    fprintf(stderr, ": %ls\n", what), fflush(stderr);
+    {
+        message += wstrprintf(L"%ls while %ls", errorKind, kind);
+    }
+    message += wstrprintf(L": %ls\n", what);
+    return message;
 }
 /*static*/ vector<SourceFile> TextLocation::sourceFileMap;

--- a/Source/CNTK/BrainScript/BrainScriptParser.h
+++ b/Source/CNTK/BrainScript/BrainScriptParser.h
@ -37,6 +37,7 @@ struct TextLocation // position in the text. Lightweight value struct that we ca

    // helpers for pretty-printing errors: Show source-code line with ...^ under it to mark up the point of error
    static void PrintIssue(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
+    static std::wstring CreateIssueMessage(const vector<TextLocation>& locations, const wchar_t* errorKind, const wchar_t* kind, const wchar_t* what);
    static void Trace(TextLocation, const wchar_t* traceKind, const wchar_t* op, const wchar_t* exprPath);

    // construction
@ -77,8 +78,12 @@ public:
    }                                        // where the error happened
    virtual const wchar_t* kind() const = 0; // e.g. "warning" or "error"

+    wstring GetError(const std::wstring& linePrefix) const override
+    {
+        return TextLocation::CreateIssueMessage(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
+    }
    // pretty-print this as an error message
-    void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const
+    void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const override
    {
        TextLocation::PrintIssue(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
    }
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -26,15 +26,36 @@ IntDiv(x, y)  = new NumericFunction [ what = 'IntDiv' ;  args = (x:y) ]


 ##############################################################################
-# comparison functions
+# aliases
 ##############################################################################

-Less         = CNTK2.Less
-Equal        = CNTK2.Equal
-Greater      = CNTK2.Greater
-GreaterEqual = CNTK2.GreaterEqual
-NotEqual     = CNTK2.NotEqual
-LessEqual    = CNTK2.LessEqual
+Less                    = CNTK2.Less
+Equal                   = CNTK2.Equal
+Greater                 = CNTK2.Greater
+GreaterEqual            = CNTK2.GreaterEqual
+NotEqual                = CNTK2.NotEqual
+LessEqual               = CNTK2.LessEqual
+Splice                  = CNTK2.Splice
+TransposeDimensions     = CNTK2.TransposeDimensions
+Times                   = CNTK2.Times
+Abs                     = CNTK2.Abs
+Ceil                    = CNTK2.Ceil
+CrossEntropyWithSoftmax = CNTK2.CrossEntropyWithSoftmax
+Dropout                 = CNTK2.Dropout
+ElementTimes            = CNTK2.ElementTimes
+ElementDivide           = CNTK2.ElementDivide
+ErrorPrediction         = CNTK2.ErrorPrediction
+Exp                     = CNTK2.Exp
+Floor                   = CNTK2.Floor
+Log                     = CNTK2.Log
+Minus                   = CNTK2.Minus
+Pass                    = CNTK2.Identity
+Plus                    = CNTK2.Plus
+RectifiedLinear         = CNTK2.Relu
+ReduceSum               = CNTK2.ReduceSum
+ReduceLogSum            = CNTK2.ReduceLogSum
+Round                   = CNTK2.Round
+Sigmoid                 = CNTK2.Sigmoid

 ##############################################################################
 # ComputationNodes
@ -87,14 +108,14 @@ CNTK2 = [
        else new ComputationNode [ operation = 'Slice' ; inputs = _ /*plus the function args*/ ] # non-time axis

    Splice (_, axis=1, tag='') = # TODO: This is a workaround. RowStack itself shall interpret 'axis' and be renamed to Splice().
-		if axis < 1 then Fail('Splice does not yet implement splicing the time axis.')
-		else if axis == 1 then [tag1=tag; out = RowStack (_, tag=tag1)].out
-		else [ # workaround: swap 'axis' to first position, RowStack, swap back
-			ArrayTransposeDimensions (_, axis1, axis2) = [ # transpose each element of a BS array
-				inputsT[i:0..Length(_)-1] = TransposeDimensions (_[i], axis1, axis2)
-			].inputsT
-			out = [tag1=tag; out=TransposeDimensions (RowStack (ArrayTransposeDimensions (_, 1, axis)), 1, axis, tag=tag)].out
-		].out
+        if axis < 1 then Fail('Splice does not yet implement splicing the time axis.')
+        else if axis == 1 then [tag1=tag; out = RowStack (_, tag=tag1)].out
+        else [ # workaround: swap 'axis' to first position, RowStack, swap back
+            ArrayTransposeDimensions (_, axis1, axis2) = [ # transpose each element of a BS array
+                inputsT[i:0..Length(_)-1] = TransposeDimensions (_[i], axis1, axis2)
+            ].inputsT
+            out = [tag1=tag; out=TransposeDimensions (RowStack (ArrayTransposeDimensions (_, 1, axis)), 1, axis, tag=tag)].out
+        ].out

    // Swap two axes of a tensor
    TransposeDimensions(_, axis1, axis2, tag='') = new ComputationNode [ operation = 'TransposeDimensions' ; inputs = _ /*plus the function args*/ ]
@ -120,9 +141,11 @@ CNTK2 = [
    Square(_, tag='') = ElementTimes(_, _, tag=tag)
    Tanh(_, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = _ /*plus the function args*/ ]

-    // 6. Reductions
-    // None so far
-
+    // 6. Reductions    
+    # the following is a temporary workaround until we have the C++ version
+    ReduceLogSum (_, axis=0, tag='')  = if axis != 0 then Fail("ReduceLogSum for now only supports axis=0.")
+    else [ tag1=tag ; axis1=axis ; out = RowSlice (0, 1, _ - LogSoftmax (_), tag=tag1) ].out
+    ReduceSum (_, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = _ ; reductionOp = "Sum"    /*plus the function args*/ ]
    // 7. Control flow (if, composite etc.)
    // None so far

@ -147,7 +170,7 @@ CNTK2 = [
    CrossEntropyWithSoftmax(_, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (_ : outProbVectorSequence) /*plus the function args*/ ]
    ErrorPrediction(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = if topN == 1 then (_ : outVectorSequence) else  (_ : outVectorSequence : Constant (topN)) /*plus the function args*/ ]

-    // 13. Comparison nodes
+    // 12. Comparison nodes
    Less(_, y, tag='')         = new ComputationNode [ operation = 'Less'         ; inputs = (_ : y) /*plus the function args*/ ]
    Equal(_, y, tag='')        = new ComputationNode [ operation = 'Equal'        ; inputs = (_ : y) /*plus the function args*/ ]
    Greater(_, y, tag='')      = new ComputationNode [ operation = 'Greater'      ; inputs = (_ : y) /*plus the function args*/ ]
@ -155,8 +178,7 @@ CNTK2 = [
    NotEqual(_, y, tag='')     = new ComputationNode [ operation = 'NotEqual'     ; inputs = (_ : y) /*plus the function args*/ ]
    LessEqual(_, y, tag='')    = new ComputationNode [ operation = 'LessEqual'    ; inputs = (_ : y) /*plus the function args*/ ]

-    // 13. Others
-    // 12. Others
+    // 13. Others    
    Identity(_, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = _ /*plus the function args*/ ]    
 ]

@ -181,19 +203,6 @@ Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag=
 RowSlice(beginIndex, numRows, input, tag='') = Slice(beginIndex, beginIndex + numRows, input, axis = 1)
 RowRepeat(input, numRepeats, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]
 RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]
-Splice (inputs, axis=1, tag='') = # TODO: This is a workaround. RowStack itself shall interpret 'axis' and be renamed to Splice().
-    if axis < 1 then Fail('Splice does not yet implement splicing the time axis.')
-    else if axis == 1 then [tag1=tag; out = RowStack (inputs, tag=tag1)].out
-    else [ # workaround: swap 'axis' to first position, RowStack, swap back
-        ArrayTransposeDimensions (inputs, axis1, axis2) = [ # transpose each element of a BS array
-            inputsT[i:0..Length(inputs)-1] = TransposeDimensions (inputs[i], axis1, axis2)
-        ].inputsT
-        out = [tag1=tag; out=TransposeDimensions (RowStack (ArrayTransposeDimensions (inputs, 1, axis)), 1, axis, tag=tag)].out
-    ].out
-Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'LegacyReshape' ; inputs = input /*plus the function args*/ ]
-NewReshape(input, dims, beginAxis=0, endAxis=0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
-ReshapeDimension(x, axis, tensorShape) = NewReshape(x, tensorShape, beginAxis=axis, endAxis=axis + 1) 
-FlattenDimensions(x, axis, num) = NewReshape(x, 0, beginAxis=axis, endAxis=axis + num) 
 Slice(beginIndex, endIndex, input, axis=1, tag='') =
    if axis < 0 then [ # time axis: specify -1
        beginFlags = if beginIndex > 0 then BS.Boolean.Not (BS.Loop.IsFirstN (beginIndex, input)) else                 BS.Loop.IsLastN  (-beginIndex, input)
@ -206,11 +215,13 @@ Slice(beginIndex, endIndex, input, axis=1, tag='') =
              else BS.Sequences.Gather (flags, input)
    ].out
    else new ComputationNode [ operation = 'Slice' ; inputs = input /*plus the function args*/ ] # non-time axis
+Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'LegacyReshape' ; inputs = input /*plus the function args*/ ]
+NewReshape(input, dims, beginAxis=0, endAxis=0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
+ReshapeDimension(x, axis, tensorShape) = NewReshape(x, tensorShape, beginAxis=axis, endAxis=axis + 1) 
+FlattenDimensions(x, axis, num) = NewReshape(x, 0, beginAxis=axis, endAxis=axis + num) 
 SplitDimension(x, axis, N) = ReshapeDimension(x, axis, 0:N) 
-TransposeDimensions(input, axis1, axis2, tag='') = new ComputationNode [ operation = 'TransposeDimensions' ; inputs = input /*plus the function args*/ ]
 # TODO: make input the last arg!
 Transpose(x) = TransposeDimensions(x, 1, 2)
-Times(A, B, outputRank=1, tag='') = new ComputationNode [ operation = 'Times' ; inputs = ( A : B ) /*plus the function args*/ ]
 Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]
 WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]
 ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
@ -228,8 +239,6 @@ ClassificationError = ErrorPrediction
 Delay = PastValue 

 BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
-Abs(x, tag='') = new ComputationNode [ operation = 'Abs' ; inputs = x /*plus the function args*/ ]
-Ceil(x, tag='') = Negate(Floor(Negate(x)), tag=tag)
 ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
 Clip(minValue, maxValue, x, tag='') = new ComputationNode [ operation = 'Clip' ; inputs = (minValue : maxValue : x) /* plus the function args*/ ]
 ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
@ -238,50 +247,33 @@ CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNod
 CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts, numNegSamples, tag='') = new ComputationNode [ operation = 'CosDistanceWithNegativeSamples' ; inputs = (aVectorSequence : anotherVectorSequence : numShifts : numNegSamples) /*plus the function args*/ ]
 Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = x /*plus the function args*/ ]
 CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
-CrossEntropyWithSoftmax(labelVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (labelVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
 # once ReduceLogSum becomes proper C++, CrossEntropyWithSoftmax() will become this:
 NewCrossEntropyWithSoftmax (labelSequence, z, tag='') = [ tag1 = tag; out = Minus (ReduceLogSum (z), ReduceSum (labelSequence .* z), tag=tag1) ].out
 DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ]
 // TODO: DiagTimes = ElementTimes
-Dropout(activationVectorSequence, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = activationVectorSequence /*plus the function args*/ ]
-ElementTimes(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'ElementTimes' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ]
-ElementDivide(aMatrix, anotherMatrix, tag='') = ElementTimes(aMatrix, Reciprocal(anotherMatrix), tag=tag)
-ErrorPrediction = CNTK2.ErrorPrediction
-Exp(x, tag='') = new ComputationNode [ operation = 'Exp' ; inputs = x /*plus the function args*/ ]
-Floor(x, tag='') = new ComputationNode [ operation = 'Floor' ; inputs = x /*plus the function args*/ ]
 GatherPacked(indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'GatherPacked' ; inputs = (indexSequence : sourceData) /*plus the function args*/ ]
 GMMLogLikelihood(unnormalizedPriorVector, meansAsRows, logStdDevAsRows, dataVectorSequence, tag='') = new ComputationNode [ operation = 'GMMLogLikelihood' ; inputs = (unnormalizedPriorVector : meansAsRows : logStdDevAsRows : dataVectorSequence) /*plus the function args*/ ]
 InvStdDev(dataVectorSequence, tag='') = new ComputationNode [ operation = 'InvStdDev' ; inputs = dataVectorSequence /*plus the function args*/ ]
 KhatriRaoProduct(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'KhatriRaoProduct' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
-Log(x, tag='') = new ComputationNode [ operation = 'Log' ; inputs = x /*plus the function args*/ ]
 LogPlus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'LogPlus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 LogSoftmax(z, tag='') = new ComputationNode [ operation = 'LogSoftmax' ; inputs = z /*plus the function args*/ ]
 # TODO: ^^ along axis, like Softmax
 MatrixL1Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL1Reg' ; inputs = matrix /*plus the function args*/ ]
 MatrixL2Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL2Reg' ; inputs = matrix /*plus the function args*/ ]
 Mean(dataVectorSequence, tag='') = new ComputationNode [ operation = 'Mean' ; inputs = dataVectorSequence /*plus the function args*/ ]
-Minus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Minus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 Negate(input, tag='') = new ComputationNode [ operation = 'Negate' ; inputs = input /*plus the function args*/ ]
 PackedIndex(targetObject, indexSequence, tag='') = new ComputationNode [ operation = 'PackedIndex' ; inputs = (targetObject : indexSequence) /*plus the function args*/ ]
-Pass(x, tag='') = new ComputationNode [ operation = 'Pass' ; inputs = x /*plus the function args*/ ]
 PerDimMeanVarDeNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarDeNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
 PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag='') = new ComputationNode [ operation = 'PerDimMeanVarNormalization' ; inputs = (dataVectorSequence : meanVector : invStdDevVector) /*plus the function args*/ ]
-Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 Reciprocal(z, tag='') = new ComputationNode [ operation = 'Reciprocal' ; inputs = z /*plus the function args*/ ]
-RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ]
-ReduceSum (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Sum"    /*plus the function args*/ ]
-# the following is a temporary workaround until we have the C++ version
-ReduceLogSum (z, axis=0, tag='')  = if axis != 0 then Fail("ReduceLogSum for now only supports axis=0.")
-    else [ tag1=tag ; axis1=axis ; out = RowSlice (0, 1, z - LogSoftmax (z), tag=tag1) ].out
+//# the following is a temporary workaround until we have the C++ version
 #ReduceLogSum (z, axis=0, tag='')  = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogSum" /*plus the function args*/ ]
 #ReduceMean (z, axis=0, tag='')    = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean"    /*plus the function args*/ ]
 #ReduceMax (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max"     /*plus the function args*/ ]
 #ReduceMin (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min"     /*plus the function args*/ ]
-Round(x, tag='') = Floor(Plus(x, ConstantTensor(0.5, (1))), tag=tag)
 Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
 # TODO: Scale = ElementTimes
 ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
-Sigmoid(z, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = z /*plus the function args*/ ]
 Sin(z, tag='') = new ComputationNode [ operation = 'Sin' ; inputs = z /*plus the function args*/ ]
 Softmax (z, axis=0, tag='') =  # TODO: replace this with more efficient version below once we have ReduceLogSum
    if axis == 0 then new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ]
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -117,6 +117,23 @@ size_t GetMaxEpochs(const ConfigParameters& configParams)
    return maxEpochs;
 }

+#ifndef CPUONLY
+// abort execution is GPU is not supported (e.g. compute capability not supported)
+void CheckSupportForGpu(DEVICEID_TYPE deviceId)
+{
+    auto gpuData = GetGpuData(deviceId);
+    if (gpuData.validity == GpuValidity::ComputeCapabilityNotSupported)
+    {
+        InvalidArgument("CNTK: The GPU (%s) has compute capability %d.%d.  CNTK is only supported on GPUs with compute capability 3.0 or greater", 
+                        gpuData.name.c_str(), gpuData.versionMajor, gpuData.versionMinor);
+    }
+    else if (gpuData.validity == GpuValidity::UnknownDevice)
+    {
+        InvalidArgument("CNTK: Unknown GPU with Device ID %d.", gpuData.deviceId);
+    }
+}
+#endif
+
 // special temporary function to guard against a now invalid usage of "truncated" which exists in some IPG production setups
 static void DisableLegacyTruncationSettings(const ConfigParameters& TopLevelConfig, const ConfigParameters& commandConfig)
 {
@ -370,6 +387,30 @@ void PrintUsageInfo()
    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
 }

+// print gpu info for current gpu devices (e.g. Device[0]: cores = 2496; computeCapability = 5.2; type = "Quadro M4000"; memory = 8192 MB)
+void PrintGpuInfo()
+{
+#ifndef CPUONLY
+    std::vector<GpuData> gpusData = GetAllGpusData();
+
+    if (gpusData.empty())
+    {
+        LOGPRINTF(stderr, "No GPUs found\n");
+        return;
+    }
+
+    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
+    LOGPRINTF(stderr, "GPU info:\n\n");
+
+    for (GpuData& data : gpusData)
+    {
+        LOGPRINTF(stderr, "\t\tDevice[%d]: cores = %d; computeCapability = %d.%d; type = \"%s\"; memory = %lu MB\n",
+                  data.deviceId, data.cudaCores, data.versionMajor, data.versionMinor, data.name.c_str(), data.totalMemory);
+    }
+    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
+#endif
+}
+
 // ---------------------------------------------------------------------------
 // main() for use with BrainScript
 // ---------------------------------------------------------------------------
@ -461,6 +502,21 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    let valp = BS::Evaluate(expr);                                // evaluate parse into a dictionary
    let& config = valp.AsRef<ScriptableObjects::IConfigRecord>(); // this is the dictionary

+#ifndef CPUONLY
+    auto valpp = config.Find(L"deviceId");
+    if (valpp)
+    {
+        auto valp = *valpp;
+        if (!valp.Is<ScriptableObjects::String>()) // if it's not string 'auto' or 'cpu', then it's a gpu
+        {
+            if (static_cast<int>(valp) >= 0) // gpu (id >= 0)
+            {
+                CheckSupportForGpu(valp); // throws if gpu is not supported
+            }
+        }
+    }
+#endif
+
    // legacy parameters that have changed spelling
    if (config.Find(L"DoneFile")) // variables follow camel case (start with lower-case letters)
        InvalidArgument("Legacy spelling of 'DoneFile' no longer allowed. Use 'doneFile'.");
@ -499,6 +555,9 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    // echo config info to log
    PrintBuiltInfo();

+    // echo gpu info to log
+    PrintGpuInfo();
+
    // execute the actions
    // std::string type = config(L"precision", "float");
    int numCPUThreads = config(L"numCPUThreads", 0);
@ -556,6 +615,18 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
 {
    ConfigParameters config;
    std::string rawConfigString = ConfigParameters::ParseCommandLine(argc, argv, config);    // get the command param set they want
+
+#ifndef CPUONLY
+    ConfigValue val = config("deviceId", "auto");
+    if (!EqualCI(val, "cpu") && !EqualCI(val, "auto"))
+    {
+        if (static_cast<int>(val) >= 0) // gpu (id >= 0)
+        {
+            CheckSupportForGpu(static_cast<int>(val)); // throws if gpu is not supported
+        }
+    }
+#endif
+
    bool timestamping = config(L"timestamping", false);
    if (timestamping)
    {
@ -599,6 +670,8 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
    }

    PrintBuiltInfo(); // this one goes to log file
+    PrintGpuInfo();
+
    std::string timestamp = TimeDateStamp();

    // dump config info
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -144,6 +144,7 @@
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\CrossProcessMutex.h" />
+    <ClInclude Include="..\Common\Include\basetypes.h" />
    <ClInclude Include="..\Common\Include\Basics.h" />
    <ClInclude Include="..\Common\Include\BestGpu.h" />
    <ClInclude Include="..\Common\Include\DataReader.h" />
@ -222,4 +223,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -47,12 +47,18 @@ namespace Microsoft { namespace MSR { namespace CNTK {

    template <typename ElementType>
    class ComputationNode;
+
+    class File;
 }}}

 // TODO: The following should be reconciled with the equivalent code in the CNTK implementation

 #ifndef _MSC_VER
 #define _countof(_Array) (sizeof(_Array) / sizeof(_Array[0]))
+static inline wchar_t* _wcsdup(const wchar_t *s)
+{
+    return ::wcsdup(s);
+}
 #endif

 namespace CNTK
@ -131,386 +137,30 @@ namespace CNTK
    // Forward declarations
    class CompositeFunction;
    class Function;
+    class Variable;

-    namespace _Internal
+    // Similar to make_shared except that it associates a custom deleter with the shared_ptr to ensure
+    // that objects are deleted on the same side of the library DLL where they are allocated
+    template <typename T, typename ...CtorArgTypes>
+    inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs)
    {
-        //  A reference counter to be used as the base class for all reference counted types.
-        class _ReferenceCounter
-        {
-        public:
-
-            //  Constructor.
-            _ReferenceCounter() : m_rc(0) {}
-
-            //  Destructor.
-            virtual ~_ReferenceCounter() {}
-
-            // Add a reference. 
-            // Thread-safe.
-            size_t AddReference()
-            {
-                return ++m_rc;
-            }
-
-            // Remove a reference. 
-            // Thread-safe.
-            size_t RemoveReference()
-            {
-                assert(m_rc.load() > 0);
-                return --m_rc;
-            }
-
-            // Return the reference count value
-            size_t GetReferenceCount()
-            {
-                return m_rc.load();
-            }
-
-        private:
-            std::atomic<size_t> m_rc;
-        };
-
-        // A smart pointer to a reference counted object
-        // T must be a type derived from _Reference_counter
-        template <class T>
-        class _ReferenceCounterSharedPtr final
-        {
-            typedef void(*_ReferenceCounterDeleter)(_ReferenceCounter* obj);
-
-        public:
-
-            // Constructor
-            _ReferenceCounterSharedPtr(T* ptr = nullptr, _ReferenceCounterDeleter deleter = nullptr) : m_objPtr(ptr), m_deleter(deleter)
-            {
-                Init();
-            }
-
-            // Copy constructor
-            _ReferenceCounterSharedPtr(const _ReferenceCounterSharedPtr& other) : m_objPtr(nullptr), m_deleter(nullptr)
-            {
-                *this = other;
-            }
-
-            // Move constructor
-            _ReferenceCounterSharedPtr(_ReferenceCounterSharedPtr&& other) : m_objPtr(nullptr), m_deleter(nullptr)
-            {
-                *this = std::move(other);
-            }
-
-            // Destructor
-            ~_ReferenceCounterSharedPtr()
-            {
-                UnInitialize(m_objPtr, m_deleter);
-            }
-
-            // Assignment operator
-            _ReferenceCounterSharedPtr& operator=(const _ReferenceCounterSharedPtr& other)
-            {
-                if (this != &other)
-                {
-                    T* oldPtr = m_objPtr;
-                    _ReferenceCounterDeleter oldDeleter = m_deleter;
-
-                    m_objPtr = other.m_objPtr;
-                    m_deleter = other.m_deleter;
-                    Init();
-
-                    UnInitialize(oldPtr, oldDeleter);
-                }
-
-                return *this;
-            }
-
-            // Move-assignment operator
-            _ReferenceCounterSharedPtr& operator=(_ReferenceCounterSharedPtr&& other)
-            {
-                assert(this != &other);
-
-                T* oldPtr = m_objPtr;
-                _ReferenceCounterDeleter oldDeleter = m_deleter;
-
-                m_objPtr = other.m_objPtr;
-                m_deleter = other.m_deleter;
-                // No change to ref-count of the adopted pointer.
-
-                other.m_objPtr = nullptr;
-                other.m_deleter = nullptr;
-
-                UnInitialize(oldPtr, oldDeleter);
-
-                return *this;
-            }
-
-            // Conversion to a ReferenceCountedSharedPtr instance of a base type
-            template <typename Base, typename std::enable_if<std::is_base_of<Base, T>::value>::type* = nullptr>
-            operator _ReferenceCounterSharedPtr<Base>()
-            {
-                return _ReferenceCounterSharedPtr<Base>(m_objPtr, m_deleter);
-            }
-
-            T* operator->() const
-            {
-                return m_objPtr;
-            }
-
-            T& operator*() const
-            {
-                return *m_objPtr;
-            }
-
-            operator T*() const
-            {
-                return m_objPtr;
-            }
-
-            T* GetPtr() const
-            {
-                return m_objPtr;
-            }
-
-        private:
-            void Init()
-            {
-                static_assert(std::is_base_of<_ReferenceCounter, T>::value, "_ReferenceCounterSharedPtr<T> can only be used when _ReferenceCounter is a base type of T!");
-
-                if (m_objPtr != nullptr)
-                    reinterpret_cast<_ReferenceCounter*>(m_objPtr)->AddReference();
-            }
-
-            static void UnInitialize(T* objPtr, _ReferenceCounterDeleter deleter)
-            {
-                static_assert(std::is_base_of<_ReferenceCounter, T>::value, "_ReferenceCounterSharedPtr<T> can only be used when _ReferenceCounter is a base type of T!");
-
-                if (objPtr != nullptr)
-                {
-                    size_t refCountRemaining = reinterpret_cast<_ReferenceCounter*>(objPtr)->RemoveReference();
-                    if (refCountRemaining == 0)
-                    {
-                        if (deleter != nullptr)
-                            deleter(reinterpret_cast<_ReferenceCounter*>(objPtr));
-                        else
-                            delete objPtr;
-                    }
-                }
-            }
-
-        private:
-            T* m_objPtr;
-            _ReferenceCounterDeleter m_deleter;
-        };
-
-        template <typename T>
-        bool operator==(const _ReferenceCounterSharedPtr<T>& first, const _ReferenceCounterSharedPtr<T>& second)
-        {
-            return first.GetPtr() == second.GetPtr();
-        }
-
-        // A simple vector implementation with a C ABI to allow usage across the library DLL boundary
-        // as STL vectors cannot be used across the DLL boundary
-        template <typename T>
-        class CNTK_API _SimpleVector final
-        {
-            template <typename ValueType>
-            friend CNTK_API bool operator==(const _SimpleVector<ValueType>& first, const _SimpleVector<ValueType>& second);
-
-            friend class CNTK::Function;
-
-        public:
-            _SimpleVector();
-            _SimpleVector(size_t numElements, const T& initVal = T());
-            ~_SimpleVector();
-
-            _SimpleVector(const _SimpleVector& other);
-            _SimpleVector& operator=(const _SimpleVector& other);
-
-            _SimpleVector(_SimpleVector&& other);
-            _SimpleVector& operator=(_SimpleVector&& other);
-
-            T& operator[](size_t idx);
-            const T& operator[](size_t idx) const;
-
-            size_t Size() const;
-
-            T* Data();
-            const T* Data() const;
-
-            void PushBack(const T& value);
-            void PushBack(T&& value);
-
-            operator std::vector<T>() const
-            {
-                std::vector<T> retVector(Size());
-                for (size_t i = 0; i < Size(); ++i)
-                    retVector[i] = this->operator[](i);
-
-                return retVector;
-            }
-
-            std::unordered_set<T> GetAsUnorderedSet(bool ensureUnique = true)
-            {
-                std::unordered_set<T> retSet;
-                for (size_t i = 0; i < Size(); ++i)
-                {
-                    auto insertRet = retSet.insert(this->operator[](i));
-                    if (ensureUnique && !insertRet.second)
-                        RuntimeError("A _SimpleVector with duplicate elements cannot be converted to an unordered_set");
-                }
-
-                return retSet;
-            }
-
-            template <typename ContainerType, typename std::enable_if<std::is_same<ContainerType, std::vector<T>>::value ||
-                                                                      std::is_same<ContainerType, std::initializer_list<T>>::value ||
-                                                                      std::is_same<ContainerType, std::array<T, sizeof(ContainerType) / sizeof(T)>>::value>::type* = nullptr>
-            static _SimpleVector<T> CreateSimpleVector(const ContainerType& initList)
-            {
-                _SimpleVector<T> simpleVector(initList.size());
-                std::copy(initList.begin(), initList.end(), simpleVector.Data());
-
-                return simpleVector;
-            }
-
-        private:
-            std::vector<T>* m_vector;
-        };
-
-        template <typename ValueType>
-        CNTK_API bool operator==(const _SimpleVector<ValueType>& first, const _SimpleVector<ValueType>& second);
-
-        template <typename ValueType>
-        bool operator!=(const _SimpleVector<ValueType>& first, const _SimpleVector<ValueType>& second)
-        {
-            return !(first == second);
-        }
-
-        // A simple set implementation with a C ABI to allow usage across the library DLL boundary
-        // as STL sets cannot be used across the DLL boundary
-        template <typename KeyType>
-        class CNTK_API _SimpleSet final
-        {
-            friend class CNTK::CompositeFunction;
-
-            template <typename T>
-            friend CNTK_API bool operator==(const _SimpleSet<T>& first, const _SimpleSet<T>& second);
-
-        public:
-            _SimpleSet();
-            ~_SimpleSet();
-
-            _SimpleSet(const _SimpleSet& other);
-            _SimpleSet& operator=(const _SimpleSet& other);
-
-            _SimpleSet(_SimpleSet&& other);
-            _SimpleSet& operator=(_SimpleSet&& other);
-
-            bool Insert(const KeyType& key);
-            bool Contains(const KeyType& key) const;
-
-            size_t Size() const;
-
-            operator _SimpleVector<KeyType>() const;
-
-            operator std::unordered_set<KeyType>() const
-            {
-                return ((_SimpleVector<KeyType>)(*this)).GetAsUnorderedSet();
-            }
-
-            static _SimpleSet<KeyType> CreateSimpleSet(const std::unordered_set<KeyType>& initSet)
-            {
-                _SimpleSet<KeyType> simpleSet;
-                for (auto iter = initSet.begin(); iter != initSet.end(); ++iter)
-                    simpleSet.Insert(*iter);
-
-                return simpleSet;
-            }
-
-        private:
-            std::unordered_set<KeyType>* m_set;
-        };
-
-        template <typename KeyType>
-        CNTK_API bool operator==(const _SimpleSet<KeyType>& first, const _SimpleSet<KeyType>& second);
-
-        template <typename KeyType>
-        bool operator!=(const _SimpleSet<KeyType>& first, const _SimpleSet<KeyType>& second)
-        {
-            return !(first == second);
-        }
-
-        // A simple map implementation with a C ABI to allow usage across the library DLL boundary
-        // as STL maps cannot be used across the DLL boundary
-        template <typename KeyType, typename ValueType>
-        class CNTK_API _SimpleMap final
-        {
-            friend class CNTK::CompositeFunction;
-            friend class CNTK::Function;
-
-        public:
-            _SimpleMap();
-            ~_SimpleMap();
-
-            _SimpleMap(const _SimpleMap& other);
-            _SimpleMap& operator=(const _SimpleMap& other);
-
-            _SimpleMap(_SimpleMap&& other);
-            _SimpleMap& operator=(_SimpleMap&& other);
-
-            ValueType& operator[](const KeyType& key);
-            const ValueType& operator[](const KeyType& key) const;
-
-            bool Insert(const KeyType& key, const ValueType& value);
-            bool Contains(const KeyType& key) const;
-            size_t Size() const;
-
-            _SimpleSet<KeyType> Keys() const;
-
-            static _SimpleMap<KeyType, ValueType> CreateSimpleMap(const std::unordered_map<KeyType, ValueType>& initMap)
-            {
-                _SimpleMap<KeyType, ValueType> simpleMap;
-                for (auto iter = initMap.begin(); iter != initMap.end(); ++iter)
-                    simpleMap.Insert(iter->first, iter->second);
-
-                return simpleMap;
-            }
-
-        private:
-            std::unordered_map<KeyType, ValueType>* m_map;
-        };
+        auto objPtr = new T(std::forward<CtorArgTypes>(ctorArgs)...);
+        return std::shared_ptr<T>(objPtr, [](T* ptr) { delete ptr; });
    }

    // Forward declarations
    class NDArrayView;
-    typedef _Internal::_ReferenceCounterSharedPtr<NDArrayView> NDArrayViewPtr;
+    typedef std::shared_ptr<NDArrayView> NDArrayViewPtr;

    class NDMask;
-    typedef _Internal::_ReferenceCounterSharedPtr<NDMask> NDMaskPtr;
+    typedef std::shared_ptr<NDMask> NDMaskPtr;

    class Value;
-    typedef _Internal::_ReferenceCounterSharedPtr<Value> ValuePtr;
+    typedef std::shared_ptr<Value> ValuePtr;

    class Function;
-    typedef _Internal::_ReferenceCounterSharedPtr<Function> FunctionPtr;
+    typedef std::shared_ptr<Function> FunctionPtr;

-    inline wchar_t* CopyString(const wchar_t* source)
-    {
-        size_t len = wcslen(source) + 1;
-        wchar_t* copy = new wchar_t[len];
-#ifdef _WIN32
-        wcscpy_s(copy, len, source);
-#else
-        wcscpy(copy, source);
-#endif
-        return copy;
-    }
-}
-
-namespace std {
-    template <typename T>
-    struct hash<CNTK::_Internal::_ReferenceCounterSharedPtr<T>>
-    {
-        size_t operator()(const CNTK::_Internal::_ReferenceCounterSharedPtr<T>& x) const
-        {
-            return std::hash<const void*>()(x.GetPtr());
-        }
-    };
+    class Learner;
+    typedef std::shared_ptr<Learner> LearnerPtr;
 }
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -128,6 +128,7 @@
    <ClInclude Include="API\CNTKLibrary.h" />
    <ClInclude Include="API\CNTKLibraryInternals.h" />
    <ClInclude Include="Function.h" />
+    <ClInclude Include="Learner.h" />
    <ClInclude Include="Utils.h" />
    <ClInclude Include="stdafx.h" />
    <ClInclude Include="targetver.h" />
@ -140,6 +141,7 @@
      </PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="Function.cpp" />
+    <ClCompile Include="Learner.cpp" />
    <ClCompile Include="NDArrayView.cpp" />
    <ClCompile Include="NDMask.cpp" />
    <ClCompile Include="stdafx.cpp">
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -10,6 +10,7 @@
    <ClCompile Include="Variable.cpp" />
    <ClCompile Include="Utils.cpp" />
    <ClCompile Include="NDMask.cpp" />
+    <ClCompile Include="Learner.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="stdafx.h" />
@ -22,6 +23,7 @@
      <Filter>API</Filter>
    </ClInclude>
    <ClInclude Include="Function.h" />
+    <ClInclude Include="Learner.h" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="API">
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -14,7 +14,21 @@ namespace CNTK
        return GPUDevice(0);
    }

-    /*static*/ Axis Axis::DefaultDynamicAxis = Axis(L"defaultDynamicAxis");
-    /*static*/ Axis Axis::BatchAxis = Axis(L"batchAxis");
-    /*static*/ Axis Axis::AllAxes = Axis(L"allAxes");
+    /*static*/ const Axis& Axis::DefaultDynamicAxis()
+    {
+        static Axis s_defaultDynamicAxis(L"defaultDynamicAxis");
+        return s_defaultDynamicAxis;
+    }
+
+    /*static*/ const Axis& Axis::BatchAxis()
+    {
+        static Axis s_batchAxis(L"batchAxis");
+        return s_batchAxis;
+    }
+
+    /*static*/ const Axis& Axis::AllAxes()
+    {
+        static Axis s_allAxes(L"allAxes");
+        return s_allAxes;
+    }
 }
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -17,40 +17,98 @@ bool g_shareNodeValueMatrices = true;

 namespace CNTK
 {
-    _Internal::_SimpleVector<Variable> Function::_Inputs() const
+    std::shared_ptr<std::vector<Variable>> Function::InputsImpl() const
    {
        const CompositeFunction* compositeFunction = dynamic_cast<const CompositeFunction*>(this);
+        std::vector<Variable> inputs;
        if (compositeFunction == nullptr)
-            return m_inputs;
+            inputs = m_inputs;
        else
-            return _Internal::_SimpleVector<Variable>::CreateSimpleVector(compositeFunction->DetermineInputs());
+            inputs = compositeFunction->DetermineInputs();
+
+        return std::shared_ptr<std::vector<Variable>>(new std::vector<Variable>(std::move(inputs)), [](std::vector<Variable>* ptr) { delete ptr; });
    }

-    /*virtual*/ void Function::_ReplacePlaceholders(const _Internal::_SimpleMap<Placeholder, Variable>& placeholderReplacements, _Internal::_SimpleSet<const Function*>& visitedFunctions, _Internal::_SimpleSet<Placeholder>& replacedPlaceholders)
+    FunctionPtr Function::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements)
    {
-        visitedFunctions.Insert(this);
+        // Cannot be called on primitive functions
+        if (RootFunction() == nullptr)
+            InvalidArgument("ReplacePlaceholders should never be called on primitive functions");

-        for (auto iter = m_inputs.m_vector->begin(); iter != m_inputs.m_vector->end(); ++iter)
+        std::unordered_set<const Function*> visitedFunctions;
+        std::unordered_set<Placeholder> replacedPlaceholders;
+        ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
+
+        for (auto replacementPair : placeholderReplacements)
        {
-            if (iter->IsPlaceholder())
+            if (replacedPlaceholders.find(replacementPair.first) == replacedPlaceholders.end())
+                InvalidArgument("At least one of the placeholders specified for replacement was not found in the function");
+        }
+
+        return this->shared_from_this();
+    }
+
+    // Placeholders can be replaced incrementally - i.e. not all placeholders need to replaced in one go.
+    // The only requirement is that they must all be replaced before making any 'Forward' calls on the Function instance.
+    /*virtual*/ void Function::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
+                                                   std::unordered_set<const Function*>& visitedFunctions,
+                                                   std::unordered_set<Placeholder>& replacedPlaceholders)
+    {
+        visitedFunctions.insert(this);
+
+        for (auto& inputVar : m_inputs)
+        {
+            if (inputVar.IsPlaceholder())
            {
-                Placeholder placeholder(*iter);
-                if (placeholderReplacements.Contains(placeholder))
+                Placeholder placeholder(inputVar);
+                if (placeholderReplacements.find(placeholder) != placeholderReplacements.end())
                {
-                    *iter = placeholderReplacements[placeholder];
-                    replacedPlaceholders.Insert(placeholder);
+                    inputVar = placeholderReplacements.at(placeholder);
+                    replacedPlaceholders.insert(placeholder);
                }
            }
-            else if ((iter->Kind() == VariableKind::Output) && !visitedFunctions.Contains(iter->Owner()))
-                iter->Owner()->_ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
+            else if (inputVar.IsOutput() && (visitedFunctions.find(inputVar.Owner().get()) == visitedFunctions.end()))
+                inputVar.Owner()->ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
        }
    }

-    template <typename ElementType>
-    /*static*/ ComputationNodeBasePtr CompositeFunction::GetNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap)
+    // Replace any PlaceHolder Variables in the graph of Functions underlying 'this' CompositeFunction. All PlaceHolder variables
+    // should have been replaced before performing any Forward compute of 'this' Function.
+    /*virtual*/ void CompositeFunction::ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
+                                                            std::unordered_set<const Function*>& visitedFunctions,
+                                                            std::unordered_set<Placeholder>& replacedPlaceholders)
    {
-        if (variableToNodeMap.find(variable) != variableToNodeMap.end())
-            return variableToNodeMap[variable];
+        RootFunction()->ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
+
+        // If any of the placeholders were replaced with Output variables, let's add the graph of function underneath each of those to 'm_allPrimitiveFunctions' set
+        for (auto replacedPlaceholder : replacedPlaceholders)
+        {
+            auto replacingVariable = placeholderReplacements.at(replacedPlaceholder);
+            if (replacingVariable.IsOutput())
+            {
+                auto ownerFunc = replacingVariable.Owner();
+                std::unordered_set<FunctionPtr> visitedFunctions;
+                DetermineInputs(ownerFunc, visitedFunctions);
+
+                // Add the newly visited functions to 'm_allPrimitiveFunctions' set
+                m_allPrimitiveFunctions.insert(visitedFunctions.begin(), visitedFunctions.end());
+            }
+        }
+    }
+
+    // Recursively create a sub-network of ComputationNode instances corresponding to the graph of Functions 
+    // underlying the specified 'variable' and return the ComputationNode instance that corresponds to the 
+    // top level 'variable'
+    template <typename ElementType>
+    /*static*/ ComputationNodeBasePtr CompositeFunction::GetNode(const Variable& variable,
+                                                                 Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
+                                                                 ComputationNetworkBuilder<ElementType>& builder,
+                                                                 std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap,
+                                                                 std::unordered_map<Variable, bool>& isVariableRootMap)
+    {
+        auto iter = variableToNodeMap.find(variable);
+        if (iter != variableToNodeMap.end())
+            return iter->second;

        // Lets add a null entry in the map for this variable, to break infinite recursion when processing recurrent graphs
        variableToNodeMap[variable] = nullptr;
@ -66,10 +124,10 @@ namespace CNTK
            auto matrix = variable.IsConstant() ? value->GetMatrix<ElementType>()->AsReference() : value->GetWritableMatrix<ElementType>()->AsReference();
            computationNodePtr->Value() = std::move(matrix);
        }
-        else if (variable.Kind() == VariableKind::Input)
+        else if (variable.IsInput())
        {
            // TODO: Specify dynamic axis
-            if (variable.IsSparseInput())
+            if (IsSparseInput(variable))
                computationNodePtr = builder.CreateSparseInputNode(variable.Name(), AsTensorShape(variable.Shape()));
            else
                computationNodePtr = builder.CreateInputNode(variable.Name(), AsTensorShape(variable.Shape()));
@ -83,23 +141,27 @@ namespace CNTK
        }
        else
        {
-            assert(variable.Kind() == VariableKind::Output);
+            assert(variable.IsOutput());
            computationNodePtr = GetOutputVariableNode(variable, network, builder, variableToNodeMap, isVariableRootMap)->template As<ComputationNode<ElementType>>()->shared_from_this();
        }

        variableToNodeMap[variable] = computationNodePtr;
-        isVariableRootMap[variable] = (variable.Kind() == VariableKind::Output);
+        isVariableRootMap[variable] = variable.IsOutput();
        return computationNodePtr;
    }

    template <typename ElementType>
-    /*static*/ ComputationNodeBasePtr CompositeFunction::GetOutputVariableNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap)
+    /*static*/ ComputationNodeBasePtr CompositeFunction::GetOutputVariableNode(const Variable& variable,
+                                                                               Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
+                                                                               ComputationNetworkBuilder<ElementType>& builder,
+                                                                               std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap,
+                                                                               std::unordered_map<Variable, bool>& isVariableRootMap)
    {
-        assert(variable.Kind() == VariableKind::Output);
+        assert(variable.IsOutput());

-        Function* function = variable.Owner();
+        Function* function = variable.Owner().get();
        ComputationNodeBasePtr computationNodePtr;
-        if (dynamic_cast<PrimitiveFunction*>(function) != nullptr)
+        if (dynamic_cast<PrimitiveFunction*>(function))
        {
            PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);

@ -134,7 +196,7 @@ namespace CNTK
            case PrimitiveOpType::CrossEntropyWithSoftmax:
                computationNodePtr = builder.CrossEntropyWithSoftmax(input1Node, input0Node, function->Name());
                break;
-            case PrimitiveOpType::PredictionError:
+            case PrimitiveOpType::ClassificationError:
                computationNodePtr = builder.ErrorPrediction(input1Node, input0Node, function->Name());
                break;
            case PrimitiveOpType::Exp:
@ -180,8 +242,10 @@ namespace CNTK
                break;
            }
            case PrimitiveOpType::Combine:
-                for (size_t i = 0; i < functionInputs.size(); ++i)
-                    GetNode(functionInputs[i], network, builder, variableToNodeMap, isVariableRootMap);
+                // This operation is just a no-op and is a means to combine multiple functions to create a single Function
+                // whose outputs are a union of tyhe outputs of the Functions being combined.
+                for (auto inputVar : functionInputs)
+                    GetNode(inputVar, network, builder, variableToNodeMap, isVariableRootMap);

                computationNodePtr = variableToNodeMap[variable];

@ -193,8 +257,8 @@ namespace CNTK

            if (op != PrimitiveOpType::Combine)
            {
-                for (size_t i = 0; i < functionInputs.size(); ++i)
-                    isVariableRootMap[functionInputs[i]] = false;
+                for (auto inputVar : functionInputs)
+                    isVariableRootMap[inputVar] = false;
            }
        }
        else
@ -206,14 +270,14 @@ namespace CNTK
    }

    template <typename ElementType>
-    ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device, const _Internal::_SimpleSet<Variable>& backpropRoots)
+    ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device, const std::unordered_set<Variable>& backpropRoots)
    {
        if (m_computationNetwork != nullptr)
        {
            // TODO: We should either invalidate and readapt the network if he backpropRoots change compared to what was specified when the network
            // was last constructed, to just recreate a new network.
            // For now just disallow changing the backpropRoots after the network is created
-            if (m_currentBackpropRoots != *backpropRoots.m_set)
+            if (m_currentBackpropRoots != backpropRoots)
                LogicError("Changing backprop roots across different Forward calls on a CNTK composite Function is currently unsupported");

            // TODO: Support changing the device across different invocations of the forward method on a Function instance
@ -228,7 +292,7 @@ namespace CNTK
            ComputationNetworkBuilder<ElementType> builder(*m_computationNetwork);

            // TODO: We current only support one backprop root
-            if (backpropRoots.Size() > 1)
+            if (backpropRoots.size() > 1)
                LogicError("More than one backprop roots is currently unsupported");

            ComputationNodeBasePtr backpropRootNode;
@ -237,52 +301,52 @@ namespace CNTK
            auto rootFunction = RootFunction();
            auto rootFunctionOutputs = rootFunction->Outputs();
            std::vector<ComputationNodeBasePtr> forwardRootNodes;
-            for (size_t i = 0; i < rootFunctionOutputs.size(); ++i)
+            for (auto rootOutput : rootFunctionOutputs)
            {
-                auto currentRootNode = GetNode(rootFunctionOutputs[i], m_computationNetwork, builder, m_variableToNodeMap, m_isVariableRootMap);
+                auto currentRootNode = GetNode(rootOutput, m_computationNetwork, builder, m_variableToNodeMap, m_isVariableRootMap);
                forwardRootNodes.push_back(currentRootNode);

-                if (backpropRoots.Contains(rootFunctionOutputs[i]))
-                    backpropRootNode = m_variableToNodeMap[rootFunctionOutputs[i]];
+                if (backpropRoots.find(rootOutput) != backpropRoots.end())
+                    backpropRootNode = m_variableToNodeMap[rootOutput];
            }

            // If any of the function outputs is not a root node, we need to explicitly add it to the 'output' group of the ComputationNetwork
-            for (size_t i = 0; i < rootFunctionOutputs.size(); ++i)
+            for (auto rootOutput : rootFunctionOutputs)
            {
-                if (!m_isVariableRootMap[rootFunctionOutputs[i]])
-                    m_computationNetwork->AddToNodeGroup(L"output", m_variableToNodeMap[rootFunctionOutputs[i]]);
+                if (!m_isVariableRootMap[rootOutput])
+                    m_computationNetwork->AddToNodeGroup(L"output", m_variableToNodeMap[rootOutput]);
            }

            m_currentBackpropRoots = backpropRoots;

            // In case of recurrence, the inputs of some of the ComputationNodes are not attached due to cycles.
            // Now attach those after we have created all ComputationNodes in the network
-            for (auto iter = m_variableToNodeMap.begin(); iter != m_variableToNodeMap.end(); ++iter)
+            for (auto varNodePair : m_variableToNodeMap)
            {
-                auto currentComputationNodeInputs = iter->second->GetInputs();
+                auto currentComputationNodeInputs = varNodePair.second->GetInputs();

                // TODO: Can any node other than a non PastValue/FutureValue Function have a null input attached after the first pass is finished?
                if (std::find(currentComputationNodeInputs.begin(), currentComputationNodeInputs.end(), nullptr) != currentComputationNodeInputs.end())
                {
                    // We found a null input; this variable must correspond to a PastValue or FutureValue function
-                    const PrimitiveFunction* primitiveFunc = dynamic_cast<const PrimitiveFunction*>(iter->first.Owner().GetPtr());
+                    const PrimitiveFunction* primitiveFunc = dynamic_cast<const PrimitiveFunction*>(varNodePair.first.Owner().get());
                    if ((primitiveFunc == nullptr) || ((primitiveFunc->OpType() != PrimitiveOpType::PastValue) && (primitiveFunc->OpType() != PrimitiveOpType::FutureValue)))
                        InvalidArgument("Invalid Function graph detected; recurrence found at a Function that is not a PastValue/FutureValue function");

                    // The 2nd input of the PastValue/FutureValue function denotes the recurrent input
                    auto actualInput = m_variableToNodeMap[primitiveFunc->Inputs()[1]];
-                    iter->second->AttachInputs({ actualInput });
+                    varNodePair.second->AttachInputs({ actualInput });
                }
            }

            m_computationNetwork->CompileNetwork();

            // Verify that the shapes of the output Variables that we computed match the corresponding nodes in the ComputationNetwork
-            for (auto iter = m_variableToNodeMap.begin(); iter != m_variableToNodeMap.end(); ++iter)
+            for (auto varNodePair : m_variableToNodeMap)
            {
-                if (iter->first.Kind() == VariableKind::Output)
+                if (varNodePair.first.IsOutput())
                {
-                    auto outputVar = iter->first;
+                    auto outputVar = varNodePair.first;
                    auto computationNodePtr = m_variableToNodeMap[outputVar];
                    auto outputShape = outputVar.Shape();
                    auto computationNodeSampleLayout = computationNodePtr->GetSampleLayout();
@ -310,10 +374,10 @@ namespace CNTK
            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(value->Data()->GetDataType()));

        // TODO: Is supplying dense data for an Input variable tagged as sparse, a fatal error?
-        if (var.IsSparseInput() && !value->Data()->IsSparse())
+        if (IsSparseInput(var) && !value->Data()->IsSparse())
            InvalidArgument("Dense input data supplied for a sparse input Variable");

-        if (var.IsSparseInput() && (value->Data()->GetStorageFormat() != StorageFormat::SparseCSC))
+        if (IsSparseInput(var) && (value->Data()->GetStorageFormat() != StorageFormat::SparseCSC))
            InvalidArgument("Sparse Input data must be in SparseCSC format");

        if (value->Data()->Shape().NumAxes() == var.Shape().NumAxes())
@ -397,7 +461,7 @@ namespace CNTK
                                                                    layout->GetNumCols(),
                                                                    AsCNTKImplDeviceId(value->Data()->Device()),
                                                                    value->Data()->IsSparse() ? MatrixType::SPARSE : MatrixType::DENSE,
-                                                                    AsCNTKMatrixFormat(value->Data()->GetStorageFormat()));
+                                                                    AsCNTKImplMatrixFormat(value->Data()->GetStorageFormat()));

            std::vector<size_t> sequencesShorterThanLongestSequence;
            for (size_t i = 0; i < numSequences; ++i)
@ -442,8 +506,8 @@ namespace CNTK
        {
            // Just create a view over the existing matrix itself
            auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorShape(valueDataShape));
-            auto data = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView), [](_ReferenceCounter* ptr) { delete ptr; });
-            return ValuePtr(new Value(data), [](_ReferenceCounter* ptr) { delete ptr; });
+            auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, true, tensorView);
+            return MakeSharedObject<Value>(data);
        }

        if (layout->GetNumCols() != matrix.GetNumCols())
@ -454,10 +518,10 @@ namespace CNTK

        std::vector<size_t> sequenceLengths;
        auto& layoutSequences = layout->GetAllSequences();
-        for (auto iter = layoutSequences.begin(); iter != layoutSequences.end(); ++iter)
+        for (auto sequenceInfo : layoutSequences)
        {
-            if (iter->seqId != GAP_SEQUENCE_ID)
-                sequenceLengths.push_back(iter->GetNumTimeSteps());
+            if (sequenceInfo.seqId != GAP_SEQUENCE_ID)
+                sequenceLengths.push_back(sequenceInfo.GetNumTimeSteps());
        }

        // Reshuffle to data to unpack and uninterleave the CNTK form data
@ -473,13 +537,13 @@ namespace CNTK
        size_t targetColIdxForInvalidColumns = sequencesShorterThanLongestSequence.empty() ? 0 : (((sequencesShorterThanLongestSequence[0] + 1) * maxNumTimeSteps) - 1);
        std::vector<ElementType> scatterIndicesVector(layout->GetNumCols(), (ElementType)targetColIdxForInvalidColumns);
        size_t i = 0;
-        for (auto iter = layoutSequences.begin(); iter != layoutSequences.end(); ++iter)
+        for (auto sequenceInfo : layoutSequences)
        {
-            if (iter->seqId != GAP_SEQUENCE_ID)
+            if (sequenceInfo.seqId != GAP_SEQUENCE_ID)
            {
-                size_t targetParallelStreamIdx = iter->s;
-                size_t targetStartIdxInParallelStream = iter->tBegin;
-                for (size_t j = 0; j < iter->GetNumTimeSteps(); ++j)
+                size_t targetParallelStreamIdx = sequenceInfo.s;
+                size_t targetStartIdxInParallelStream = sequenceInfo.tBegin;
+                for (size_t j = 0; j < sequenceInfo.GetNumTimeSteps(); ++j)
                    scatterIndicesVector[((targetStartIdxInParallelStream + j) * layout->GetNumParallelSequences()) + targetParallelStreamIdx] = (ElementType)((i * maxNumTimeSteps) + j);

                i++;
@ -493,106 +557,97 @@ namespace CNTK
        NDMaskPtr mask;
        if (!sequencesShorterThanLongestSequence.empty())
        {
-            mask = NDMaskPtr(new NDMask({ maxNumTimeSteps, numSequences }, AsDeviceDescriptor(matrix.GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
-            for (size_t i = 0; i < sequencesShorterThanLongestSequence.size(); ++i)
+            mask = MakeSharedObject<NDMask>(NDShape({ maxNumTimeSteps, numSequences }), AsDeviceDescriptor(matrix.GetDeviceId()));
+            for (auto shortSequenceIdx : sequencesShorterThanLongestSequence)
            {
-                size_t shorterSequenceIdx = sequencesShorterThanLongestSequence[i];
-                mask->MaskSection({ sequenceLengths[shorterSequenceIdx], shorterSequenceIdx }, { NDShape::InferredDimension, 1 });
+                mask->MaskSection({ sequenceLengths[shortSequenceIdx], shortSequenceIdx }, { NDShape::InferredDimension, 1 });
            }
        }

        auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorShape(valueDataShape));
-        auto data = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView), [](_ReferenceCounter* ptr) { delete ptr; });
-        return ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
+        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), StorageFormat::Dense, valueDataShape, true, tensorView);
+        return MakeSharedObject<Value>(data, mask);
    }

-    void CompositeFunction::PopulateNetworkInputs(const _Internal::_SimpleMap<Variable, const ValuePtr>& arguments)
+    template <typename ElementType>
+    /*static*/ void CompositeFunction::PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, ComputationNodeBasePtr& computationNode)
+    {
+        auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableValue.first, variableValue.second);
+        MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
+
+        auto& nodeData = computationNode->As<ComputationNode<ElementType>>()->Value();
+
+        // Switch the node matrix to the right matrix type
+        nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
+        nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
+        computationNode->GetMBLayout()->CopyFrom(layout);
+    }
+
+    void CompositeFunction::PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments)
    {
        auto functionArguments = this->Arguments();
        std::vector<ComputationNodeBasePtr> inputNodes;
-        for (auto iter = functionArguments.begin(); iter != functionArguments.end(); ++iter)
+        for (auto argument : functionArguments)
        {
            // Ensure we have values for all arguments of the function
-            if (!arguments.Contains(*iter))
+            if (arguments.find(argument) == arguments.end())
                InvalidArgument("Value not specified for required Function Argument");

-            auto argumentComputationNode = m_variableToNodeMap[*iter];
+            auto argumentComputationNode = m_variableToNodeMap[argument];
            inputNodes.push_back(argumentComputationNode);

-            ValuePtr argumentValue = arguments[*iter];
+            ValuePtr argumentValue = arguments.at(argument);

            MBLayoutPtr layout;
            switch (argumentValue->Data()->GetDataType())
            {
            case DataType::Float:
-            {
-                auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(*iter, argumentValue);
-                layout = CNTKMatrixAndMBLayout.second;
-
-                auto& nodeData = argumentComputationNode->As<ComputationNode<float>>()->Value();
-                // Switch the node matrix to the right matrix type
-                nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
-                nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
+                PopulateComputationNodeValue<float>({ argument, argumentValue }, argumentComputationNode);
                break;
-            }
            case DataType::Double:
-            {
-                auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(*iter, argumentValue);
-                layout = CNTKMatrixAndMBLayout.second;
-
-                auto& nodeData = argumentComputationNode->As<ComputationNode<double>>()->Value();
-                // Switch the node matrix to the right matrix type
-                nodeData.SwitchToMatrixType(CNTKMatrixAndMBLayout.first->GetMatrixType(), CNTKMatrixAndMBLayout.first->GetFormat(), false);
-                nodeData.AssignValuesOf(*CNTKMatrixAndMBLayout.first);
+                PopulateComputationNodeValue<double>({ argument, argumentValue }, argumentComputationNode);
                break;
-            }
            default:
                LogicError("Unsupported DataType %s", DataTypeName(argumentValue->Data()->GetDataType()));
                break;
            }
-
-            argumentComputationNode->GetMBLayout()->CopyFrom(layout);
        }

        m_computationNetwork->BumpEvalTimeStamp(inputNodes);
    }

-    void CompositeFunction::PopulateNetworkGradients(const _Internal::_SimpleMap<Variable, const ValuePtr>& gradients)
+    template <typename ElementType>
+    /*static*/ void CompositeFunction::PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode)
+    {
+        auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableGradient.first, variableGradient.second);
+        MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
+        auto nodeLayout = computationNode->GetMBLayout();
+        if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
+            InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
+        computationNode->As<ComputationNode<ElementType>>()->AssignGradient(*CNTKMatrixAndMBLayout.first);
+    }
+
+    // Assign the supplied gradients corresponding to the root(s) of the network to be backpropagated through the graph
+    void CompositeFunction::PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients)
    {
        auto functionOutputs = this->Outputs();
-        std::unordered_map<Variable, const ValuePtr>& gradientsValueMap = *gradients.m_map;
-        for (auto iter = gradientsValueMap.begin(); iter != gradientsValueMap.end(); ++iter)
+        for (auto gradientVarValuePair : gradients)
        {
            // Only gradients for roots of the function can be specified
-            if (std::find(functionOutputs.begin(), functionOutputs.end(), iter->first) == functionOutputs.end())
+            if (std::find(functionOutputs.begin(), functionOutputs.end(), gradientVarValuePair.first) == functionOutputs.end())
                InvalidArgument("Gradients cannot be specified for a Variable that is not an Output of the Function");

-            auto outputComputationNode = m_variableToNodeMap[iter->first];
-            auto nodeLayout = outputComputationNode->GetMBLayout();
+            auto outputComputationNode = m_variableToNodeMap[gradientVarValuePair.first];
+            ValuePtr gradientValue = gradientVarValuePair.second;

-            ValuePtr gradientValue = iter->second;
-
-            MBLayoutPtr layout;
            switch (gradientValue->Data()->GetDataType())
            {
            case DataType::Float:
-            {
-                auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(iter->first, gradientValue);
-                layout = CNTKMatrixAndMBLayout.second;
-                if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
-                    InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
-                outputComputationNode->As<ComputationNode<float>>()->ResetGradient(*CNTKMatrixAndMBLayout.first);
+                PopulateComputationNodeGradient<float>(gradientVarValuePair, outputComputationNode);
                break;
-            }
            case DataType::Double:
-            {
-                auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(iter->first, gradientValue);
-                layout = CNTKMatrixAndMBLayout.second;
-                if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
-                    InvalidArgument("The layout of the specified gradient Value in incompatible with the layout of the corresponding Variable computed during Forward call");
-                outputComputationNode->As<ComputationNode<double>>()->ResetGradient(*CNTKMatrixAndMBLayout.first);
+                PopulateComputationNodeGradient<double>(gradientVarValuePair, outputComputationNode);
                break;
-            }
            default:
                LogicError("Unsupported DataType %s", DataTypeName(gradientValue->Data()->GetDataType()));
                break;
@ -603,6 +658,8 @@ namespace CNTK
    static NDShape GetValueShape(const Variable& var, const ComputationNodeBasePtr& computationNodePtr)
    {
        size_t outputValueNumAxes = var.Shape().NumAxes();
+
+        // Add the batch and dynamic axes if needed
        if (computationNodePtr->GetMBLayout() != nullptr)
            outputValueNumAxes += 2;

@ -622,12 +679,12 @@ namespace CNTK
    void CompositeFunction::GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs)
    {
        // Now copy the Forward values of output nodes from the network to outputs' Value objects
-        for (auto iter = outputs.begin(); iter != outputs.end(); ++iter)
+        for (auto outputVarValuePair : outputs)
        {
-            auto computationNodePtr = m_variableToNodeMap[iter->first];
-            auto outputValuePtr = iter->second;
+            auto computationNodePtr = m_variableToNodeMap[outputVarValuePair.first];
+            auto outputValuePtr = outputVarValuePair.second;

-            auto outputShape = GetValueShape(iter->first, computationNodePtr);
+            auto outputShape = GetValueShape(outputVarValuePair.first, computationNodePtr);
            if (outputValuePtr != nullptr)
            {
                // TODO: The shape of the specified output Value object must match the actual output shape
@ -635,38 +692,28 @@ namespace CNTK
                    InvalidArgument("The shape %s of the specified Value object for output does not match the actual output shape %s", AsString(outputValuePtr->Data()->Shape()).c_str(), AsString(outputShape).c_str());
            }

-            switch (iter->first.GetDataType())
+            ValuePtr nodeValue;
+            switch (outputVarValuePair.first.GetDataType())
            {
            case DataType::Float:
-            {
-                auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(iter->first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
-                if (outputValuePtr == nullptr)
-                {
-                    auto data = NDArrayViewPtr(new NDArrayView(iter->first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
-                    auto mask = (nodeValue->Mask() != nullptr) ? NDMaskPtr(new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()), [](_ReferenceCounter* ptr) { delete ptr; }) : nullptr;
-                    outputValuePtr = ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
-                }
-                outputValuePtr->CopyFrom(*nodeValue);
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Value(), computationNodePtr->GetMBLayout());
                break;
-            }
            case DataType::Double:
-            {
-                auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(iter->first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
-                if (outputValuePtr == nullptr)
-                {
-                    auto data = NDArrayViewPtr(new NDArrayView(iter->first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
-                    auto mask = (nodeValue->Mask() != nullptr) ? NDMaskPtr(new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()), [](_ReferenceCounter* ptr) { delete ptr; }) : nullptr;
-                    outputValuePtr = ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
-                }
-                outputValuePtr->CopyFrom(*nodeValue);
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(outputVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Value(), computationNodePtr->GetMBLayout());
                break;
-            }
            default:
-                LogicError("Unsupported DataType %s", DataTypeName(iter->first.GetDataType()));
+                LogicError("Unsupported DataType %s", DataTypeName(outputVarValuePair.first.GetDataType()));
                break;
            }

-            outputs[iter->first] = outputValuePtr;
+            if (outputValuePtr == nullptr)
+            {
+                auto data = MakeSharedObject<NDArrayView>(outputVarValuePair.first.GetDataType(), outputShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
+                auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
+                outputValuePtr = MakeSharedObject<Value>(data, mask);
+            }
+            outputValuePtr->CopyFrom(*nodeValue);
+            outputs[outputVarValuePair.first] = outputValuePtr;
        }
    }

@ -674,20 +721,20 @@ namespace CNTK
    {
        auto networkInputs = this->Inputs();
        // Now copy the gradient values of input nodes of the network to gradients' Value objects
-        for (auto iter = gradients.begin(); iter != gradients.end(); ++iter)
+        for (auto gradientVarValuePair : gradients)
        {
            // Only gradients corresponding to inputs of the network can be obtained
-            if (std::find(networkInputs.begin(), networkInputs.end(), iter->first) == networkInputs.end())
+            if (std::find(networkInputs.begin(), networkInputs.end(), gradientVarValuePair.first) == networkInputs.end())
                InvalidArgument("Backpropagated gradient values can only be obtained for inputs of a Function");

            // Gradients can only be obtained for parameter variables or input variables that NeedsGradient
-            if (!iter->first.NeedsGradient())
+            if (!gradientVarValuePair.first.NeedsGradient())
                InvalidArgument("Gradient value incorrectly requested for an Output or Constant Variable, or an Input Variable with NeedsGradient setting of false");

-            auto computationNodePtr = m_variableToNodeMap[iter->first];
-            auto gradientValuePtr = iter->second;
+            auto computationNodePtr = m_variableToNodeMap[gradientVarValuePair.first];
+            auto gradientValuePtr = gradientVarValuePair.second;

-            auto gradientShape = GetValueShape(iter->first, computationNodePtr);
+            auto gradientShape = GetValueShape(gradientVarValuePair.first, computationNodePtr);
            if (gradientValuePtr != nullptr)
            {
                // TODO: The shape of the specified output Value object must match the actual output shape
@ -698,50 +745,40 @@ namespace CNTK
            if (!computationNodePtr->NeedsGradient())
                LogicError("Backpropagated gradient value cannot be read from a ComputationNode that has NeedsGradient set to false");

-            switch (iter->first.GetDataType())
+            ValuePtr nodeValue;
+            switch (gradientVarValuePair.first.GetDataType())
            {
            case DataType::Float:
-            {
-                auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(iter->first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
-                if (gradientValuePtr == nullptr)
-                {
-                    auto data = NDArrayViewPtr(new NDArrayView(iter->first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
-                    auto mask = NDMaskPtr((nodeValue->Mask() != nullptr) ? new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr, [](_ReferenceCounter* ptr) { delete ptr; });
-                    gradientValuePtr = ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
-                }
-                gradientValuePtr->CopyFrom(*nodeValue);
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<float>>()->Gradient(), computationNodePtr->GetMBLayout());
                break;
-            }
            case DataType::Double:
-            {
-                auto nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(iter->first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
-                if (gradientValuePtr == nullptr)
-                {
-                    auto data = NDArrayViewPtr(new NDArrayView(iter->first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId())), [](_ReferenceCounter* ptr) { delete ptr; });
-                    auto mask = NDMaskPtr((nodeValue->Mask() != nullptr) ? new NDMask(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr, [](_ReferenceCounter* ptr) { delete ptr; });
-                    gradientValuePtr = ValuePtr(new Value(data, mask), [](_ReferenceCounter* ptr) { delete ptr; });
-
-                }
-                gradientValuePtr->CopyFrom(*nodeValue);
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(gradientVarValuePair.first, computationNodePtr->As<ComputationNode<double>>()->Gradient(), computationNodePtr->GetMBLayout());
                break;
-            }
            default:
-                LogicError("Unsupported DataType %s", DataTypeName(iter->first.GetDataType()));
+                LogicError("Unsupported DataType %s", DataTypeName(gradientVarValuePair.first.GetDataType()));
                break;
            }

-            gradients[iter->first] = gradientValuePtr;
+            if (gradientValuePtr == nullptr)
+            {
+                auto data = MakeSharedObject<NDArrayView>(gradientVarValuePair.first.GetDataType(), gradientShape, AsDeviceDescriptor(computationNodePtr->ValuePtr()->GetDeviceId()));
+                auto mask = (nodeValue->Mask() != nullptr) ? MakeSharedObject<NDMask>(nodeValue->Mask()->Shape(), nodeValue->Mask()->Device()) : nullptr;
+                gradientValuePtr = MakeSharedObject<Value>(data, mask);
+            }
+
+            gradientValuePtr->CopyFrom(*nodeValue);
+            gradients[gradientVarValuePair.first] = gradientValuePtr;
        }
    }

-    /*virtual*/ BackPropStatePtr CompositeFunction::Forward(const _Internal::_SimpleMap<Variable, const ValuePtr>& arguments,
-                                                            _Internal::_SimpleMap<Variable, ValuePtr>& outputs,
-                                                            const _Internal::_SimpleSet<Variable>& outputsToRetainBackwardStateFor,
-                                                            const DeviceDescriptor& computeDevice)
+    /*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
+                                                            std::unordered_map<Variable, ValuePtr>& outputs,
+                                                            const DeviceDescriptor& computeDevice,
+                                                            const std::unordered_set<Variable>& outputsToRetainBackwardStateFor)
    {
        // TODO: How about zero argument functions?
        // TODO: We need a better way to determine the ElementType for the network
-        auto dataType = arguments.m_map->begin()->second->Data()->GetDataType();
+        auto dataType = arguments.begin()->second->Data()->GetDataType();
        if (dataType == DataType::Float)
            GetComputationNetwork<float>(computeDevice, outputsToRetainBackwardStateFor);
        else
@ -752,140 +789,119 @@ namespace CNTK
        // Feed data into the arguments of the network
        PopulateNetworkInputs(arguments);

-        std::unordered_set<Variable> functionOutputs = _Internal::_SimpleVector<Variable>::CreateSimpleVector(this->Outputs()).GetAsUnorderedSet();
+        std::unordered_set<Variable> functionOutputs(this->Outputs().begin(), this->Outputs().end());
        std::vector<ComputationNodeBasePtr> outputsToEvaluate;

-        for (auto iter = outputs.m_map->begin(); iter != outputs.m_map->end(); ++iter)
+        for (auto outputVarValuePair : outputs)
        {
            // Ensure that only a subset of this function's outputs are being asked to be evaluated
-            if (functionOutputs.find(iter->first) == functionOutputs.end())
+            if (functionOutputs.find(outputVarValuePair.first) == functionOutputs.end())
                InvalidArgument("Requested output is not an Ouptut of the Function");

-            auto outputComputationNode = m_variableToNodeMap[iter->first];
+            auto outputComputationNode = m_variableToNodeMap[outputVarValuePair.first];
            outputsToEvaluate.push_back(outputComputationNode);
        }

        // The 'outputsToRetainBackwardStateFor' nodes also need to be evaluated if not already specified in 'outputs'
-        for (auto iter = outputsToRetainBackwardStateFor.m_set->begin(); iter != outputsToRetainBackwardStateFor.m_set->end(); ++iter)
+        for (auto rootVarForBackprop : outputsToRetainBackwardStateFor)
        {
-            if (outputs.m_map->find(*iter) == outputs.m_map->end())
-                outputsToEvaluate.push_back(m_variableToNodeMap[*iter]);
+            if (outputs.find(rootVarForBackprop) == outputs.end())
+                outputsToEvaluate.push_back(m_variableToNodeMap[rootVarForBackprop]);
        }

        m_computationNetwork->ForwardProp(outputsToEvaluate);

-        GetNetworkOutputs(*(outputs.m_map));
+        GetNetworkOutputs(outputs);

        // TODO: How to deal with the specified 'computeDevice'

-        return (outputsToRetainBackwardStateFor.Size() > 0) ? BackPropStatePtr(new CNTKBackPropState(this, { arguments.m_map->begin()->first, m_variableToNodeMap[arguments.m_map->begin()->first]->GetEvalTimeStamp() }), [](_ReferenceCounter* ptr) { delete ptr; }) : nullptr;
+        return (outputsToRetainBackwardStateFor.size() > 0) ? MakeSharedObject<CNTKBackPropState>(this->shared_from_this(), std::make_pair(arguments.begin()->first, m_variableToNodeMap[arguments.begin()->first]->GetEvalTimeStamp())) : nullptr;
    }

    /*virtual*/ void CompositeFunction::Backward(const BackPropStatePtr& state,
-                                                 const _Internal::_SimpleMap<Variable, const ValuePtr>& rootGradientValues,
-                                                 _Internal::_SimpleMap<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
+                                                 const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
+                                                 std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs)
    {
-        if ((state == nullptr) || (dynamic_cast<const CNTKBackPropState*>(state.GetPtr()) == nullptr))
+        auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.get());
+        if (backpropState == nullptr)
            InvalidArgument("Invalid backprop state specified");

        // TODO: Support multiple concurrent backprop states
-        auto backpropState = dynamic_cast<const CNTKBackPropState*>(state.GetPtr());
        if (backpropState->EvalTimeStamp().second != m_variableToNodeMap[backpropState->EvalTimeStamp().first]->GetEvalTimeStamp())
            LogicError("The specified backprop state specified cannot be used for backpropagation as the Function's internal state was modified by subsequent Forward calls to the function."
                       "This is not a user error but a shortcoming of the current implementation where multiple independent backprop states are not simultaneously supported");

-        if (rootGradientValues.Size() > 1)
+        if (rootGradientValues.size() > 1)
            LogicError("Currently gradient backprop from only one of the Function Outputs is supported");

        // TODO: Avoid copying the data when possible

        // Zero all gradients of nodes below the root nodes
-        for (auto iter = rootGradientValues.m_map->begin(); iter != rootGradientValues.m_map->end(); ++iter)
-            m_computationNetwork->ZeroInputGradients(m_variableToNodeMap[iter->first]);
+        for (auto rootGradientVarValuePair : rootGradientValues)
+            m_computationNetwork->ZeroInputGradients(m_variableToNodeMap[rootGradientVarValuePair.first]);

        // Feed data into the arguments of the network
        PopulateNetworkGradients(rootGradientValues);

        // Backpropagate through the network
-        auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.m_map->begin()->first];
+        auto rootComputationNodePtr = m_variableToNodeMap[rootGradientValues.begin()->first];
        m_computationNetwork->GetNestedNetwork(rootComputationNodePtr)->Backprop(FrameRange(nullptr), true, true);

-        GetNetworkGradients(*(backPropagatedGradientValuesForInputs.m_map));
+        GetNetworkGradients(backPropagatedGradientValuesForInputs);

        // TODO: How to deal with the specified 'computeDevice'
    }

-    /*virtual*/ void CompositeFunction::_ReplacePlaceholders(const _Internal::_SimpleMap<Placeholder, Variable>& placeholderReplacements, _Internal::_SimpleSet<const Function*>& visitedFunctions, _Internal::_SimpleSet<Placeholder>& replacedPlaceholders)
-    {
-        RootFunction()->_ReplacePlaceholders(placeholderReplacements, visitedFunctions, replacedPlaceholders);
-
-        // If any of the placeholders were replaced with Output variables, let's add the graph of function underneath each of those to 'm_allPrimitiveFunctions' set
-        for (auto iter = replacedPlaceholders.m_set->begin(); iter != replacedPlaceholders.m_set->end(); ++iter)
-        {
-            auto replacingVariable = placeholderReplacements[*iter];
-            if (replacingVariable.Kind() == VariableKind::Output)
-            {
-                auto ownerFunc = replacingVariable.Owner();
-                _Internal::_SimpleSet<FunctionPtr> visitedFunctions;
-                _DetermineInputs(ownerFunc, visitedFunctions);
-
-                // Add the newly visited functions to 'm_allPrimitiveFunctions' set
-                m_allPrimitiveFunctions.m_set->insert(visitedFunctions.m_set->begin(), visitedFunctions.m_set->end());
-            }
-        }
-    }
-
    FunctionPtr Times(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Times, { leftOperand, rightOperand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Times, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
    }

    FunctionPtr Plus(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Plus, { leftOperand, rightOperand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Plus, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
    }

    FunctionPtr Sigmoid(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Sigmoid, { operand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Sigmoid, std::vector<Variable>({ operand }), Dictionary(), name), name);
    }

    FunctionPtr Tanh(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Tanh, { operand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Tanh, std::vector<Variable>({ operand }), Dictionary(), name), name);
    }

-    FunctionPtr _Combine(const _Internal::_SimpleVector<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
+    FunctionPtr Combine(const std::initializer_list<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
    {
-        _Internal::_SimpleSet<FunctionPtr> uniqueOperands;
+        std::unordered_set<FunctionPtr> uniqueOperands;
        std::vector<Variable> inputs;
-        for (size_t i = 0; i < operands.Size(); ++i)
+        for (auto operand : operands)
        {
-            if (uniqueOperands.Contains(operands[i]))
+            if (uniqueOperands.find(operand) != uniqueOperands.end())
                LogicError("All function operands specified to Combine must be unique");

-            uniqueOperands.Insert(operands[i]);
-            auto currentFunctionOutputs = operands[i]->Outputs();
+            uniqueOperands.insert(operand);
+            auto currentFunctionOutputs = operand->Outputs();
            std::copy(currentFunctionOutputs.begin(), currentFunctionOutputs.end(), std::back_inserter(inputs));
        }

-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
-
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, inputs, Dictionary(), name), name);
    }

    FunctionPtr CrossEntropyWithSoftmax(const Variable& output, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::CrossEntropyWithSoftmax, { output, labels }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::CrossEntropyWithSoftmax, std::vector<Variable>({ output, labels }), Dictionary(), name), name);
    }

-    FunctionPtr PredictionError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
+    FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::PredictionError, { prediction, labels }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ClassificationError, std::vector<Variable>({ prediction, labels }), Dictionary(), name), name);
    }

    FunctionPtr Exp(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::Exp, { operand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Exp, std::vector<Variable>({ operand }), Dictionary(), name), name);
    }

    FunctionPtr PastValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
@ -895,7 +911,7 @@ namespace CNTK

        auto additionalProperties = Dictionary();
        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::PastValue, { initialState, operand }, std::move(additionalProperties), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::PastValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
    }

    FunctionPtr FutureValue(const Variable& initialState, const Variable& operand, size_t stepSize, const std::wstring& name/* = L""*/)
@ -905,16 +921,16 @@ namespace CNTK

        auto additionalProperties = Dictionary();
        additionalProperties[L"stepSize"] = DictionaryValue(stepSize);
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::FutureValue, { initialState, operand }, std::move(additionalProperties), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::FutureValue, std::vector<Variable>({ initialState, operand }), std::move(additionalProperties), name), name);
    }

    FunctionPtr ElementTimes(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::ElementTimes, { leftOperand, rightOperand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ElementTimes, std::vector<Variable>({ leftOperand, rightOperand }), Dictionary(), name), name);
    }

    FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name/* = L""*/)
    {
-        return CompositeFunction::Create(new PrimitiveFunction(PrimitiveOpType::ReduceSum, { operand }, Dictionary(), name), name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ReduceSum, std::vector<Variable>({ operand }), Dictionary(), name), name);
    }
 }
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -3,6 +3,8 @@
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //

+#pragma once
+
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include <iterator>
@ -19,7 +21,7 @@ namespace CNTK
        Tanh,
        Combine,
        CrossEntropyWithSoftmax,
-        PredictionError,
+        ClassificationError,
        Exp,
        PastValue,
        FutureValue,
@ -29,6 +31,7 @@ namespace CNTK

    inline const char* PrimitiveOpTypeName(PrimitiveOpType opType)
    {
+        // TODO: Put these in table form
        if (opType == PrimitiveOpType::Plus)
            return "Plus";
        else if (opType == PrimitiveOpType::Times)
@ -41,8 +44,8 @@ namespace CNTK
            return "Combine";
        else if (opType == PrimitiveOpType::CrossEntropyWithSoftmax)
            return "CrossEntropyWithSoftmax";
-        else if (opType == PrimitiveOpType::PredictionError)
-            return "PredictionError";
+        else if (opType == PrimitiveOpType::ClassificationError)
+            return "ClassificationError";
        else if (opType == PrimitiveOpType::Exp)
            return "Exp";
        else if (opType == PrimitiveOpType::PastValue)
@ -65,17 +68,17 @@ namespace CNTK
        {
        }

-        virtual BackPropStatePtr Forward(const _Internal::_SimpleMap<Variable, const ValuePtr>& /*arguments*/,
-                                         _Internal::_SimpleMap<Variable, ValuePtr>& /*outputs*/,
-                                         const _Internal::_SimpleSet<Variable>& /*outputsToRetainBackwardStateFor*/,
-                                         const DeviceDescriptor& /*computeDevice*/) override
+        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& /*arguments*/,
+                                         std::unordered_map<Variable, ValuePtr>& /*outputs*/,
+                                         const DeviceDescriptor& /*computeDevice*/,
+                                         const std::unordered_set<Variable>& /*outputsToRetainBackwardStateFor*/) override
        {
            NOT_IMPLEMENTED;
        }

        virtual void Backward(const BackPropStatePtr& /*state*/,
-                              const _Internal::_SimpleMap<Variable, const ValuePtr>& /*rootGradientValues*/,
-                              _Internal::_SimpleMap<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
+                              const std::unordered_map<Variable, const ValuePtr>& /*rootGradientValues*/,
+                              std::unordered_map<Variable, ValuePtr>& /*backPropagatedGradientValuesForInputs*/) override
        {
            NOT_IMPLEMENTED;
        }
@ -91,6 +94,8 @@ namespace CNTK
        }

    private:
+        // The following helper functions are used to determine the output shape for different 
+        // types of primitive operations accounting for broadcasting and reductions where applicable.
        static NDShape UnaryElementwiseOpOutputShape(const NDShape& operandShape)
        {
            return operandShape;
@ -98,17 +103,17 @@ namespace CNTK

        static NDShape BinaryElementwiseOpOutputShape(PrimitiveOpType op, const NDShape& leftOperandShape, const NDShape& rightOperandShape, bool broadcastAllowed = true)
        {
-            auto& shapeWithSmallerNumAxes = (leftOperandShape.NumAxes() > rightOperandShape.NumAxes()) ? rightOperandShape : leftOperandShape;
-            auto& shapeWithLargerNumAxes = (leftOperandShape.NumAxes() > rightOperandShape.NumAxes()) ? leftOperandShape : rightOperandShape;
+            const auto& shapeWithSmallerNumAxes = (leftOperandShape.NumAxes() > rightOperandShape.NumAxes()) ? rightOperandShape : leftOperandShape;
+            const auto& shapeWithLargerNumAxes = (leftOperandShape.NumAxes() > rightOperandShape.NumAxes()) ? leftOperandShape : rightOperandShape;
            size_t numOutputAxes = shapeWithLargerNumAxes.NumAxes();
            std::vector<size_t> outputDims(numOutputAxes);
            for (size_t i = 0; i < shapeWithSmallerNumAxes.NumAxes(); ++i)
            {
                if ((leftOperandShape[i] == NDShape::InferredDimension) && (rightOperandShape[i] == NDShape::InferredDimension))
                    outputDims[i] = NDShape::InferredDimension;
-                else if ((leftOperandShape[i] == NDShape::InferredDimension) && (rightOperandShape[i] != NDShape::InferredDimension))
+                else if (leftOperandShape[i] == NDShape::InferredDimension)
                    outputDims[i] = rightOperandShape[i];
-                else if ((leftOperandShape[i] != NDShape::InferredDimension) && (rightOperandShape[i] == NDShape::InferredDimension))
+                else if (rightOperandShape[i] == NDShape::InferredDimension)
                    outputDims[i] = leftOperandShape[i];
                else
                {
@ -126,7 +131,7 @@ namespace CNTK
            return NDShape(std::move(outputDims));
        }

-        static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape, bool broadcastAllowed = true)
+        static NDShape TimesOpOutputShape(const NDShape& leftOperandShape, const NDShape& rightOperandShape)
        {
            if (rightOperandShape.NumAxes() > 2)
                RuntimeError("The right operand of a times operation can have at most 2 axes");
@ -166,6 +171,7 @@ namespace CNTK
            return NDShape(std::move(outputDims));
        }

+        // TODO: Reconcile this with the ComputationNode::Validate functionality in core CNTK to avoid duplication of inference logic
        static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner)
        {
            std::vector<Variable> outputs;
@ -175,9 +181,9 @@ namespace CNTK

            // We currently require that the inputs' dynamic axes if any match
            std::vector<Axis> outputDynamicAxes = inputs[0].DynamicAxes();
-            for (size_t i = 1; i < inputs.size(); ++i)
+            for (auto inputVar : inputs)
            {
-                auto currentInputDynamicAxes = inputs[i].DynamicAxes();
+                auto currentInputDynamicAxes = inputVar.DynamicAxes();
                if (outputDynamicAxes.empty())
                    outputDynamicAxes = currentInputDynamicAxes;
                else
@ -210,7 +216,7 @@ namespace CNTK
                outputs.push_back(Variable(TimesOpOutputShape(inputs[0].Shape(), inputs[1].Shape()), outputDataType, owner, outputDynamicAxes));
                break;
            case PrimitiveOpType::CrossEntropyWithSoftmax:
-            case PrimitiveOpType::PredictionError:
+            case PrimitiveOpType::ClassificationError:
            {
                assert(inputs.size() == 2);

@ -274,65 +280,69 @@ namespace CNTK
    private:
        std::pair<Variable, int64_t> m_evalTimeStamp;
    };
-    typedef _Internal::_ReferenceCounterSharedPtr<CNTKBackPropState> CNTKBackPropStatePtr;
+    typedef std::shared_ptr<CNTKBackPropState> CNTKBackPropStatePtr;

    class CompositeFunction;
-    typedef _Internal::_ReferenceCounterSharedPtr<CompositeFunction> CompositeFunctionPtr;
+    typedef std::shared_ptr<CompositeFunction> CompositeFunctionPtr;

    class CompositeFunction final : public Function
    {
        friend class Function;

+        template <typename T, typename ...CtorArgTypes>
+        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
+
    public:
        static CompositeFunctionPtr Create(const FunctionPtr& rootFunction, const std::wstring& name = L"")
        {
-            _Internal::_SimpleSet<FunctionPtr> visitedFunctions;
+            std::unordered_set<FunctionPtr> visitedFunctions;

-            // Call _DetermineInputs to get the set of all functions in the graph
-            _DetermineInputs(rootFunction, visitedFunctions);
+            // Call DetermineInputs to get the set of all functions in the graph
+            DetermineInputs(rootFunction, visitedFunctions);

-            auto func = new CompositeFunction(rootFunction, std::move(visitedFunctions), name);
-            return CompositeFunctionPtr(func, [](_ReferenceCounter* ptr) { delete ptr; });
+            return MakeSharedObject<CompositeFunction>(rootFunction, std::move(visitedFunctions), name);
        }

-        virtual BackPropStatePtr Forward(const _Internal::_SimpleMap<Variable, const ValuePtr>& arguments,
-                                         _Internal::_SimpleMap<Variable, ValuePtr>& outputs,
-                                         const _Internal::_SimpleSet<Variable>& outputsToRetainBackwardStateFor,
-                                         const DeviceDescriptor& computeDevice) override;
+        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, const ValuePtr>& arguments,
+                                         std::unordered_map<Variable, ValuePtr>& outputs,
+                                         const DeviceDescriptor& computeDevice,
+                                         const std::unordered_set<Variable>& outputsToRetainBackwardStateFor) override;

        virtual void Backward(const BackPropStatePtr& state,
-                              const _Internal::_SimpleMap<Variable, const ValuePtr>& rootGradientValues,
-                              _Internal::_SimpleMap<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;
+                              const std::unordered_map<Variable, const ValuePtr>& rootGradientValues,
+                              std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;

    private:
-        virtual void _ReplacePlaceholders(const _Internal::_SimpleMap<Placeholder, Variable>& placeholderReplacements, _Internal::_SimpleSet<const Function*>& visitedFunctions, _Internal::_SimpleSet<Placeholder>& replacedPlaceholders) override;
+        virtual void ReplacePlaceholders(const std::unordered_map<Placeholder, Variable>& placeholderReplacements,
+                                         std::unordered_set<const Function*>& visitedFunctions,
+                                         std::unordered_set<Placeholder>& replacedPlaceholders) override;

-        CompositeFunction(const FunctionPtr& rootFunction, _Internal::_SimpleSet<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name)
+        CompositeFunction(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name)
            : Function({}, rootFunction->Outputs(), rootFunction, name), m_allPrimitiveFunctions(std::move(allPrimitiveFunctions))
        {
        }

        std::vector<Variable> DetermineInputs() const
        {
-            _Internal::_SimpleSet<FunctionPtr> visitedFunctions;
-            return _DetermineInputs(RootFunction(), visitedFunctions);
+            std::unordered_set<FunctionPtr> visitedFunctions;
+            return DetermineInputs(RootFunction(), visitedFunctions);
        }

-        static std::vector<Variable> _DetermineInputs(const FunctionPtr& rootFunction, _Internal::_SimpleSet<FunctionPtr>& visitedFunctions)
+        // Recursively traverses the Function graph underlying the 'rootFunction' to determine all the leaves (aka inputs) of the graph
+        static std::vector<Variable> DetermineInputs(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>& visitedFunctions)
        {
-            visitedFunctions.Insert(rootFunction);
+            visitedFunctions.insert(rootFunction);

            std::vector<Variable> inputs;
            std::vector<Variable> rootFunctionInputs = rootFunction->Inputs();
-            for (size_t i = 0; i < rootFunctionInputs.size(); ++i)
+            for (auto rootInput : rootFunctionInputs)
            {
-                Variable currentInput = rootFunctionInputs[i];
-                if (currentInput.Kind() != VariableKind::Output)
-                    inputs.push_back(currentInput);
-                else if (!visitedFunctions.Contains(currentInput.Owner()))
+                if (!rootInput.IsOutput())
+                    inputs.push_back(rootInput);
+                else if (visitedFunctions.find(rootInput.Owner()) == visitedFunctions.end())
                {
-                    FunctionPtr function = currentInput.Owner();
-                    std::vector<Variable> functionInputs = _DetermineInputs(function, visitedFunctions);
+                    FunctionPtr function = rootInput.Owner();
+                    std::vector<Variable> functionInputs = DetermineInputs(function, visitedFunctions);
                    std::copy(functionInputs.begin(), functionInputs.end(), std::back_inserter(inputs));
                }
            }
@ -341,7 +351,7 @@ namespace CNTK
        }

        template <typename ElementType>
-        Microsoft::MSR::CNTK::ComputationNetworkPtr GetComputationNetwork(const DeviceDescriptor& device, const _Internal::_SimpleSet<Variable>& backpropRoots);
+        Microsoft::MSR::CNTK::ComputationNetworkPtr GetComputationNetwork(const DeviceDescriptor& device, const std::unordered_set<Variable>& backpropRoots);

        template <typename ElementType>
        static Microsoft::MSR::CNTK::ComputationNodeBasePtr GetOutputVariableNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap);
@ -349,8 +359,13 @@ namespace CNTK
        template <typename ElementType>
        static Microsoft::MSR::CNTK::ComputationNodeBasePtr GetNode(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkPtr& network, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap, std::unordered_map<Variable, bool>& isVariableRootMap);

-        void PopulateNetworkInputs(const _Internal::_SimpleMap<Variable, const ValuePtr>& arguments);
-        void PopulateNetworkGradients(const _Internal::_SimpleMap<Variable, const ValuePtr>& gradients);
+        template <typename ElementType>
+        static void PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
+        void PopulateNetworkInputs(const std::unordered_map<Variable, const ValuePtr>& arguments);
+
+        template <typename ElementType>
+        static void PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode);
+        void PopulateNetworkGradients(const std::unordered_map<Variable, const ValuePtr>& gradients);

        void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
        void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);
@ -362,10 +377,23 @@ namespace CNTK
        static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout);

    private:
-        _Internal::_SimpleSet<FunctionPtr> m_allPrimitiveFunctions;
+
+        // Set of all primitive functions in the graph underlying 'this' Function. Also keeps the primitive Function objects alive 
+        // by holding strong references to them
+        std::unordered_set<FunctionPtr> m_allPrimitiveFunctions;
+
+        // A map from Variable objects to ComputationNode objects in the ComputationNetwork instance that implements 'this' Composite Function
        std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr> m_variableToNodeMap;
+
+        // A map that tells whether a Variable in the graph underlying 'this' Function is a root of the graph
        std::unordered_map<Variable, bool> m_isVariableRootMap;
+
        Microsoft::MSR::CNTK::ComputationNetworkPtr m_computationNetwork;
+
+        // The backpropRoots sepecified in the most recent 'Forward' call on 'this' Function.
+        // This indicates for which of it's roots has 'this' Function retained required intermediate 
+        // states from the previos Forward call to be able to backpropagate gradients backwards from in
+        // the next 'Backward' call.
        std::unordered_set<Variable> m_currentBackpropRoots;
    };
 }
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -0,0 +1,464 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "Learner.h"
+#include "TensorView.h"
+#include "Utils.h"
+
+#define UPDATE_FUNCTION                                                                                       \
+    switch (smoothedGradientValue->Data()->GetDataType())                                                     \
+    {                                                                                                         \
+    case DataType::Float:                                                                                     \
+        Update<float>(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount);  \
+        break;                                                                                                \
+    case DataType::Double:                                                                                    \
+        Update<double>(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount); \
+        break;                                                                                                \
+    default:                                                                                                  \
+        NOT_IMPLEMENTED;                                                                                      \
+    }
+
+
+using namespace Microsoft::MSR::CNTK;
+using namespace std;
+
+namespace CNTK
+{
+    template <typename ElementType>
+    /*static*/ shared_ptr<const Matrix<ElementType>> LearnerBase::GetMatrix(const NDArrayViewPtr arrayView)
+    {
+        return arrayView->GetMatrix<ElementType>();
+    }
+
+    template <typename ElementType>
+    /*static*/ shared_ptr<Matrix<ElementType>> LearnerBase::GetWritableMatrix(NDArrayViewPtr arrayView)
+    {
+        return arrayView->GetWritableMatrix<ElementType>();
+    }
+
+    template <typename ElementType>
+    /*static*/ const TensorView<ElementType>* LearnerBase::GetTensorView(const NDArrayViewPtr arrayView)
+    {
+        return arrayView->GetTensorView<ElementType>();
+    }
+
+    /*static*/ bool LearnerBase::HasNan(const ValuePtr& value, const char* name)
+    {
+        const auto& data = value->Data();
+        switch (data->GetDataType())
+        {
+        case DataType::Float:
+            return data->GetMatrix<float>()->HasNan(name);
+        case DataType::Double:
+            return data->GetMatrix<double>()->HasNan(name);
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(data->GetDataType()));
+        }
+    }
+
+    /*static*/ void LearnerBase::Print(const ValuePtr& value, const char* msg)
+    {
+        const auto& data = value->Data();
+        switch (data->GetDataType())
+        {
+        case DataType::Float:
+            data->GetMatrix<float>()->Print(msg);
+            break;
+        case DataType::Double:
+            data->GetMatrix<double>()->Print(msg);
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(data->GetDataType()));
+        }
+    }
+
+    // Clipping gradients to prevent outliers,
+    template <typename ElementType>
+    void LearnerBase::ClipGradient(Matrix<ElementType>& gradient, size_t actualMBSize) const
+    {
+        if (m_additionalOptions.gradientClippingThresholdPerSample != numeric_limits<double>::infinity())
+        {
+            double maxGradientPerMB = m_additionalOptions.gradientClippingThresholdPerSample * actualMBSize;
+            if (m_additionalOptions.gradientClippingWithTruncation)
+                gradient.InplaceTruncate(ElementType(maxGradientPerMB));
+            else
+            {
+                // norm2 normalized
+                double gradientNorm = gradient.FrobeniusNorm();
+                if (gradientNorm > maxGradientPerMB)
+                {
+                    double normFactor = maxGradientPerMB / gradientNorm;
+                    gradient *= ElementType(normFactor);
+                }
+            }
+        }
+    }
+
+    // Performs additional preprocessing before calling the update method 
+    // (gradient clipping and L2 regularization depending on the additional learning parameters).
+    template <typename ElementType>
+    void LearnerBase::PreProcess(const ValuePtr& gradientValue,const ValuePtr& parameterValue, size_t actualMBSize) const
+    {
+        const auto& gradientMatrix = gradientValue->Data()->GetWritableMatrix<ElementType>();
+
+        // clipping gradients to prevent outliers
+        ClipGradient<ElementType>(*gradientMatrix, actualMBSize);
+
+        // L2 regularizer
+        if (m_additionalOptions.l2RegularizationWeight > 0)
+        {
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            auto weight = ElementType(m_additionalOptions.l2RegularizationWeight * actualMBSize);
+            const auto& parameterMatrix = parameterValue->Data()->GetWritableMatrix<ElementType>();
+            Matrix<ElementType>::ScaleAndAdd(weight, *parameterMatrix, *gradientMatrix);
+        }
+    }
+
+    // Performs additional postprocessing after the update method has been executed
+    // (noise injection and L1 regularization specified by the additional learning parameters).
+    template <typename ElementType>
+    void LearnerBase::PostProcess(const Variable& parameter, const ValuePtr& gradientValue,
+                                    const ValuePtr& parameterValue, size_t actualMBSize) const
+    {
+        const auto& parameterMatrix = parameterValue->Data()->GetWritableMatrix<ElementType>();
+        if (m_additionalOptions.gaussianNoiseInjectionStdDev > 0)
+        {
+            const auto& gradientMatrix = gradientValue->Data()->GetWritableMatrix<ElementType>();
+
+            Matrix<ElementType> sgdUpdateNoise((DEVICEID_TYPE)parameterMatrix->GetDeviceId());
+
+            // get the gradient structure since gradient is sparse
+            sgdUpdateNoise.SetValue(*gradientMatrix);
+
+            auto noiseStdDev = ElementType(m_additionalOptions.gaussianNoiseInjectionStdDev);
+
+            // reset its value to random
+            sgdUpdateNoise.SetGaussianRandomValue(ElementType(0.0), noiseStdDev);
+
+            Matrix<ElementType>::ScaleAndAdd(ElementType(1.0), sgdUpdateNoise, *parameterMatrix);
+        }
+
+        // L1 regularizer with proximal gradient descent method
+        if (m_additionalOptions.l1RegularizationWeight > 0)
+        {
+            auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
+            auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
+            parameterValue->Data()->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
+        }
+    }
+
+    template <typename ElementType>
+    /*static*/ TensorView<ElementType>* LearnerBase::GetWritableTensorView(NDArrayViewPtr arrayView)
+    {
+        return arrayView->GetWritableTensorView<ElementType>();
+    }
+
+    LearnerBase::LearnerBase(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+        : Learner(parameters),
+        m_learningRatePerSample(0.0),
+        m_sampleCount(0)
+    {
+        const unordered_set<Variable>& parameterSet = parameters;
+        for (const auto& parameter : parameterSet)
+        {
+            // TODO: using the same device to allocate data for all smoothed gradients. Is this correct?
+            // Should the device be specified on the per-parameter basis?
+            NDArrayViewPtr view;
+            if (parameter.GetDataType() == DataType::Float)
+            {
+                view = MakeSharedObject<NDArrayView>(0.0f, parameter.Shape(), device);
+            }
+            else
+            {
+                view = MakeSharedObject<NDArrayView>(0.0, parameter.Shape(), device);
+            }
+
+            m_smoothedGradientValues.insert(make_pair(parameter, MakeSharedObject<Value>(view)));
+            m_additionalOptions.learningRateMultipliers.insert(make_pair(parameter, 1.0));
+        }
+    }
+
+    void LearnerBase::ResetSmoothedGradients()
+    {
+        for (const auto& parameter : Parameters())
+        {
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            const auto& data = smoothedGradientValue->Data();
+            switch (data->GetDataType())
+            {
+            case DataType::Float:
+                data->SetValue(0.0f);
+                break;
+            case DataType::Double:
+                data->SetValue(0.0);
+                break;
+            default:
+                LogicError("Unsupported DataType %s", ::CNTK::DataTypeName(data->GetDataType()));
+            }
+        }
+    }
+
+    /*virtual*/ bool LearnerBase::Update(const unordered_map<Variable, ValuePtr>& parameterValues,
+                                            const unordered_map<Variable, const ValuePtr>& gradientValues,
+                                            size_t trainingSampleCount) /*override*/
+    {
+        // make sure trainingSampleCount is a valid value
+        assert(trainingSampleCount > 0);
+
+        for (const auto& parameter : Parameters())
+        {
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+            const auto& gradientValue = gradientValues.at(parameter);
+            const auto& parameterValue = parameterValues.at(parameter);
+
+// TODO: make this a runtime parameter.
+#if DUMPOUTPUT
+            LOGPRINTF(stderr, "Update_%ls\n", parameter.Name().c_str());
+#endif
+
+#ifdef _DEBUG
+            if (HasNan(smoothedGradientValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
+                LogicError("%ls has NaNs in smoothedGradient.", parameter.Name().c_str());
+#endif
+
+#if DUMPOUTPUT
+            LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
+                        m_learningRatePerSample, m_momentumPerSample, trainingSampleCount);
+            LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
+                        LearnerType().c_str(), m_GaussianNoiseInjectStd);
+            Print(gradientValue, "Gradient Update");
+            Print(smoothedGradientValue, "Smoothed Gradient Input");
+#endif
+            UPDATE_FUNCTION;
+
+#if DUMPOUTPUT
+            Print(parameterValue, "Parameter Update");
+#endif
+
+#ifdef _DEBUG
+            if (HasNan(parameterValue, "TrainOneEpoch/UpdateWeights/Learner::Update(): "))
+                LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Name().c_str());
+#endif
+        }
+        m_sampleCount += trainingSampleCount;
+        return false;
+    }
+
+    template <typename ElementType>
+    void LearnerBase::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                             const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        PreProcess<ElementType>(gradientValue, parameterValue, trainingSampleCount);
+        Update(parameter, smoothedGradientValue, gradientValue, parameterValue, trainingSampleCount);
+        PostProcess<ElementType>(parameter, gradientValue, parameterValue, trainingSampleCount);
+    }
+
+    string LearnerBase::LearnerType() const
+    {
+        auto name = typeid(*this).name(); 
+        if (strncmp(name, "class ", 6) == 0)
+        {
+            // On Windows, the type name contains "class" prefix. 
+            // Return the actual name, omitting the prefix.
+            return &name[6];
+        } 
+        return name;
+    }
+
+    /*virtual*/ Dictionary LearnerBase::GetCheckpointState() const /*override*/
+    {
+        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        Dictionary checkpoint;
+
+        for (const auto& parameter : Parameters())
+        {
+            // TODO: parameter name is not guaranteed to be unique. Instead, all serializable objects
+            // need to expose "UId" property -- a persistent unique internal name.
+            // Switch to UId as soon as it's available.
+            if (checkpoint.Contains(parameter.Name()))
+            {
+                LogicError("Parameter names must be unique");
+            }
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+
+            // Potentially, could store things like dimensions, element size, format, etc., but
+            // that seems to be redundant, since all of that is passed in the constructor.
+            checkpoint[parameter.Name()] = SerializeToVector(smoothedGradientValue->Data());
+        }
+        return checkpoint;
+    }
+
+    /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
+    {
+        NOT_IMPLEMENTED; // Until the new checkpointing is fully fleshed out, nobody should be calling this.
+        for (const auto& parameter : Parameters())
+        {
+            if (!checkpoint.Contains(parameter.Name()))
+            {
+                LogicError("Checkpoint does not contain state for parameter %ls", parameter.Name().c_str());
+            }
+            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
+
+            const DictionaryValue& state = checkpoint[parameter.Name()];
+
+            const auto& data = smoothedGradientValue->Data();
+
+            DeserializeFromVector(data, state.GetValue<vector<DictionaryValue>>());
+        }
+    }
+
+    /*virtual*/ void LearnerSGD::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                        const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerSGD::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
+
+        const auto& learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
+        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
+        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
+                                            learningRate, ElementType(m_momentumPerSample), m_useNesterovAcceleration);
+    }
+
+    LearnerAdaGrad::LearnerAdaGrad(const unordered_set<Variable>& parameters, bool needAveMultiplier, const DeviceDescriptor& device)
+        : LearnerBase(parameters, device),
+        m_needAveMultiplier(needAveMultiplier)
+    {
+    }
+
+    /*virtual*/ void LearnerAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
+        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
+    }
+
+    LearnerFSAdaGrad::LearnerFSAdaGrad(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+        : LearnerMomentumSGD(parameters, device)
+    {
+    }
+
+    /*virtual*/ void LearnerFSAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerFSAdaGrad::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                  const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
+
+        //const double momentum = MomentumPerMB(m_momentumPerSample, trainingSampleCount);
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        smoothedGradientMatrix->FSAdagrad(trainingSampleCount, *gradientMatrix, *parameterMatrix,
+                                            learningRate, ElementType(m_momentumPerSample));
+    }
+
+    LearnerRMSProp::LearnerRMSProp(const unordered_set<Variable>& parameters,
+                                    double gamma, double inc, double dec, double max, double min,
+                                    bool needAveMultiplier, const DeviceDescriptor& device)
+                                    : LearnerBase(parameters, device),
+                                    m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min),
+                                    m_needAveMultiplier(needAveMultiplier)
+    {
+    }
+
+    /*virtual*/ void LearnerRMSProp::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const /*override*/
+    {
+        UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerRMSProp::Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                                const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const
+    {
+        UNUSED(trainingSampleCount);
+
+        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue->Data());
+        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue->Data());
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue->Data());
+
+        auto learningRate = ElementType(ParameterDependentLearningRate(parameter));
+
+        auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
+                                                                ElementType(m_gamma), ElementType(m_inc),
+                                                                ElementType(m_max), ElementType(m_dec),
+                                                                ElementType(m_min), m_needAveMultiplier);
+        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
+    }
+
+    // Explicit template instantiations
+    template shared_ptr<Matrix<float>> LearnerBase::GetWritableMatrix<float>(const NDArrayViewPtr arrayView);
+    template shared_ptr<Matrix<double>> LearnerBase::GetWritableMatrix<double>(const NDArrayViewPtr arrayView);
+    
+    LearnerPtr SGDLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerSGD>(parameters, device);
+    }
+
+    LearnerPtr MomentumSGDLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerMomentumSGD>(parameters, device);
+    }
+
+    LearnerPtr NesterovLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerNesterov>(parameters, device);
+    }
+
+    LearnerPtr AdaGradLearner(const unordered_set<Variable>& parameters, bool needAveMultiplier, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerAdaGrad>(parameters, needAveMultiplier, device);
+    }
+
+    LearnerPtr FSAdaGradLearner(const unordered_set<Variable>& parameters, const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerFSAdaGrad>(parameters, device);
+    }
+
+    LearnerPtr RMSPropLearner(const unordered_set<Variable>& parameters,
+                                double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
+                                const DeviceDescriptor& device)
+    {
+        return MakeSharedObject<LearnerRMSProp>(parameters, gamma, inc, dec, max, min, needAveMultiplier, device);
+    }
+
+}
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -0,0 +1,224 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "CNTKLibrary.h"
+
+namespace CNTK 
+{
+    // A collection of additional options that are applicable for all standard learners 
+    // (after these options are set, they retain their value for the entire lifespan of a learner).
+    struct AdditionalLearningOptions
+    {
+        double l1RegularizationWeight = 0.0;
+        double l2RegularizationWeight = 0.0;
+        double gaussianNoiseInjectionStdDev = 0.0;
+        bool gradientClippingWithTruncation = false;
+        double gradientClippingThresholdPerSample = 0.0;
+        std::unordered_map<Variable, double> learningRateMultipliers;
+    };
+
+    // An abstract base class at the root of the standard learners hierarchy
+    // It implements most of the learner functionality, except for the actual update function,
+    // and adds a few pre-/postprocessing methods (which are invoked before and after the update).
+    class LearnerBase : public Learner
+    {
+    public:
+
+        CNTK_API virtual bool Update(const std::unordered_map<Variable, ValuePtr>& parameterValues,
+                                     const std::unordered_map<Variable, const ValuePtr>& gradientValues,
+                                     size_t trainingSampleCount) override final;
+
+        CNTK_API virtual Dictionary GetCheckpointState() const override;
+
+        CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override;
+
+        CNTK_API void SetAdditionalOptions(const AdditionalLearningOptions& additionalOptions)
+        {
+            m_additionalOptions = additionalOptions;
+        }
+
+        // TODO: should this be called ResetMomentum?
+        // needed for BlockMomemtumSGD to reset SGD momentum after aggregation.
+        CNTK_API void ResetSmoothedGradients();
+
+        // TODO: move learning rate and momentum scheduling and adjustment functionality 
+        // inside the learner and drop these setters.
+        void SetLearningRate(double value) { m_learningRatePerSample = value; }
+
+    protected:
+        LearnerBase(const std::unordered_set<Variable>& parameters,
+                    const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const = 0;
+
+        double ParameterDependentLearningRate(const Variable& parameter) const
+        {
+            return m_learningRatePerSample * m_additionalOptions.learningRateMultipliers.at(parameter);
+        }
+
+        std::string LearnerType() const;
+
+        double m_learningRatePerSample;
+
+        AdditionalLearningOptions m_additionalOptions;
+
+        std::unordered_map<Variable, ValuePtr> m_smoothedGradientValues;
+
+        // The following four static protected methods expose private methods of NDArrayView class
+        // (which declares LearnerBase as friend class), so that they are available to subclasses.
+        template <typename ElementType>
+        static std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>> GetMatrix(const NDArrayViewPtr arrayView);
+
+        template <typename ElementType>
+        static std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(NDArrayViewPtr arrayView);
+
+        template <typename ElementType>
+        static const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView(const NDArrayViewPtr arrayView);
+
+        template <typename ElementType>
+        static Microsoft::MSR::CNTK::TensorView<ElementType>* GetWritableTensorView(NDArrayViewPtr arrayView);
+
+        template <typename ElementType>
+        void ClipGradient(Microsoft::MSR::CNTK::Matrix<ElementType>& gradient, size_t actualMBSize) const;
+
+        // Performs additional preprocessing before calling the update method 
+        // (gradient clipping and L2 regularization depending on the additional learning parameters).
+        template <typename ElementType>
+        void PreProcess(const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t actualMBSize) const;
+
+        // Performs additional postprocessing after the update method has been executed
+        // (noise injection and L1 regularization specified by the additional learning parameters).
+        template <typename ElementType>
+        void PostProcess(const Variable& parameter, const ValuePtr& gradientValue,
+                         const ValuePtr& parameterValue, size_t actualMBSize) const;
+    private:
+        // Templatized update function, it invokes preprocess and postprocess using the provided
+        // template parameter and also invokes virtual Update method implemented in one of the subclasses.
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+
+        // TODO: make these functions friends of NDViewArray and move to Utils?
+        static bool HasNan(const ValuePtr& value, const char* name);
+        static void Print(const ValuePtr& value, const char* msg);
+
+        size_t m_sampleCount;
+    };
+
+    // Vanilla gradient descent optimization algorithm.
+    class LearnerSGD : public LearnerBase
+    {
+    public:
+
+        LearnerSGD(const std::unordered_set<Variable>& parameters,
+                   const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
+                   : LearnerBase(parameters, device),
+                   m_momentumPerSample(0.0),
+                   m_useNesterovAcceleration(false)
+        {
+        }
+
+    protected:
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+
+        double m_momentumPerSample;
+        bool m_useNesterovAcceleration;
+    };
+
+    // SGD optimization with momentum. 
+    class LearnerMomentumSGD : public LearnerSGD
+    {
+    public:
+
+        LearnerMomentumSGD(const std::unordered_set<Variable>& parameters,
+                           const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
+                           : LearnerSGD(parameters, device)
+        {
+        }
+
+        void SetMomentum(double value) { m_momentumPerSample = value; }
+    };
+
+    // Nesterov's accelerated SGDLearnerBase descent. 
+    class LearnerNesterov : public LearnerSGD
+    {
+    public:
+
+        LearnerNesterov(const std::unordered_set<Variable>& parameters,
+                        const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice())
+                        : LearnerSGD(parameters, device)
+        {
+            m_useNesterovAcceleration = true;
+        }
+    };
+
+    class LearnerAdaGrad : public LearnerBase
+    {
+    public:
+
+        LearnerAdaGrad(const std::unordered_set<Variable>& parameters, bool needAveMultiplier,
+                       const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
+
+    protected:
+        bool m_needAveMultiplier;
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+    };
+
+    class LearnerFSAdaGrad : public LearnerMomentumSGD
+    {
+    public:
+
+        LearnerFSAdaGrad(const std::unordered_set<Variable>& parameters,
+                         const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
+
+    protected:
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+    };
+
+    class LearnerRMSProp : public LearnerBase
+    {
+    public:
+
+        LearnerRMSProp(const std::unordered_set<Variable>& parameters,
+                       double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
+                       const DeviceDescriptor& device = DeviceDescriptor::DefaultDevice());
+
+    protected:
+
+        double m_gamma;
+        double m_inc;
+        double m_dec;
+        double m_max;
+        double m_min;
+        bool m_needAveMultiplier;
+
+        virtual void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                            const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Variable& parameter, const ValuePtr& smoothedGradientValue,
+                    const ValuePtr& gradientValue, const ValuePtr& parameterValue, size_t trainingSampleCount) const;
+    };
+}
--- a/Source/CNTKv2LibraryDll/NDArrayView.cpp
+++ b/Source/CNTKv2LibraryDll/NDArrayView.cpp
@ -17,9 +17,9 @@ namespace CNTK
 {
    template <typename ElementType>
    static TensorView<ElementType>* AllocateTensorView(const NDShape& viewShape,
-                                    const DeviceDescriptor& device,
-                                    void* dataBuffer,
-                                    size_t bufferSizeInBytes)
+                                                       const DeviceDescriptor& device,
+                                                       void* dataBuffer,
+                                                       size_t bufferSizeInBytes)
    {
        if (dataBuffer == nullptr)
            InvalidArgument("Cannot create a NDArrayView over a null data buffer");
@ -33,10 +33,10 @@ namespace CNTK
    }

    static void* AllocateTensorView(CNTK::DataType dataType,
-        const NDShape& viewShape,
-        const DeviceDescriptor& device,
-        void* dataBuffer,
-        size_t bufferSizeInBytes)
+                                    const NDShape& viewShape,
+                                    const DeviceDescriptor& device,
+                                    void* dataBuffer,
+                                    size_t bufferSizeInBytes)
    {
        switch (dataType)
        {
@ -60,7 +60,7 @@ namespace CNTK
                                                                                            matrixDims.second,
                                                                                            AsCNTKImplDeviceId(device),
                                                                                            IsSparseStorageFormat(storageType) ? MatrixType::SPARSE : MatrixType::DENSE,
-                                                                                            AsCNTKMatrixFormat(storageType));
+                                                                                            AsCNTKImplMatrixFormat(storageType));
        return new TensorView<ElementType>(matrix, AsTensorShape(viewShape));
    }

@ -99,8 +99,22 @@ namespace CNTK
    }

    NDArrayView::NDArrayView(CNTK::DataType dataType, const DeviceDescriptor& device, CNTK::StorageFormat storageType, const NDShape& viewShape, bool readOnly, void* tensorView)
-        : m_dataType(dataType), m_device(device), m_storageFormat(storageType), m_viewShape(viewShape), m_isReadOnly(readOnly), m_tensorView(tensorView)
+        : m_dataType(dataType), m_device(device), m_storageFormat(storageType), m_viewShape(viewShape), m_isReadOnly(readOnly)
    {
+        m_tensorView = std::shared_ptr<void>(tensorView, [this](void*) {
+            switch (m_dataType)
+            {
+            case DataType::Float:
+                delete GetTensorView<float>();
+                break;
+            case DataType::Double:
+                delete GetTensorView<double>();
+                break;
+            default:
+                LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
+                break;
+            }
+        });
    }

    NDArrayView::NDArrayView(CNTK::DataType dataType, CNTK::StorageFormat storageType, const NDShape& viewShape, const DeviceDescriptor& device)
@ -108,6 +122,10 @@ namespace CNTK
    {
    }

+    NDArrayView::~NDArrayView()
+    {
+    }
+
    void NDArrayView::SetValue(float value)
    {
        if (IsSparse())
@ -124,22 +142,6 @@ namespace CNTK
        GetWritableMatrix<double>()->SetValue(value);
    }

-    NDArrayView::~NDArrayView()
-    {
-        switch (m_dataType)
-        {
-        case DataType::Float:
-            delete GetTensorView<float>();
-            break;
-        case DataType::Double:
-            delete GetTensorView<double>();
-            break;
-        default:
-            LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
-            break;
-        }
-    }
-
    template <typename ElementType>
    /*static*/ std::shared_ptr<Matrix<ElementType>> NDArrayView::GetMatrixImpl(const TensorView<ElementType>* tensorView, size_t rowColSplitPoint)
    {
@ -150,7 +152,8 @@ namespace CNTK
        size_t splitPoint = rowColSplitPoint;
        if (splitPoint == NDArrayView::AutoSelectRowColSplitPoint)
        {
-            // Determine the split point
+            // Determine the split point by determining which of the axes can be 
+            // folded and selecting the non-foldable axis as the split point
            std::vector<bool> dimsToDrop(tensorShape.GetRank(), false);
            for (size_t k = 1; k < tensorShape.GetRank(); ++k)
                if (tensorShape.CanFlatten(k))
@ -195,9 +198,9 @@ namespace CNTK
    const TensorView<ElementType>* NDArrayView::GetTensorView() const
    {
        if (AsDataType<ElementType>() != m_dataType)
-            LogicError("NDArrayView::GetWritableTensorView: The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(m_dataType));
+            LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(m_dataType));

-        return (const TensorView<ElementType>*)(m_tensorView);
+        return (const TensorView<ElementType>*)(m_tensorView.get());
    }

    template <typename ElementType>
@ -211,7 +214,7 @@ namespace CNTK

    NDArrayViewPtr NDArrayView::DeepClone(bool readOnly/* = false*/) const
    {
-        NDArrayViewPtr newView(new NDArrayView(this->GetDataType(), this->GetStorageFormat(), this->Shape(), this->Device()), [](_ReferenceCounter* ptr) { delete ptr; });
+        NDArrayViewPtr newView = MakeSharedObject<NDArrayView>(this->GetDataType(), this->GetStorageFormat(), this->Shape(), this->Device());
        switch (m_dataType)
        {
        case DataType::Float:
@ -234,9 +237,7 @@ namespace CNTK
        }

        newView->m_isReadOnly = readOnly;
-        return NDArrayViewPtr(newView, [](_ReferenceCounter* ptr) {
-            delete ptr;
-        });
+        return newView;
    }

    void NDArrayView::CopyFrom(const NDArrayView& source)
@ -285,8 +286,7 @@ namespace CNTK
            break;
        }

-        auto aliasView = new NDArrayView(GetDataType(), Device(), GetStorageFormat(), Shape(), IsReadOnly() || readOnly, tensorView);;
-        return NDArrayViewPtr(aliasView, [](_ReferenceCounter* ptr) { delete ptr; });
+        return MakeSharedObject<NDArrayView>(GetDataType(), Device(), GetStorageFormat(), Shape(), IsReadOnly() || readOnly, tensorView);
    }

    // TODO: This could actually be strided?
@ -316,19 +316,18 @@ namespace CNTK
    }

    template <typename ElementType>
-    NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
+    NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
    {
        auto matrixDims = GetMatrixDimensions(shape);
-        auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeStart, (ElementType)rangeEnd, seed));
+        auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
        auto tensorView = new TensorView<ElementType>(randomUniformMatrix, AsTensorShape(shape));

-        auto view = new NDArrayView(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
-        return NDArrayViewPtr(view, [](_ReferenceCounter* ptr) { delete ptr; });
+        return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
    }

    // Explicit template instantiations
-    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
-    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
+    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);
+    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/);

    template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
    template CNTK_API const double* NDArrayView::DataBuffer<double>() const;
@ -339,8 +338,10 @@ namespace CNTK
    template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
    template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;

-    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
-    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
+    template TensorView<double>* NDArrayView::GetWritableTensorView<double>();

    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
--- a/Source/CNTKv2LibraryDll/NDMask.cpp
+++ b/Source/CNTKv2LibraryDll/NDMask.cpp
@ -17,15 +17,13 @@ namespace CNTK
    static Matrix<char>* AllocateMatrix(const NDShape& viewShape, const DeviceDescriptor& device)
    {
        auto matrixDims = GetMatrixDimensions(viewShape);
-        auto maskMatrix = new Matrix<char>(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device));
-        maskMatrix->SetValue(1);
-
-        return maskMatrix;
+        return new Matrix<char>(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device));
    }

    NDMask::NDMask(const NDShape& shape, Matrix<char>* matrix)
-        : m_device(AsDeviceDescriptor(matrix->GetDeviceId())), m_maskShape(shape), m_matrixView(matrix)
+        : m_device(AsDeviceDescriptor(matrix->GetDeviceId())), m_maskShape(shape)
    {
+        m_matrixView = std::shared_ptr<Matrix<char>>(matrix, [](Matrix<char>* ptr) { delete ptr; });
    }

    NDMask::NDMask(const NDShape& shape, const DeviceDescriptor& device/* = DeviceDescriptor::DefaultDevice()*/)
@ -33,16 +31,17 @@ namespace CNTK
    {
        if (shape.NumAxes() > 2)
            LogicError("NDMask instances with more than 2 axes are currently unsupported");
+
+        Clear();
    }

    NDMask::~NDMask()
    {
-        delete m_matrixView;
    }

    void NDMask::MaskSection(const std::vector<size_t>& sectionOffset, const NDShape& sectionShape)
    {
-        // TODO: Implement batching of masking operation for masks residing on GPUs to avoid making 
+        // TODO: Implement batching of masking operation for masks residing on GPUs to avoid making
        // GPU invocations for each MaskSection call.

        if (sectionOffset.size() > m_maskShape.NumAxes())
@ -78,12 +77,13 @@ namespace CNTK

    void NDMask::Clear()
    {
+        // Clear the mask by marking all samples as Valid; i.e. a value of 1
        GetMatrix()->SetValue(1);
    }

    Matrix<char>* NDMask::GetMatrix() const
    {
-        return m_matrixView;
+        return m_matrixView.get();
    }

    void NDMask::CopyFrom(const NDMask& source)
@ -96,14 +96,14 @@ namespace CNTK

    NDMaskPtr NDMask::DeepClone() const
    {
-        NDMaskPtr newMask = new NDMask(this->Shape(), this->Device());
+        NDMaskPtr newMask = MakeSharedObject<NDMask>(this->Shape(), this->Device());
        newMask->CopyFrom(*this);

-        return NDMaskPtr(newMask, [](_ReferenceCounter* ptr) { delete ptr; });
+        return newMask;
    }

    NDMaskPtr NDMask::Alias() const
    {
-        return NDMaskPtr(new NDMask(this->Shape(), new Matrix<char>(GetMatrix()->AsReference())), [](_ReferenceCounter* ptr) { delete ptr; });
+        return MakeSharedObject<NDMask>(this->Shape(), new Matrix<char>(GetMatrix()->AsReference()));
    }
 }
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -6,329 +6,138 @@
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include "Utils.h"
+#include "File.h"
+
+using namespace std;

 namespace CNTK
 {
-    namespace _Internal
+    template <typename T>
+    void DictionaryValue::AllocateDataPtr(const T& value)
    {
-#pragma region _SimpleVector
+        static_assert(is_same<T, NDShape>::value || is_same<T, vector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
+        m_data.m_ptr = new T(value);
+    }

-        template <typename T>
-        _SimpleVector<T>::_SimpleVector()
-            : m_vector(new std::vector<T>())
-        {
-        }
+    template <typename T>
+    void DictionaryValue::FreePtrAsType()
+    {
+        T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
+        delete typedPtr;

-        template <typename T>
-        _SimpleVector<T>::_SimpleVector(size_t numElements, const T& initVal/* = T()*/)
-            : m_vector(new std::vector<T>(numElements, initVal))
-        {
-        }
+        m_data.m_ptr = nullptr;
+    }

-        template <typename T>
-        _SimpleVector<T>::~_SimpleVector()
-        {
-            delete m_vector;
-        }
+    void DictionaryValue::FreeDataPtr()
+    {
+        if (m_valueType == Type::NDShape)
+            FreePtrAsType<NDShape>();
+        else if (m_valueType == Type::Vector)
+            FreePtrAsType<vector<DictionaryValue>>();
+    }

-        template <typename T>
-        _SimpleVector<T>::_SimpleVector(const _SimpleVector<T>& other)
-            : m_vector(new std::vector<T>(*other.m_vector))
-        {
-        }
+    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, DictionaryValue& us)
+    {
+        size_t version;
+        stream >> version;

-        template <typename T>
-        _SimpleVector<T>& _SimpleVector<T>::operator=(const _SimpleVector<T>& other)
+        stream >> us.m_valueType;
+
+        switch (us.ValueType())
        {
-            if (this != &other)
+        case DictionaryValue::Type::Bool:
+            stream >> us.m_data.m_boolean;
+            break;
+        case DictionaryValue::Type::SizeT:
+            stream >> us.m_data.m_sizeT;
+            break;
+        case DictionaryValue::Type::Float:
+            stream >> us.m_data.m_float;
+            break;
+        case DictionaryValue::Type::Double:
+            stream >> us.m_data.m_double;
+            break;
+        case DictionaryValue::Type::NDShape:
+        {
+            size_t size;
+            stream >> size;
+            vector<size_t> dims(size);
+            for (auto i = 0; i < size; i++)
            {
-                delete m_vector;
-                m_vector = new std::vector<T>(*other.m_vector);
+                stream >> dims[i];
            }
-
-            return *this;
+            us.AllocateDataPtr(NDShape(dims));
+            break;
        }
-
-        template <typename T>
-        _SimpleVector<T>::_SimpleVector(_SimpleVector<T>&& other)
-            : m_vector(nullptr)
+        case DictionaryValue::Type::Vector:
        {
-            *this = std::move(other);
-        }
-
-        template <typename T>
-        _SimpleVector<T>& _SimpleVector<T>::operator=(_SimpleVector<T>&& other)
-        {
-            assert(this != &other);
-
-            delete m_vector;
-            m_vector = other.m_vector;
-
-            other.m_vector = nullptr;
-
-            return *this;
-        }
-
-        template <typename T>
-        T& _SimpleVector<T>::operator[](size_t idx)
-        {
-            assert(idx < Size());
-            return (*m_vector)[idx];
-        }
-
-        template <typename T>
-        const T& _SimpleVector<T>::operator[](size_t idx) const
-        {
-            assert(idx < Size());
-            return (*m_vector)[idx];
-        }
-
-        template <typename T>
-        size_t _SimpleVector<T>::Size() const
-        {
-            return m_vector->size();
-        }
-
-        template <typename T>
-        T* _SimpleVector<T>::Data()
-        {
-            return m_vector->data();
-        }
-
-        template <typename T>
-        const T* _SimpleVector<T>::Data() const
-        {
-            return m_vector->data();
-        }
-
-        template <typename T>
-        void _SimpleVector<T>::PushBack(const T& value)
-        {
-            m_vector->push_back(value);
-        }
-
-        template <typename T>
-        void _SimpleVector<T>::PushBack(T&& value)
-        {
-            m_vector->push_back(std::move(value));
-        }
-
-        template <typename ValueType>
-        bool operator==(const _SimpleVector<ValueType>& first, const _SimpleVector<ValueType>& second)
-        {
-            return *first.m_vector == *second.m_vector;
-        }
-
-        // Explicit template instantiations
-        template class _SimpleVector<Variable>;
-        template class _SimpleVector<size_t>;
-        template class _SimpleVector<Axis>;
-        template class _SimpleVector<FunctionPtr>;
-
-        template bool operator==(const _SimpleVector<size_t>& first, const _SimpleVector<size_t>& second);
-	
-#pragma endregion _SimpleVector
-
-#pragma region _SimpleSet
-
-        template <typename KeyType>
-        _SimpleSet<KeyType>::_SimpleSet()
-            : m_set(new std::unordered_set<KeyType>())
-        {
-        }
-
-        template <typename KeyType>
-        _SimpleSet<KeyType>::~_SimpleSet()
-        {
-            delete m_set;
-        }
-
-        template <typename KeyType>
-        _SimpleSet<KeyType>::_SimpleSet(const _SimpleSet& other)
-            : m_set(nullptr)
-        {
-            *this = other;
-        }
-
-        template <typename KeyType>
-        _SimpleSet<KeyType>& _SimpleSet<KeyType>::operator=(const _SimpleSet& other)
-        {
-            if (this != &other)
+            size_t size;
+            stream >> size;
+            vector<DictionaryValue> values(size);
+            for (auto i = 0; i < size; i++)
            {
-                delete m_set;
-                m_set = new std::unordered_set<KeyType>(*(other.m_set));
+                stream >> values[i];
            }
-
-            return *this;
+            us.AllocateDataPtr(values);
+            break;
        }
-
-        template <typename KeyType>
-        _SimpleSet<KeyType>::_SimpleSet(_SimpleSet&& other)
-            : m_set(nullptr)
-        {
-            *this = std::move(other);
+        default:
+            NOT_IMPLEMENTED;
        }
+        return stream;
+    }

-        template <typename KeyType>
-        _SimpleSet<KeyType>& _SimpleSet<KeyType>::operator=(_SimpleSet&& other)
+    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const DictionaryValue& us)
+    {
+        stream << us.version;
+
+        stream << us.ValueType();
+
+        switch (us.ValueType())
        {
-            assert(this != &other);
-
-            delete m_set;
-            m_set = other.m_set;
-            other.m_set = nullptr;
-
-            return *this;
-        }
-
-        template <typename KeyType>
-        bool _SimpleSet<KeyType>::Insert(const KeyType& key)
+        case DictionaryValue::Type::Bool:
+            stream << us.m_data.m_boolean;
+            break;
+        case DictionaryValue::Type::SizeT:
+            stream << us.m_data.m_sizeT;
+            break;
+        case DictionaryValue::Type::Float:
+            stream << us.m_data.m_float;
+            break;
+        case DictionaryValue::Type::Double:
+            stream << us.m_data.m_double;
+            break;
+        case DictionaryValue::Type::NDShape:
        {
-            return m_set->insert(key).second;
-        }
-
-        template <typename KeyType>
-        bool _SimpleSet<KeyType>::Contains(const KeyType& key) const
-        {
-            return (m_set->find(key) != m_set->end());
-        }
-
-        template <typename KeyType>
-        size_t _SimpleSet<KeyType>::Size() const
-        {
-            return m_set->size();
-        }
-
-        template <typename KeyType>
-        _SimpleSet<KeyType>::operator _SimpleVector<KeyType>() const
-        {
-            _SimpleVector<KeyType> retVector;
-            for (auto iter = m_set->begin(); iter != m_set->end(); ++iter)
-                retVector.PushBack(*iter);
-
-            return retVector;
-        }
-
-        template <typename KeyType>
-        bool operator==(const _SimpleSet<KeyType>& first, const _SimpleSet<KeyType>& second)
-        {
-            return *first.m_set == *second.m_set;
-        }
-
-        // Explicit template instantiations
-        template class _SimpleSet<FunctionPtr>;
-        template class _SimpleSet<Variable>;
-        template class _SimpleSet<Placeholder>;
-        template class _SimpleSet<const Function*>;
-
-        template bool operator==(const _SimpleSet<Variable>& first, const _SimpleSet<Variable>& second);
-        template bool operator==(const _SimpleSet<Placeholder>& first, const _SimpleSet<Placeholder>& second);
-
-#pragma endregion _SimpleSet
-
-#pragma region _SimpleMap
-
-        template <typename KeyType, typename ValueType>
-        _SimpleMap<KeyType, ValueType>::_SimpleMap()
-            : m_map(new std::unordered_map<KeyType, ValueType>())
-        {
-        }
-
-        template <typename KeyType, typename ValueType>
-        _SimpleMap<KeyType, ValueType>::~_SimpleMap()
-        {
-            delete m_map;
-        }
-
-        template <typename KeyType, typename ValueType>
-        _SimpleMap<KeyType, ValueType>::_SimpleMap(const _SimpleMap& other)
-            : m_map(nullptr)
-        {
-            *this = other;
-        }
-
-        template <typename KeyType, typename ValueType>
-        _SimpleMap<KeyType, ValueType>& _SimpleMap<KeyType, ValueType>::operator=(const _SimpleMap& other)
-        {
-            if (this != &other)
+            NDShape* shapePtr = reinterpret_cast<NDShape*>(us.m_data.m_ptr);
+            auto size = shapePtr->NumAxes();
+            stream << size;
+            for (auto i = 0; i < size; i++)
            {
-                delete m_map;
-                m_map = new std::unordered_map<KeyType, ValueType>(*(other.m_map));
+                stream << shapePtr->operator[](i);
            }
-
-            return *this;
+            break;
        }
-
-        template <typename KeyType, typename ValueType>
-        _SimpleMap<KeyType, ValueType>::_SimpleMap(_SimpleMap&& other)
-            : m_map(nullptr)
+        case DictionaryValue::Type::Vector:
        {
-            *this = std::move(other);
+            vector<DictionaryValue>* vectorPtr =
+                reinterpret_cast<vector<DictionaryValue>*>(us.m_data.m_ptr);
+            auto size = vectorPtr->size();
+            stream << size;
+            for (auto i = 0; i < size; i++)
+            {
+                stream << vectorPtr->operator[](i);
+            }
+            break;
        }
-
-        template <typename KeyType, typename ValueType>
-        _SimpleMap<KeyType, ValueType>& _SimpleMap<KeyType, ValueType>::operator=(_SimpleMap&& other)
-        {
-            assert(this != &other);
-
-            delete m_map;
-            m_map = other.m_map;
-            other.m_map = nullptr;
-
-            return *this;
+        default:
+            NOT_IMPLEMENTED;
        }
-
-        template <typename KeyType, typename ValueType>
-        ValueType& _SimpleMap<KeyType, ValueType>::operator[](const KeyType& key)
-        {
-            return (*m_map)[key];
-        }
-
-        template <typename KeyType, typename ValueType>
-        const ValueType& _SimpleMap<KeyType, ValueType>::operator[](const KeyType& key) const
-        {
-            return (*m_map)[key];
-        }
-
-        template <typename KeyType, typename ValueType>
-        bool _SimpleMap<KeyType, ValueType>::Insert(const KeyType& key, const ValueType& value)
-        {
-            return m_map->insert({ key, value }).second;
-        }
-
-        template <typename KeyType, typename ValueType>
-        bool _SimpleMap<KeyType, ValueType>::Contains(const KeyType& key) const
-        {
-            return (m_map->find(key) != m_map->end());
-        }
-
-        template <typename KeyType, typename ValueType>
-        size_t _SimpleMap<KeyType, ValueType>::Size() const
-        {
-            return m_map->size();
-        }
-
-        template <typename KeyType, typename ValueType>
-        _SimpleSet<KeyType> _SimpleMap<KeyType, ValueType>::Keys() const
-        {
-            _SimpleSet<KeyType> keys;
-            for (auto iter = m_map->begin(); iter != m_map->end(); ++iter)
-                keys.Insert(iter->first);
-
-            return keys;
-        }
-
-        // Explicit template instantiations
-        template class _SimpleMap<Variable, ValuePtr>;
-        template class _SimpleMap<Variable, const ValuePtr>;
-        template class _SimpleMap<Placeholder, Variable>;
-
-#pragma endregion _SimpleMap
-
+        return stream;
    }

    Dictionary::Dictionary()
-        : m_dictionaryData(new std::unordered_map < std::wstring, DictionaryValue>)
+        : m_dictionaryData(new unordered_map <wstring, DictionaryValue>)
    {
    }

@ -340,7 +149,7 @@ namespace CNTK
    Dictionary::Dictionary(Dictionary&& other)
        : m_dictionaryData(nullptr)
    {
-        *this = std::move(other);
+        *this = move(other);
    }

    Dictionary& Dictionary::operator=(Dictionary&& other)
@ -369,4 +178,130 @@ namespace CNTK
    {
        return (m_dictionaryData->find(key) != m_dictionaryData->end());
    }
+
+    Microsoft::MSR::CNTK::File& operator<<(Microsoft::MSR::CNTK::File& stream, const Dictionary& us)
+    {
+        stream << us.version;
+        stream << us.m_dictionaryData->size();
+        for (auto it = us.m_dictionaryData->begin(); it != us.m_dictionaryData->end(); ++it)
+        {
+            stream << it->first;
+            stream << it->second;
+        }
+        return stream;
+    }
+
+    Microsoft::MSR::CNTK::File& operator>>(Microsoft::MSR::CNTK::File& stream, Dictionary& us)
+    {
+        size_t version;
+        stream >> version;
+        size_t size;
+        stream >> size;
+        us.m_dictionaryData->reserve(size);
+        for (auto i = 0; i < size; i++)
+        {
+            wstring key;
+            stream >> key;
+            DictionaryValue value;
+            stream >> value;
+            us.m_dictionaryData->insert(make_pair(key, value));
+        }
+        return stream;
+    }
+
+    template <typename T>
+    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    {
+        if (viewPtr->IsSparse())
+        {
+            LogicError("Sparse NDArrayView cannot be serialized into a vector.");
+        }
+
+        auto numElements = viewPtr->Shape().TotalSize();
+
+        vector<DictionaryValue> values(numElements);
+
+        NDArrayViewPtr cpuDataViewPtr = viewPtr;
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
+            cpuDataViewPtr->CopyFrom(*viewPtr);
+        }
+
+        const T* buffer = cpuDataViewPtr->DataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            T v = buffer[i];
+            values[i] = DictionaryValue(v);
+        }
+
+        return values;
+    }
+
+    template <typename T>
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values)
+    {
+        if (viewPtr->IsSparse())
+        {
+            LogicError("Sparse NDArrayView cannot be deserialized from a vector.");
+        }
+
+        auto numElements = viewPtr->Shape().TotalSize();
+
+        if (values.size() != numElements)
+        {
+            LogicError("Number of elements (%lu) in the deserialized representation does not match the expected value (%lu)",
+                        values.size(), numElements);
+        }
+
+        NDArrayViewPtr cpuDataViewPtr = viewPtr;
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            cpuDataViewPtr = MakeSharedObject<NDArrayView>(viewPtr->GetDataType(), viewPtr->Shape(), DeviceDescriptor::CPUDevice());
+        }
+
+        T* buffer = cpuDataViewPtr->WritableDataBuffer<T>();
+        for (auto i = 0; i < numElements; ++i)
+        {
+            buffer[i] = values[i].GetValue<T>();
+        }
+
+        if ((viewPtr->Device().Type() != DeviceKind::CPU))
+        {
+            viewPtr->CopyFrom(*cpuDataViewPtr);
+        }
+    }
+
+    // TODO: we store the type info for every element in the vector, which is extremely redundant.
+    // Instead, it'd be nice to introduce some sort of DictionaryValueVector.
+    vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr)
+    {
+        switch (viewPtr->GetDataType())
+        {
+        case DataType::Float:
+            return SerializeToVector<float>(viewPtr);
+        case DataType::Double:
+            return SerializeToVector<double>(viewPtr);
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
+        }
+    }
+
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const vector<DictionaryValue>& values) 
+    {
+        switch (viewPtr->GetDataType())
+        {
+        case DataType::Float:
+            DeserializeFromVector<float>(viewPtr, values);
+            break;
+        case DataType::Double:
+            DeserializeFromVector<double>(viewPtr, values);
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(viewPtr->GetDataType()));
+        }
+    }
+     
+    template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
+    template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -15,244 +15,6 @@ namespace CNTK
    // Forward declarations
    class Dictionary;

-    class DictionaryValue
-    {
-    public:
-        enum class Type : unsigned int
-        {
-            None,
-            Bool,
-            SizeT,
-            Double,
-            NDShape,
-            Vector
-        };
-
-        static const char* TypeName(Type type)
-        {
-            if (type == Type::None)
-                return "None";
-            else if (type == Type::Bool)
-                return "Bool";
-            else if (type == Type::SizeT)
-                return "SizeT";
-            else if (type == Type::Double)
-                return "Double";
-            else if (type == Type::NDShape)
-                return "NDShape";
-            else if (type == Type::Vector)
-                return "Vector";
-            else
-                LogicError("Unknown DictionaryValue::Type");
-        }
-
-    public:
-        DictionaryValue()
-            : m_valueType(Type::None)
-        {
-        }
-
-        DictionaryValue(bool value)
-            : m_valueType(GetValueType<bool>())
-        {
-            m_data.m_boolean = value;
-        }
-
-        DictionaryValue(size_t value)
-            : m_valueType(GetValueType<size_t>())
-        {
-            m_data.m_sizeT = value;
-        }
-
-        DictionaryValue(double value)
-            : m_valueType(GetValueType<double>())
-        {
-            m_data.m_double = value;
-        }
-
-        template <typename T>
-        DictionaryValue(const T& value)
-            : m_valueType(GetValueType<T>())
-        {
-            static_assert(std::is_same<T, NDShape>::value ||
-                std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value,
-                "Unsupported ValueType");
-
-            AllocateDataPtr(value);
-        }
-
-        DictionaryValue(const DictionaryValue& other)
-            : m_valueType(Type::Bool)
-        {
-            *this = other;
-        }
-
-        DictionaryValue& operator=(const DictionaryValue& other)
-        {
-            if (this != &other)
-            {
-                FreeDataPtr();
-
-                m_valueType = other.m_valueType;
-                m_data = other.m_data;
-
-                if (other.m_valueType == Type::NDShape)
-                    AllocateDataPtr(other.GetValue<NDShape>());
-                else if (other.m_valueType == Type::Vector)
-                    AllocateDataPtr(other.GetValue<_Internal::_SimpleVector<DictionaryValue>>());
-            }
-
-            return *this;
-        }
-
-        ~DictionaryValue()
-        {
-            FreeDataPtr();
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, bool>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_boolean;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, size_t>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_sizeT;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, double>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return m_data.m_double;
-        }
-
-        template <typename T, typename std::enable_if<std::is_same<T, NDShape>::value || std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value>::type* = nullptr>
-        const T& GetValue() const
-        {
-            VerifyType<T>();
-            return *(reinterpret_cast<T*>(m_data.m_ptr));
-        }
-
-        bool HasValue() const
-        {
-            return m_valueType != Type::None;
-        }
-
-        Type ValueType() const
-        {
-            return m_valueType;
-        }
-
-    private:
-        template <typename T>
-        static Type GetValueType()
-        {
-            static_assert(std::is_same<T, bool>::value ||
-                std::is_same<T, size_t>::value ||
-                std::is_same<T, double>::value ||
-                std::is_same<T, NDShape>::value ||
-                std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value ||
-                std::is_same<T, CNTK::Dictionary>::value,
-                "Unsupported ValueType");
-
-            if (std::is_same<T, bool>::value)
-                return Type::Bool;
-            else if (std::is_same<T, size_t>::value)
-                return Type::SizeT;
-            else if (std::is_same<T, double>::value)
-                return Type::Double;
-            else if (std::is_same<T, NDShape>::value)
-                return Type::NDShape;
-            else if (std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value)
-                return Type::Vector;
-        }
-
-        template <typename T>
-        void VerifyType() const
-        {
-            if (GetValueType<T>() != m_valueType)
-                RuntimeError("Reading a DictionaryValue as the wrong type; Reading as type %s when actual type is %s", typeid(T).name(), DictionaryValue::TypeName(m_valueType));
-        }
-
-        template <typename T>
-        void AllocateDataPtr(const T& value)
-        {
-            static_assert(std::is_same<T, NDShape>::value || std::is_same<T, _Internal::_SimpleVector<DictionaryValue>>::value, "AllocateDataPtr called with invalid type");
-            m_data.m_ptr = new T(value);
-        }
-
-        template <typename T>
-        void FreePtrAsType()
-        {
-            T* typedPtr = reinterpret_cast<T*>(m_data.m_ptr);
-            delete typedPtr;
-
-            m_data.m_ptr = nullptr;
-        }
-
-        void FreeDataPtr()
-        {
-            if (m_valueType == Type::NDShape)
-                FreePtrAsType<NDShape>();
-            else if (m_valueType == Type::Vector)
-                FreePtrAsType<_Internal::_SimpleVector<DictionaryValue>>();
-        }
-
-    private:
-        Type m_valueType;
-
-        union ValueData
-        {
-            bool m_boolean;
-            size_t m_sizeT;
-            double m_double;
-            void* m_ptr;
-        } m_data;
-    };
-
-    class Dictionary
-    {
-    public:
-        Dictionary();
-        ~Dictionary();
-
-        // Disallow copy contruction and assignment
-        Dictionary(const Dictionary&) = delete;
-        Dictionary& operator=(const Dictionary&) = delete;
-
-        Dictionary(Dictionary&& other);
-        Dictionary& operator=(Dictionary&& other);
-
-        DictionaryValue& operator[](const std::wstring& key)
-        {
-            return operator[](key.c_str());
-        }
-
-        DictionaryValue& operator[](const wchar_t* key);
-
-        DictionaryValue operator[](const std::wstring& key) const
-        {
-            return operator[](key.c_str());
-        }
-
-        DictionaryValue operator[](const wchar_t* key) const;
-
-        bool Contains(const std::wstring& key) const
-        {
-            return Contains(key.c_str());
-        }
-
-        bool Contains(const wchar_t* key) const;
-
-    private:
-        std::unordered_map<std::wstring, DictionaryValue>* m_dictionaryData;
-    };
-
    // Helper to get the size of an element of the specified DataType
    inline size_t ElementSize(DataType dataType)
    {
@ -266,15 +28,15 @@ namespace CNTK

    inline DEVICEID_TYPE AsCNTKImplDeviceId(const DeviceDescriptor& device)
    {
-        if (device.Type() == DeviceType::CPU)
+        if (device.Type() == DeviceKind::CPU)
            return -1;
-        else if (device.Type() == DeviceType::GPU)
+        else if (device.Type() == DeviceKind::GPU)
            return device.Id();
        else
            NOT_IMPLEMENTED;
    }

-    inline Microsoft::MSR::CNTK::MatrixFormat AsCNTKMatrixFormat(StorageFormat storageFormat)
+    inline Microsoft::MSR::CNTK::MatrixFormat AsCNTKImplMatrixFormat(StorageFormat storageFormat)
    {
        if (storageFormat == StorageFormat::Dense)
            return Microsoft::MSR::CNTK::MatrixFormat::matrixFormatDense;
@ -357,4 +119,13 @@ namespace CNTK

        return{ matrixRowSize, matrixColSize };
    }
+
+    inline bool IsSparseInput(const Variable& var)
+    {
+        return var.IsInput() && var.IsSparse();
+    }
+
+    std::vector<DictionaryValue> SerializeToVector(const NDArrayViewPtr& viewPtr);
+
+    void DeserializeFromVector(const NDArrayViewPtr& viewPtr, const std::vector<DictionaryValue>& values);
 }
--- a/Source/CNTKv2LibraryDll/Value.cpp
+++ b/Source/CNTKv2LibraryDll/Value.cpp
@ -15,20 +15,21 @@ namespace CNTK
    Value::Value(const NDArrayViewPtr& data, const NDMaskPtr& mask)
        : m_data(data), m_mask(mask)
    {
-        if ((mask != nullptr) && (mask->Shape().NumAxes() > data->Shape().NumAxes()))
-            InvalidArgument("The number of axes of the mask of a Value object cannot exceed the number of axes of the data NDArrayView object");
-
        if (mask != nullptr)
        {
            auto dataShape = data->Shape();
            auto maskShape = mask->Shape();
+
+            if (maskShape.NumAxes() > dataShape.NumAxes())
+                InvalidArgument("The number of axes (%d) of the mask of a Value object cannot exceed the number of axes (%d) of the data NDArrayView object", (int)maskShape.NumAxes(), (int)dataShape.NumAxes());
+
            if (dataShape.SubShape(dataShape.NumAxes() - maskShape.NumAxes()) != maskShape)
-                InvalidArgument("Invalid Value object; the data and mask are incompatible. The trailing dimensions of the data do not match the dimensions of the mask");
+                InvalidArgument("Invalid Value object; the data and mask are incompatible. The trailing dimensions of the data (%S) do not match the dimensions of the mask (%S)", dataShape.AsString().c_str(), maskShape.AsString().c_str());
        }
    }

    template <typename T>
-    static NDMaskPtr CreateMask(size_t sampleSize, const std::vector<std::vector<T>>& sequences, const DeviceDescriptor& device)
+    static NDMaskPtr CreateMask(size_t numElementsPerSample, const std::vector<std::vector<T>>& sequences, const DeviceDescriptor& device)
    {
        size_t numSequences = sequences.size();
        std::vector<size_t> sequenceLengths(numSequences);
@ -36,7 +37,7 @@ namespace CNTK
        bool needsMask = false;
        for (size_t i = 0; i < numSequences; ++i)
        {
-            sequenceLengths[i] = sequences[i].size() / sampleSize;
+            sequenceLengths[i] = sequences[i].size() / numElementsPerSample;

            if (maxSequenceLength < sequenceLengths[i])
                maxSequenceLength = sequenceLengths[i];
@ -45,11 +46,12 @@ namespace CNTK
                needsMask = true;
        }

+        // If needed, create a mask to account for variability in lengths of specified sequences
        NDMaskPtr deviceValueMask;
        if (needsMask)
        {
            NDShape valueMaskShape = { maxSequenceLength, numSequences };
-            deviceValueMask = NDMaskPtr(new NDMask(valueMaskShape, device), [](_Internal::_ReferenceCounter* ptr) {delete ptr; });
+            deviceValueMask = MakeSharedObject<NDMask>(valueMaskShape, device);
            for (size_t i = 0; i < numSequences; ++i)
                deviceValueMask->MaskSection({ sequenceLengths[i], i }, { NDShape::InferredDimension, 1 });
        }
@ -86,23 +88,23 @@ namespace CNTK
        }

        colStarts[numSequences * maxSequenceLength] = (SparseIndexType)(nonZeroValues.size());
-        NDArrayViewPtr deviceValueData(new NDArrayView(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly), [](_ReferenceCounter* ptr) { delete ptr; });
-        return ValuePtr(new Value(deviceValueData, deviceValueMask), [](_ReferenceCounter* ptr) { delete ptr; });
+        NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
+        return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
    }

    template <typename ElementType>
    /*static*/ ValuePtr Value::Create(const NDShape& sampleShape, const std::vector<std::vector<ElementType>>& sequences, const DeviceDescriptor& device, bool readOnly/* = false*/)
    {
-        size_t sampleSize = sampleShape.TotalSize();
-        NDMaskPtr deviceValueMask = CreateMask(sampleSize, sequences, device);
+        size_t numElementsPerSample = sampleShape.TotalSize();
+        NDMaskPtr deviceValueMask = CreateMask(numElementsPerSample, sequences, device);
        size_t maxSequenceLength = (deviceValueMask == nullptr) ? sequences[0].size() : deviceValueMask->Shape()[0];

        size_t numSequences = sequences.size();
        NDShape valueDataShape = sampleShape.AppendShape({ maxSequenceLength, numSequences });
-        NDArrayViewPtr valueData(new NDArrayView(AsDataType<ElementType>(), valueDataShape, DeviceDescriptor::CPUDevice()), [](_ReferenceCounter* ptr) { delete ptr; });
+        NDArrayViewPtr valueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, DeviceDescriptor::CPUDevice());
        ElementType* dataBuffer = valueData->WritableDataBuffer<ElementType>();
        for (size_t i = 0; i < numSequences; ++i)
-            std::copy(sequences[i].data(), sequences[i].data() + sequences[i].size(), dataBuffer + (maxSequenceLength * i * sampleSize));
+            std::copy(sequences[i].data(), sequences[i].data() + sequences[i].size(), dataBuffer + (maxSequenceLength * i * numElementsPerSample));

        NDArrayViewPtr deviceValueData;
        if (device == DeviceDescriptor::CPUDevice())
@ -114,13 +116,13 @@ namespace CNTK
        }
        else
        {
-            deviceValueData = NDArrayViewPtr(new NDArrayView(AsDataType<ElementType>(), valueDataShape, device), [](_ReferenceCounter* ptr) { delete ptr; });
+            deviceValueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, device);
            deviceValueData->CopyFrom(*valueData);
            if (readOnly)
                deviceValueData = deviceValueData->Alias(true);
        }

-        return ValuePtr(new Value(deviceValueData, deviceValueMask), [](_ReferenceCounter* ptr) { delete ptr; });
+        return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
    }

    /*virtual*/ Value::~Value()
@ -142,13 +144,13 @@ namespace CNTK
    /*virtual*/ ValuePtr Value::DeepClone(bool readOnly/* = false*/) const
    {
        // TODO: Check if this is a derived type and throw an exception in that case
-        return ValuePtr(new Value(Data()->DeepClone(readOnly), (Mask() != nullptr) ? Mask()->DeepClone() : nullptr), [](_ReferenceCounter* ptr) { delete ptr; });
+        return MakeSharedObject<Value>(Data()->DeepClone(readOnly), (Mask() != nullptr) ? Mask()->DeepClone() : nullptr);
    }

    /*virtual*/ ValuePtr Value::Alias(bool readOnly/* = false*/) const
    {
        // TODO: Check if this is a derived type and throw an exception in that case
-        return ValuePtr(new Value(Data()->Alias(readOnly), (Mask() != nullptr) ? Mask()->Alias() : nullptr), [](_ReferenceCounter* ptr) { delete ptr; });
+        return MakeSharedObject<Value>(Data()->Alias(readOnly), (Mask() != nullptr) ? Mask()->Alias() : nullptr);
    }

    /*virtual*/ void Value::CopyFrom(const Value& source)
--- a/Source/CNTKv2LibraryDll/Variable.cpp
+++ b/Source/CNTKv2LibraryDll/Variable.cpp
@ -11,4 +11,9 @@ namespace CNTK
        : Variable(function->Output())
    {
    }
+
+    FunctionPtr Variable::Owner() const 
+    {
+        return m_dataFields->m_ownerFunction->shared_from_this(); 
+    }
 }
--- a/Source/Common/BestGpu.cpp
+++ b/Source/Common/BestGpu.cpp
@ -106,22 +106,18 @@ public:
    ~BestGpu();
    void Init();
    void SetAllowedDevices(const std::vector<int>& devices); // only allow certain GPUs
-    bool DeviceAllowed(int device);
-    void DisallowDevice(int device)
-    {
-        assert((device >= -1) && (device <= 31));
-
-        if (device < 0)
-            m_disallowCPUDevice = true;
-        else
-            m_allowedDevices &= ~(1 << device);
-    }
+    bool DeviceAllowed(int deviceId);
+    void DisallowUnsupportedDevices();
+    void DisallowDevice(int deviceId);
    void AllowAll();                                                                          // reset to allow all GPUs (no allowed list)
    bool UseMultiple();                                                                       // using multiple GPUs?
    int GetDevice(BestGpuFlags flags = bestGpuNormal);                                        // get a single device
    static const int AllDevices = -1;                                                         // can be used to specify all GPUs in GetDevices() call
    static const int RequeryDevices = -2;                                                     // Requery refreshing statistics and picking the same number as last query
+    static const int MininumCCMajorForGpu = 3;                                                // cntk supports GPUs with Compute Capability > 3.0
    std::vector<int> GetDevices(int number = AllDevices, BestGpuFlags flags = bestGpuNormal); // get multiple devices
+    std::vector<ProcessorData *> GetProcessorData();
+
 private:
    bool LockDevice(int deviceId, bool trial = true);
 };
@ -156,6 +152,8 @@ static DEVICEID_TYPE SelectDevice(DEVICEID_TYPE deviceId, bool bLockGPU, const i
                {
                    g_bestGpu->DisallowDevice(excludedDevices[i]);
                }
+
+                g_bestGpu->DisallowUnsupportedDevices();
            }

            bestDeviceId = (DEVICEID_TYPE)g_bestGpu->GetDevice(BestGpuFlags(bLockGPU ? (bestGpuAvoidSharing | bestGpuExclusiveLock) : bestGpuAvoidSharing));
@ -345,22 +343,32 @@ int BestGpu::GetDevice(BestGpuFlags bestFlags)
 void BestGpu::SetAllowedDevices(const std::vector<int>& devices)
 {
    m_allowedDevices = 0;
-    for (int device : devices)
+    for (int deviceId : devices)
    {
-        m_allowedDevices |= (1 << device);
+        m_allowedDevices |= (1 << deviceId);
    }
 }

 // DeviceAllowed - is a particular device allowed?
 // returns: true if the device is allowed, otherwise false
-bool BestGpu::DeviceAllowed(int device)
+bool BestGpu::DeviceAllowed(int deviceId)
 {
-    assert((device >= -1) && (device <= 31));
+    assert((deviceId >= -1) && (deviceId <= 31));

-    if (device < 0)
+    if (deviceId < 0)
        return !m_disallowCPUDevice;
    else
-        return !!(m_allowedDevices & (1 << device));
+        return !!(m_allowedDevices & (1 << deviceId));
+}
+
+void BestGpu::DisallowDevice(int deviceId)
+{
+    assert((deviceId >= -1) && (deviceId <= 31));
+
+    if (deviceId < 0)
+        m_disallowCPUDevice = true;
+    else
+        m_allowedDevices &= ~(1 << deviceId);
 }

 // AllowAll - Reset the allowed filter to allow all GPUs
@ -527,6 +535,68 @@ std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
    return best; // return the array of the best GPUs
 }

+// disallow devices wich don't comply with compute capability restriction when cntk runs with deviceId = 'auto'
+void BestGpu::DisallowUnsupportedDevices()
+{
+    for (auto pd : m_procData)
+    {
+        if (pd->deviceProp.major < BestGpu::MininumCCMajorForGpu)
+        {
+            DisallowDevice(pd->deviceId);
+        }
+    }
+}
+
+GpuData GetGpuData(DEVICEID_TYPE deviceId)
+{
+    std::vector<GpuData> gpusData = GetAllGpusData();
+
+    auto it = std::find_if(gpusData.begin(), gpusData.end(), [&deviceId](const GpuData& gpu){return gpu.deviceId == deviceId;});
+
+    if (it != gpusData.end())
+    {
+        return *it;
+    }
+
+    return GpuData(0, 0, deviceId, 0, GpuValidity::UnknownDevice, "", 0);
+}
+
+// populate a vector with data (id, major/minor version, cuda cores, name and memory) for each gpu device in the machine
+std::vector<GpuData> GetAllGpusData()
+{
+    std::vector<GpuData> data;
+
+    auto bestGpu = make_unique<BestGpu>();
+
+    std::vector<ProcessorData*> processorData = bestGpu->GetProcessorData();
+    
+    for (ProcessorData* pd : processorData)
+    {
+
+        GpuValidity validity = GpuValidity::UnknownDevice;
+
+        if (pd->deviceProp.major < BestGpu::MininumCCMajorForGpu)
+        {
+            validity = GpuValidity::ComputeCapabilityNotSupported;
+        }
+        else
+        {
+            validity = GpuValidity::Valid;
+        }
+
+        size_t totalMemory = pd->deviceProp.totalGlobalMem/(1024*1024); //From bytes to MBytes
+        GpuData gpuData = GpuData(pd->deviceProp.major, pd->deviceProp.minor, pd->deviceId, pd->cores, validity, string(pd->deviceProp.name), totalMemory);
+        data.push_back(gpuData);
+    }
+
+    return data;
+}
+
+std::vector<ProcessorData*> BestGpu::GetProcessorData()
+{
+    return m_procData;
+}
+
 // QueryNvmlData - Query data from the Nvidia Management Library, and accumulate counters,
 // In case failure, this function simply backs out without filling in the data structure and without setting m_nvmlData.
 void BestGpu::QueryNvmlData()
--- a/Source/Common/Eval.cpp
+++ b/Source/Common/Eval.cpp
@ -70,14 +70,14 @@ void Eval<ElemType>::GetEvalClass(const std::string& config)
    }
    // create a variable of each type just to call the proper templated version
    ElemType elemType = ElemType();
-    GetEvalProc getEvalProc = (GetEvalProc) Plugin::Load(module, GetEvalName(elemType));
+    GetEvalProc getEvalProc = (GetEvalProc) m_plugin->Load(module, GetEvalName(elemType));
    getEvalProc(&m_eval);
 }

 // Eval Constructor
 // options - [in] string  of options (i.e. "-windowsize:11 -addenergy") data reader specific
 template <class ElemType>
-Eval<ElemType>::Eval(const std::string& config)
+Eval<ElemType>::Eval(const std::string& config) : m_plugin(make_shared<Plugin>())
 {
    GetEvalClass(config);
    m_eval->Init(config);
--- a/Source/Common/Include/BestGpu.h
+++ b/Source/Common/Include/BestGpu.h
@ -8,15 +8,46 @@
 // #define CPUONLY      // #define this to build without GPU support nor needing the SDK installed
 #include "CommonMatrix.h"

+#include <vector>
+
 // define IConfigRecord and ConfigParameters as incomplete types, in order to avoid having to include "ScriptableObjects.h" and "Config.h", as that confuses some .CU code
 namespace Microsoft { namespace MSR { namespace ScriptableObjects { struct IConfigRecord; }}}

 namespace Microsoft { namespace MSR { namespace CNTK {

+using namespace std;
+
 #ifndef CPUONLY
+enum class GpuValidity
+{
+    Valid,
+    UnknownDevice,
+    ComputeCapabilityNotSupported
+};
+
+struct GpuData
+{
+    int versionMajor;
+    int versionMinor;
+    int deviceId;
+    int cudaCores;
+    GpuValidity validity;
+    string name;
+    size_t totalMemory;
+    GpuData(int versionMajor, int versionMinor, int deviceId, int cudaCores, GpuValidity validity, const string& name, size_t totalMemory)
+        :versionMajor(versionMajor), versionMinor(versionMinor), deviceId(deviceId), cudaCores(cudaCores), validity(validity), name(name), totalMemory(totalMemory)
+    {
+    }
+
+};
+
+std::vector<GpuData> GetAllGpusData();
+GpuData GetGpuData(DEVICEID_TYPE deviceId);
+
 class ConfigParameters;
 DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config);
 DEVICEID_TYPE DeviceFromConfig(const ScriptableObjects::IConfigRecord& config);
+
 #else
 template <class ConfigRecordType>
 static inline DEVICEID_TYPE DeviceFromConfig(const ConfigRecordType& /*config*/)
--- a/Source/Common/Include/Eval.h
+++ b/Source/Common/Include/Eval.h
@ -25,8 +25,7 @@
 #include <map>
 #include <vector>
 #include <string>
-
-#include "Basics.h"
+#include <memory>

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -110,12 +109,14 @@ void EVAL_API GetEval(IEvaluateModel<ElemType>** peval);
 extern "C" EVAL_API void GetEvalF(IEvaluateModel<float>** peval);
 extern "C" EVAL_API void GetEvalD(IEvaluateModel<double>** peval);

+class Plugin;

 template <typename ElemType>
-class Eval : public IEvaluateModel<ElemType>, protected Plugin
+class Eval : public IEvaluateModel<ElemType>
 {
 private:
    IEvaluateModel<ElemType>* m_eval; // evaluation class pointer
+    std::shared_ptr<Plugin> m_plugin; 

    void GetEvalClass(const std::string& config);

@ -225,7 +226,8 @@ struct VectorRef
    size_t m_size;       // ElemTypes used.

    VectorRef() : m_vector(nullptr), m_capacity(0), m_size(0) {}
-    void InitFrom(std::vector<ElemType>& src) { m_vector = src.data(); m_capacity = src.capacity(); m_size = src.size(); }
+    void InitFrom(std::vector<ElemType>& src) { InitFrom(src.data(), src.capacity(), src.size()); }
+    void InitFrom(ElemType* data, size_t capacity, size_t size) { m_vector = data; m_capacity = capacity; m_size = size; }
    size_t size() const { return m_size; }
    size_t capacity() const { return m_capacity; }
    ElemType* data() { return m_vector; }
@ -280,7 +282,7 @@ class VariableSchema : public std::vector<VariableLayout>
        Values<ElemType> CreateBuffers(const std::vector<size_t>& maxLengths)
        {
            if (maxLengths.size() != size())
-                throw std::exception("Expected max lengths for all variables.");
+                throw std::runtime_error("Expected max lengths for all variables.");

            Values<ElemType> buffers(size());
            for (size_t i = 0; i < size(); ++i)
--- a/Source/Common/Include/RandomOrdering.h
+++ b/Source/Common/Include/RandomOrdering.h
@ -128,5 +128,11 @@ public:
    {
        return currentseed;
    }
+
+    bool IsRandomizationDisabled() const
+    {
+        return randomizationrange == randomizeDisable;
+    }
 };
-} } }
+
+}}}
--- a/Source/Common/Include/ScriptableObjects.h
+++ b/Source/Common/Include/ScriptableObjects.h
@ -29,7 +29,8 @@ public:
        runtime_error(msg)
    {
    }
-    virtual void PrintError(const std::wstring& linePrefix) const = 0;
+    virtual std::wstring GetError(const std::wstring& /*linePrefix*/) const = 0;
+    virtual void PrintError(const std::wstring& /*linePrefix*/) const = 0;
 };

 // -----------------------------------------------------------------------
--- a/Source/Common/Include/StringUtil.h
+++ b/Source/Common/Include/StringUtil.h
@ -17,6 +17,11 @@ inline bool AreEqualIgnoreCase(
    const std::basic_string<TElement, char_traits<TElement>, allocator<TElement>>& s1,
    const std::basic_string<TElement, char_traits<TElement>, allocator<TElement> >& s2)
 {
+    if (s1.size() != s2.size())
+    {
+        return false;
+    }
+
    return std::equal(s1.begin(), s1.end(), s2.begin(), [](const TElement& a, const TElement& b)
    {
        return std::tolower(a) == std::tolower(b);
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@ -665,7 +665,8 @@ public:
        std::swap(m_strides[i], m_strides[j]);
    }

-    // Flatten the shape in place to a 2D tensor. 
+    // Flatten a tensor shape into a 2D tensor, where splitPoint is the first index to go into the second dimension
+    // The tensor shape must be flattenable this way, i.e. each of the two index ranges must be dense.
    void FlattenTo2DInPlace(size_t splitPoint, const char* errorPrefix/* = nullptr*/)
    {
        // check & print meaningful error message
--- a/Source/Readers/HTKMLFReader/basetypes.h
+++ b/Source/Readers/HTKMLFReader/basetypes.h
@ -411,7 +411,7 @@ static inline void byteswap(V &v) throw()

 // execute a block with retry
 // Block must be restartable.
-// Use this when writing small files to those unreliable Windows servers.
+// Use this when writing/reading small files to those unreliable Windows servers.
 // TODO: This will fail to compile under VS 2008--we need an #ifdef around this
 template <typename FUNCTION>
 static void attempt(int retries, const FUNCTION &body)
--- a/Source/Common/Include/fileutil.h
+++ b/Source/Common/Include/fileutil.h
@ -30,6 +30,7 @@
 #include <assert.h>
 #include <string.h>  // for strerror()
 #include <stdexcept> // for exception
+#include <fcntl.h>

 // ----------------------------------------------------------------------------
 // fopenOrDie(): like fopen() but terminate with err msg in case of error.
@ -591,7 +592,8 @@ void fgetfile(const std::wstring& pathname, std::vector<char>& buffer);
 void fgetfile(FILE* f, std::vector<char>& buffer);
 namespace msra { namespace files {

-void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines);
+void fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, std::vector<std::string>& lines, int numberOfTries = 1);
+
 static inline std::vector<std::string> fgetfilelines(const std::wstring& pathname)
 {
    std::vector<char> buffer;
@ -599,7 +601,7 @@ static inline std::vector<std::string> fgetfilelines(const std::wstring& pathnam
    fgetfilelines(pathname, buffer, lines);
    return lines;
 }
-std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer);
+std::vector<char*> fgetfilelines(const std::wstring& pathname, std::vector<char>& readbuffer, int numberOfTries = 1);

 }}

@ -698,8 +700,18 @@ class auto_file_ptr
    {
        if (f && f != stdin && f != stdout && f != stderr)
        {
+            bool readMode = false;
+
+#ifdef _WIN32
+            if ((f->_flag&_IOREAD) == _IOREAD)
+                readMode = true;
+#else
+            int mode = fcntl(fileno(f), F_GETFL);
+            if ((mode & O_ACCMODE) == O_RDONLY)
+                readMode = true;
+#endif
            int rc = ::fclose(f);
-            if ((rc != 0) && !std::uncaught_exception())
+            if (!readMode && (rc != 0) && !std::uncaught_exception())
                RuntimeError("auto_file_ptr: failed to close file: %s", strerror(errno));

            f = NULL;
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@ -1251,7 +1251,7 @@ public:
        // BUGBUG: we only really support one archive file at this point
        // read the TOC in one swoop
        std::vector<char> textbuffer;
-        auto toclines = msra::files::fgetfilelines(tocpath, textbuffer);
+        auto toclines = msra::files::fgetfilelines(tocpath, textbuffer, 3);

        // parse it one by one
        size_t archiveindex = SIZE_MAX; // its index
--- a/Source/Common/fileutil.cpp
+++ b/Source/Common/fileutil.cpp
@ -16,6 +16,7 @@
 #endif
 #define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
 #include "Basics.h"
+#include "basetypes.h" //for attemp()
 #include "fileutil.h"
 #include "ProgressTracing.h"

@ -1632,6 +1633,11 @@ static size_t fgetfilechars(const std::wstring& path, vector<char>& buffer)
    return len;
 }

+static void fgetfilechars(const std::wstring& path, vector<char>& buffer, size_t& len)
+{
+    len = fgetfilechars(path, buffer);
+}
+
 template <class LINES>
 static void strtoklines(char* s, LINES& lines)
 {
@ -1639,10 +1645,14 @@ static void strtoklines(char* s, LINES& lines)
        lines.push_back(p);
 }

-void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines)
+void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer, std::vector<std::string>& lines, int numberOfTries)
 {
-    // load it into RAM in one huge chunk
-    const size_t len = fgetfilechars(path, buffer);
+    size_t len = 0;
+    msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
+    {
+        // load it into RAM in one huge chunk
+        fgetfilechars(path, buffer, len);
+    });

    // parse into lines
    lines.resize(0);
@ -1651,11 +1661,15 @@ void msra::files::fgetfilelines(const std::wstring& path, vector<char>& buffer,
 }

 // same as above but returning const char* (avoiding the memory allocation)
-vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer)
+vector<char*> msra::files::fgetfilelines(const wstring& path, vector<char>& buffer, int numberOfTries)
 {
-    // load it into RAM in one huge chunk
-    const size_t len = fgetfilechars(path, buffer);
-
+    size_t len = 0;
+    msra::util::attempt(numberOfTries, [&]() // (can be reading from network)
+    {
+        // load it into RAM in one huge chunk
+        fgetfilechars(path, buffer, len);
+    });
+    
    // parse into lines
    vector<char*> lines;
    lines.reserve(len / 20);
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -72,6 +72,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(InvStdDevNode))                        return New<InvStdDevNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(KhatriRaoProductNode))                 return New<KhatriRaoProductNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LogNode))                              return New<LogNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(LogPlusNode))                          return New<LogPlusNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LogSoftmaxNode))                       return New<LogSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LookupTableNode))                      return New<LookupTableNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(MatrixL1RegNode))                      return New<MatrixL1RegNode<ElemType>>(forward<_Types>(_Args)...);
@ -657,6 +658,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Plus(
    return net.AddNodeToNetAndAttachInputs(New<PlusNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LogPlus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<LogPlusNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Less(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName)
 {
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -134,6 +134,7 @@ public:
    ComputationNodePtr InvStdDev(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr KhatriRaoProduct(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Log(const ComputationNodePtr a, const std::wstring nodeName = L"");
+    ComputationNodePtr LogPlus(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr LogSoftmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr Logistic(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName = L"");
    ComputationNodePtr Logistic(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -1423,8 +1423,8 @@ public:
        m_gradientInitialized = true;
    }

-    // resize and reset this node's gradient to a given matrix's value
-    void ResetGradient(const Matrix<ElemType>& val)
+    // Assign the given matrix's value to this node's gradient. The matrix sizes must match.
+    void AssignGradient(const Matrix<ElemType>& val)
    {
        UpdateDataSize(Gradient());

--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -67,6 +67,8 @@ template class PlusNode<double>;

 // -----------------------------------------------------------------------
 // LogPlusNode (summand1, summand2)
+// Computes ln(exp(summand1) + exp(summand2)) in an overflow safe way.
+// Useful e.g. for computing softmax over sequence.
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -105,8 +107,16 @@ public:
        if (Input(inputIndex)->ReducesInTimeWrt(Input(1 - inputIndex)))
            Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);

-        // TODO: would be nice to state the derivative here in a comment
-        inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input0, input1);
+        if (inputIndex == 0)
+        {
+            // d/dx (ln( exp(x) + (exp(y)) = exp(x) / (exp(x) + exp(y)) = 1 / (1 + exp(y-x)) = sigmoid(x-y)
+            inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input1, input0);
+        }
+        else
+        {
+            // d/dy (ln( exp(x) + (exp(y)) = exp(y) / (exp(x) + exp(y)) = 1 / (1 + exp(x-y)) = sigmoid(y-x)
+            inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input0, input1);
+        }
    }
 };

--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -8,7 +8,8 @@
 #define __STDC_FORMAT_MACROS
 #include <inttypes.h>

-#include "stdafx.h"
+#include <stdio.h>
+#include <math.h>
 #define EVAL_EXPORTS // creating the exports here
 #include "Eval.h"
 #include "Actions.h"
@ -26,6 +27,7 @@
 #include "NoRandomizer.h"
 #include "HeapMemoryProvider.h"
 #include "InputAndParamNodes.h"
+#include "latticearchive.h"

 // TODO: Temporary mechanism to enable memory sharing for
 // node output value matrices. This will go away when the
@ -99,6 +101,8 @@ extern "C" EVAL_API void GetEvalD(IEvaluateModel<double>** peval)
 template <typename ElemType>
 void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup)
 {
+    // On Linux with gcc 4.8.4, it is required to add "this->" when referencing m_net, which is the protected member of the base class with templates,
+    // in order to make the name correctly resolved by the compiler.
    if (this->m_net == NULL)
    {
        for (auto iter = dimensions.begin(); iter != dimensions.end(); iter++)
@ -317,15 +321,17 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
        RuntimeError("Expected %d outputs, but got %d.", (int)m_outputNodes.size(), (int)outputs.size());

    size_t i = 0;
-    for (auto& input : m_inputMatrices)
+    for (auto& inputNode : m_inputNodes)
    {
        // const cast: The matrix class takes this over without copying and could theoretically change the contents,
        // though it doesn't in this case.
        auto& buffer = const_cast<ValueBuffer<ElemType, ValueContainer>&>(inputs[i]);
-        shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
+        auto matrix = dynamic_pointer_cast<Matrix<ElemType>>(inputNode->ValuePtr());
        auto type = matrix->GetMatrixType();
-        size_t numRows = input.second.sampleLayout.GetNumElements();
+        size_t numRows = inputNode->GetSampleLayout().GetNumElements();

+        if (buffer.m_buffer.data() == nullptr)
+            RuntimeError("Input %ls: Buffer is not allocated.", m_inputNodes[i]->GetName().c_str());
        if (type == MatrixType::DENSE)
        {
            if (buffer.m_buffer.size() % numRows != 0)
@ -336,8 +342,12 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem
        }
        else if (type == MatrixType::SPARSE)
        {
+            if (buffer.m_colIndices.data() == nullptr)
+                RuntimeError("Input %ls: Due to sparse input format, expected colIndices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
+            if (buffer.m_indices.data() == nullptr)
+                RuntimeError("Input %ls: Due to sparse input format, expected Indices array, but was nullptr.", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices.size() < 2)
-                RuntimeError("Input %ls: Expected at least one element.", m_inputNodes[i]->GetName().c_str());
+                RuntimeError("Input %ls: Expected at least one element (2 entries in colIndices array).", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices[0] != 0)
                RuntimeError("Input %ls: First element of column indices must be 0", m_inputNodes[i]->GetName().c_str());
            if (buffer.m_colIndices[buffer.m_colIndices.size() - 1] != buffer.m_indices.size())
@ -348,8 +358,8 @@ void CNTKEvalExtended<ElemType>::ForwardPassT(const std::vector<ValueBuffer<Elem

        int numCols = type == MatrixType::DENSE ? buffer.m_buffer.size() / numRows : buffer.m_colIndices.size() - 1;
        assert(numCols >= 1);
-        input.second.pMBLayout->Init(1, numCols);
-        input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
+        inputNode->GetMBLayout()->Init(1, numCols);
+        inputNode->GetMBLayout()->AddSequence(0, 0, 0, numCols);

        if (type == MatrixType::DENSE)
            matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@ -134,8 +134,6 @@
    <ClInclude Include="..\Common\Include\TimerUtility.h" />
    <ClInclude Include="EvalReader.h" />
    <ClInclude Include="EvalWriter.h" />
-    <ClInclude Include="stdafx.h" />
-    <ClInclude Include="targetver.h" />
    <ClInclude Include="CNTKEval.h" />
  </ItemGroup>
  <ItemGroup>
@ -146,12 +144,9 @@
      <PrecompiledHeader>
      </PrecompiledHeader>
    </ClCompile>
-    <ClCompile Include="stdafx.cpp">
-      <PrecompiledHeader>Create</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="CNTKEval.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/Source/EvalDll/EvalDll.vcxproj.filters
+++ b/Source/EvalDll/EvalDll.vcxproj.filters
@ -5,9 +5,6 @@
    <ClCompile Include="dllmain.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
-    <ClCompile Include="stdafx.cpp">
-      <Filter>Misc</Filter>
-    </ClCompile>
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptEvaluator.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
@ -31,12 +28,6 @@
    <ClInclude Include="..\Common\Include\Basics.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
-    <ClInclude Include="stdafx.h">
-      <Filter>Misc</Filter>
-    </ClInclude>
-    <ClInclude Include="targetver.h">
-      <Filter>Misc</Filter>
-    </ClInclude>
    <ClInclude Include="..\Common\Include\Config.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
--- a/Source/EvalDll/dllmain.cpp
+++ b/Source/EvalDll/dllmain.cpp
@ -4,7 +4,7 @@
 //
 // dllmain.cpp : Defines the entry point for the DLL application.
 //
-#include "stdafx.h"
+
 #ifdef _WIN32
 #define WIN32_LEAN_AND_MEAN
 #define NOMINMAX
--- a/Source/EvalDll/stdafx.cpp
+++ b/Source/EvalDll/stdafx.cpp
@ -1,8 +0,0 @@
-// stdafx.cpp : source file that includes just the standard includes
-// ParseNumber.pch will be the pre-compiled header
-// stdafx.obj will contain the pre-compiled type information
-
-#include "stdafx.h"
-
-// TODO: reference any additional headers you need in STDAFX.H
-// and not in this file
--- a/Source/EvalDll/stdafx.h
+++ b/Source/EvalDll/stdafx.h
@ -1,17 +0,0 @@
-// stdafx.h : include file for standard system include files,
-// or project specific include files that are used frequently, but
-// are changed infrequently
-//
-
-#pragma once
-
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-#endif
-#ifdef _WIN32
-#include "targetver.h"
-#endif
-#include <stdio.h>
-#include <math.h>
-
-// TODO: reference additional headers your program requires here
--- a/Source/EvalDll/targetver.h
+++ b/Source/EvalDll/targetver.h
@ -1,8 +0,0 @@
-#pragma once
-
-// Including SDKDDKVer.h defines the highest available Windows platform.
-
-// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
-// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
-
-#include <SDKDDKVer.h>
--- a/Source/Extensibility/CPPEvalClient/stdafx.cpp
+++ b/Source/Extensibility/CPPEvalClient/stdafx.cpp
@ -1,12 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-// stdafx.cpp : source file that includes just the standard includes
-// CPPEvalClient.pch will be the pre-compiled header
-// stdafx.obj will contain the pre-compiled type information
-
-#include "stdafx.h"
-
-// TODO: reference any additional headers you need in STDAFX.H
-// and not in this file
--- a/Source/Extensibility/CPPEvalClient/stdafx.h
+++ b/Source/Extensibility/CPPEvalClient/stdafx.h
@ -1,19 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-// stdafx.h : include file for standard system include files,
-// or project specific include files that are used frequently, but
-// are changed infrequently
-//
-
-#pragma once
-
-#include "targetver.h"
-
-#include <stdio.h>
-#include <tchar.h>
-#include "targetver.h"
-
-// This is a windows only application
-#include "Windows.h"
--- a/Source/Extensibility/CPPEvalClient/targetver.h
+++ b/Source/Extensibility/CPPEvalClient/targetver.h
@ -1,13 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-
-#pragma once
-
-// Including SDKDDKVer.h defines the highest available Windows platform.
-
-// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
-// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
-
-#include <SDKDDKVer.h>
--- a/Source/Extensibility/CSEvalClient/CSEvalClient.csproj
+++ b/Source/Extensibility/CSEvalClient/CSEvalClient.csproj
@ -1,80 +0,0 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
-    <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
-    <PropertyGroup>
-        <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
-        <Platform Condition=" '$(Platform)' == '' ">AnyCPU</Platform>
-        <ProjectGuid>{41E11A59-62B2-4927-A4F8-F40B1B612C6C}</ProjectGuid>
-        <OutputType>Exe</OutputType>
-        <AppDesignerFolder>Properties</AppDesignerFolder>
-        <RootNamespace>Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient</RootNamespace>
-        <AssemblyName>CSEvalClient</AssemblyName>
-        <TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
-        <FileAlignment>512</FileAlignment>
-    </PropertyGroup>
-    <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
-        <DebugSymbols>true</DebugSymbols>
-        <OutputPath>..\..\..\x64\Debug\</OutputPath>
-        <DefineConstants>DEBUG;TRACE</DefineConstants>
-        <DebugType>full</DebugType>
-        <PlatformTarget>x64</PlatformTarget>
-        <ErrorReport>prompt</ErrorReport>
-        <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
-        <Prefer32Bit>true</Prefer32Bit>
-    </PropertyGroup>
-    <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug_CpuOnly|x64'">
-        <DebugSymbols>true</DebugSymbols>
-        <OutputPath>..\..\..\x64\Debug_CpuOnly\</OutputPath>
-        <DefineConstants>DEBUG;TRACE</DefineConstants>
-        <DebugType>full</DebugType>
-        <PlatformTarget>x64</PlatformTarget>
-        <ErrorReport>prompt</ErrorReport>
-        <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
-        <Prefer32Bit>true</Prefer32Bit>
-    </PropertyGroup>
-    <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
-        <OutputPath>..\..\..\x64\Release\</OutputPath>
-        <DefineConstants>TRACE</DefineConstants>
-        <Optimize>true</Optimize>
-        <DebugType>pdbonly</DebugType>
-        <PlatformTarget>x64</PlatformTarget>
-        <ErrorReport>prompt</ErrorReport>
-        <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
-        <Prefer32Bit>true</Prefer32Bit>
-    </PropertyGroup>
-    <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release_CpuOnly|x64'">
-        <OutputPath>..\..\..\x64\Release_CpuOnly\</OutputPath>
-        <DefineConstants>TRACE</DefineConstants>
-        <Optimize>true</Optimize>
-        <DebugType>pdbonly</DebugType>
-        <PlatformTarget>x64</PlatformTarget>
-        <ErrorReport>prompt</ErrorReport>
-        <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
-        <Prefer32Bit>true</Prefer32Bit>
-    </PropertyGroup>    <ItemGroup>
-        <Reference Include="System" />
-        <Reference Include="System.Core" />
-        <Reference Include="Microsoft.CSharp" />
-    </ItemGroup>
-    <ItemGroup>
-        <Compile Include="Program.cs" />
-        <Compile Include="Properties\AssemblyInfo.cs" />
-    </ItemGroup>
-    <ItemGroup>
-        <None Include="App.config" />
-    </ItemGroup>
-    <ItemGroup>
-        <ProjectReference Include="..\EvalWrapper\EvalWrapper.vcxproj">
-            <Project>{ef766cae-9cb1-494c-9153-0030631a6340}</Project>
-            <Name>EvalWrapper</Name>
-        </ProjectReference>
-    </ItemGroup>
-    <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
-    <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
-       Other similar extension points exist, see Microsoft.Common.targets.
-  <Target Name="BeforeBuild">
-  </Target>
-  <Target Name="AfterBuild">
-  </Target>
-  -->
-</Project>
--- a/Source/Extensibility/EvalWrapper/CNTKException.h
+++ b/Source/Extensibility/EvalWrapper/CNTKException.h
@ -0,0 +1,105 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CNTKException.h -- Managed CNTK Exception wrappers
+//
+
+#include "ExceptionWithCallStack.h"
+
+using namespace std;
+using namespace System;
+using namespace System::Collections::Generic;
+using namespace System::Collections;
+using namespace System::Runtime::Serialization;
+using namespace Microsoft::MSR::CNTK;
+
+namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {
+
+[Serializable]
+public ref class CNTKException : Exception, ISerializable
+{
+public:
+    CNTKException() : Exception()
+    {}
+
+    CNTKException(String^ message) : Exception(message)
+    {}
+
+    CNTKException(String^ message, String^ callstack) : Exception(message), NativeCallStack(callstack)
+    {}
+
+    const String^ NativeCallStack;
+
+protected:
+
+    CNTKException(SerializationInfo^ info, StreamingContext context) : Exception(info, context)
+    {}
+};
+
+[Serializable]
+public ref class CNTKRuntimeException : CNTKException
+{
+public:
+    CNTKRuntimeException() : CNTKException()
+    {}
+
+    CNTKRuntimeException(String^ message, String^ callstack) : CNTKException(message, callstack)
+    {}
+
+protected:
+
+    CNTKRuntimeException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
+    {}
+};
+
+[Serializable]
+public ref class CNTKLogicErrorException : CNTKException
+{
+public:
+    CNTKLogicErrorException() : CNTKException()
+    {}
+
+    CNTKLogicErrorException(String^ message, String^ callstack) : CNTKException(message, callstack)
+    {}
+
+protected:
+
+    CNTKLogicErrorException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
+    {}
+};
+
+[Serializable]
+public ref class CNTKInvalidArgumentException : CNTKException
+{
+public:
+    CNTKInvalidArgumentException() : CNTKException()
+    {}
+
+    CNTKInvalidArgumentException(String^ message, String^ callstack) : CNTKException(message, callstack)
+    {}
+
+protected:
+
+    CNTKInvalidArgumentException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
+    {}
+};
+
+[Serializable]
+public ref class CNTKBadAllocException : CNTKException
+{
+public:
+    CNTKBadAllocException() : CNTKException()
+    {}
+
+    CNTKBadAllocException(String^ message) : CNTKException(message)
+    {}
+
+protected:
+
+    CNTKBadAllocException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
+    {}
+};
+
+
+}}}}}
--- a/Source/Extensibility/EvalWrapper/EvalCommon.h
+++ b/Source/Extensibility/EvalWrapper/EvalCommon.h
@ -0,0 +1,31 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// EvalCommon.h -- Common structures used by managed code wrapping the native EvaluateModel interface
+//
+
+namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {
+
+/// Enumeration for the types of nodes
+public enum class NodeGroup
+{
+    Input,  // an input node
+    Output, // an output node
+    Specified
+};
+
+public enum class DataType
+{
+    Float32,
+    Float64
+};
+
+public enum class StorageType
+{
+    Unknown,
+    Dense,
+    Sparse,
+};
+
+}}}}}
--- a/Source/Extensibility/EvalWrapper/EvalExtendedWrapper.cpp
+++ b/Source/Extensibility/EvalWrapper/EvalExtendedWrapper.cpp
@ -0,0 +1,558 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// EvalExtendedWrapper.cpp -- Managed code wrapping the native EvaluateExtendedModel interface
+//
+
+#include <windows.h>
+#include <vcclr.h>
+#include <string>
+#include <utility>
+#include <vector>
+#include <memory>
+#include <msclr\marshal_cppstd.h>
+
+#include "CNTKException.h"
+#pragma warning(push)
+#pragma warning(disable : 4793) // Function compiled as native
+#include "Basics.h"
+#include "ScriptableObjects.h"
+#pragma warning(pop)
+#include "EvalCommon.h"
+#include "Eval.h"
+
+#using <System.dll>
+#using <System.Collections.dll>
+#using <System.IO.dll>
+#using <System.Reflection.dll>
+
+using namespace std;
+using namespace System;
+using namespace System::Collections::Generic;
+using namespace System::Collections;
+
+namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {
+
+namespace Native = Microsoft::MSR::CNTK;
+
+// Used for retrieving the appropriate model for the element type (float / double)
+template<typename ElemType>
+using GetEvalProc = void(*)(IEvaluateModelExtended<ElemType>**);
+
+//
+// A buffer to keep data for all samples in a (variable length) sequence 
+// from a single input or output.
+// This is used for both dense and sparse data.
+//
+generic<class ElemType>
+public ref class ValueBuffer
+    {
+    public:
+        ValueBuffer()
+        {
+            Size = 0;
+        }
+
+        //
+        // Init for Dense
+        //
+        ValueBuffer(int bufferSize)
+        {
+            Buffer = gcnew array<ElemType>(bufferSize);
+            Size = bufferSize;
+        }
+
+        //
+        // Init for Sparse
+        //
+        ValueBuffer(int bufferSize, int colIndicesSize)
+        {
+            Buffer = gcnew array<ElemType>(bufferSize);
+            Indices = gcnew array<int>(bufferSize);
+            ColIndices = gcnew array<int>(colIndicesSize);
+            Size = colIndicesSize - 1;
+        }
+
+        //
+        // For dense, this is the length of Buffer (in nr. of ElemTypes).
+        // For sparse, this is the length of ColIndices (i.e. the number of columns + 1).
+        // This allows Buffer / Indices / ColIndices to be larger than Size to avoid
+        // reallocation.
+        //
+        property int Size;
+
+        //
+        // All elements of a sequence, concatenated.
+        // For dense inputs, the number of samples is given by the the length of
+        // this vector / product of tensor dimensions. E.g. for a tensor of dimension
+        // [2,2] and 12 elements in the buffer, the number of samples is 3.
+        // For sparse inputs, the number of samples is indicated by the ColIndices field.
+        //
+        property array<ElemType>^ Buffer;
+
+        // In case of sparse data, the following is also used. Otherwise, the 
+        // contents are ignored.
+
+        // E.g. a sequence of three sparse vectors with 2 / 4 / 2 non-zero values
+        // could be represented as the following:
+        // colIdx:  0   2       6   8
+        //          v   v       v   v
+        // indices  1 3 2 3 5 6 2 7
+        // buffer   0 1 2 3 4 5 6 7
+
+        //
+        // For every element in buffer, an entry in this array gives its position.
+        // For every vector the entries must be ascending.
+        //
+        property array<int>^ Indices;
+
+        //
+        // Contains numberOfsamples + 1 indices into the buffer. The first entry
+        // is always 0. The last entry points after the last element.
+        // See http://docs.nvidia.com/cuda/cusparse/#compressed-sparse-column-format-csc
+        //
+        property array<int>^ ColIndices;
+
+
+        // TODO: Should it have a read-only StorageType property?
+    };
+
+//
+// Meta data
+//
+public ref struct VariableLayout
+{
+    // Name of the input
+    property String^ Name;
+
+    property DataType DataType;
+
+    property StorageType StorageType;
+
+    // Dimension of the tensor, flattened to 1 dimension, for one entry on the dynamic axis.
+    // E.g. for a tensor [2,3,*] this would be 6.
+    property int NumElements;
+};
+
+public ref class VariableSchema : List<VariableLayout^>
+{
+public:
+    generic<typename ElemType>
+        array<ValueBuffer<ElemType>^>^ CreateBuffers(... array<int>^ maxLengths)
+        {
+            if (maxLengths->Length == 0)
+            {
+                maxLengths = gcnew array<int>(this->Count);
+                for (int i = 0; i<maxLengths->Length; i++)
+                {
+                    maxLengths[i] = 1;
+                }
+            }
+
+            if (maxLengths->Length != this->Count)
+            {
+                throw gcnew CNTKRuntimeException("Expected max lengths for all variables.", String::Empty);
+            }
+
+            array<ValueBuffer<ElemType>^>^ buffers = gcnew array<ValueBuffer<ElemType>^>(this->Count);
+            for (int i = 0; i < this->Count; i++)
+            {
+                buffers[i] = gcnew ValueBuffer<ElemType>(this[i]->NumElements * maxLengths[i]);
+            }
+
+            return buffers;
+        }
+};
+
+/// Managed wrapper for the native evaluation model
+template<typename ElemType>
+public ref class ModelEvaluationExtended : IDisposable
+{
+    typedef std::pair<std::wstring, std::vector<ElemType>*> MapEntry;
+    typedef std::shared_ptr<Native::ValueBuffer<ElemType, Native::VectorRef>> ValueBufferPtr;
+
+public:
+    /// <summary>Initializes a new instance of the <see cref="ModelEvaluationExtended"> class.</summary>
+    /// <param name="funcName">Factory function name for retrieving the native model from the dll.</param>
+    ModelEvaluationExtended(String^ funcName)
+    {
+        try
+        {
+            pin_ptr <IEvaluateModelExtended<ElemType>*> p_eval = &m_eval;
+            GetEvalExtended<ElemType>(p_eval);
+        }
+        catch (const exception& ex)
+        {
+            throw gcnew CNTKException(gcnew System::String(ex.what()));
+        }
+    }
+
+    /// <summary>Creates a network based on the network description in the configuration</summary>
+    /// <param name="networkDescription">The configuration file containing the network description</param>
+    void CreateNetwork(String^ networkDescription)
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        msclr::interop::marshal_context context;
+        const std::string stdNetworkDescription = context.marshal_as<std::string>(networkDescription);
+
+        try
+        {
+            m_eval->CreateNetwork(stdNetworkDescription);
+        }
+        catch (const exception& ex)
+        {
+            throw GetCustomException(ex);
+        }
+    }
+
+    //
+    // GetInputSchema - retrieve information about tensor shapes and memory layout for this model.
+    //
+    VariableSchema^ GetInputSchema()
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        return ConvertNativeSchemaToManaged(m_eval->GetInputSchema());
+    }
+
+    //
+    // GetOutputSchema - retrieve information about tensor shapes and memory layout for this model.
+    //
+    VariableSchema^ GetOutputSchema()
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        return ConvertNativeSchemaToManaged(m_eval->GetOutputSchema());
+    }
+
+    //
+    // Allocate internal state for calling ForwardPass(). The call restricts the network (inputs and outputs)
+    // to the functions represented by the output name.
+    //
+    void StartForwardEvaluation(List<String^>^ outputs)
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        std::vector<wstring> outputNodeNames;
+        msclr::interop::marshal_context context;
+
+        for each (String^ output in outputs)
+        {
+            outputNodeNames.push_back(context.marshal_as<std::wstring>(output));
+        }
+
+        try
+        {
+            m_eval->StartForwardEvaluation(outputNodeNames);
+        }
+        catch (const exception& ex)
+        {
+            throw GetCustomException(ex);
+        }
+    }
+
+    //
+    // Forward Pass - Evaluate (perform a forward pass for) a single unit using the model with the given inputs and 
+    // outputs.
+    // The layout and shape of the data in inputs vector must match the schema returned by GetInputLayouts.
+    // This method is not reentrant, as the forward pass keeps internal state.
+    // outputId - output to compute values for. See GetOutputLayouts()
+    // inputs - vector of input buffers, one for every input as given by GetInputLayouts()
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller
+    // Called after StartForwardEvaluation()
+    //
+    void ForwardPass(array<ValueBuffer<ElemType>^>^ inputs, array<ValueBuffer<ElemType>^>^ outputs)
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        try
+        {
+            Native::ValueRefs<ElemType> stdInputs;
+            Native::ValueRefs<ElemType> stdOutputs;
+
+            // Hold gc objects in the stack, while performing native actions
+            vector<gcroot<array<ElemType>^>> pinBuffers;
+            vector<gcroot<array<int>^>> pinIndices;
+
+            // Map the managed space into the native space, results will be written directly into the managed memory space
+            // https://msdn.microsoft.com/en-us/library/1dz8byfh.aspx
+            TransferVectorsToValueBuffers(inputs, stdInputs, pinBuffers, pinIndices, StorageType::Sparse);
+            TransferVectorsToValueBuffers(outputs, stdOutputs, pinBuffers, pinIndices, StorageType::Dense);
+
+            try
+            {
+                m_eval->ForwardPass(stdInputs, stdOutputs);
+
+                // Update actual output size.
+                for (int i = 0; i < outputs->Length; ++i)
+                {
+                    outputs[i]->Size = (int)stdOutputs[i].m_buffer.m_size;
+                }
+            }
+            catch (const exception& ex)
+            {
+                throw GetCustomException(ex);
+            }
+        }
+        catch (Exception^)
+        {
+            throw;
+        }
+    }
+
+    ~ModelEvaluationExtended()
+    {
+        if (m_eval == nullptr)
+        {
+            return;
+        }
+
+        this->!ModelEvaluationExtended();
+    }
+
+protected:
+    !ModelEvaluationExtended()
+    {
+        if (m_eval != nullptr)
+        {
+            m_eval->Destroy();
+            m_eval = nullptr;
+        }
+    }
+
+private:
+    // Native model evaluation instance
+    IEvaluateModelExtended<ElemType> *m_eval;
+
+    /// <summary> Throws a CLR exception based on a native exception</summary>
+    /// <param name="ex">The native exception to throw as a CLR exception</param>
+    /// <returns>A CLR exception</returns>
+    CNTKException^ GetCustomException(const exception& ex)
+    {
+        // Determine the appropriate exception and initialize it with the exception payload
+        if (typeid(ex) == typeid(ExceptionWithCallStack<runtime_error>))
+        {
+            ExceptionWithCallStack<runtime_error>& rich = dynamic_cast<ExceptionWithCallStack<runtime_error>&>((runtime_error&)ex);
+            return gcnew CNTKRuntimeException(gcnew System::String(rich.what()), gcnew System::String(rich.CallStack()));
+        }
+        else if (typeid(ex) == typeid(ExceptionWithCallStack<logic_error>))
+        {
+            ExceptionWithCallStack<logic_error>& rich = dynamic_cast<ExceptionWithCallStack<logic_error>&>((logic_error&)ex);
+            return gcnew CNTKLogicErrorException(gcnew System::String(ex.what()), gcnew System::String(rich.CallStack()));
+        }
+        else if (typeid(ex) == typeid(ExceptionWithCallStack<invalid_argument>))
+        {
+            ExceptionWithCallStack<invalid_argument>& rich = dynamic_cast<ExceptionWithCallStack<invalid_argument>&>((invalid_argument&)ex);
+            return gcnew CNTKInvalidArgumentException(gcnew System::String(ex.what()), gcnew System::String(rich.CallStack()));
+        }
+        else if (typeid(ex) == typeid(bad_alloc))
+        {
+            return gcnew CNTKBadAllocException(gcnew System::String(ex.what()));
+        }
+        else if (dynamic_cast<const ScriptableObjects::ScriptingException*>(&ex) != nullptr) // Includes derived classes
+        {
+            const auto& err = dynamic_cast<const ScriptableObjects::ScriptingException&>(ex);
+            return gcnew CNTKLogicErrorException(gcnew System::String(wstrprintf(L"%ls\n%ls", utf16(err.what()).c_str(), err.GetError(L"").c_str()).c_str()), nullptr);
+        }
+        else
+        {
+            return gcnew CNTKException(gcnew System::String(ex.what()));
+        }
+    }
+
+    /// <summary Converts a managed (CLI) enum NodeGroup to a native NodeGroup
+    /// <param name="nodeGroup">The managed (CLI) NodeGroup to convert to native</param>
+    Native::NodeGroup GetNodeGroup(NodeGroup nodeGroup)
+    {
+        switch ((int)nodeGroup)
+        {
+        case Native::NodeGroup::nodeInput:
+            return Native::NodeGroup::nodeInput;
+        case Native::NodeGroup::nodeOutput:
+            return Native::NodeGroup::nodeOutput;
+        case Native::NodeGroup::nodeSpecified:
+            return Native::NodeGroup::nodeSpecified;
+        default:
+            throw gcnew CNTKRuntimeException(String::Format("Cannot convert native NodeGroup with value: {0} to corresponding managed NodeGroup.", (int)nodeGroup), "");
+        }
+    }
+
+    DataType GetDataType(Microsoft::MSR::CNTK::VariableLayout::DataType dataType)
+    {
+        switch ((int)dataType)
+        {
+        case DataType::Float32:
+            return DataType::Float32;
+        case DataType::Float64:
+            return DataType::Float64;
+        default:
+            throw gcnew CNTKRuntimeException(String::Format("Cannot convert native DataType with value: {0} to corresponding managed DataType.", (int)dataType), "");
+        }
+    }
+
+    StorageType GetStorageType(Microsoft::MSR::CNTK::VariableLayout::StorageType storageType)
+    {
+        switch ((int)storageType)
+        {
+        case StorageType::Dense:
+            return StorageType::Dense;
+        case StorageType::Sparse:
+            return StorageType::Sparse;
+        case StorageType::Unknown:
+            return StorageType::Unknown;
+        default:
+            throw gcnew CNTKRuntimeException(String::Format("Cannot convert native StorageType with value: {0} to corresponding managed StorageType.", (int)storageType), "");
+        }
+    }
+
+    void PinBuffer(array<ElemType>^ itemBuffer, vector<gcroot<array<ElemType>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
+    {
+        // gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
+        gcroot<array<ElemType>^> pBuf(itemBuffer);
+        pin_ptr<ElemType> pp = &(pBuf[0]);
+        pinBuffers.push_back(pBuf);
+        vb->m_buffer.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
+        pp = nullptr;
+    }
+
+    void PinIndices(array<int>^ itemBuffer, vector<gcroot<array<int>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
+    {
+        // gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
+        gcroot<array<int>^> pBuf(itemBuffer);
+        pin_ptr<int> pp = &(pBuf[0]);
+        pinBuffers.push_back(pBuf);
+        vb->m_indices.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
+        pp = nullptr;
+    }
+
+    void PinColIndices(array<int>^ itemBuffer, vector<gcroot<array<int>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
+    {
+        // gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
+        gcroot<array<int>^> pBuf(itemBuffer);
+        pin_ptr<int> pp = &(pBuf[0]);
+        pinBuffers.push_back(pBuf);
+        vb->m_colIndices.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
+        pp = nullptr;
+    }
+
+    void TransferVectorsToValueBuffers(array<ValueBuffer<ElemType>^>^ list, Native::ValueRefs<ElemType>& valueRefs, vector<gcroot<array<ElemType>^>>& pinBuffers, vector<gcroot<array<int>^>>& pinIndices, StorageType storageType)
+    {
+        for each (auto item in list)
+        {
+            Native::ValueBuffer<ElemType, Native::VectorRef> vb;
+
+            int numElements = item->Size;
+            int bufferSize = item->ColIndices != nullptr ? item->ColIndices[item->Size - 1] : item->Size;
+
+            // Buffer is required
+            if (item->Buffer == nullptr)
+            {
+                throw gcnew CNTKRuntimeException("Invalid buffer (empty) for argument into ForwardPass", String::Empty);
+            }
+
+            PinBuffer(item->Buffer, pinBuffers, &vb, storageType, bufferSize);
+
+            if (item->Indices != nullptr)
+            {
+                PinIndices(item->Indices, pinIndices, &vb, storageType, bufferSize);
+            }
+
+            if (item->ColIndices != nullptr)
+            {
+                PinColIndices(item->ColIndices, pinIndices, &vb, storageType, numElements);
+            }
+
+            valueRefs.push_back(vb);
+        }
+    }
+
+    //
+    // ConvertNativeSchemaToManaged - Converts a native schema to a manged one
+    //
+    VariableSchema^ ConvertNativeSchemaToManaged(Native::VariableSchema layouts)
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        auto schema = gcnew VariableSchema();
+        for (auto& lay : layouts)
+        {
+            VariableLayout^ varlayout = gcnew VariableLayout();
+            varlayout->Name = gcnew String(lay.m_name.c_str());
+            varlayout->DataType = GetDataType(lay.m_dataType);
+            varlayout->NumElements = static_cast<int>(lay.m_numElements);
+            varlayout->StorageType = GetStorageType(lay.m_storageType);
+
+            schema->Add(varlayout);
+        }
+        return schema;
+    }
+};
+
+/// <summary>Managed float-specific model evaluation class</summary>
+/// <remarks>This class is necessary due to how generics and templates work in CLR</remarks>
+public ref class ModelEvaluationExtendedF : ModelEvaluationExtended<float>
+{
+public:
+    ModelEvaluationExtendedF::ModelEvaluationExtendedF()
+        : ModelEvaluationExtended("GetEvalExtendedF")
+    {
+    }
+};
+
+/// <summary>Managed double-specific model evaluation class</summary>
+/// <remarks>This class is necessary due to how generics and templates work in CLR</remarks>
+public ref class ModelEvaluationExtendedD : ModelEvaluationExtended<double>
+{
+public:
+    ModelEvaluationExtendedD::ModelEvaluationExtendedD()
+        : ModelEvaluationExtended("GetEvalExtendedD")
+    {
+    }
+};
+
+
+// This method tricks the compiler into emitting the methods of the classes
+// Refer to https://msdn.microsoft.com/en-us/library/ms177213.aspx for an
+// explanation to this behavior
+void EmitExtended()
+{
+    ModelEvaluationExtendedF f;
+    f.CreateNetwork("");
+    f.GetInputSchema();
+    f.GetOutputSchema();
+    f.StartForwardEvaluation(nullptr);
+    f.ForwardPass(nullptr, nullptr);
+
+    ModelEvaluationExtendedD d;
+    d.CreateNetwork("");
+    d.GetInputSchema();
+    d.GetOutputSchema();
+    d.StartForwardEvaluation(nullptr);
+    d.ForwardPass(nullptr, nullptr);
+
+    VariableSchema sc;
+    sc.CreateBuffers<float>();
+    sc.CreateBuffers<double>();
+}
+
+}}}}}
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.cpp
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.cpp
@ -13,7 +13,8 @@
 #include <memory>
 #include <msclr\marshal_cppstd.h>

-#include "ExceptionWithCallStack.h"
+#include "CNTKException.h"
+#include "EvalCommon.h"
 #include "Eval.h"

 #using <System.dll>
@ -23,25 +24,14 @@ using namespace std;
 using namespace System;
 using namespace System::Collections::Generic;
 using namespace System::Collections;
-using namespace System::Runtime::Serialization;
 using namespace Microsoft::MSR::CNTK;

 namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {

-ref class CNTKException;
-
 // Used for retrieving the model appropriate for the element type (float / double)
 template<typename ElemType>
 using GetEvalProc = void(*)(IEvaluateModel<ElemType>**);

-/// Enumeration for the types of nodes
-public enum class NodeGroup
-{
-    nodeInput,  // an input node
-    nodeOutput, // an output node
-    nodeSpecified
-};
-
 /// Managed wrapper for the native evaluation model
 template<typename ElemType>
 public ref class IEvaluateModelManaged : IDisposable
@ -53,21 +43,10 @@ public:
    /// <param name="funcName">Factory function name for retrieving the native model from the dll.</param>
    IEvaluateModelManaged(String^ funcName)
    {
-        pin_ptr<const WCHAR> dllname = PtrToStringChars("evaldll.dll");
-        auto hModule = LoadLibrary(dllname);
-        if (hModule == nullptr)
-        {
-            throw gcnew CNTKException(System::String::Format("Cannot find library: {0}", gcnew String(dllname)));
-        }
-
        try
        {
-            msclr::interop::marshal_context context;
-            const std::string func = context.marshal_as<std::string>(funcName);
-            auto procAddress = GetProcAddress(hModule, func.c_str());
-            auto getEvalProc = (GetEvalProc<ElemType>)procAddress;
            pin_ptr <IEvaluateModel<ElemType>*> p_eval = &m_eval;
-            getEvalProc(p_eval);
+            GetEval<ElemType>(p_eval);
        }
        catch (const exception& ex)
        {
@ -248,7 +227,7 @@ public:
        try
        {
            std::vector<shared_ptr<std::vector<ElemType>>> sharedOutputVectors;
-            int outputSize = GetNodeDimensions(NodeGroup::nodeOutput)[outputKey];
+            int outputSize = GetNodeDimensions(NodeGroup::Output)[outputKey];

            List<ElemType>^ outputs = gcnew List<ElemType>(outputSize);
            for (int i = 0; i < outputSize; i++)
@ -394,7 +373,7 @@ public:
    /// <returns>Results for requested layer</returns>
    List<ElemType>^ Evaluate(Dictionary<String^, List<ElemType>^>^ inputs, String^ outputKey)
    {
-        auto outDims = GetNodeDimensions(NodeGroup::nodeOutput);
+        auto outDims = GetNodeDimensions(NodeGroup::Output);
        int outputSize = outDims[outputKey];

        List<ElemType>^ outputs = gcnew List<ElemType>(outputSize);
@ -556,100 +535,6 @@ public:
    }
 };

-[Serializable]
-public ref class CNTKException : Exception, ISerializable
-{
-public:
-    CNTKException() : Exception()
-    {}
-
-    CNTKException(String^ message) : Exception(message)
-    {}
-
-    CNTKException(String^ message, String^ callstack) : Exception(message), NativeCallStack(callstack)
-    {}
-
-    const String^ NativeCallStack;
-
-
-    [System::Security::Permissions::SecurityPermissionAttribute
-        (System::Security::Permissions::SecurityAction::LinkDemand,
-        Flags = System::Security::Permissions::SecurityPermissionFlag::SerializationFormatter)]
-    virtual void GetObjectData(SerializationInfo^ info, StreamingContext context) override
-    {
-        Exception::GetObjectData(info, context);
-    }
-
-protected:
-
-    CNTKException(SerializationInfo^ info, StreamingContext context) : Exception(info, context)
-    {}
-};
-
-[Serializable]
-public ref class CNTKRuntimeException : CNTKException
-{
-public:
-    CNTKRuntimeException() : CNTKException()
-    {}
-
-    CNTKRuntimeException(String^ message, String^ callstack) : CNTKException(message, callstack)
-    {}
-
-protected:
-
-    CNTKRuntimeException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
-    {}
-};
-
-[Serializable]
-public ref class CNTKLogicErrorException : CNTKException
-{
-public:
-    CNTKLogicErrorException() : CNTKException()
-    {}
-
-    CNTKLogicErrorException(String^ message, String^ callstack) : CNTKException(message, callstack)
-    {}
-
-protected:
-
-    CNTKLogicErrorException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
-    {}
-};
-
-[Serializable]
-public ref class CNTKInvalidArgumentException : CNTKException
-{
-public:
-    CNTKInvalidArgumentException() : CNTKException()
-    {}
-
-    CNTKInvalidArgumentException(String^ message, String^ callstack) : CNTKException(message, callstack)
-    {}
-
-protected:
-
-    CNTKInvalidArgumentException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
-    {}
-};
-
-[Serializable]
-public ref class CNTKBadAllocException : CNTKException
-{
-public:
-    CNTKBadAllocException() : CNTKException()
-    {}
-
-    CNTKBadAllocException(String^ message) : CNTKException(message)
-    {}
-
-protected:
-
-    CNTKBadAllocException(SerializationInfo^ info, StreamingContext context) : CNTKException(info, context)
-    {}
-};
-
 // This method tricks the compiler into emitting the methods of the classes
 // Refer to https://msdn.microsoft.com/en-us/library/ms177213.aspx for an
 // explanation to this behavior
@ -667,7 +552,7 @@ void emit()
    f.CreateNetwork("", 0);
    f.CreateNetwork("", nullptr);
    f.CreateNetwork("", 0, nullptr);
-    f.GetNodeDimensions(NodeGroup::nodeSpecified);
+    f.GetNodeDimensions(NodeGroup::Specified);

    IEvaluateModelManagedD d;
    d.Init("");
@ -678,7 +563,7 @@ void emit()
    d.CreateNetwork("", 0);
    d.CreateNetwork("", nullptr);
    d.CreateNetwork("", 0,nullptr);
-    d.GetNodeDimensions(NodeGroup::nodeSpecified);
+    d.GetNodeDimensions(NodeGroup::Specified);

    // Deprecated code, hush warnings locally only
 #pragma warning(push)
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
@ -56,6 +56,8 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
@ -66,8 +68,6 @@
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <DelayLoadDLLs>
-      </DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -77,15 +77,16 @@
    </ClCompile>
    <Link>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <DelayLoadDLLs>
-      </DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
+    <ClCompile Include="CNTKException.h" />
+    <ClCompile Include="EvalExtendedWrapper.cpp" />
    <ClCompile Include="EvalWrapper.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\..\Common\Include\Eval.h" />
+    <ClInclude Include="EvalCommon.h" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj.filters
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj.filters
@ -16,10 +16,19 @@
    <ClInclude Include="..\..\Common\Include\Eval.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
+    <ClInclude Include="EvalCommon.h">
+      <Filter>Source Files</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="EvalWrapper.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="CNTKException.h">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="EvalExtendedWrapper.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
 </Project>
--- a/Source/Math/BlockHandlerAVX.cpp
+++ b/Source/Math/BlockHandlerAVX.cpp
@ -0,0 +1,48 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
+//
+#include "stdafx.h"
+#include <malloc.h>
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <assert.h>
+#include <iostream>
+#include <exception>
+#include "BlockMultiplierMatrixUtil.h"
+
+#include "BlockHandlerAVX.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+int BlockHandlerAVX::RowToColOffsetRewrittenA(int row, int kOffset, int blockSize, int rowsPerBlock, int origCols)
+{
+    int rowIdx = row / rowsPerBlock;
+    int offsetFromBlockBeginning = row % rowsPerBlock;
+    int colIdx = kOffset * rowsPerBlock * blockSize + (offsetFromBlockBeginning * blockSize);
+    return (rowIdx * (origCols / blockSize) * rowsPerBlock * blockSize) + colIdx;
+}
+
+
+//col is the original column of B
+//kOffset is the offset to the current block we are multiplying against (in absolute
+int BlockHandlerAVX::RowToColOffsetRewrittenB(int col, int kOffset, int blockSize, int origCols)
+{
+    return (origCols *  blockSize * kOffset) + (col * blockSize);
+}
+
+
+
+void BlockHandlerAVX::DumpM256(__m256i dumpMe)
+{
+    union { int32_t i[8]; __m256i y; } u;
+    u.y = dumpMe;
+    for (int i = 0; i < 8; ++i)
+    {
+        std::cout << u.i[i] << " ";
+    }
+    std::cout << std::endl;
+}
+
+}}}
--- a/Source/Math/BlockHandlerAVX.h
+++ b/Source/Math/BlockHandlerAVX.h
@ -0,0 +1,961 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
+//
+#pragma once
+#include "BlockMultiplierPlatform.h"
+#include <immintrin.h>
+#include <emmintrin.h>
+#include <assert.h>
+#include <cstdint>
+#define FOR_CNTK
+#ifdef FOR_CNTK
+#include "CommonMatrix.h"
+#endif
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+class MATH_API BlockHandlerAVX
+{
+
+    private:
+        //USE SSE for the blocks of 8, borrowed from BlockHandlerSSE
+        FORCEINLINE static void kernelsse8x4(__m128i xmmRow0, __m128i xmmRow1, __m128i xmmRow2, __m128i xmmRow3, 
+                short* B, __m128i* return1, __m128i* return2, __m128i* return3, __m128i* return4);
+        FORCEINLINE static void kernelavx16x4(__m256i xmmRow0B0a, __m256i xmmRow1B0a, __m256i xmmRow2B0a, __m256i xmmRow3B0a,
+                short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4);
+        FORCEINLINE static void kernelavx32x4(
+                __m256i xmmRow0B0a, __m256i xmmRow0B0b,
+                __m256i xmmRow1B0a, __m256i xmmRow1B0b,
+                __m256i xmmRow2B0a, __m256i xmmRow2B0b,
+                __m256i xmmRow3B0a, __m256i xmmRow3B0b,
+                short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4);
+        FORCEINLINE static void kernelavx64x4(
+                __m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
+                __m256i xmmRow1B0a, __m256i xmmRow1B0b, __m256i xmmRow1B0c, __m256i xmmRow1B0d,
+                __m256i xmmRow2B0a, __m256i xmmRow2B0b, __m256i xmmRow2B0c, __m256i xmmRow2B0d,
+                __m256i xmmRow3B0a, __m256i xmmRow3B0b, __m256i xmmRow3B0c, __m256i xmmRow3B0d,
+                short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4);
+        FORCEINLINE static void kernelavx128x4(
+                __m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
+                __m256i xmmRow0B0e, __m256i xmmRow0B0f, __m256i xmmRow0B0g, __m256i xmmRow0B0h,
+                __m256i xmmRow1B0a, __m256i xmmRow1B0b, __m256i xmmRow1B0c, __m256i xmmRow1B0d,
+                __m256i xmmRow1B0e, __m256i xmmRow1B0f, __m256i xmmRow1B0g, __m256i xmmRow1B0h,
+                __m256i xmmRow2B0a, __m256i xmmRow2B0b, __m256i xmmRow2B0c, __m256i xmmRow2B0d,
+                __m256i xmmRow2B0e, __m256i xmmRow2B0f, __m256i xmmRow2B0g, __m256i xmmRow2B0h,
+                __m256i xmmRow3B0a, __m256i xmmRow3B0b, __m256i xmmRow3B0c, __m256i xmmRow3B0d,
+                __m256i xmmRow3B0e, __m256i xmmRow3B0f, __m256i xmmRow3B0g, __m256i xmmRow3B0h,
+                short* B, __m256i* return1, __m256i* return2, __m256i* return3, __m256i* return4);
+
+        FORCEINLINE static void kernelsse8x1(__m128i xmmRow0, 
+                short* B, __m128i* return1);
+        FORCEINLINE static void kernelavx16x1(__m256i xmmRow0B0a, 
+                short* B, __m256i* return1 );
+        FORCEINLINE static void kernelavx32x1(
+                __m256i xmmRow0B0a, __m256i xmmRow0B0b,
+                short* B, __m256i* return1);
+        FORCEINLINE static void kernelavx64x1(
+                __m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
+                short* B, __m256i* return1) ;
+        FORCEINLINE static void kernelavx128x1(
+                __m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
+                __m256i xmmRow0B0e, __m256i xmmRow0B0f, __m256i xmmRow0B0g, __m256i xmmRow0B0h,
+                short* B, __m256i* return1);
+
+        //TODO: Should these be refactored somewhere else? Any BlockHandler will need access to these functions.
+        //Separate class with static functions? Maybe move the Block rewriting functions as well as these to a new
+        //static class.
+        static int RowToColOffsetRewrittenB(int col, int kOffset, int blockSize, int origCols);
+        static int RowToColOffsetRewrittenA(int row, int kOffset, int blockSize, int rowsPerBlock, int origCols);
+        static void DumpM256(__m256i dumpMe);
+    public:
+        typedef __m256i VectorT;
+        typedef int16_t ScalarAT;
+        typedef int16_t ScalarBT;
+        typedef int32_t ScalarCT;
+        FORCEINLINE static void HandleBlock8x4(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+                int blockCnt, __m128i* resultStorage);
+        FORCEINLINE static void HandleBlock32x4(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+                int blockCnt, __m256i* resultStorage);
+        FORCEINLINE static void HandleBlock64x4(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+                int blockCnt, __m256i* resultStorage);
+        FORCEINLINE static void HandleBlock128x4(int currBlock, int startRow, int k, int n, short* newA, short* B,
+                int blockCnt, __m256i* resultStorage, VectorT* subtractMe);
+
+        FORCEINLINE static void HandleBlock8x1(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+                int blockCnt, __m128i* resultStorage);
+        FORCEINLINE static void HandleBlock16x1(int currBlock, int startRow, int k, int n, short* newA, short* B,  
+                int blockCnt, __m256i* resultStorage);
+        FORCEINLINE static void HandleBlock64x1(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+                int blockCnt, __m256i* resultStorage);
+        FORCEINLINE static void HandleBlock128x1(int currBlock, int startRow, int k, int n, short* newA, short* B,
+                int blockCnt, __m256i* resultStorage, VectorT* subtractMe);
+
+        FORCEINLINE static void HandleBlock16x4(int currBlock, int startRow, int k, int n, short* newA, short* B,  
+                int blockCnt, __m256i* resultStorage);
+
+
+
+        //FORCEINLINE static void HandleBlock128x4(int currBlock, int startRow, int m, int k, int n, short* newA, short* B, 
+
+        FORCEINLINE static void HandleBlock32x1(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+                int blockCnt, __m256i* resultStorage);
+
+        static VectorT* PrepareExtraB(const ScalarBT* /*prepareMe*/, int /*k*/, int /*n*/)
+        {
+            return nullptr;
+        }
+        static void FreePreparedB(VectorT* freeMe) { freeMe;  assert(nullptr == freeMe); }
+};
+
+#define LOADAVX2_128x4 \
+    __m256i r0b0a2 = _mm256_load_si256((__m256i*)currA2);		\
+__m256i r0b0b2 = _mm256_load_si256((__m256i*)(currA2 + 16));	\
+__m256i r0b0c2 = _mm256_load_si256((__m256i*)(currA2 + 32)); \
+__m256i r0b0d2 = _mm256_load_si256((__m256i*)(currA2 + 48)); \
+__m256i r0b0e2 = _mm256_load_si256((__m256i*)(currA2 + 64)); \
+__m256i r0b0f2 = _mm256_load_si256((__m256i*)(currA2 + 80)); \
+__m256i r0b0g2 = _mm256_load_si256((__m256i*)(currA2 + 96)); \
+__m256i r0b0h2 = _mm256_load_si256((__m256i*)(currA2 + 112));\
+\
+__m256i r1b0a2 = _mm256_load_si256((__m256i*)(currA2 + 128));\
+__m256i r1b0b2 = _mm256_load_si256((__m256i*)(currA2 + 144));\
+__m256i r1b0c2 = _mm256_load_si256((__m256i*)(currA2 + 160));\
+__m256i r1b0d2 = _mm256_load_si256((__m256i*)(currA2 + 176));\
+__m256i r1b0e2 = _mm256_load_si256((__m256i*)(currA2 + 192));\
+__m256i r1b0f2 = _mm256_load_si256((__m256i*)(currA2 + 208));\
+__m256i r1b0g2 = _mm256_load_si256((__m256i*)(currA2 + 224));\
+__m256i r1b0h2 = _mm256_load_si256((__m256i*)(currA2 + 240));\
+\
+__m256i r2b0a2 = _mm256_load_si256((__m256i*)(currA2 + 256));\
+__m256i r2b0b2 = _mm256_load_si256((__m256i*)(currA2 + 272));\
+__m256i r2b0c2 = _mm256_load_si256((__m256i*)(currA2 + 288));\
+__m256i r2b0d2 = _mm256_load_si256((__m256i*)(currA2 + 304));\
+__m256i r2b0e2 = _mm256_load_si256((__m256i*)(currA2 + 320));\
+__m256i r2b0f2 = _mm256_load_si256((__m256i*)(currA2 + 336));\
+__m256i r2b0g2 = _mm256_load_si256((__m256i*)(currA2 + 352));\
+__m256i r2b0h2 = _mm256_load_si256((__m256i*)(currA2 + 368));\
+\
+__m256i r3b0a2 = _mm256_load_si256((__m256i*)(currA2 + 384));\
+__m256i r3b0b2 = _mm256_load_si256((__m256i*)(currA2 + 400));\
+__m256i r3b0c2 = _mm256_load_si256((__m256i*)(currA2 + 416));\
+__m256i r3b0d2 = _mm256_load_si256((__m256i*)(currA2 + 432));\
+__m256i r3b0e2 = _mm256_load_si256((__m256i*)(currA2 + 448));\
+__m256i r3b0f2 = _mm256_load_si256((__m256i*)(currA2 + 464));\
+__m256i r3b0g2 = _mm256_load_si256((__m256i*)(currA2 + 480));\
+__m256i r3b0h2 = _mm256_load_si256((__m256i*)(currA2 + 496));\
+
+#define LOADAVX2_128x1 \
+    __m256i r0b0a2 = _mm256_load_si256((__m256i*)currA2);		\
+__m256i r0b0b2 = _mm256_load_si256((__m256i*)(currA2 + 16));	\
+__m256i r0b0c2 = _mm256_load_si256((__m256i*)(currA2 + 32)); \
+__m256i r0b0d2 = _mm256_load_si256((__m256i*)(currA2 + 48)); \
+__m256i r0b0e2 = _mm256_load_si256((__m256i*)(currA2 + 64)); \
+__m256i r0b0f2 = _mm256_load_si256((__m256i*)(currA2 + 80)); \
+__m256i r0b0g2 = _mm256_load_si256((__m256i*)(currA2 + 96)); \
+__m256i r0b0h2 = _mm256_load_si256((__m256i*)(currA2 + 112));
+
+
+#define LOADAVX_128x1 \
+    __m256i r0b0a = _mm256_load_si256((__m256i*)currA);		\
+__m256i r0b0b = _mm256_load_si256((__m256i*)(currA + 16));	\
+__m256i r0b0c = _mm256_load_si256((__m256i*)(currA + 32)); \
+__m256i r0b0d = _mm256_load_si256((__m256i*)(currA + 48)); \
+__m256i r0b0e = _mm256_load_si256((__m256i*)(currA + 64)); \
+__m256i r0b0f = _mm256_load_si256((__m256i*)(currA + 80)); \
+__m256i r0b0g = _mm256_load_si256((__m256i*)(currA + 96)); \
+__m256i r0b0h = _mm256_load_si256((__m256i*)(currA + 112));
+
+
+#define LOADAVX_128x4 \
+    __m256i r0b0a = _mm256_load_si256((__m256i*)currA);		\
+__m256i r0b0b = _mm256_load_si256((__m256i*)(currA + 16));	\
+__m256i r0b0c = _mm256_load_si256((__m256i*)(currA + 32)); \
+__m256i r0b0d = _mm256_load_si256((__m256i*)(currA + 48)); \
+__m256i r0b0e = _mm256_load_si256((__m256i*)(currA + 64)); \
+__m256i r0b0f = _mm256_load_si256((__m256i*)(currA + 80)); \
+__m256i r0b0g = _mm256_load_si256((__m256i*)(currA + 96)); \
+__m256i r0b0h = _mm256_load_si256((__m256i*)(currA + 112));\
+\
+__m256i r1b0a = _mm256_load_si256((__m256i*)(currA + 128));\
+__m256i r1b0b = _mm256_load_si256((__m256i*)(currA + 144));\
+__m256i r1b0c = _mm256_load_si256((__m256i*)(currA + 160));\
+__m256i r1b0d = _mm256_load_si256((__m256i*)(currA + 176));\
+__m256i r1b0e = _mm256_load_si256((__m256i*)(currA + 192));\
+__m256i r1b0f = _mm256_load_si256((__m256i*)(currA + 208));\
+__m256i r1b0g = _mm256_load_si256((__m256i*)(currA + 224));\
+__m256i r1b0h = _mm256_load_si256((__m256i*)(currA + 240));\
+\
+__m256i r2b0a = _mm256_load_si256((__m256i*)(currA + 256));\
+__m256i r2b0b = _mm256_load_si256((__m256i*)(currA + 272));\
+__m256i r2b0c = _mm256_load_si256((__m256i*)(currA + 288));\
+__m256i r2b0d = _mm256_load_si256((__m256i*)(currA + 304));\
+__m256i r2b0e = _mm256_load_si256((__m256i*)(currA + 320));\
+__m256i r2b0f = _mm256_load_si256((__m256i*)(currA + 336));\
+__m256i r2b0g = _mm256_load_si256((__m256i*)(currA + 352));\
+__m256i r2b0h = _mm256_load_si256((__m256i*)(currA + 368));\
+\
+__m256i r3b0a = _mm256_load_si256((__m256i*)(currA + 384));\
+__m256i r3b0b = _mm256_load_si256((__m256i*)(currA + 400));\
+__m256i r3b0c = _mm256_load_si256((__m256i*)(currA + 416));\
+__m256i r3b0d = _mm256_load_si256((__m256i*)(currA + 432));\
+__m256i r3b0e = _mm256_load_si256((__m256i*)(currA + 448));\
+__m256i r3b0f = _mm256_load_si256((__m256i*)(currA + 464));\
+__m256i r3b0g = _mm256_load_si256((__m256i*)(currA + 480));\
+__m256i r3b0h = _mm256_load_si256((__m256i*)(currA + 496));\
+
+#define LOADAVX_64x4 \
+    __m256i r0b0a = _mm256_load_si256((__m256i*)currA);		\
+__m256i r0b0b = _mm256_load_si256((__m256i*)currA + 1);	\
+__m256i r0b0c = _mm256_load_si256((__m256i*)currA + 2); \
+__m256i r0b0d = _mm256_load_si256((__m256i*)currA + 3); \
+\
+__m256i r1b0a = _mm256_load_si256((__m256i*)currA + 4);\
+__m256i r1b0b = _mm256_load_si256((__m256i*)currA + 5);\
+__m256i r1b0c = _mm256_load_si256((__m256i*)currA + 6);\
+__m256i r1b0d = _mm256_load_si256((__m256i*)currA + 7);\
+\
+__m256i r2b0a = _mm256_load_si256((__m256i*)currA + 8);\
+__m256i r2b0b = _mm256_load_si256((__m256i*)currA + 9);\
+__m256i r2b0c = _mm256_load_si256((__m256i*)currA + 10);\
+__m256i r2b0d = _mm256_load_si256((__m256i*)currA + 11);\
+\
+__m256i r3b0a = _mm256_load_si256((__m256i*)currA + 12);\
+__m256i r3b0b = _mm256_load_si256((__m256i*)currA + 13);\
+__m256i r3b0c = _mm256_load_si256((__m256i*)currA + 14);\
+__m256i r3b0d = _mm256_load_si256((__m256i*)currA + 15);
+
+#define LOADAVX_64x1 \
+    __m256i r0b0a = _mm256_load_si256((__m256i*)currA);		\
+__m256i r0b0b = _mm256_load_si256((__m256i*)currA + 1);	\
+__m256i r0b0c = _mm256_load_si256((__m256i*)currA + 2); \
+__m256i r0b0d = _mm256_load_si256((__m256i*)currA + 3); 
+
+
+#define LOADAVX_32x4 \
+    __m256i r0b0a = _mm256_load_si256((__m256i*)currA);		\
+__m256i r0b0b = _mm256_load_si256((__m256i*)currA + 1);	\
+\
+__m256i r1b0a = _mm256_load_si256((__m256i*)currA + 2);\
+__m256i r1b0b = _mm256_load_si256((__m256i*)currA + 3);\
+\
+__m256i r2b0a = _mm256_load_si256((__m256i*)currA + 4);\
+__m256i r2b0b = _mm256_load_si256((__m256i*)currA + 5);\
+\
+__m256i r3b0a = _mm256_load_si256((__m256i*)currA + 6);\
+__m256i r3b0b = _mm256_load_si256((__m256i*)currA + 7);\
+
+#define LOADAVX_32x1 \
+    __m256i r0b0a = _mm256_load_si256((__m256i*)currA);		\
+__m256i r0b0b = _mm256_load_si256((__m256i*)currA + 1);	
+
+
+
+#define LOADAVX_16x4 \
+    __m256i r0b0a = _mm256_load_si256((__m256i*)currA);		\
+__m256i r1b0a = _mm256_load_si256((__m256i*)currA + 1);\
+__m256i r2b0a = _mm256_load_si256((__m256i*)currA + 2);\
+__m256i r3b0a = _mm256_load_si256((__m256i*)currA + 3);\
+
+#define LOADAVX_16x1 \
+    __m256i r0b0a = _mm256_load_si256((__m256i*)currA);		
+
+#define LOAD_8x4 \
+    __m128i r0b0a = _mm_load_si128((__m128i*)currA);\
+__m128i r1b0a = _mm_load_si128((__m128i*)currA + 1);\
+__m128i r2b0a = _mm_load_si128((__m128i*)currA + 2);\
+__m128i r3b0a = _mm_load_si128((__m128i*)currA + 3);\
+
+#define LOAD_8x1 \
+    __m128i r0b0a = _mm_load_si128((__m128i*)currA);
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock8x4(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+        int blockCnt, __m128i* resultStorage)
+{
+    blockCnt; //warning 4100
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 8, 4, k);
+    short* currA = &newA[aOffset];
+    LOAD_8x4;
+    for (int c = 0; c < n; ++c)
+    {
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 8, n)];
+        __m128i accum1 = _mm_set_epi32(0, 0, 0, 0);
+        __m128i accum2 = _mm_set_epi32(0, 0, 0, 0);
+        __m128i accum3 = _mm_set_epi32(0, 0, 0, 0);
+        __m128i accum4 = _mm_set_epi32(0, 0, 0, 0);
+        kernelsse8x4(r0b0a, r1b0a, r2b0a, r3b0a,
+                currB, &accum1, &accum2, &accum3, &accum4);
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
+        resultStorage[RowColToOffset(1, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(1, c, n)], accum2);
+        resultStorage[RowColToOffset(2, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(2, c, n)], accum3);
+        resultStorage[RowColToOffset(3, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(3, c, n)], accum4);
+    }
+}
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock8x1(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+        int /*blockCnt*/, __m128i* resultStorage)
+{
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 8, 4, k);
+    short* currA = &newA[aOffset];
+    LOAD_8x1;
+    for (int c = 0; c < n; ++c)
+    {
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 8, n)];
+        __m128i accum1 = _mm_set_epi32(0, 0, 0, 0);
+        kernelsse8x1(r0b0a, 
+                currB, &accum1);
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
+    }
+}
+
+
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock16x4(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+        int /*blockCnt*/, __m256i* resultStorage) 
+{
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 16, 4, k);
+    short* currA = &newA[aOffset];
+    LOADAVX_16x4;
+    //#pragma omp parallel for
+    for (int c = 0; c < n; ++c)
+    {
+
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 16, n)];
+
+        //The gain comes when we have all the row values loaded up
+        //together and we multiply them all times each column, saving m_rowsPerBlock column
+        //loads.
+
+        __m256i accum1 = _mm256_set1_epi16(0);
+        __m256i accum2 = _mm256_set1_epi16(0);
+        __m256i accum3 = _mm256_set1_epi16(0);
+        __m256i accum4 = _mm256_set1_epi16(0);
+        kernelavx16x4(r0b0a, r1b0a, r2b0a, r3b0a,
+                currB, &accum1, &accum2, &accum3, &accum4);
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
+        resultStorage[RowColToOffset(1, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(1, c, n)], accum2);
+        resultStorage[RowColToOffset(2, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(2, c, n)], accum3);
+        resultStorage[RowColToOffset(3, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(3, c, n)], accum4);
+    }
+}
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock16x1(int currBlock, int startRow, int k, int n, short* newA, short* B,  
+        int /*blockCnt*/, __m256i* resultStorage) 
+{
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 16, 1, k);
+    short* currA = &newA[aOffset];
+    LOADAVX_16x1;
+    //#pragma omp parallel for
+    for (int c = 0; c < n; ++c)
+    {
+
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 16, n)];
+
+        //The gain comes when we have all the row values loaded up
+        //together and we multiply them all times each column, saving m_rowsPerBlock column
+        //loads.
+
+        __m256i accum1 = _mm256_set1_epi16(0);
+
+        kernelavx16x1(r0b0a, currB, &accum1);
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
+    }
+
+}
+
+
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock32x4(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+        int /*blockCnt*/, __m256i* resultStorage)
+{
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 32, 4, k);
+    short* currA = &newA[aOffset];
+    LOADAVX_32x4;
+    //#pragma omp parallel for
+    for (int c = 0; c < n; ++c)
+    {
+
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 32, n)];
+
+        //The gain comes when we have all the row values loaded up
+        //together and we multiply them all times each column, saving m_rowsPerBlock column
+        //loads.
+
+        __m256i accum1 = _mm256_set1_epi16(0);
+        __m256i accum2 = _mm256_set1_epi16(0);
+        __m256i accum3 = _mm256_set1_epi16(0);
+        __m256i accum4 = _mm256_set1_epi16(0);				
+
+        kernelavx32x4(
+                r0b0a, r0b0b, 
+                r1b0a, r1b0b, 
+                r2b0a, r2b0b, 
+                r3b0a, r3b0b, 
+                currB, &accum1, &accum2, &accum3, &accum4);
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], accum1);
+        resultStorage[RowColToOffset(1, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(1, c, n)], accum2);
+        resultStorage[RowColToOffset(2, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(2, c, n)], accum3);
+        resultStorage[RowColToOffset(3, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(3, c, n)], accum4);
+    }
+}
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock32x1(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+        int /*blockCnt*/, __m256i* resultStorage)
+{
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 32, 1, k);
+    short* currA = &newA[aOffset];
+    LOADAVX_32x1;
+    //#pragma omp parallel for
+    for (int c = 0; c < n; ++c)
+    {
+
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 32, n)];
+
+        __m256i accum1 = _mm256_set1_epi16(0);
+
+
+        kernelavx32x1(
+                r0b0a, r0b0b, currB, &accum1);
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], accum1);
+
+    }
+}
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock64x4(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+        int /*blockCnt*/, __m256i* resultStorage)
+{
+
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 64, 4, k);
+    short* currA = &newA[aOffset];
+    LOADAVX_64x4;
+    //#pragma omp parallel for
+    for (int c = 0; c < n; ++c)
+    {
+
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 64, n)];
+
+        //The gain comes when we have all the row values loaded up
+        //together and we multiply them all times each column, saving m_rowsPerBlock column
+        //loads.
+
+        __m256i accum1 = _mm256_set1_epi16(0);
+        __m256i accum2 = _mm256_set1_epi16(0);
+        __m256i accum3 = _mm256_set1_epi16(0);
+        __m256i accum4 = _mm256_set1_epi16(0);				
+
+        kernelavx64x4(
+                r0b0a, r0b0b, r0b0c, r0b0d, 
+                r1b0a, r1b0b, r1b0c, r1b0d, 
+                r2b0a, r2b0b, r2b0c, r2b0d, 
+                r3b0a, r3b0b, r3b0c, r3b0d, 
+                currB, &accum1, &accum2, &accum3, &accum4);
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], accum1);
+        resultStorage[RowColToOffset(1, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(1, c, n)], accum2);
+        resultStorage[RowColToOffset(2, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(2, c, n)], accum3);
+        resultStorage[RowColToOffset(3, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(3, c, n)], accum4);
+    }
+}
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock64x1(int currBlock, int startRow, int k, int n, short* newA, short* B, 
+        int /*blockCnt*/, __m256i* resultStorage)
+{
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 64, 4, k);
+    short* currA = &newA[aOffset];
+    LOADAVX_64x1;
+    //#pragma omp parallel for
+    for (int c = 0; c < n; ++c)
+    {
+
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 64, n)];
+
+        //The gain comes when we have all the row values loaded up
+        //together and we multiply them all times each column, saving m_rowsPerBlock column
+        //loads.
+
+        __m256i accum1 = _mm256_set1_epi16(0);
+
+
+        kernelavx64x1(
+                r0b0a, r0b0b, r0b0c, r0b0d,
+                currB, &accum1);
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32(resultStorage[RowColToOffset(0, c, n)], accum1);
+    }
+}
+
+
+
+
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock128x4(int currBlock, int startRow, int k, int n, short* newA, short* B,  
+        int blockCnt, __m256i* resultStorage, VectorT* /*subtractMe*/)
+{
+
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 128, 4, k);
+    int aOffset2 = RowToColOffsetRewrittenA(startRow, currBlock + 1, 128, 4, k);
+    short* currA = &newA[aOffset];
+    short* currA2 = &newA[aOffset2];
+    LOADAVX_128x4;
+    LOADAVX2_128x4;
+    //#pragma omp parallel for
+    for (int c = 0; c < n; ++c)
+    {
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 128, n)];
+        short* currB2 = &B[RowToColOffsetRewrittenB(c, currBlock + 1, 128, n)];
+
+        //The gain comes when we have all the row values loaded up
+        //together and we multiply them all times each column, saving m_rowsPerBlock column
+        //loads.
+
+        __m256i accum1 = _mm256_set1_epi16(0);
+        __m256i accum2 = _mm256_set1_epi16(0);
+        __m256i accum3 = _mm256_set1_epi16(0);
+        __m256i accum4 = _mm256_set1_epi16(0);
+        __m256i accum5 = _mm256_set1_epi16(0);
+        __m256i accum6 = _mm256_set1_epi16(0);
+        __m256i accum7 = _mm256_set1_epi16(0);
+        __m256i accum8 = _mm256_set1_epi16(0);
+
+        kernelavx128x4(
+                r0b0a, r0b0b, r0b0c, r0b0d, r0b0e, r0b0f, r0b0g, r0b0h,
+                r1b0a, r1b0b, r1b0c, r1b0d, r1b0e, r1b0f, r1b0g, r1b0h,
+                r2b0a, r2b0b, r2b0c, r2b0d, r2b0e, r2b0f, r2b0g, r2b0h,
+                r3b0a, r3b0b, r3b0c, r3b0d, r3b0e, r3b0f, r3b0g, r3b0h,
+                currB, &accum1, &accum2, &accum3, &accum4);
+
+        if (blockCnt > 1)
+        {
+            kernelavx128x4(
+                    r0b0a2, r0b0b2, r0b0c2, r0b0d2, r0b0e2, r0b0f2, r0b0g2, r0b0h2,
+                    r1b0a2, r1b0b2, r1b0c2, r1b0d2, r1b0e2, r1b0f2, r1b0g2, r1b0h2,
+                    r2b0a2, r2b0b2, r2b0c2, r2b0d2, r2b0e2, r2b0f2, r2b0g2, r2b0h2,
+                    r3b0a2, r3b0b2, r3b0c2, r3b0d2, r3b0e2, r3b0f2, r3b0g2, r3b0h2,
+                    currB2, &accum5, &accum6, &accum7, &accum8);
+        }
+
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], _mm256_add_epi32(accum1,  accum5));
+        resultStorage[RowColToOffset(1, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(1, c, n)], _mm256_add_epi32(accum2,  accum6));
+        resultStorage[RowColToOffset(2, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(2, c, n)], _mm256_add_epi32(accum3,  accum7));
+        resultStorage[RowColToOffset(3, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(3, c, n)], _mm256_add_epi32(accum4,  accum8));
+    }
+}
+
+
+FORCEINLINE void BlockHandlerAVX::HandleBlock128x1(int currBlock, int startRow, int k, int n, short* newA, short* B,  
+        int blockCnt, __m256i* resultStorage, VectorT* /*subtractMe*/)
+{
+    int aOffset = RowToColOffsetRewrittenA(startRow, currBlock, 128, 4, k);
+    int aOffset2 = RowToColOffsetRewrittenA(startRow, currBlock + 1, 128, 4, k);
+    short* currA = &newA[aOffset];
+    short* currA2 = &newA[aOffset2];
+    LOADAVX_128x1;
+    LOADAVX2_128x1;
+    //#pragma omp parallel for
+    for (int c = 0; c < n; ++c)
+    {
+        short* currB = &B[RowToColOffsetRewrittenB(c, currBlock, 128, n)];
+        short* currB2 = &B[RowToColOffsetRewrittenB(c, currBlock + 1, 128, n)];
+
+        //The gain comes when we have all the row values loaded up
+        //together and we multiply them all times each column, saving m_rowsPerBlock column
+        //loads.
+
+        __m256i accum1 = _mm256_set1_epi16(0);
+        __m256i accum2 = _mm256_set1_epi16(0);
+        kernelavx128x1(
+                r0b0a, r0b0b, r0b0c, r0b0d, r0b0e, r0b0f, r0b0g, r0b0h,
+                currB, &accum1);
+
+        if (blockCnt > 1)
+        {
+            kernelavx128x1(
+                    r0b0a2, r0b0b2, r0b0c2, r0b0d2, r0b0e2, r0b0f2, r0b0g2, r0b0h2,
+                    currB2, &accum1);
+        }
+
+        resultStorage[RowColToOffset(0, c, n)] = _mm256_add_epi32( resultStorage[RowColToOffset(0, c, n)], _mm256_add_epi32(accum1,  accum2));
+    }
+}
+
+FORCEINLINE void BlockHandlerAVX::kernelsse8x1(__m128i xmmRow0,
+        short* B, __m128i* return1)
+{
+    __m128i xmmCol0 = _mm_load_si128((__m128i*)B);
+    __m128i result1 = _mm_madd_epi16(xmmRow0, xmmCol0);
+    *return1 = result1;
+}
+
+
+FORCEINLINE void BlockHandlerAVX::kernelsse8x4(__m128i xmmRow0, __m128i xmmRow1, __m128i xmmRow2, __m128i xmmRow3, 
+        short* B, __m128i* return1, __m128i* return2, __m128i* return3, __m128i* return4)
+{
+    __m128i xmmCol0 = _mm_load_si128((__m128i*)B);
+
+    __m128i result1 = _mm_madd_epi16(xmmRow0, xmmCol0);
+    __m128i result2 = _mm_madd_epi16(xmmRow1, xmmCol0);
+    __m128i result3 = _mm_madd_epi16(xmmRow2, xmmCol0);
+    __m128i result4 = _mm_madd_epi16(xmmRow3, xmmCol0);
+
+    *return1 = result1;
+    *return2 = result2;
+    *return3 = result3;
+    *return4 = result4;
+}
+FORCEINLINE void BlockHandlerAVX::kernelavx16x1(__m256i xmmRow0B0a, 
+        short* B, __m256i* return1)
+{
+
+    __m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
+
+    //Result for row 0
+    //Nomenclature:
+    //r0b0axc0b0a  means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
+    __m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
+
+
+    *return1 = r0b0axc0b0a;
+}
+
+
+
+
+FORCEINLINE void BlockHandlerAVX::kernelavx16x4(__m256i xmmRow0B0a, __m256i xmmRow1B0a, __m256i xmmRow2B0a, __m256i xmmRow3B0a, 
+        short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4)
+{
+
+    __m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
+
+    //Result for row 0
+    //Nomenclature:
+    //r0b0axc0b0a  means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
+    __m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
+    //Result for row 1
+    __m256i r1b0axc0b0a = _mm256_madd_epi16(xmmRow1B0a, xmmCol0B0a);
+    //Result for row 2
+    __m256i r2b0axc0b0a = _mm256_madd_epi16(xmmRow2B0a, xmmCol0B0a);
+    //Result for row 3
+    __m256i r3b0axc0b0a = _mm256_madd_epi16(xmmRow3B0a, xmmCol0B0a);
+
+    *return1 = r0b0axc0b0a;
+    *return2 = r1b0axc0b0a;
+    *return3 = r2b0axc0b0a;
+    *return4 = r3b0axc0b0a;
+}
+
+FORCEINLINE void BlockHandlerAVX::kernelavx32x1(
+        __m256i xmmRow0B0a, __m256i xmmRow0B0b, 
+        short* B, __m256i* return1)
+{
+
+    __m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
+    __m256i xmmCol0B0b = _mm256_load_si256((__m256i*)B + 1);
+
+    //Result for row 0
+    //Nomenclature:
+    //r0b0axc0b0a  means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
+    __m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
+    __m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
+
+    __m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
+
+
+    *return1 = result1a;
+}
+
+
+
+FORCEINLINE void BlockHandlerAVX::kernelavx32x4(
+        __m256i xmmRow0B0a, __m256i xmmRow0B0b, 
+        __m256i xmmRow1B0a, __m256i xmmRow1B0b, 
+        __m256i xmmRow2B0a, __m256i xmmRow2B0b, 
+        __m256i xmmRow3B0a, __m256i xmmRow3B0b, 
+        short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4)
+{
+
+    __m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
+    __m256i xmmCol0B0b = _mm256_load_si256((__m256i*)B + 1);
+
+    //Result for row 0
+    //Nomenclature:
+    //r0b0axc0b0a  means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
+    __m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
+    __m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
+
+    __m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
+
+
+
+    //Result for row 1
+    __m256i r1b0axc0b0a = _mm256_madd_epi16(xmmRow1B0a, xmmCol0B0a);
+    __m256i r1b0bxc0b0b = _mm256_madd_epi16(xmmRow1B0b, xmmCol0B0b);
+
+    __m256i result2a = _mm256_add_epi32(r1b0axc0b0a, r1b0bxc0b0b);
+
+    //Result for row 2
+    __m256i r2b0axc0b0a = _mm256_madd_epi16(xmmRow2B0a, xmmCol0B0a);
+    __m256i r2b0bxc0b0b = _mm256_madd_epi16(xmmRow2B0b, xmmCol0B0b);
+    __m256i result3a = _mm256_add_epi32(r2b0axc0b0a, r2b0bxc0b0b);
+
+
+
+    //Result for row 3
+    __m256i r3b0axc0b0a = _mm256_madd_epi16(xmmRow3B0a, xmmCol0B0a);
+    __m256i r3b0bxc0b0b = _mm256_madd_epi16(xmmRow3B0b, xmmCol0B0b);
+    __m256i result4a = _mm256_add_epi32(r3b0axc0b0a, r3b0bxc0b0b);
+
+
+
+
+    *return1 = result1a;
+    *return2 = result2a;
+    *return3 = result3a;
+    *return4 = result4a;
+}
+
+FORCEINLINE void BlockHandlerAVX::kernelavx64x1(
+        __m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
+        short* B, __m256i* return1)
+{
+
+    __m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
+    __m256i xmmCol0B0b = _mm256_load_si256((__m256i*)B + 1);
+    __m256i xmmCol0B0c = _mm256_load_si256((__m256i*)B + 2);
+    __m256i xmmCol0B0d = _mm256_load_si256((__m256i*)B + 3);
+    __m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
+    __m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
+    __m256i r0b0cxc0b0c = _mm256_madd_epi16(xmmRow0B0c, xmmCol0B0c);
+    __m256i r0b0dxc0b0d = _mm256_madd_epi16(xmmRow0B0d, xmmCol0B0d);
+
+    __m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
+    __m256i result1b = _mm256_add_epi32(r0b0cxc0b0c, r0b0dxc0b0d);
+
+
+    __m256i result1ab = _mm256_add_epi32(result1a, result1b);
+
+
+    *return1 = result1ab;
+    //std::cout << "Returning " << u.i[0] << " + " << u.i[4] << "(" << u.i[0] + u.i[4] << ") for first row" << std::endl;
+}
+
+
+
+FORCEINLINE void BlockHandlerAVX::kernelavx64x4(
+        __m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
+        __m256i xmmRow1B0a, __m256i xmmRow1B0b, __m256i xmmRow1B0c, __m256i xmmRow1B0d,
+        __m256i xmmRow2B0a, __m256i xmmRow2B0b, __m256i xmmRow2B0c, __m256i xmmRow2B0d,
+        __m256i xmmRow3B0a, __m256i xmmRow3B0b, __m256i xmmRow3B0c, __m256i xmmRow3B0d,
+        short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4)
+{
+
+    __m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
+    __m256i xmmCol0B0b = _mm256_load_si256((__m256i*)B + 1);
+    __m256i xmmCol0B0c = _mm256_load_si256((__m256i*)B + 2);
+    __m256i xmmCol0B0d = _mm256_load_si256((__m256i*)B + 3);
+
+    //Result for row 0
+    //Nomenclature:
+    //r0b0axc0b0a  means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
+    __m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
+    __m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
+    __m256i r0b0cxc0b0c = _mm256_madd_epi16(xmmRow0B0c, xmmCol0B0c);
+    __m256i r0b0dxc0b0d = _mm256_madd_epi16(xmmRow0B0d, xmmCol0B0d);
+
+    __m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
+    __m256i result1b = _mm256_add_epi32(r0b0cxc0b0c, r0b0dxc0b0d);
+
+
+    __m256i result1ab = _mm256_add_epi32(result1a, result1b);
+
+    //Result for row 1
+    __m256i r1b0axc0b0a = _mm256_madd_epi16(xmmRow1B0a, xmmCol0B0a);
+    __m256i r1b0bxc0b0b = _mm256_madd_epi16(xmmRow1B0b, xmmCol0B0b);
+    __m256i r1b0cxc0b0c = _mm256_madd_epi16(xmmRow1B0c, xmmCol0B0c);
+    __m256i r1b0dxc0b0d = _mm256_madd_epi16(xmmRow1B0d, xmmCol0B0d);
+
+    __m256i result2a = _mm256_add_epi32(r1b0axc0b0a, r1b0bxc0b0b);
+    __m256i result2b = _mm256_add_epi32(r1b0cxc0b0c, r1b0dxc0b0d);
+    __m256i result2ab = _mm256_add_epi32(result2a, result2b);
+
+    //Result for row 2
+    __m256i r2b0axc0b0a = _mm256_madd_epi16(xmmRow2B0a, xmmCol0B0a);
+    __m256i r2b0bxc0b0b = _mm256_madd_epi16(xmmRow2B0b, xmmCol0B0b);
+    __m256i r2b0cxc0b0c = _mm256_madd_epi16(xmmRow2B0c, xmmCol0B0c);
+    __m256i r2b0dxc0b0d = _mm256_madd_epi16(xmmRow2B0d, xmmCol0B0d);
+    __m256i result3a = _mm256_add_epi32(r2b0axc0b0a, r2b0bxc0b0b);
+    __m256i result3b = _mm256_add_epi32(r2b0cxc0b0c, r2b0dxc0b0d);
+
+    __m256i result3ab = _mm256_add_epi32(result3a, result3b);
+
+
+    //Result for row 3
+    __m256i r3b0axc0b0a = _mm256_madd_epi16(xmmRow3B0a, xmmCol0B0a);
+    __m256i r3b0bxc0b0b = _mm256_madd_epi16(xmmRow3B0b, xmmCol0B0b);
+    __m256i r3b0cxc0b0c = _mm256_madd_epi16(xmmRow3B0c, xmmCol0B0c);
+    __m256i r3b0dxc0b0d = _mm256_madd_epi16(xmmRow3B0d, xmmCol0B0d);
+
+    __m256i result4a = _mm256_add_epi32(r3b0axc0b0a, r3b0bxc0b0b);
+    __m256i result4b = _mm256_add_epi32(r3b0cxc0b0c, r3b0dxc0b0d);
+    __m256i result4ab = _mm256_add_epi32(result4a, result4b);
+
+    *return1 = result1ab;
+    *return2 = result2ab;
+    *return3 = result3ab;
+    *return4 = result4ab;
+}
+
+FORCEINLINE void BlockHandlerAVX::kernelavx128x1(
+        __m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
+        __m256i xmmRow0B0e, __m256i xmmRow0B0f, __m256i xmmRow0B0g, __m256i xmmRow0B0h,
+        short* B, __m256i* return1)
+{
+
+    __m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
+    __m256i xmmCol0B0b = _mm256_load_si256((__m256i*)(B + 16));
+    __m256i xmmCol0B0c = _mm256_load_si256((__m256i*)(B + 32));
+    __m256i xmmCol0B0d = _mm256_load_si256((__m256i*)(B + 48));
+    __m256i xmmCol0B0e = _mm256_load_si256((__m256i*)(B + 64));
+    __m256i xmmCol0B0f = _mm256_load_si256((__m256i*)(B + 80));
+    __m256i xmmCol0B0g = _mm256_load_si256((__m256i*)(B + 96));
+    __m256i xmmCol0B0h = _mm256_load_si256((__m256i*)(B + 112));
+    //Result for row 0
+    //Nomenclature:
+    //r0b0axc0b0a  means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
+    __m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
+    __m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
+    __m256i r0b0cxc0b0c = _mm256_madd_epi16(xmmRow0B0c, xmmCol0B0c);
+    __m256i r0b0dxc0b0d = _mm256_madd_epi16(xmmRow0B0d, xmmCol0B0d);
+    __m256i r0b0exc0b0e = _mm256_madd_epi16(xmmRow0B0e, xmmCol0B0e);
+    __m256i r0b0fxc0b0f = _mm256_madd_epi16(xmmRow0B0f, xmmCol0B0f);
+    __m256i r0b0gxc0b0g = _mm256_madd_epi16(xmmRow0B0g, xmmCol0B0g);
+    __m256i r0b0hxc0b0h = _mm256_madd_epi16(xmmRow0B0h, xmmCol0B0h);
+    __m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
+    __m256i result1b = _mm256_add_epi32(r0b0cxc0b0c, r0b0dxc0b0d);
+    __m256i result1c = _mm256_add_epi32(r0b0exc0b0e, r0b0fxc0b0f);
+    __m256i result1d = _mm256_add_epi32(r0b0gxc0b0g, r0b0hxc0b0h);
+
+
+    __m256i result1ab = _mm256_add_epi32(result1a, result1b);
+    __m256i result1cd = _mm256_add_epi32(result1c, result1d);
+    __m256i result1abcd = _mm256_add_epi32(result1ab, result1cd);
+
+    *return1 = result1abcd;
+    //std::cout << "Returning " << u.i[0] << " + " << u.i[4] << "(" << u.i[0] + u.i[4] << ") for first row" << std::endl;
+}
+
+FORCEINLINE void BlockHandlerAVX::kernelavx128x4(
+        __m256i xmmRow0B0a, __m256i xmmRow0B0b, __m256i xmmRow0B0c, __m256i xmmRow0B0d,
+        __m256i xmmRow0B0e, __m256i xmmRow0B0f, __m256i xmmRow0B0g, __m256i xmmRow0B0h,
+        __m256i xmmRow1B0a, __m256i xmmRow1B0b, __m256i xmmRow1B0c, __m256i xmmRow1B0d,
+        __m256i xmmRow1B0e, __m256i xmmRow1B0f, __m256i xmmRow1B0g, __m256i xmmRow1B0h,
+        __m256i xmmRow2B0a, __m256i xmmRow2B0b, __m256i xmmRow2B0c, __m256i xmmRow2B0d,
+        __m256i xmmRow2B0e, __m256i xmmRow2B0f, __m256i xmmRow2B0g, __m256i xmmRow2B0h,
+        __m256i xmmRow3B0a, __m256i xmmRow3B0b, __m256i xmmRow3B0c, __m256i xmmRow3B0d,
+        __m256i xmmRow3B0e, __m256i xmmRow3B0f, __m256i xmmRow3B0g, __m256i xmmRow3B0h,
+        short* B, __m256i* return1, __m256i* return2, __m256i * return3, __m256i* return4)
+{
+
+    __m256i xmmCol0B0a = _mm256_load_si256((__m256i*)B);
+    __m256i xmmCol0B0b = _mm256_load_si256((__m256i*)(B + 16));
+    __m256i xmmCol0B0c = _mm256_load_si256((__m256i*)(B + 32));
+    __m256i xmmCol0B0d = _mm256_load_si256((__m256i*)(B + 48));
+    __m256i xmmCol0B0e = _mm256_load_si256((__m256i*)(B + 64));
+    __m256i xmmCol0B0f = _mm256_load_si256((__m256i*)(B + 80));
+    __m256i xmmCol0B0g = _mm256_load_si256((__m256i*)(B + 96));
+    __m256i xmmCol0B0h = _mm256_load_si256((__m256i*)(B + 112));
+    //Result for row 0
+    //Nomenclature:
+    //r0b0axc0b0a  means "Row zero block zero part A times column zero block zero part A. (Blocks > 8 take up > 1 __m256i each (xmm registers))
+    __m256i r0b0axc0b0a = _mm256_madd_epi16(xmmRow0B0a, xmmCol0B0a);
+    __m256i r0b0bxc0b0b = _mm256_madd_epi16(xmmRow0B0b, xmmCol0B0b);
+    __m256i r0b0cxc0b0c = _mm256_madd_epi16(xmmRow0B0c, xmmCol0B0c);
+    __m256i r0b0dxc0b0d = _mm256_madd_epi16(xmmRow0B0d, xmmCol0B0d);
+    __m256i r0b0exc0b0e = _mm256_madd_epi16(xmmRow0B0e, xmmCol0B0e);
+    __m256i r0b0fxc0b0f = _mm256_madd_epi16(xmmRow0B0f, xmmCol0B0f);
+    __m256i r0b0gxc0b0g = _mm256_madd_epi16(xmmRow0B0g, xmmCol0B0g);
+    __m256i r0b0hxc0b0h = _mm256_madd_epi16(xmmRow0B0h, xmmCol0B0h);
+    __m256i result1a = _mm256_add_epi32(r0b0axc0b0a, r0b0bxc0b0b);
+    __m256i result1b = _mm256_add_epi32(r0b0cxc0b0c, r0b0dxc0b0d);
+    __m256i result1c = _mm256_add_epi32(r0b0exc0b0e, r0b0fxc0b0f);
+    __m256i result1d = _mm256_add_epi32(r0b0gxc0b0g, r0b0hxc0b0h);
+
+
+    __m256i result1ab = _mm256_add_epi32(result1a, result1b);
+    __m256i result1cd = _mm256_add_epi32(result1c, result1d);
+    __m256i result1abcd = _mm256_add_epi32(result1ab, result1cd);
+
+    //Result for row 1
+    __m256i r1b0axc0b0a = _mm256_madd_epi16(xmmRow1B0a, xmmCol0B0a);
+    __m256i r1b0bxc0b0b = _mm256_madd_epi16(xmmRow1B0b, xmmCol0B0b);
+    __m256i r1b0cxc0b0c = _mm256_madd_epi16(xmmRow1B0c, xmmCol0B0c);
+    __m256i r1b0dxc0b0d = _mm256_madd_epi16(xmmRow1B0d, xmmCol0B0d);
+    __m256i r1b0exc0b0e = _mm256_madd_epi16(xmmRow1B0e, xmmCol0B0e);
+    __m256i r1b0fxc0b0f = _mm256_madd_epi16(xmmRow1B0f, xmmCol0B0f);
+    __m256i r1b0gxc0b0g = _mm256_madd_epi16(xmmRow1B0g, xmmCol0B0g);
+    __m256i r1b0hxc0b0h = _mm256_madd_epi16(xmmRow1B0h, xmmCol0B0h);
+
+    __m256i result2a = _mm256_add_epi32(r1b0axc0b0a, r1b0bxc0b0b);
+    __m256i result2b = _mm256_add_epi32(r1b0cxc0b0c, r1b0dxc0b0d);
+    __m256i result2c = _mm256_add_epi32(r1b0exc0b0e, r1b0fxc0b0f);
+    __m256i result2d = _mm256_add_epi32(r1b0gxc0b0g, r1b0hxc0b0h);
+    __m256i result2ab = _mm256_add_epi32(result2a, result2b);
+    __m256i result2cd = _mm256_add_epi32(result2c, result2d);
+    __m256i result2abcd = _mm256_add_epi32(result2ab, result2cd);
+
+    //Result for row 2
+    __m256i r2b0axc0b0a = _mm256_madd_epi16(xmmRow2B0a, xmmCol0B0a);
+    __m256i r2b0bxc0b0b = _mm256_madd_epi16(xmmRow2B0b, xmmCol0B0b);
+    __m256i r2b0cxc0b0c = _mm256_madd_epi16(xmmRow2B0c, xmmCol0B0c);
+    __m256i r2b0dxc0b0d = _mm256_madd_epi16(xmmRow2B0d, xmmCol0B0d);
+    __m256i r2b0exc0b0e = _mm256_madd_epi16(xmmRow2B0e, xmmCol0B0e);
+    __m256i r2b0fxc0b0f = _mm256_madd_epi16(xmmRow2B0f, xmmCol0B0f);
+    __m256i r2b0gxc0b0g = _mm256_madd_epi16(xmmRow2B0g, xmmCol0B0g);
+    __m256i r2b0hxc0b0h = _mm256_madd_epi16(xmmRow2B0h, xmmCol0B0h);
+    __m256i result3a = _mm256_add_epi32(r2b0axc0b0a, r2b0bxc0b0b);
+    __m256i result3b = _mm256_add_epi32(r2b0cxc0b0c, r2b0dxc0b0d);
+    __m256i result3c = _mm256_add_epi32(r2b0exc0b0e, r2b0fxc0b0f);
+    __m256i result3d = _mm256_add_epi32(r2b0gxc0b0g, r2b0hxc0b0h);
+
+    __m256i result3ab = _mm256_add_epi32(result3a, result3b);
+    __m256i result3cd = _mm256_add_epi32(result3c, result3d);
+    __m256i result3abcd = _mm256_add_epi32(result3ab, result3cd);
+
+
+    //Result for row 3
+    __m256i r3b0axc0b0a = _mm256_madd_epi16(xmmRow3B0a, xmmCol0B0a);
+    __m256i r3b0bxc0b0b = _mm256_madd_epi16(xmmRow3B0b, xmmCol0B0b);
+    __m256i r3b0cxc0b0c = _mm256_madd_epi16(xmmRow3B0c, xmmCol0B0c);
+    __m256i r3b0dxc0b0d = _mm256_madd_epi16(xmmRow3B0d, xmmCol0B0d);
+    __m256i r3b0exc0b0e = _mm256_madd_epi16(xmmRow3B0e, xmmCol0B0e);
+    __m256i r3b0fxc0b0f = _mm256_madd_epi16(xmmRow3B0f, xmmCol0B0f);
+    __m256i r3b0gxc0b0g = _mm256_madd_epi16(xmmRow3B0g, xmmCol0B0g);
+    __m256i r3b0hxc0b0h = _mm256_madd_epi16(xmmRow3B0h, xmmCol0B0h);
+
+    __m256i result4a = _mm256_add_epi32(r3b0axc0b0a, r3b0bxc0b0b);
+    __m256i result4b = _mm256_add_epi32(r3b0cxc0b0c, r3b0dxc0b0d);
+    __m256i result4c = _mm256_add_epi32(r3b0exc0b0e, r3b0fxc0b0f);
+    __m256i result4d = _mm256_add_epi32(r3b0gxc0b0g, r3b0hxc0b0h);
+    __m256i result4ab = _mm256_add_epi32(result4a, result4b);
+    __m256i result4cd = _mm256_add_epi32(result4c, result4d);
+    __m256i result4abcd = _mm256_add_epi32(result4ab, result4cd);
+
+    //Now we can just add horizontally
+
+
+
+
+    *return1 = result1abcd;
+    *return2 = result2abcd;
+    *return3 = result3abcd;
+    *return4 = result4abcd;
+}
+
+
+}}}
--- a/Source/Math/BlockHandlerSSE.cpp
+++ b/Source/Math/BlockHandlerSSE.cpp
@ -0,0 +1,32 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
+//
+#include "stdafx.h"
+#include <xmmintrin.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "BlockHandlerSSE.h"
+#include "BlockMultiplierMatrixUtil.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+int BlockHandlerSSE::RowToColOffsetRewrittenA(int row, int kOffset, int blockSize, int rowsPerBlock, int origCols)
+{
+    int rowIdx = row / rowsPerBlock;
+    int offsetFromBlockBeginning = row % rowsPerBlock;
+    int colIdx = kOffset * rowsPerBlock * blockSize + (offsetFromBlockBeginning * blockSize);
+    return (rowIdx * (origCols / blockSize) * rowsPerBlock * blockSize) + colIdx;
+}
+
+
+//col is the original column of B
+//kOffset is the offset to the current block we are multiplying against (in absolute
+int BlockHandlerSSE::RowToColOffsetRewrittenB(int col, int kOffset, int blockSize, int origCols)
+{
+    return (origCols *  blockSize * kOffset) + (col * blockSize);
+}
+
+
+}}}
--- a/Source/Math/BlockHandlerSSE.h
+++ b/Source/Math/BlockHandlerSSE.h
--- a/Source/Math/BlockMultiplier.h
+++ b/Source/Math/BlockMultiplier.h
--- a/Source/Math/BlockMultiplierMatrixUtil.h
+++ b/Source/Math/BlockMultiplierMatrixUtil.h
@ -0,0 +1,161 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
+//
+#pragma once
+#define NOMINMAX
+#include <fstream>
+#include <functional>
+#include <iostream>
+#include <limits>
+#include <string.h>//for memset
+#include "BlockMultiplierPlatform.h"
+
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    template<typename ScalarT> void DumpMatrix(ScalarT* pDumpMe, int rows, int cols, std::ostream* pStream, int rowMax = std::numeric_limits<int>::max(),
+                                               int colMax = std::numeric_limits<int>::max())
+    {
+        for (int r = 0; r < std::min(rows, rowMax); ++r)
+        {
+            for (int c = 0; c < std::min(cols, colMax); ++c)
+            {
+                (*pStream) << pDumpMe[r * cols + c] << " ";
+            }
+            (*pStream) << std::endl;
+        }
+    }
+
+    // Turn a row+col into an absolute offset
+    FORCEINLINE int RowColToOffset(int idxRow, int idxCol, int numCols)
+    {
+        return idxRow * numCols + idxCol;
+    }
+
+    template<typename ScalarT>struct TransposeArgs
+    {
+        int r;
+        ScalarT* transposeMe;
+        ScalarT* transposed;
+        int origRows;
+        int origCols;
+    };
+
+
+    template<class ScalarT>void TransposeThread(TransposeArgs<ScalarT> ta)
+    {
+        for (int c = 0; c < ta.origCols; ++c)
+        {
+            //new c,r = old r,c
+            int oldOffset = RowColToOffset(ta.r, c, ta.origCols);
+            int newOffset = RowColToOffset(c, ta.r, ta.origRows);
+            ta.transposed[newOffset] = ta.transposeMe[oldOffset];
+        }
+    }
+
+    template<typename ScalarT> class TransposeThreadType
+    {
+        public:
+            void operator()(TransposeArgs<ScalarT> ta)
+            {
+                TransposeThread<ScalarT>(ta);
+            }
+    };
+
+
+    template<class ScalarT> void Transpose(ScalarT* transposeMe, ScalarT* transposed, int origRows, int origCols)
+    {
+#pragma omp parallel for
+        for (int r = 0; r < origRows; ++r)
+        {
+            for (int c = 0; c < origCols; ++c)
+            {
+                int oldOffset = RowColToOffset(r, c, origCols);
+                int newOffset = RowColToOffset(c, r, origRows);
+                transposed[newOffset] = transposeMe[oldOffset];
+            }
+        }
+    }
+
+    template<typename ScalarT> ScalarT* CreateAlignedMatrix(int m, int n, ScalarT initVal, int alignment = 64)
+    {
+        ScalarT* ret = (ScalarT*)ALIGNED_ALLOC(sizeof(ScalarT) * (m * n), alignment);
+
+
+        if (initVal != 0)
+        {
+            for (int i = 0; i < m * n; ++i)
+            {
+                ret[i] = initVal;// +i;
+            }
+        }
+        else
+        {
+            memset(ret, 0, sizeof(ScalarT) * m * n);
+        }
+
+        return ret;
+    }
+
+    template<typename ScalarT> void FreeAlignedMatrix(ScalarT* destroyMe)
+    {
+        ALIGNED_FREE(destroyMe);
+    }
+
+    template<typename ScalarT> double MeanSquaredError(ScalarT* lhs, ScalarT* rhs, int m, int n)
+    {
+        double accumulatedError = 0.0;
+        for (int r = 0; r < m; ++r)
+        {
+            for(int c = 0; c < n; ++c)
+            {
+                double err = ((double)lhs[RowColToOffset(r, c, n)] - (double)rhs[RowColToOffset(r, c, n)]);
+                err = err * err;
+                accumulatedError += err;
+            }
+        }
+        return accumulatedError / (double)(m * n);
+    }
+
+
+    template<typename ScalarT> void RandInitIntMatrix(ScalarT* initMe, int m, int n, ScalarT bound)
+    {
+        ScalarT* curr = initMe;
+        for (int i = 0; i < m * n; ++i)
+        {
+            *curr++ = rand() % bound;
+        }
+    }
+
+    //Helper fn for tests
+    template<typename ScalarT>static void RandInitFloatMatrix(ScalarT* initMe, int m, int n, ScalarT min, ScalarT max)
+    {
+        for (int i = 0; i < m * n; ++i)
+        {
+            initMe[i] = min + ((max - min) * ((ScalarT)rand() / RAND_MAX));
+        }
+    }
+
+    //Viewing matrices and troubleshooting is a lot easier in Octave.
+    //Utility fn for exporting to Octave format
+    template<typename ScalarT>void DumpMatrixToOctaveFormat(const ScalarT* dumpMe, int rows, int cols, const char* fileName, const char* id)
+    {
+        std::ofstream ofs(fileName);
+        ofs << "# Created by gemmbenchmark" << std::endl <<
+            "# name: " << id << std::endl <<
+            "# type: matrix" << std::endl <<
+            "# rows: " << rows << std::endl <<
+            "# columns: " << cols << std::endl;
+
+        for (int r = 0; r < rows; ++r)
+        {
+            for (int c = 0; c < cols; ++c)
+            {
+                ofs << ' ' << (ScalarT)(dumpMe[(cols * r) + c]);
+            }
+            ofs << std::endl;
+        }
+    }
+
+}}} //End namespaces
--- a/Source/Math/BlockMultiplierPlatform.h
+++ b/Source/Math/BlockMultiplierPlatform.h
@ -0,0 +1,19 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
+//
+#pragma once
+#ifdef _MSC_VER
+#define ALIGNED_ALLOC(bytes,alignment) _aligned_malloc(bytes,alignment)
+#define ALIGNED_FREE(ptr) _aligned_free(ptr)
+#define FORCEINLINE __forceinline
+#else
+#ifdef __GNUC__
+#include <stdlib.h>
+#define ALIGNED_ALLOC(bytes,alignment) aligned_alloc(alignment,bytes)
+#define ALIGNED_FREE(ptr) free(ptr)
+//#define FORCEINLINE __attribute__((always_inline)) 
+#define FORCEINLINE inline 
+#endif
+#endif
+
--- a/Source/Math/CUDAPageLockedMemAllocator.cpp
+++ b/Source/Math/CUDAPageLockedMemAllocator.cpp
@ -8,6 +8,13 @@
 namespace Microsoft { namespace MSR { namespace CNTK {

 #ifndef CPUONLY
+
+inline static void CheckCudaReturnCode(cudaError_t rc, const char* msg)
+{
+    if (rc != cudaSuccess)
+        RuntimeError("%s: %s (cuda error %d)", msg, cudaGetErrorString(rc), (int)rc);
+}
+
 CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)
    : m_deviceID(deviceID)
 {
@ -15,19 +22,18 @@ CUDAPageLockedMemAllocator::CUDAPageLockedMemAllocator(int deviceID)

 void* CUDAPageLockedMemAllocator::Malloc(size_t size, int deviceId)
 {
-    void* p;
-    cudaSetDevice(deviceId);
+    void* p = nullptr;
+    CheckCudaReturnCode(cudaSetDevice(deviceId), "Cannot set cuda device");

    // Note: I ask for cudaHostAllocDefault but cudaHostGetFlags() shows that it is allocated as 'cudaHostAllocMapped'
-    cudaHostAlloc(&p, size, cudaHostAllocDefault) || "Malloc in CUDAPageLockedMemAllocator failed";
-
+    CheckCudaReturnCode(cudaHostAlloc(&p, size, cudaHostAllocDefault), "Malloc in CUDAPageLockedMemAllocator failed");
    return p;
 }

 void CUDAPageLockedMemAllocator::Free(void* p, int deviceId)
 {
-    cudaSetDevice(deviceId);
-    cudaFreeHost(p) || "Free in CUDAPageLockedMemAllocator failed";
+    CheckCudaReturnCode(cudaSetDevice(deviceId), "Cannot set cuda device");
+    CheckCudaReturnCode(cudaFreeHost(p), "Free in CUDAPageLockedMemAllocator failed");
 }

 void* CUDAPageLockedMemAllocator::Malloc(size_t size)
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -4278,11 +4278,16 @@ void GPUMatrix<ElemType>::RCRFTransGrdCompute(const GPUMatrix<ElemType>& lbls,
 template <class ElemType>
 static shared_ptr<GPUMatrix<ElemType>> GetOnesVector(size_t N, DEVICEID_TYPE deviceId)
 {
-    // using an array of shared_ptrs because those are thread-safe. The objects themselves are immutable.
-    // And using a plain array so this will never get freed, avoiding free-after-DLL-unload issues.
-    static shared_ptr<GPUMatrix<ElemType>> onesCache[32]; // cache of objects
-    if (deviceId >= _countof(onesCache))
-        LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", (int) _countof(onesCache), (int) deviceId + 1);
+    // using a dynamically allocated array so this will never get freed, avoiding free-after-DLL-unload issues.
+    // and using shared_ptrs since we don't want to leak more than CacheSize elements
+    // when using a plain array we would have to control lifetime of the object and destructor would be called for every element in the array at the end
+    const int CacheSize = 32;
+    static shared_ptr<GPUMatrix<ElemType>> * onesCache = new shared_ptr<GPUMatrix<ElemType>>[CacheSize]; // cache of objects
+
+    if (deviceId >= CacheSize){
+        LogicError("GetOnesVector: onesCache[] too small (%d entries), increase (you need %d) and recompile.", CacheSize, (int)deviceId + 1);
+    }
+
    auto p = onesCache[deviceId];
    if (!p || p->GetNumRows() < N) // must (re-)allocate
    {
--- a/Source/Math/Helpers.h
+++ b/Source/Math/Helpers.h
@ -5,7 +5,7 @@
 // helpful macros
 // TODO: the file's name is too general to be included from outside; MathHelpers.h?

-//#pragma once
+#pragma once

 // iterators
 #undef foreach_row
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -161,6 +161,11 @@
    <ClInclude Include="..\Common\Include\File.h" />
    <ClInclude Include="..\Common\Include\fileutil.h" />
    <ClInclude Include="BatchNormalizationEngine.h" />
+    <ClInclude Include="BlockHandlerAVX.h" />
+    <ClInclude Include="BlockHandlerSSE.h" />
+    <ClInclude Include="BlockMultiplier.h" />
+    <ClInclude Include="BlockMultiplierMatrixUtil.h" />
+    <ClInclude Include="BlockMultiplierPlatform.h" />
    <ClInclude Include="CommonMatrix.h" />
    <ClInclude Include="ConvolutionEngine.h" />
    <ClInclude Include="ConvolveGeometry.h" />
@ -190,6 +195,8 @@
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="BatchNormalizationEngine.cpp" />
+    <ClCompile Include="BlockHandlerAVX.cpp" />
+    <ClCompile Include="BlockHandlerSSE.cpp" />
    <ClCompile Include="ConvolutionEngine.cpp" />
    <ClCompile Include="CPURNGHandle.cpp" />	
    <ClCompile Include="CPUSparseMatrix.cpp" />
--- a/Source/Math/Math.vcxproj.filters
+++ b/Source/Math/Math.vcxproj.filters
@ -42,6 +42,12 @@
      <Filter>CPU</Filter>
    </ClCompile>
    <ClCompile Include="RNGHandle.cpp" />
+    <ClCompile Include="BlockHandlerAVX.cpp">
+      <Filter>CPU</Filter>
+    </ClCompile>
+    <ClCompile Include="BlockHandlerSSE.cpp">
+        <Filter>CPU</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="CommonMatrix.h" />
@ -105,6 +111,18 @@
    <ClInclude Include="CPURNGHandle.h">
      <Filter>CPU</Filter>
    </ClInclude>
+    <ClInclude Include="BlockHandlerAVX.h">
+        <Filter>CPU</Filter>
+    </ClInclude>
+    <ClInclude Include="BlockHandlerSSE.h">
+        <Filter>CPU</Filter>
+    </ClInclude>
+    <ClInclude Include="BlockMultiplier.h">
+        <Filter>CPU</Filter>
+    </ClInclude>
+    <ClInclude Include="BlockMultiplierPlatform.h">
+        <Filter>CPU</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <None Include="GPUMatrix.h">
@ -155,4 +173,4 @@
      <UniqueIdentifier>{8f982dac-298d-4e48-b060-8e6cba5ff554}</UniqueIdentifier>
    </Filter>
  </ItemGroup>
-</Project>
+</Project>
--- a/Source/Math/TensorView.h
+++ b/Source/Math/TensorView.h
@ -14,7 +14,9 @@
 #pragma warning(push)
 #pragma warning(disable : 4251) // needs to have dll-interface to be used by clients of... caused by TensorView::m_shape which is only private. We use the same compiler everywhere.

-template<class ElemType> struct TensorTest;
+namespace Microsoft { namespace MSR { namespace CNTK { namespace Test {
+    template <class ElemType> struct TensorTest;
+}}}}

 // This class is exported from the Math.dll.
 namespace Microsoft { namespace MSR { namespace CNTK {
@ -151,7 +153,7 @@ private:

    const Matrix<ElemType>& GetSOB() const { return *m_sob; }
    Matrix<ElemType>&       GetSOB()       { return *m_sob; }
-    friend struct ::TensorTest<ElemType>;
+    friend Test::TensorTest<ElemType>;

    // -------------------------------------------------------------------
    // sob members
--- a/Source/Math/cudadevice.h
+++ b/Source/Math/cudadevice.h
@ -40,10 +40,9 @@ class ondevice
 public:
    ondevice(size_t deviceid)
    {
-        cudaSetDevice((int) deviceid) || "cudaSetDevice failed!";
-    }
-    ~ondevice()
-    {
+        auto rc = cudaSetDevice((int)deviceid);
+        if (rc != cudaSuccess)
+            RuntimeError("Cannot set cuda device: %s (cuda error %d)", cudaGetErrorString(rc), (int)rc);
    }
 };
 } }
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj
@ -110,9 +110,6 @@
      <PrecompiledHeader>Create</PrecompiledHeader>
    </ClCompile>
  </ItemGroup>
-  <ItemGroup>
-    <None Include="uci_to_cntk_text_format_converter.py" />
-  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
 </Project>
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj.filters
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj.filters
@ -47,13 +47,5 @@
    <Filter Include="Common\Include">
      <UniqueIdentifier>{C6F55578-121A-4D7C-8F57-4172BC5C463B}</UniqueIdentifier>
    </Filter>
-    <Filter Include="Scripts">
-      <UniqueIdentifier>{cd70d891-88aa-40a4-8e47-0e31e4cac48e}</UniqueIdentifier>
-    </Filter>
  </ItemGroup>
-  <ItemGroup>
-    <None Include="uci_to_cntk_text_format_converter.py">
-      <Filter>Scripts</Filter>
-    </None>
-  </ItemGroup>
-</Project>
+</Project>
--- a/Source/Readers/CNTKTextFormatReader/TextParser.cpp
+++ b/Source/Readers/CNTKTextFormatReader/TextParser.cpp
@ -16,6 +16,11 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+inline bool IsDigit(char c)
+{
+    return '0' <= c && c <= '9';
+}
+
 enum State
 {
    Init = 0,
@ -38,7 +43,7 @@ public:
    void GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result) override;

    // A map from sequence ids to the sequence data.
-    std::map<size_t, SequenceBuffer> m_sequenceMap;
+    std::vector<SequenceBuffer> m_sequenceMap;

    // chunk id (copied from the descriptor)
    ChunkIdType m_id;
@ -234,40 +239,11 @@ TextParser<ElemType>::TextDataChunk::TextDataChunk(const ChunkDescriptor& descri
 template <class ElemType>
 void TextParser<ElemType>::TextDataChunk::GetSequence(size_t sequenceId, std::vector<SequenceDataPtr>& result)
 {
-    auto it = m_sequenceMap.find(sequenceId);
-    assert(it != m_sequenceMap.end());
+    assert(sequenceId < m_sequenceMap.size());
    result.reserve(m_parser->m_streamInfos.size());
-    const auto& sequenceData = it->second;
-    for (size_t j = 0; j < m_parser->m_streamInfos.size(); ++j)
-    {
-        InputStreamBuffer* input = sequenceData[j].get();
-        const StreamInfo& stream = m_parser->m_streamInfos[j];
-        SequenceDataPtr data;
-        if (stream.m_type == StorageType::dense)
-        {
-            auto denseData = make_shared<DenseSequenceData>();
-            denseData->m_sampleLayout = m_parser->m_streams[j]->m_sampleLayout;
-            data = denseData;
-        }
-        else
-        {
-            auto sparseData = make_shared<SparseSequenceData>();
-            SparseInputStreamBuffer* sparseInput = static_cast<SparseInputStreamBuffer*>(input);
-            sparseData->m_indices = sparseInput->m_indices.data();
-            sparseData->m_nnzCounts.reserve(sparseInput->m_nnzCounts.size());
-            copy(sparseInput->m_nnzCounts.begin(), sparseInput->m_nnzCounts.end(),
-                back_inserter(sparseData->m_nnzCounts));
-            sparseData->m_totalNnzCount = sparseInput->m_totalNnzCount;
-            assert(input->m_numberOfSamples == sparseInput->m_nnzCounts.size());
-            data = sparseData;
-        }

-        data->m_data = input->m_buffer.data();
-        data->m_numberOfSamples = input->m_numberOfSamples;
-        data->m_chunk = shared_from_this();
-        data->m_id = sequenceId;
-        result.push_back(data);
-    }
+    const auto& sequenceData = m_sequenceMap[sequenceId];
+    result.insert(result.end(), sequenceData.begin(), sequenceData.end());
 }

 template <class ElemType>
@ -292,11 +268,10 @@ ChunkPtr TextParser<ElemType>::GetChunk(ChunkIdType chunkId)
 template <class ElemType>
 void TextParser<ElemType>::LoadChunk(TextChunkPtr& chunk, const ChunkDescriptor& descriptor)
 {
+    chunk->m_sequenceMap.resize(descriptor.m_sequences.size());
    for (const auto& sequenceDescriptor : descriptor.m_sequences)
    {
-        chunk->m_sequenceMap.insert(make_pair(
-            sequenceDescriptor.m_id,
-            LoadSequence(sequenceDescriptor)));
+        chunk->m_sequenceMap[sequenceDescriptor.m_id] = LoadSequence(sequenceDescriptor);
    }
 }

@ -480,13 +455,39 @@ typename TextParser<ElemType>::SequenceBuffer TextParser<ElemType>::LoadSequence
            GetSequenceKey(sequenceDsc).c_str(), GetFileInfo().c_str(), numRowsRead, expectedRowCount);
    }

+    FillSequenceMetadata(sequence, sequenceDsc.m_id);
    return sequence;
 }

+template<class ElemType>
+void TextParser<ElemType>::FillSequenceMetadata(SequenceBuffer& sequenceData, size_t sequenceId)
+{
+    for (size_t j = 0; j < m_streamInfos.size(); ++j)
+    {
+        const StreamInfo& stream = m_streamInfos[j];
+        SequenceDataBase* data = sequenceData[j].get();
+        if (stream.m_type == StorageType::dense)
+        {
+            auto denseData = static_cast<DenseInputStreamBuffer*>(data);
+            denseData->m_sampleLayout = m_streams[j]->m_sampleLayout;
+            data->m_data = denseData->m_buffer.data();
+        }
+        else
+        {
+            auto sparseData = static_cast<SparseInputStreamBuffer*>(data);
+            sparseData->m_indices = sparseData->m_indicesBuffer.data();
+            assert(data->m_numberOfSamples == sparseData->m_nnzCounts.size());
+            data->m_data = sparseData->m_buffer.data();
+        }
+
+        data->m_id = sequenceId;
+    }
+}
+
 template <class ElemType>
 bool TextParser<ElemType>::TryReadRow(SequenceBuffer& sequence, size_t& bytesToRead)
 {
-    while (bytesToRead && CanRead() && isdigit(*m_pos))
+    while (bytesToRead && CanRead() && IsDigit(*m_pos))
    {
        // skip sequence ids
        ++m_pos;
@ -616,7 +617,7 @@ bool TextParser<ElemType>::TryReadSample(SequenceBuffer& sequence, size_t& bytes
    {
        SparseInputStreamBuffer* data = reinterpret_cast<SparseInputStreamBuffer*>(sequence[id].get());
        vector<ElemType>& values = data->m_buffer;
-        vector<IndexType>& indices = data->m_indices;
+        vector<IndexType>& indices = data->m_indicesBuffer;
        assert(values.size() == indices.size());
        size_t size = values.size();
        if (!TryReadSparseSample(values, indices, stream.m_sampleDimension, bytesToRead))
@ -919,7 +920,7 @@ bool TextParser<ElemType>::TryReadUint64(size_t& value, size_t& bytesToRead)
    {
        char c = *m_pos;

-        if (!isdigit(c))
+        if (!IsDigit(c))
        {
            return found;
        }
@ -977,7 +978,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
        {
        case State::Init:
            // the number must either start with a number or a sign
-            if (isdigit(c))
+            if (IsDigit(c))
            {
                state = IntegralPart;
                number = (c - '0');
@ -1001,7 +1002,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
            break;
        case Sign:
            // the sign must be followed by a number
-            if (isdigit(c))
+            if (IsDigit(c))
            {
                state = IntegralPart;
                number = (c - '0');
@ -1019,7 +1020,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
            }
            break;
        case IntegralPart:
-            if (isdigit(c))
+            if (IsDigit(c))
            {
                number = number * 10 + (c - '0');
            }
@ -1040,7 +1041,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
            }
            break;
        case Period:
-            if (isdigit(c))
+            if (IsDigit(c))
            {
                state = FractionalPart;
                coefficient = number;
@ -1054,7 +1055,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
            }
            break;
        case FractionalPart:
-            if (isdigit(c))
+            if (IsDigit(c))
            {
                // TODO: ignore if number of precision digits > FLT_[MANT_]DIG/DBL_[MANT_]DIG
                // no state change
@ -1079,7 +1080,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
            break;
        case TheLetterE:
            // followed with optional minus or plus sign and nonempty sequence of decimal digits
-            if (isdigit(c))
+            if (IsDigit(c))
            {
                state = Exponent;
                negative = false;
@ -1104,7 +1105,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
            break;
        case ExponentSign:
            // exponent sign must be followed by a number
-            if (isdigit(c))
+            if (IsDigit(c))
            {
                state = Exponent;
                number = (c - '0');
@ -1122,7 +1123,7 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
            }
            break;
        case Exponent:
-            if (isdigit(c))
+            if (IsDigit(c))
            {
                // no state change
                number = number * 10 + (c - '0');
--- a/Source/Readers/CNTKTextFormatReader/TextParser.h
+++ b/Source/Readers/CNTKTextFormatReader/TextParser.h
@ -42,37 +42,33 @@ private:
    // Builds an index of the input data.
    void Initialize();

-    // A buffer to keep data for all samples in a (variable length) sequence
-    // from a single input stream.
-    struct InputStreamBuffer
-    {
-        virtual ~InputStreamBuffer() { };
-
-        uint32_t m_numberOfSamples = 0;
-        std::vector<ElemType> m_buffer;
-    };
-
-    struct DenseInputStreamBuffer : InputStreamBuffer
+    struct DenseInputStreamBuffer : DenseSequenceData
    {
        // capacity = expected number of samples * sample size
        DenseInputStreamBuffer(size_t capacity)
        {
-            InputStreamBuffer::m_buffer.reserve(capacity);
+            m_buffer.reserve(capacity);
        }
+
+        std::vector<ElemType> m_buffer;
    };

    // In case of sparse input, we also need a vector of
    // indices (one index for each input value) and a vector
    // of NNZ counts (one for each sample).
-    struct SparseInputStreamBuffer : InputStreamBuffer
+    struct SparseInputStreamBuffer : SparseSequenceData
    {
-        IndexType m_totalNnzCount = 0;
-        std::vector<IndexType> m_indices;
-        std::vector<IndexType> m_nnzCounts;
+        SparseInputStreamBuffer()
+        {
+            m_totalNnzCount = 0;
+        }
+
+        std::vector<IndexType> m_indicesBuffer;
+        std::vector<ElemType> m_buffer;
    };

-    // A sequence buffer is a vector that contains an input buffer for each input stream.
-    typedef std::vector<std::unique_ptr<InputStreamBuffer>> SequenceBuffer;
+    // A sequence buffer is a vector that contains sequence data for each input stream.
+    typedef std::vector<SequenceDataPtr> SequenceBuffer;

    // A chunk of input data in the text format.
    class TextDataChunk;
@ -176,6 +172,9 @@ private:

    TextParser(CorpusDescriptorPtr corpus, const std::wstring& filename, const vector<StreamDescriptor>& streams);

+    // Fills some metadata members to be conformant to the exposed SequenceData interface.
+    void FillSequenceMetadata(SequenceBuffer& sequenceBuffer, size_t sequenceId);
+
    void SetTraceLevel(unsigned int traceLevel);

    void SetMaxAllowedErrors(unsigned int maxErrors);
--- a/Source/Readers/CNTKTextFormatReader/TextReaderConstants.h
+++ b/Source/Readers/CNTKTextFormatReader/TextReaderConstants.h
@ -18,7 +18,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    
    const char ESCAPE_SYMBOL = '#';

-    const auto BUFFER_SIZE = 256 * 1024;
+    const auto BUFFER_SIZE = 2 * 1024 * 1024;

    inline bool isPrintable(char c)
    {
--- a/Показать больше
+++ b/Показать больше