merged from master

2017-02-20 18:49:20 -08:00 · 2017-02-20 18:49:20 -08:00 · 65bf17f4f4
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -119,6 +119,12 @@
    <LinkIncremental>$(DebugBuild)</LinkIncremental>
  </PropertyGroup>

+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PreprocessorDefinitions>HAS_MPI=1</PreprocessorDefinitions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+
  <ItemDefinitionGroup Condition="'$(ConfigurationType)' == 'StaticLibrary'">
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1484,7 +1484,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript"
 		Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs
 	EndProjectSection
 EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCPPEvalExamplesTest\CNTKLibraryCPPEvalExamplesTest.vcxproj.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCPPEvalExamplesTest\CNTKLibraryCPPEvalExamplesTest.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
 	ProjectSection(ProjectDependencies) = postProject
 		{91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715}
 		{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}
--- a/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.cpp
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.cpp
@ -0,0 +1,20 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CNTKLibraryCPPEvalCPUOnlyExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API.
+//
+
+#include <stdio.h>
+
+void MultiThreadsEvaluation(bool);
+
+int main()
+{
+
+    fprintf(stderr, "\n##### Run CNTKLibraryCPPEvalCPUOnlyExamples on CPU. #####\n");
+    MultiThreadsEvaluation(false);
+
+    fprintf(stderr, "Evaluation complete.\n");
+    fflush(stderr);
+}
--- a/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.vcxproj
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.vcxproj
@ -1,20 +1,27 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="CNTKLibraryCPPEvalExamples.cpp" />
+    <ClCompile Include="CNTKLibraryCPPEvalCPUOnlyExamples.cpp" />
    <ClCompile Include="EvalMultithreads.cpp" />
  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{D771A06D-CC25-4582-B5CD-D2A4782BB005}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>CNTKLibraryCPPEvalExamples</RootNamespace>
-    <ProjectName>CNTKLibraryCPPEvalExamples</ProjectName>
+    <RootNamespace>CNTKLibraryCPPEvalCPUOnlyExamples</RootNamespace>
+    <ProjectName>CNTKLibraryCPPEvalCPUOnlyExamples</ProjectName>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
@ -24,6 +31,13 @@
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
@ -31,12 +45,14 @@
  <PropertyGroup>
    <OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <PreprocessorDefinitions>WIN32;_CONSOLE;UNICODE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
      <TreatWarningAsError>true</TreatWarningAsError>
      <SDLCheck>true</SDLCheck>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
@ -45,8 +61,6 @@
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
-      <AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
-      <AdditionalDependencies>CNTKLibrary-2.0.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
    </Link>
  </ItemDefinitionGroup>
@ -55,7 +69,6 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreaded</RuntimeLibrary>
@ -66,7 +79,32 @@
      <OptimizeReferences>true</OptimizeReferences>
    </Link>
  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebug</RuntimeLibrary>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <MinimalRebuild>false</MinimalRebuild>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
+    <Import Project="..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets" Condition="Exists('..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets')" />
  </ImportGroup>
+  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
+    <PropertyGroup>
+      <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
+    </PropertyGroup>
+    <Error Condition="!Exists('..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\CNTK.CPUOnly.2.0-beta11\build\native\CNTK.CPUOnly.targets'))" />
+  </Target>
 </Project>
--- a/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.vcxproj.filters
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.vcxproj.filters
@ -15,11 +15,14 @@
    </Filter>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="CNTKLibraryCPPEvalExamples.cpp">
+    <ClCompile Include="CNTKLibraryCPPEvalCPUOnlyExamples.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="EvalMultithreads.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
 </Project>
--- a/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp
--- a/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/packages.config
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/packages.config
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="CNTK.CPUOnly" version="2.0-beta11" targetFramework="native" />
+</packages>
--- a/Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.cpp
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.cpp
@ -1,30 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-// CNTKLibraryCPPevalExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API. 
-//
-
-#include <stdio.h>
-
-// define GPU_AVAILABLE, if you want to run evaluation on a GPU device. You also need CNTK GPU binaries.
-// undefine GPU_AVAILABLE, if you want to run evaluation on a CPU device.
-// #define GPU_AVAILABLE
-
-void MultiThreadsEvaluation(bool);
-
-int main()
-{
-
-#ifdef GPU_AVAILABLE
-    fprintf(stderr, "\n##### Run eval on GPU device. #####\n");
-    MultiThreadsEvaluation(true);
-#else
-    fprintf(stderr, "\n##### Run eval on CPU device. #####\n");
-    MultiThreadsEvaluation(false);
-#endif
-
-    fprintf(stderr, "Evaluation complete.\n");
-
-    fflush(stderr);
-}
--- a/Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.cpp
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.cpp
@ -0,0 +1,20 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CNTKLibraryCPPEvalGPUExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API.
+//
+
+#include <stdio.h>
+
+void MultiThreadsEvaluation(bool);
+
+int main()
+{
+
+    fprintf(stderr, "\n##### Run CNTKLibraryCPPEvalGPUExamples on CPU and GPU. #####\n");
+    MultiThreadsEvaluation(true);
+
+    fprintf(stderr, "Evaluation complete.\n");
+    fflush(stderr);
+}
--- a/Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.vcxproj
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.vcxproj
@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\CNTKLibraryCPPEvalCPUOnlyExamples\EvalMultithreads.cpp" />
+    <ClCompile Include="CNTKLibraryCPPEvalGPUExamples.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{13489884-3A6A-4023-8CF1-D8C78DDAF952}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CNTKLibraryCPPEvalGPUExamples</RootNamespace>
+    <ProjectName>CNTKLibraryCPPEvalGPUExamples</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup>
+    <OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;UNICODE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <SDLCheck>true</SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebug</RuntimeLibrary>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <MinimalRebuild>false</MinimalRebuild>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+    <Import Project="..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets" Condition="Exists('..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets')" />
+  </ImportGroup>
+  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
+    <PropertyGroup>
+      <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
+    </PropertyGroup>
+    <Error Condition="!Exists('..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\CNTK.GPU.2.0-beta11\build\native\CNTK.GPU.targets'))" />
+  </Target>
+</Project>
--- a/Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.vcxproj.filters
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.vcxproj.filters
@ -0,0 +1,28 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="..\CNTKLibraryCPPEvalCPUOnlyExamples\EvalMultithreads.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="CNTKLibraryCPPEvalGPUExamples.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
+</Project>
--- a/Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/packages.config
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/packages.config
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="CNTK.GPU" version="2.0-beta11" targetFramework="native" />
+</packages>
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CNTKLibraryCSEvalCPUOnlyExamples.csproj
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CNTKLibraryCSEvalCPUOnlyExamples.csproj
@ -39,7 +39,7 @@
    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
  </PropertyGroup>
  <ItemGroup>
-    <Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
+    <Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, PublicKeyToken=21fff2ec8197defe, processorArchitecture=AMD64">
      <HintPath>..\packages\CNTK.CPUOnly.2.0-beta11\lib\net45\x64\CNTKLibraryManaged-2.0.dll</HintPath>
      <Private>True</Private>
    </Reference>
@ -72,4 +72,4 @@
  <Target Name="AfterBuild">
  </Target>
  -->
-</Project>
+</Project>
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/packages.config
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/packages.config
@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
  <package id="CNTK.CPUOnly" version="2.0-beta11" targetFramework="net45" />
-</packages>
+</packages>
--- a/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/CNTKLibraryCSEvalGPUExamples.csproj
+++ b/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/CNTKLibraryCSEvalGPUExamples.csproj
@ -39,7 +39,7 @@
    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
  </PropertyGroup>
  <ItemGroup>
-    <Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
+    <Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, PublicKeyToken=a82c1f3f67b62253, processorArchitecture=AMD64">
      <HintPath>..\packages\CNTK.GPU.2.0-beta11\lib\net45\x64\CNTKLibraryManaged-2.0.dll</HintPath>
      <Private>True</Private>
    </Reference>
@ -76,4 +76,4 @@
  <Target Name="AfterBuild">
  </Target>
  -->
-</Project>
+</Project>
--- a/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/packages.config
+++ b/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/packages.config
@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
  <package id="CNTK.GPU" version="2.0-beta11" targetFramework="net45" />
-</packages>
+</packages>
--- a/Examples/Evaluation/CNTKLibraryEvalExamples.sln
+++ b/Examples/Evaluation/CNTKLibraryEvalExamples.sln
@ -3,19 +3,22 @@ Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio 14
 VisualStudioVersion = 14.0.25420.1
 MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamples", "CNTKLibraryCPPEvalExamples\CNTKLibraryCPPEvalExamples.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalCPUOnlyExamples", "CNTKLibraryCPPEvalCPUOnlyExamples\CNTKLibraryCPPEvalCPUOnlyExamples.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalCPUOnlyExamples", "CNTKLibraryCSEvalCPUOnlyExamples\CNTKLibraryCSEvalCPUOnlyExamples.csproj", "{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalGPUExamples", "CNTKLibraryCSEvalGPUExamples\CNTKLibraryCSEvalGPUExamples.csproj", "{307E5BAC-DA03-45D2-ADEC-FE6620090109}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalGPUExamples", "CNTKLibraryCPPEvalGPUExamples\CNTKLibraryCPPEvalGPUExamples.vcxproj", "{13489884-3A6A-4023-8CF1-D8C78DDAF952}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Release|x64
+		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Debug|x64
+		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.Build.0 = Debug|x64
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.ActiveCfg = Release|x64
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.Build.0 = Release|x64
 		{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}.Debug|x64.ActiveCfg = Debug|x64
@ -26,6 +29,10 @@ Global
 		{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Debug|x64.Build.0 = Debug|x64
 		{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Release|x64.ActiveCfg = Release|x64
 		{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Release|x64.Build.0 = Release|x64
+		{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Debug|x64.ActiveCfg = Debug|x64
+		{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Debug|x64.Build.0 = Debug|x64
+		{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Release|x64.ActiveCfg = Release|x64
+		{13489884-3A6A-4023-8CF1-D8C78DDAF952}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/Examples/Evaluation/CPPEvalClient/CPPEvalClient.vcxproj
+++ b/Examples/Evaluation/CPPEvalClient/CPPEvalClient.vcxproj
@ -1,6 +1,10 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
@ -20,17 +24,30 @@
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
+  <PropertyGroup Label="Configuration" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
@ -38,29 +55,65 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <TreatWarningAsError>true</TreatWarningAsError>
      <SDLCheck>true</SDLCheck>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <FloatingPointModel>Fast</FloatingPointModel>
      <OpenMPSupport>true</OpenMPSupport>
+      <UseDebugLibraries>false</UseDebugLibraries>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
-      <AdditionalDependencies>EvalDll.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
-      <Profile>true</Profile>
+      <Profile>false</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <Optimization>Disabled</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <SDLCheck>true</SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OpenMPSupport>true</OpenMPSupport>
+      <UseDebugLibraries>true</UseDebugLibraries>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <Profile>false</Profile>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="CPPEvalClient.cpp" />
  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
+    <Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
  </ImportGroup>
-</Project>
+  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
+    <PropertyGroup>
+      <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
+    </PropertyGroup>
+    <Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
+  </Target>
+</Project>
--- a/Examples/Evaluation/CPPEvalClient/CPPEvalClient.vcxproj.filters
+++ b/Examples/Evaluation/CPPEvalClient/CPPEvalClient.vcxproj.filters
@ -19,4 +19,7 @@
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
-</Project>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
+</Project>
--- a/Examples/Evaluation/CPPEvalClient/packages.config
+++ b/Examples/Evaluation/CPPEvalClient/packages.config
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="native" />
+</packages>
--- a/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj
+++ b/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj
@ -1,6 +1,10 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
@ -20,17 +24,30 @@
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
+    <LinkIncremental>false</LinkIncremental>
+  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
@ -38,29 +55,63 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <TreatWarningAsError>true</TreatWarningAsError>
      <SDLCheck>true</SDLCheck>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <FloatingPointModel>Fast</FloatingPointModel>
      <OpenMPSupport>true</OpenMPSupport>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
-      <AdditionalDependencies>EvalDll.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
-      <Profile>true</Profile>
+      <Profile>false</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <Optimization>Disabled</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <SDLCheck>true</SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OpenMPSupport>true</OpenMPSupport>
+      <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+      <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+      <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <Profile>false</Profile>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClCompile Include="CPPEvalExtendedClient.cpp" />
  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
+    <Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
  </ImportGroup>
-</Project>
+  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
+    <PropertyGroup>
+      <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
+    </PropertyGroup>
+    <Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\native\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
+  </Target>
+</Project>
--- a/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj.filters
+++ b/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj.filters
@ -19,4 +19,7 @@
      <Filter>Source Files</Filter>
    </ClCompile>
  </ItemGroup>
+  <ItemGroup>
+    <None Include="packages.config" />
+  </ItemGroup>
 </Project>
--- a/Examples/Evaluation/CPPEvalExtendedClient/packages.config
+++ b/Examples/Evaluation/CPPEvalExtendedClient/packages.config
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="native" />
+</packages>
--- a/Examples/Evaluation/CSEvalClient/CSEvalClient.csproj
+++ b/Examples/Evaluation/CSEvalClient/CSEvalClient.csproj
@ -95,4 +95,4 @@
    </PropertyGroup>
    <Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta11\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
  </Target>
-</Project>
+</Project>
--- a/Examples/Evaluation/CSEvalClient/packages.config
+++ b/Examples/Evaluation/CSEvalClient/packages.config
@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
  <package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta11" targetFramework="net45" />
-</packages>
+</packages>
--- a/Examples/Evaluation/EvalClients.sln
+++ b/Examples/Evaluation/EvalClients.sln
@ -15,14 +15,16 @@ Global
 		Release|x64 = Release|x64
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.ActiveCfg = Release|x64
+		{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.ActiveCfg = Debug|x64
+		{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Debug|x64.Build.0 = Debug|x64
 		{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Release|x64.ActiveCfg = Release|x64
 		{C81CE839-184C-42C7-BB1C-9D0ABA17078D}.Release|x64.Build.0 = Release|x64
 		{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Debug|x64.ActiveCfg = Debug|x64
 		{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Debug|x64.Build.0 = Debug|x64
 		{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Release|x64.ActiveCfg = Release|x64
 		{92CCF4B9-BFED-4914-901A-CF1327B1A02D}.Release|x64.Build.0 = Release|x64
-		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.ActiveCfg = Release|x64
+		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.ActiveCfg = Debug|x64
+		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.Build.0 = Debug|x64
 		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.ActiveCfg = Release|x64
 		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
--- a/Examples/Image/Classification/AlexNet/Python/AlexNet_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/AlexNet/Python/AlexNet_ImageNet_Distributed.py
@ -16,6 +16,7 @@ from cntk.utils import *
 from cntk.ops import *
 from cntk.distributed import data_parallel_distributed_learner, Communicator
 from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
+import cntk.io.transforms as xforms
 from cntk.layers import Placeholder, Block, Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options, Sequential
 from cntk.initializer import normal

@ -41,15 +42,15 @@ def create_image_mb_source(map_file, is_training, total_number_of_samples):
    transforms = []
    if is_training:
        transforms += [
-            ImageDeserializer.crop(crop_type='randomside', side_ratio=0.88671875, jitter_type='uniratio') # train uses jitter
+            xforms.crop(crop_type='randomside', side_ratio=0.88671875, jitter_type='uniratio') # train uses jitter
        ]
-    else: 
+    else:
        transforms += [
-            ImageDeserializer.crop(crop_type='center', side_ratio=0.88671875) # test has no jitter
+            xforms.crop(crop_type='center', side_ratio=0.88671875) # test has no jitter
        ]

    transforms += [
-        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
+        xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
    ]

    # deserializer
@ -57,27 +58,27 @@ def create_image_mb_source(map_file, is_training, total_number_of_samples):
        ImageDeserializer(map_file, StreamDefs(
            features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
            labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
-        randomize = is_training, 
+        randomize = is_training,
        epoch_size=total_number_of_samples,
        multithreaded_deserializer = True)

-# Local Response Normalization layer. See Section 3.3 of the paper: 
-# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf 
-# The mathematical equation is: 
+# Local Response Normalization layer. See Section 3.3 of the paper:
+# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
+# The mathematical equation is:
 #   b_{x,y}^i=a_{x,y}^i/(k+\alpha\sum_{j=max(0,i-n)}^{min(N-1, i+n)}(a_{x,y}^j)^2)^\beta
 # where a_{x,y}^i is the activity of a neuron comoputed by applying kernel i at position (x,y)
-# N is the total number of kernals, n is half normalization width. 
-def LocalResponseNormalization(k, n, alpha, beta, name=''): 
-    x = cntk.blocks.Placeholder(name='lrn_arg') 
-    x2 = cntk.ops.square(x) 
-    # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. 
+# N is the total number of kernals, n is half normalization width.
+def LocalResponseNormalization(k, n, alpha, beta, name=''):
+    x = cntk.blocks.Placeholder(name='lrn_arg')
+    x2 = cntk.ops.square(x)
+    # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
    x2s = cntk.ops.reshape(x2, (1, cntk.InferredDimension), 0, 1)
    W = cntk.ops.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W')
    # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
    y = cntk.ops.convolution (W, x2s)
    # reshape back to remove the fake singleton reduction dimension
    b = cntk.ops.reshape(y, cntk.InferredDimension, 0, 2)
-    den = cntk.ops.exp(beta * cntk.ops.log(k + b)) 
+    den = cntk.ops.exp(beta * cntk.ops.log(k + b))
    apply_x = cntk.ops.element_divide(x, den)
    return cntk.blocks.Block(apply_x, 'LocalResponseNormalization', name, make_block=True)

@ -89,35 +90,35 @@ def create_alexnet():
    label_var = input_variable((num_classes))

    # apply model to input
-    # remove mean value 
+    # remove mean value
    input = minus(feature_var, constant(114), name='mean_removed_input')
-    
+
    with default_options(activation=None, pad=True, bias=True):
        z = Sequential([
-            # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU) 
-            Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'), 
-            Activation(activation=relu, name='relu1'), 
+            # we separate Convolution and ReLU to name the output for feature extraction (usually before ReLU)
+            Convolution2D((11,11), 96, init=normal(0.01), pad=False, strides=(4,4), name='conv1'),
+            Activation(activation=relu, name='relu1'),
            LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm1'),
            MaxPooling((3,3), (2,2), name='pool1'),
-            
-            Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'), 
-            Activation(activation=relu, name='relu2'), 
+
+            Convolution2D((5,5), 192, init=normal(0.01), init_bias=0.1, name='conv2'),
+            Activation(activation=relu, name='relu2'),
            LocalResponseNormalization(1.0, 2, 0.0001, 0.75, name='norm2'),
            MaxPooling((3,3), (2,2), name='pool2'),
-            
-            Convolution2D((3,3), 384, init=normal(0.01), name='conv3'), 
-            Activation(activation=relu, name='relu3'), 
-            Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'), 
-            Activation(activation=relu, name='relu4'), 
-            Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'), 
-            Activation(activation=relu, name='relu5'), 
-            MaxPooling((3,3), (2,2), name='pool5'), 
-            
-            Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'), 
-            Activation(activation=relu, name='relu6'), 
-            Dropout(0.5, name='drop6'), 
-            Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'), 
-            Activation(activation=relu, name='relu7'), 
+
+            Convolution2D((3,3), 384, init=normal(0.01), name='conv3'),
+            Activation(activation=relu, name='relu3'),
+            Convolution2D((3,3), 384, init=normal(0.01), init_bias=0.1, name='conv4'),
+            Activation(activation=relu, name='relu4'),
+            Convolution2D((3,3), 256, init=normal(0.01), init_bias=0.1, name='conv5'),
+            Activation(activation=relu, name='relu5'),
+            MaxPooling((3,3), (2,2), name='pool5'),
+
+            Dense(4096, init=normal(0.005), init_bias=0.1, name='fc6'),
+            Activation(activation=relu, name='relu6'),
+            Dropout(0.5, name='drop6'),
+            Dense(4096, init=normal(0.005), init_bias=0.1, name='fc7'),
+            Activation(activation=relu, name='relu7'),
            Dropout(0.5, name='drop7'),
            Dense(num_classes, init=normal(0.01), name='fc8')
            ])(input)
@ -134,7 +135,7 @@ def create_alexnet():
        'label': label_var,
        'ce' : ce,
        'pe' : pe,
-        'pe5': pe5, 
+        'pe5': pe5,
        'output': z
    }

@ -145,10 +146,10 @@ def create_trainer(network, epoch_size, num_quantization_bits):
    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
    mm_schedule       = cntk.learner.momentum_schedule(0.9)
    l2_reg_weight     = 0.0005 # CNTK L2 regularization is per sample, thus same as Caffe
-    
+
    # Create learner
    local_learner = cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, unit_gain=False, l2_regularization_weight=l2_reg_weight)
-    # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency 
+    # Since we reuse parameter settings (learning rate, momentum) from Caffe, we set unit_gain to False to ensure consistency
    parameter_learner = data_parallel_distributed_learner(
        local_learner,
        num_quantization_bits=num_quantization_bits,
@ -167,25 +168,25 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
    }

    training_session = cntk.training_session(
-        training_minibatch_source = train_source, 
+        training_minibatch_source = train_source,
        trainer = trainer,
-        model_inputs_to_mb_source_mapping = input_map, 
-        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), 
-        progress_printer = progress_printer, 
+        model_inputs_to_mb_source_mapping = input_map,
+        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
+        progress_printer = progress_printer,
 #        checkpoint_frequency = epoch_size,
-        checkpoint_filename = os.path.join(model_path, model_name), 
+        checkpoint_filename = os.path.join(model_path, model_name),
 #        save_all_checkpoints = True,
-        progress_frequency = epoch_size, 
-        cv_source = test_source, 
+        progress_frequency = epoch_size,
+        cv_source = test_source,
        cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
 #        cv_frequency = epoch_size,
        restore = restore)

-    # Train all minibatches 
+    # Train all minibatches
    training_session.train()

 # Train and evaluate the network.
-def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112, 
+def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112,
                           restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=True):
    _cntk_py.set_computation_network_trace_level(0)

@ -202,10 +203,10 @@ def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, mini
    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
- 
+

 if __name__=='__main__':
-    
+
    parser = argparse.ArgumentParser()

    parser.add_argument('-datadir', '--datadir', help='Data directory where the ImageNet dataset is located', required=False, default=data_path)
@ -233,8 +234,8 @@ if __name__=='__main__':
    test_data=os.path.join(data_path, 'val_map.txt')

    try:
-        alexnet_train_and_eval(train_data, test_data, 
-                               minibatch_size=args['minibatch_size'], 
+        alexnet_train_and_eval(train_data, test_data,
+                               minibatch_size=args['minibatch_size'],
                               epoch_size=args['epoch_size'],
                               num_quantization_bits=args['quantized_bits'],
                               max_epochs=args['num_epochs'],
@ -243,4 +244,4 @@ if __name__=='__main__':
                               num_mbs_per_log=200,
                               gen_heartbeat=True)
    finally:
-        cntk.distributed.Communicator.finalize()    
+        cntk.distributed.Communicator.finalize()
--- a/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
+++ b/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
@ -32,7 +32,7 @@ TrainConvNet = {
                x2s = SplitDimension(x2, 3, 1) 
                # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
                W = ParameterTensor{(1:1:2*n+1:1), learningRateMultiplier = 0, initValue = alpha/(2*n+1)}
-                y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose = false, maxTempMemSizeInSamples = 0)
+                y = Convolution (W, x2s, (1:1:2*n+1), mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, maxTempMemSizeInSamples = 0)
                # reshape back to remove the fake singleton reduction dimension
                b = FlattenDimensions(y, 3, 2)
                den = Exp (beta .* Log(k + b)) 
--- a/Examples/Image/Classification/ConvNet/Python/ConvNetLRN_CIFAR10_DataAug.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNetLRN_CIFAR10_DataAug.py
@ -10,8 +10,9 @@ import math
 import numpy as np
 import cntk
 import _cntk_py
+import cntk.io.transforms as xforms

-# Paths relative to current python file. 
+# Paths relative to current python file.
 abs_path   = os.path.dirname(os.path.abspath(__file__))
 data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
 model_path = os.path.join(abs_path, "Models")
@ -32,11 +33,11 @@ def create_reader(map_file, mean_file, is_training):
    transforms = []
    if is_training:
        transforms += [
-            cntk.io.ImageDeserializer.crop(crop_type='RandomSide', side_ratio=0.8, jitter_type='uniRatio') # train uses jitter
+            xforms.crop(crop_type='RandomSide', side_ratio=0.8, jitter_type='uniRatio') # train uses jitter
        ]
    transforms += [
-        cntk.io.ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
-        cntk.io.ImageDeserializer.mean(mean_file)
+        xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
+        xforms.mean(mean_file)
    ]
    # deserializer
    return cntk.io.MinibatchSource(cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
@ -44,23 +45,23 @@ def create_reader(map_file, mean_file, is_training):
        labels   = cntk.io.StreamDef(field='label', shape=num_classes))),   # and second as 'label'
        randomize=is_training)

-# Local Response Normalization layer. See Section 3.3 of the paper: 
-# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf 
-# The mathematical equation is: 
+# Local Response Normalization layer. See Section 3.3 of the paper:
+# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf
+# The mathematical equation is:
 #   b_{x,y}^i=a_{x,y}^i/(k+\alpha\sum_{j=max(0,i-n)}^{min(N-1, i+n)}(a_{x,y}^j)^2)^\beta
 # where a_{x,y}^i is the activity of a neuron comoputed by applying kernel i at position (x,y)
-# N is the total number of kernals, n is half normalization width.  
-def LocalResponseNormalization(k, n, alpha, beta, name=''): 
-    x = cntk.blocks.Placeholder(name='lrn_arg') 
-    x2 = cntk.ops.square(x) 
-    # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. 
+# N is the total number of kernals, n is half normalization width.
+def LocalResponseNormalization(k, n, alpha, beta, name=''):
+    x = cntk.blocks.Placeholder(name='lrn_arg')
+    x2 = cntk.ops.square(x)
+    # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed.
    x2s = cntk.ops.reshape(x2, (1, cntk.InferredDimension), 0, 1)
    W = cntk.ops.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W')
    # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
    y = cntk.ops.convolution (W, x2s)
    # reshape back to remove the fake singleton reduction dimension
    b = cntk.ops.reshape(y, cntk.InferredDimension, 0, 2)
-    den = cntk.ops.exp(beta * cntk.ops.log(k + b)) 
+    den = cntk.ops.exp(beta * cntk.ops.log(k + b))
    apply_x = cntk.ops.element_divide(x, den)
    return cntk.blocks.Block(apply_x, 'LocalResponseNormalization', name, make_block=True)

@ -75,18 +76,18 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
    # apply model to input
    scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var)

-    with cntk.layers.default_options (activation=cntk.ops.relu, pad=True): 
+    with cntk.layers.default_options (activation=cntk.ops.relu, pad=True):
        z = cntk.models.Sequential([
            cntk.models.For(range(2), lambda : [
-                cntk.layers.Convolution2D((3,3), 64), 
-                cntk.layers.Convolution2D((3,3), 64), 
+                cntk.layers.Convolution2D((3,3), 64),
+                cntk.layers.Convolution2D((3,3), 64),
                LocalResponseNormalization (1.0, 4, 0.001, 0.75),
                cntk.layers.MaxPooling((3,3), (2,2))
-            ]), 
+            ]),
            cntk.models.For(range(2), lambda i: [
-                cntk.layers.Dense([256,128][i]), 
+                cntk.layers.Dense([256,128][i]),
                cntk.layers.Dropout(0.5)
-            ]), 
+            ]),
            cntk.layers.Dense(num_classes, activation=None)
        ])(scaled_input)

@ -103,7 +104,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
    mm_time_constant       = [0]*20 + [600]*20 + [1200]
    mm_schedule            = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
    l2_reg_weight          = 0.002
-    
+
    # trainer object
    learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule,
                                        unit_gain = True,
@ -117,7 +118,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_
    }

    cntk.utils.log_number_of_parameters(z) ; print()
-    progress_printer = cntk.utils.ProgressPrinter(tag='Training')
+    progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)

    # perform model training
    for epoch in range(max_epochs):       # loop over epochs
@ -130,7 +131,7 @@ def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_

        progress_printer.epoch_summary(with_metric=True)
        z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
-    
+
    ### Evaluation action
    epoch_size     = 10000
    minibatch_size = 16
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
@ -84,10 +84,10 @@ def convnet_cifar10(debug_output=False):
    }

    cntk.utils.log_number_of_parameters(z) ; print()
-    progress_printer = cntk.utils.ProgressPrinter(tag='Training')
+    max_epochs = 30
+    progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)

    # Get minibatches of images to train with and perform model training
-    max_epochs = 30
    for epoch in range(max_epochs):       # loop over epochs
        sample_count = 0
        while sample_count < epoch_size:  # loop over minibatches in the epoch
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
@ -8,6 +8,9 @@ from __future__ import print_function
 import os
 import math
 import numpy as np
+import cntk
+import _cntk_py
+import cntk.io.transforms as xforms

 from cntk.layers import Convolution2D, MaxPooling, AveragePooling, Dropout, BatchNormalization, Dense, default_options, Placeholder, identity, Sequential, For
 from cntk.layers.typing import *
@ -47,11 +50,11 @@ def create_reader(map_file, mean_file, is_training):
    transforms = []
    if is_training:
        transforms += [
-            ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
+            xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
        ]
    transforms += [
-        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
-        ImageDeserializer.mean(mean_file)
+        xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
+        xforms.mean(mean_file)
    ]
    # deserializer
    return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
@ -142,6 +145,10 @@ def train_and_evaluate(reader, reader_test, model, epoch_size=50000, max_epochs=
    # TODO: we should be done here
    #return metric_numer/metric_denom

+        progress_printer.epoch_summary(with_metric=True)
+        z.save(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
+
+    ### Evaluation action
    
    # evaluate with current Trainer instance; just to make sure we save and load the model correctly and BN works now --TODO: delete once confirmed
    epoch_size     = 10000
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
@ -11,6 +11,7 @@ import argparse
 import numpy as np
 import cntk
 import _cntk_py
+import cntk.io.transforms as xforms

 # default Paths relative to current python file.
 abs_path   = os.path.dirname(os.path.abspath(__file__))
@ -32,12 +33,12 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
    transforms = []
    if train:
        transforms += [
-            cntk.io.ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
+            xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
        ]

    transforms += [
-        cntk.io.ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
-        cntk.io.ImageDeserializer.mean(mean_file)
+        xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
+        xforms.mean(mean_file)
    ]

    # deserializer
@ -45,7 +46,7 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
        cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
            features = cntk.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
            labels   = cntk.io.StreamDef(field='label', shape=num_classes))),   # and second as 'label'
-        randomize=train, 
+        randomize=train,
        epoch_size=total_number_of_samples,
        multithreaded_deserializer = True)

@ -58,18 +59,18 @@ def create_conv_network():

    # apply model to input
    scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), feature_var)
-    
+
    with cntk.layers.default_options(activation=cntk.ops.relu, pad=True):
        z = cntk.models.Sequential([
            cntk.models.For(range(2), lambda : [
                cntk.layers.Convolution2D((3,3), 64),
                cntk.layers.Convolution2D((3,3), 64),
                cntk.layers.MaxPooling((3,3), (2,2))
-            ]), 
+            ]),
            cntk.models.For(range(2), lambda i: [
-                cntk.layers.Dense([256,128][i]), 
+                cntk.layers.Dense([256,128][i]),
                cntk.layers.Dropout(0.5)
-            ]), 
+            ]),
            cntk.layers.Dense(num_classes, activation=None)
        ])(scaled_input)

@ -96,13 +97,13 @@ def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_
    mm_time_constant  = [0]*20 + [600]*20 + [1200]
    mm_schedule       = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
    l2_reg_weight     = 0.002
-    
+
    # Create learner
    if block_size != None and num_quantization_bits != 32:
        raise RuntimeError("Block momentum cannot be used with quantization, please remove quantized_bits option.")

-    local_learner = cntk.learner.momentum_sgd(network['output'].parameters, 
-                                              lr_schedule, mm_schedule, 
+    local_learner = cntk.learner.momentum_sgd(network['output'].parameters,
+                                              lr_schedule, mm_schedule,
                                              l2_regularization_weight=l2_reg_weight)

    if block_size != None:
@ -125,12 +126,12 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
    training_session = cntk.training_session(
        training_minibatch_source = train_source,
        trainer = trainer,
-        model_inputs_to_mb_source_mapping = input_map, 
+        model_inputs_to_mb_source_mapping = input_map,
        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
-        progress_printer = progress_printer, 
-        checkpoint_frequency = epoch_size, 
+        progress_printer = progress_printer,
+        checkpoint_frequency = epoch_size,
        checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
-#        save_all_checkpoints = False, 
+#        save_all_checkpoints = False,
        progress_frequency=epoch_size,
        cv_source = test_source,
        cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
@ -147,8 +148,8 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
        cntk.stop_profiler()

 # Train and evaluate the network.
-def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32, 
-                            block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None, 
+def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64, epoch_size=50000, num_quantization_bits=32,
+                            block_size=3200, warm_up=0, max_epochs=2, restore=False, log_to_file=None,
                            num_mbs_per_log=None, gen_heartbeat=False, profiling=False):
    _cntk_py.set_computation_network_trace_level(0)

@ -165,10 +166,10 @@ def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64,
    train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore, profiling)
- 
+

 if __name__=='__main__':
-    
+
    parser = argparse.ArgumentParser()
    data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")

@ -201,8 +202,8 @@ if __name__=='__main__':
    test_data=os.path.join(data_path, 'test_map.txt')

    try:
-        convnet_cifar10_dataaug(train_data, test_data, mean_data, 
-                                minibatch_size=args['minibatch_size'], 
+        convnet_cifar10_dataaug(train_data, test_data, mean_data,
+                                minibatch_size=args['minibatch_size'],
                                epoch_size=args['epoch_size'],
                                num_quantization_bits=args['quantized_bits'],
                                block_size=args['block_samples'],
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_MNIST.py
@ -74,10 +74,10 @@ def convnet_mnist(debug_output=False):
    }

    cntk.utils.log_number_of_parameters(z) ; print()
-    progress_printer = cntk.utils.ProgressPrinter(tag='Training')
+    max_epochs = 40
+    progress_printer = cntk.utils.ProgressPrinter(tag='Training', num_epochs=max_epochs)

    # Get minibatches of images to train with and perform model training
-    max_epochs = 40
    for epoch in range(max_epochs):       # loop over epochs
        sample_count = 0
        while sample_count < epoch_size:  # loop over minibatches in the epoch
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
@ -13,7 +13,6 @@ modelPath = "$outputDir$/Models/ResNet_101"
 stderr = "$outputDir$/ResNet_101_BS_out"

 parallelTrain = true
-hyperCompressMemory = true

 TrainNetwork = {
    action = "train"
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
@ -13,7 +13,6 @@ modelPath = "$outputDir$/Models/ResNet_152"
 stderr = "$outputDir$/ResNet_152_BS_out"

 parallelTrain = true
-hyperCompressMemory = true

 TrainNetwork = {
    action = "train"
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py
@ -13,6 +13,7 @@ import numpy as np
 from cntk.utils import *
 from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error
 from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
+import cntk.io.transforms as xforms
 from cntk import Trainer, cntk_py
 from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
 from _cntk_py import set_computation_network_trace_level
@ -40,11 +41,11 @@ def create_reader(map_file, mean_file, train):
    transforms = []
    if train:
        transforms += [
-            ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
+            xforms.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
        ]
    transforms += [
-        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
-        ImageDeserializer.mean(mean_file)
+        xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
+        xforms.mean(mean_file)
    ]
    # deserializer
    return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
@ -61,21 +62,21 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
    input_var = input_variable((num_channels, image_height, image_width))
    label_var = input_variable((num_classes))

-    # create model, and configure learning parameters 
-    if network_name == 'resnet20': 
+    # create model, and configure learning parameters
+    if network_name == 'resnet20':
        z = create_cifar10_model(input_var, 3, num_classes)
        lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
-    elif network_name == 'resnet110': 
+    elif network_name == 'resnet110':
        z = create_cifar10_model(input_var, 18, num_classes)
        lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01]
-    else: 
+    else:
        return RuntimeError("Unknown model name!")

    # loss and metric
    ce = cross_entropy_with_softmax(z, label_var)
    pe = classification_error(z, label_var)

-    # shared training parameters 
+    # shared training parameters
    minibatch_size = 128
    momentum_time_constant = -minibatch_size/np.log(0.9)
    l2_reg_weight = 0.0001
@ -84,7 +85,7 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
    lr_per_sample = [lr/minibatch_size for lr in lr_per_mb]
    lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
-    
+
    # trainer object
    learner     = momentum_sgd(z.parameters, lr_schedule, mm_schedule,
                               l2_regularization_weight = l2_reg_weight)
@ -97,13 +98,13 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
    }

    log_number_of_parameters(z) ; print()
-    progress_printer = ProgressPrinter(tag='Training')
+    progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)

    # perform model training
-    
+
    if profiler_dir:
        start_profiler(profiler_dir, True)
-        
+
    for epoch in range(max_epochs):       # loop over epochs
        sample_count = 0
        while sample_count < epoch_size:  # loop over minibatches in the epoch
@ -114,10 +115,10 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
        progress_printer.epoch_summary(with_metric=True)
        z.save(os.path.join(model_path, network_name + "_{}.dnn".format(epoch)))
        enable_profiler() # begin to collect profiler data after first epoch
-        
+
    if profiler_dir:
        stop_profiler()
-    
+
    # Evaluation parameters
    test_epoch_size     = 10000
    minibatch_size = 16
@ -154,7 +155,7 @@ if __name__=='__main__':
    args = vars(parser.parse_args())
    epochs = int(args['epochs'])
    network_name = args['network']
-    
+
    reader_train = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True)
    reader_test  = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)

--- a/Examples/Image/Classification/VGG/BrainScript/VGG16_ImageNet.cntk
+++ b/Examples/Image/Classification/VGG/BrainScript/VGG16_ImageNet.cntk
@ -26,7 +26,6 @@ ImageC    = 3
 NumLabels = 1000

 parallelTrain = true
-hyperCompressMemory = true

 ################################
 Train = {
--- a/Examples/Image/Classification/VGG/BrainScript/VGG19_ImageNet.cntk
+++ b/Examples/Image/Classification/VGG/BrainScript/VGG19_ImageNet.cntk
@ -26,7 +26,6 @@ ImageC    = 3
 NumLabels = 1000

 parallelTrain = true
-hyperCompressMemory = true

 ################################
 Train = {
--- a/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py
@ -32,8 +32,6 @@ num_channels = 3  # RGB
 num_classes  = 1000
 model_name   = "VGG16.model"

-cntk.cntk_py.enable_hyper_memory_compress()
-
 # Create a minibatch source.
 def create_image_mb_source(map_file, is_training, total_number_of_samples):
    if not os.path.exists(map_file):
--- a/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py
@ -32,8 +32,6 @@ num_channels = 3  # RGB
 num_classes  = 1000
 model_name   = "VGG19.model"

-cntk.cntk_py.enable_hyper_memory_compress()
-
 # Create a minibatch source.
 def create_image_mb_source(map_file, is_training, total_number_of_samples):
    if not os.path.exists(map_file):
--- a/Examples/Image/DataSets/Grocery/install_grocery.py
+++ b/Examples/Image/DataSets/Grocery/install_grocery.py
@ -7,6 +7,9 @@
 from __future__ import print_function
 import zipfile
 import os
+from sys import platform
+import shutil
+
 try:
    from urllib.request import urlretrieve 
 except ImportError: 
@ -26,6 +29,15 @@ def download_grocery_data():
            print('Extracting ' + filename + '...')
            with zipfile.ZipFile(filename) as myzip:
                myzip.extractall(dataset_folder)
+            if platform != "win32":
+                testfile  = os.path.join(dataset_folder, "grocery", "test.txt")
+                unixfile = os.path.join(dataset_folder, "grocery", "test_unix.txt")
+                out = open(unixfile, 'w')
+                with open(testfile) as f:
+                    for line in f:
+                        out.write(line.replace('\\', '/'))
+                out.close()
+                shutil.move(unixfile, testfile)
        finally:
            os.remove(filename)
        print('Done.')
@ -34,4 +46,4 @@ def download_grocery_data():
    
 if __name__ == "__main__":
    download_grocery_data()
-    
+    
--- a/Examples/Image/FeatureExtraction/FeatureExtraction.py
+++ b/Examples/Image/FeatureExtraction/FeatureExtraction.py
@ -9,18 +9,15 @@ import os
 import numpy as np
 from cntk import load_model, graph
 from cntk.ops import combine
-from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs
 from cntk import graph
 from cntk.graph import get_node_outputs
-
+import cntk.io.transforms as xforms

 def create_mb_source(image_height, image_width, num_channels, map_file):
-    transforms = [ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
-    image_source = ImageDeserializer(map_file)
-    image_source.ignore_labels()
-    image_source.map_features('features', transforms)
-
-    return MinibatchSource(image_source, randomize=False)
+    transforms = [xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
+    return MinibatchSource(ImageDeserializer(map_file,
+        StreamDefs(features=StreamDef(field='image', transforms=transforms))), # first column  in map file is referred to as 'image'
+        randomize=False)                                                       # second column is labels and is ignored


 def eval_and_write(model_file, node_name, output_file, minibatch_source, num_objects):
--- a/Examples/Image/TransferLearning/TransferLearning.py
+++ b/Examples/Image/TransferLearning/TransferLearning.py
@ -12,7 +12,8 @@ from cntk.device import set_default_device, gpu
 from cntk import load_model, Trainer, UnitType
 from cntk.layers import Placeholder, Constant
 from cntk.graph import find_by_name, get_node_outputs
-from cntk.io import MinibatchSource, ImageDeserializer
+from cntk.io import MinibatchSource, ImageDeserializer, StreamDefs, StreamDef
+import cntk.io.transforms as xforms
 from cntk.layers import Dense
 from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_schedule
 from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, combine, softmax
@ -58,11 +59,11 @@ _num_classes = 102

 # Creates a minibatch source for training or testing
 def create_mb_source(map_file, image_width, image_height, num_channels, num_classes, randomize=True):
-    transforms = [ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')]
-    image_source = ImageDeserializer(map_file)
-    image_source.map_features(features_stream_name, transforms)
-    image_source.map_labels(label_stream_name, num_classes)
-    return MinibatchSource(image_source, randomize=randomize)
+    transforms = [xforms.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear')] 
+    return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
+            features =StreamDef(field='image', transforms=transforms),
+            labels   =StreamDef(field='label', shape=num_classes))),
+            randomize=randomize)


 # Creates the network model for transfer learning
--- a/Examples/Image/TransferLearning/install_data_and_model.py
+++ b/Examples/Image/TransferLearning/install_data_and_model.py
@ -18,6 +18,10 @@ sys.path.append(os.path.join(base_folder, "..", "DataSets", "Animals"))
 from install_animals import download_animals_data
 download_animals_data()

+sys.path.append(os.path.join(base_folder, "..", "DataSets", "Grocery"))
+from install_grocery import download_grocery_data
+download_grocery_data()
+
 sys.path.append(os.path.join(base_folder, "..", "PretrainedModels"))
 from models_util import download_model_by_name
 download_model_by_name("ResNet_18")
--- a/Examples/LanguageUnderstanding/ATIS/Python/LanguageUnderstanding.py
+++ b/Examples/LanguageUnderstanding/ATIS/Python/LanguageUnderstanding.py
@ -19,8 +19,7 @@ from cntk.ops import cross_entropy_with_softmax, classification_error, splice, r
 # variables and stuff  #
 ########################

-cntk_dir = os.path.dirname(os.path.abspath(__file__)) + "/../../../.."  # data resides in the CNTK folder
-data_dir = cntk_dir + "/Examples/LanguageUnderstanding/ATIS/Data"       # under Examples/LanguageUnderstanding/ATIS
+data_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..", "Data")
 vocab_size = 943 ; num_labels = 129 ; num_intents = 26    # number of words in vocab, slot labels, and intent labels

 model_dir = "./Models"
--- a/Examples/Text/CharacterLM/char_rnn.py
+++ b/Examples/Text/CharacterLM/char_rnn.py
@ -139,7 +139,7 @@ def create_inputs(vocab_dim):
    return input_sequence, label_sequence

 # Creates and trains a character-level language model
-def train_lm(training_file, max_num_minibatches):
+def train_lm(training_file, epochs, max_num_minibatches):

    # load the data and vocab
    data, char_to_ix, ix_to_char, data_size, vocab_dim = load_data_and_vocab(training_file)
@ -168,46 +168,34 @@ def train_lm(training_file, max_num_minibatches):
    trainer = Trainer(z, (ce, errs), learner)

    sample_freq = 1000
-    epochs = 50
-    minibatches_per_epoch = int((data_size / minibatch_size))
-    minibatches = min(epochs * minibatches_per_epoch, max_num_minibatches)
+    minibatches_per_epoch = min(data_size // minibatch_size, max_num_minibatches // epochs)

    # print out some useful training information
-    log_number_of_parameters(z) ; print()
-    progress_printer = ProgressPrinter(freq=100, tag='Training')    
+    log_number_of_parameters(z)
+    print ("Running %d epochs with %d minibatches per epoch" % (epochs, minibatches_per_epoch))
+    print()
    
-    e = 0
-    p = 0
-    for i in range(0, minibatches):
-
-        if p + minibatch_size+1 >= data_size:
-            p = 0
-            e += 1
-            model_filename = "models/shakespeare_epoch%d.dnn" % e
-            z.save(model_filename)
-            print("Saved model to '%s'" % model_filename)
-
-        # get the data            
-        features, labels = get_data(p, minibatch_size, data, char_to_ix, vocab_dim)
+    progress_printer = ProgressPrinter(freq=100, tag='Training')

+    for e in range(0, epochs):
        # Specify the mapping of input variables in the model to actual minibatch data to be trained with
        # If it's the start of the data, we specify that we are looking at a new sequence (True)
-        mask = [False] 
-        if p == 0:
-            mask = [True]
-        arguments = ({input_sequence : features, label_sequence : labels}, mask)
-        trainer.train_minibatch(arguments)
+        mask = [True]
+        for b in range(0, minibatches_per_epoch):
+            # get the data            
+            features, labels = get_data(b, minibatch_size, data, char_to_ix, vocab_dim)
+            arguments = ({input_sequence : features, label_sequence : labels}, mask)
+            mask = [False] 
+            trainer.train_minibatch(arguments)

-        progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
-        
-        if i % sample_freq == 0:
-            print(sample(z, ix_to_char, vocab_dim, char_to_ix))
+            progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
+            global_minibatch = e*minibatches_per_epoch + b
+            if global_minibatch % sample_freq == 0:
+                print(sample(z, ix_to_char, vocab_dim, char_to_ix))

-        p += minibatch_size
-
-    # Do a final save of the model        
-    model_filename = "models/shakespeare_epoch%d.dnn" % e
-    z.save(model_filename)
+        model_filename = "models/shakespeare_epoch%d.dnn" % (e+1)
+        z.save_model(model_filename)
+        print("Saved model to '%s'" % model_filename)


 def load_and_sample(model_filename, vocab_filename, prime_text='', use_hardmax=False, length=1000, temperature=1.0):
@ -223,13 +211,13 @@ def load_and_sample(model_filename, vocab_filename, prime_text='', use_hardmax=F
        
    return sample(model, ix_to_char, len(chars), char_to_ix, prime_text=prime_text, use_hardmax=use_hardmax, length=length, temperature=temperature)
    
-def train_and_eval_char_rnn(max_num_minibatches=sys.maxsize):
-    # train the LM    
-    train_lm("data/tinyshakespeare.txt", max_num_minibatches)
+def train_and_eval_char_rnn(epochs=50, max_num_minibatches=sys.maxsize):
+    # train the LM 
+    train_lm("data/tinyshakespeare.txt", epochs, max_num_minibatches)

    # load and sample
    text = "T"
-    return load_and_sample("models/shakespeare_epoch0.dnn", "data/tinyshakespeare.txt.vocab", prime_text=text, use_hardmax=False, length=100, temperature=0.95)
+    return load_and_sample("models/shakespeare_epoch%d.dnn" % (epochs), "data/tinyshakespeare.txt.vocab", prime_text=text, use_hardmax=False, length=100, temperature=0.95)

 if __name__=='__main__':    
    # Specify the target device to be used for computing, if you do not want to
--- a/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py
+++ b/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py
@ -23,7 +23,7 @@ from _cntk_py import set_computation_network_trace_level

 # Paths relative to current python file.
 abs_path   = os.path.dirname(os.path.abspath(__file__))
-data_path  = os.path.join(abs_path, "..", "..", "Datasets", "UCF11")
+data_path  = os.path.join(abs_path, "..", "..", "DataSets", "UCF11")
 model_path = os.path.join(abs_path, "Models")

 # Define the reader for both training and evaluation action.
@ -194,14 +194,14 @@ def conv3d_ucf11(train_reader, test_reader, max_epochs=30):
    lr_per_sample          = [0.01]*10+[0.001]*10+[0.0001]
    lr_schedule            = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
    momentum_time_constant = 4096
-    mm_schedule            = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)
+    mm_schedule            = momentum_as_time_constant_schedule([momentum_time_constant], epoch_size=epoch_size)

    # Instantiate the trainer object to drive the model training
    learner     = momentum_sgd(z.parameters, lr_schedule, mm_schedule, True)
    trainer     = Trainer(z, (ce, pe), learner)

    log_number_of_parameters(z) ; print()
-    progress_printer = ProgressPrinter(tag='Training')
+    progress_printer = ProgressPrinter(tag='Training', num_epochs=max_epochs)

    # Get minibatches of images to train with and perform model training
    for epoch in range(max_epochs):       # loop over epochs
--- a/40
+++ b/40
@ -77,7 +77,10 @@ endif

 # The mpic++ wrapper only adds MPI specific flags to the g++ command line.
 # The actual compiler/linker flags added can be viewed by running 'mpic++ --showme:compile' and 'mpic++ --showme:link'
+ifneq ($(HAS_MPI),0)
 CXX = $(MPI_PATH)/bin/mpic++
+endif
+
 SSE_FLAGS = -msse4.1 -mssse3

 PROTOC = $(PROTOBUF_PATH)/bin/protoc
@ -90,8 +93,8 @@ SOURCEDIR:= Source
 INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API CNTKv2LibraryDll/proto Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib PerformanceProfilerDll)
 INCLUDEPATH+=$(PROTOBUF_PATH)/include
 # COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
-COMMON_FLAGS:= -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
-CPPFLAGS:=
+COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
+CPPFLAGS:= 
 CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
 LIBPATH:=
 LIBS_LIST:=
@ -270,7 +273,7 @@ RPATH=-Wl,-rpath,
 # Build info
 ########################################

-BUILDINFO:= $(SOURCEDIR)/CNTK/buildinfo.h
+BUILDINFO:= $(SOURCEDIR)/CNTKv2LibraryDll/buildinfo.h
 GENBUILD:=Tools/generate_build_info

 BUILDINFO_OUTPUT := $(shell $(GENBUILD) $(BUILD_TOP)/Config.make && echo Success)
@ -579,9 +582,16 @@ $(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB) $(READER_LIBS
 ########################################
 CNTKLIBRARY_CPP_EVAL_EXAMPLES:=$(BINDIR)/CNTKLibraryCPPEvalExamples

+#ifdef CUDA_PATH
 CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC=\
-	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.cpp  \
-	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalExamples/EvalMultithreads.cpp
+	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalGPUExamples/CNTKLibraryCPPEvalGPUExamples.cpp\
+	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp
+
+#else
+CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC=\
+	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/CNTKLibraryCPPEvalCPUOnlyExamples.cpp\
+	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp
+#endif

 CNTKLIBRARY_CPP_EVAL_EXAMPLES_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_CPP_EVAL_EXAMPLES_SRC))

@ -594,6 +604,26 @@ $(CNTKLIBRARY_CPP_EVAL_EXAMPLES): $(CNTKLIBRARY_CPP_EVAL_EXAMPLES_OBJ) | $(CNTKL
 	@echo building $(CNTKLIBRARY_CPP_EVAL_EXAMPLES) for $(ARCH) with build type $(BUILDTYPE)
 	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) $(L_READER_LIBS)

+########################################
+# Eval V2 Sample test 
+########################################
+CNTKLIBRARY_CPP_EVAL_TEST:=$(BINDIR)/CNTKLibraryCPPEvalExamplesTest
+
+CNTKLIBRARY_CPP_EVAL_TEST_SRC=\
+	$(SOURCEDIR)/../Tests/EndToEndTests/EvalClientTests/CNTKLibraryCPPEvalExamplesTest/CNTKLibraryCPPEvalExamplesTest.cpp\
+	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalCPUOnlyExamples/EvalMultithreads.cpp\
+	$(SOURCEDIR)/../Tests/EndToEndTests/CNTKv2Library/Common/Common.cpp
+
+CNTKLIBRARY_CPP_EVAL_TEST_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_CPP_EVAL_TEST_SRC))
+
+ALL+=$(CNTKLIBRARY_CPP_EVAL_TEST)
+SRC+=$(CNTKLIBRARY_CPP_EVAL_TEST_SRC)
+
+$(CNTKLIBRARY_CPP_EVAL_TEST): $(CNTKLIBRARY_CPP_EVAL_TEST_OBJ) | $(CNTKLIBRARY_LIB) $(READER_LIBS)
+	@mkdir -p $(dir $@)
+	@echo building $(CNTKLIBRARY_CPP_EVAL_TEST) for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) $(L_READER_LIBS)
+
 ########################################
 # HTKMLFReader plugin
 ########################################
--- a/Source/ActionsLib/EvalActions.cpp
+++ b/Source/ActionsLib/EvalActions.cpp
@ -256,9 +256,10 @@ void DoWriteOutput(const ConfigParameters& config)
    else if (config.Exists("outputPath"))
    {
        wstring outputPath = config(L"outputPath");
+        bool writeSequenceKey = config(L"writeSequenceKey", false);
        WriteFormattingOptions formattingOptions(config);
        bool nodeUnitTest = config(L"nodeUnitTest", "false");
-        writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, formattingOptions, epochSize, nodeUnitTest);
+        writer.WriteOutput(testDataReader, mbSize[0], outputPath, outputNodeNamesVector, formattingOptions, epochSize, nodeUnitTest, writeSequenceKey);
    }
    else
        InvalidArgument("write command: You must specify either 'writer'or 'outputPath'");
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -164,12 +164,15 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(ForwardBackwardNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(LabelsToGraphNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LessEqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(LessNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(NotEqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ClipNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(CropNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(PassNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceWithNegativeSamplesNode), L"CosWithNegSamples")) ret = true;
--- a/Source/ActionsLib/TrainActions.cpp
+++ b/Source/ActionsLib/TrainActions.cpp
@ -59,12 +59,19 @@ shared_ptr<C> CreateObject(const ScriptableObjects::IConfigRecord& config, const
 template <class C>
 shared_ptr<C> CreateObject(const ConfigParameters& config, const wchar_t* id)
 {
-    ConfigParameters readerConfig(config(id));
-    if (!readerConfig.ExistsCurrent("traceLevel")) // do not overwrite "traceLevel" if it's already present
+    ConfigParameters objConfig(config(id));
+    const auto& readerType = string(objConfig("readerType", ""));
+    if (objConfig.ExistsCurrent("traceLevel") || // do not overwrite "traceLevel" if it's already present
+        AreEqualIgnoreCase(readerType, "CNTKTextFormatReader") || // do not overwrite "traceLevel" when creating a CTF reader
+        AreEqualIgnoreCase(readerType, "CNTKBinaryReader"))  // do not overwrite "traceLevel" when creating a binary reader
    {
-        readerConfig.Insert("traceLevel", config(L"traceLevel", "0")); // TODO: fix this by adding it to all config blocks. Easy to fix in BS as 'config with [ traceLevel = 0 ]'.
+        return make_shared<C>(objConfig);
    }
-    return make_shared<C>(readerConfig);                           // old CNTK config specifies a dictionary which then must be explicitly instantiated
+
+    // If the config does not specify a 'traceLevel', the following line
+    // will insert it with the value of 0.
+    objConfig.Insert("traceLevel", config(L"traceLevel", "0")); // TODO: fix this by adding it to all config blocks. Easy to fix in BS as 'config with [ traceLevel = 0 ]'.
+    return make_shared<C>(objConfig);                           // old CNTK config specifies a dictionary which then must be explicitly instantiated
 }

 template <class ConfigRecordType, typename ElemType>
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -577,6 +577,9 @@ Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag=
 RowSlice(beginIndex, numRows, input, tag='') = Slice(beginIndex, beginIndex + numRows, input, axis = 1)
 RowRepeat(input, numRepeats, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = _AsNodes (input) /*plus the function args*/ ]
 RowStack(inputs, axis=1, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]
+EditDistanceError(leftInput, rightInput, subPen=0.0, delPen=0.0, insPen=0.0, squashInputs=false, tokensToIgnore=[||], tag='') = new ComputationNode [ operation = 'EditDistanceError' ; inputs = _AsNodes (leftInput : rightInput) /*plus the function args*/ ]
+ForwardBackward(graph, features, blankTokenId, delayConstraint=-1, tag='') = new ComputationNode [ operation = 'ForwardBackward' ; inputs = _AsNodes (graph : features) /*plus the function args*/ ]
+LabelsToGraph(labels, tag='') = new ComputationNode [ operation = 'LabelsToGraph' ; inputs = _AsNodes (labels) /*plus the function args*/ ]
 Slice(beginIndex, endIndex, input, axis=1, tag='') =
    if axis < 0 then [ # time axis: specify -1
        beginFlags = if beginIndex > 0 then BS.Boolean.Not (BS.Loop.IsFirstN (beginIndex, input)) else                 BS.Loop.IsLastN  (-beginIndex, input)
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -36,6 +36,7 @@
 #include "BrainScriptEvaluator.h"
 #include "BrainScriptParser.h"
 #include "PerformanceProfiler.h"
+#include "CNTKLibrary.h"

 #include <string>
 #include <chrono>
@ -252,9 +253,6 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
            ProgressTracing::SetStepOffset(fullEpochsOffset); // this is the epoch number that SGD will log relative to
        }

-        if (Globals::ShouldEnableHyperCompressMemory())
-            Matrix<ElemType>::UseCachedResizeOrNot(true);
-
        // determine the action to perform, and do it
        for (int j = 0; j < action.size(); j++)
        {
@ -372,55 +370,6 @@ std::string TimeDateStamp()
    return buf;
 }

-void PrintBuiltInfo()
-{
-    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
-    LOGPRINTF(stderr, "Build info: \n\n");
-    LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
-    LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
-#ifdef _BUILDTYPE_
-    LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
-#endif
-#ifdef _BUILDTARGET_
-    LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
-#endif
-#ifdef _WITH_1BITSGD_
-    LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
-#endif
-#ifdef _WITH_ASGD_
-    LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
-#endif
-#ifdef _MATHLIB_
-    LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
-#endif
-#ifdef _CUDA_PATH_
-    LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
-#endif
-#ifdef _CUB_PATH_
-    LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
-#endif
-#ifdef _CUDNN_PATH_
-    LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
-#endif
-#ifdef _GIT_EXIST
-    LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
-    LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
-#endif
-#ifdef _BUILDER_
-    LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
-#endif
-#ifdef _BUILDPATH_
-    LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
-#endif
-#ifdef _MPI_NAME_
-    LOGPRINTF(stderr, "\t\tMPI distribution: %s\n", _MPI_NAME_);
-#endif
-#ifdef _MPI_VERSION_
-    LOGPRINTF(stderr, "\t\tMPI version: %s\n", _MPI_VERSION_);
-#endif
-    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
-}
-
 void PrintUsageInfo()
 {
    LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
@ -585,7 +534,6 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp

    Globals::SetShareNodeValueMatrices(config(L"shareNodeValueMatrices", true));
    Globals::SetGradientAccumulationOptimization(config(L"optimizeGradientAccumulation", true));
-    Globals::SetHyperCompressMemory(config(L"hyperCompressMemory", false));

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

@ -598,7 +546,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp

        RedirectStdErr(logpath);
        LOGPRINTF(stderr, "%ls\n", startupMessage.c_str());
-        PrintBuiltInfo();
+        ::CNTK::PrintBuiltInfo();
    }

    // echo gpu info to log
@ -666,7 +614,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp

 static void PrintBanner(int argc, wchar_t* argv[], const string& timestamp)
 {
-    fprintf(stderr, "CNTK 2.0.beta11.0+ (");
+    fprintf(stderr, "CNTK 2.0.beta11.0 (");
 #ifdef _GIT_EXIST
    fprintf(stderr, "%s %.6s, ", _BUILDBRANCH_, _BUILDSHA1_);
 #endif
@ -729,7 +677,6 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])

    Globals::SetShareNodeValueMatrices(config(L"shareNodeValueMatrices", true));
    Globals::SetGradientAccumulationOptimization(config(L"optimizeGradientAccumulation", true));
-    Globals::SetHyperCompressMemory(config(L"hyperCompressMemory", false));

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

@ -764,7 +711,7 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
    }

    // full config info
-    PrintBuiltInfo();
+    ::CNTK::PrintBuiltInfo();
    PrintGpuInfo();

 #ifdef _DEBUG
@ -857,7 +804,7 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
    {        
        if (argc <= 1)
        {
-            PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
+            ::CNTK::PrintBuiltInfo(); // print build info directly in case that user provides zero argument (convenient for checking build type)
            LOGPRINTF(stderr, "No command-line argument given.\n");
            PrintUsageInfo();
            fflush(stderr);
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -85,7 +85,8 @@
      <StackReserveSize>100000000</StackReserveSize>
    </Link>
    <PreBuildEvent>
-      <Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
+      <Command>
+      </Command>
    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -113,7 +114,8 @@
      <StackReserveSize>100000000</StackReserveSize>
    </Link>
    <PreBuildEvent>
-      <Command>prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
+      <Command>
+      </Command>
    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
--- a/Source/CNTK/ModelEditLanguage.cpp
+++ b/Source/CNTK/ModelEditLanguage.cpp
@ -396,8 +396,16 @@ void MELScript<ElemType>::CallFunction(const std::string& p_name, const ConfigPa
        MELProperty prop = melPropNull;
 #if 1   // legacy
        // legacy names for some properties
-        if      (EqualInsensitive(propName, "finalCriterion", "Criteria")) propName = "criterion";
-        else if (EqualInsensitive(propName, "eval"))                       propName = "evaluation";
+        if (EqualInsensitive(propName, "finalCriterion", "Criteria"))
+        {
+            propName = "criterion";
+            prop = melPropFinalCriterion;
+        }
+        else if (EqualInsensitive(propName, "eval"))
+        {
+            propName = "evaluation";
+            prop = melPropEvaluation;
+        }
        // legacy property that now works differently
        else if (EqualInsensitive(propName, "needGradient", "needsGradient") || EqualInsensitive(propName, "computeGradient"))
            prop = melPropParameterUpdateRequired;  // for backward compatibility
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -1395,6 +1395,18 @@ namespace CNTK

        CNTK_API void Add(const Dictionary& other);

+        void Add(const std::wstring& key, const DictionaryValue& value)
+        {
+            operator[](key.c_str()) = value;
+        }
+
+        template<typename... Args>
+        void Add(const std::wstring& key, const DictionaryValue& value, Args... args)
+        {
+            Add(key, value); //insert one
+            Add(args...);    //recurse
+        }
+
        CNTK_API bool operator==(const Dictionary& other) const;
        CNTK_API bool operator!=(const Dictionary& other) const;

@ -1634,6 +1646,8 @@ private:

        Variable CompositePreservingCopy(const std::shared_ptr<const Function>& composite) const;

+        Variable NonCompositePreservingCopy() const;
+
    private:
 #ifdef SWIGCSHARP
    public:
@ -2735,7 +2749,7 @@ namespace CNTK

        ///
        /// Returns the root of the Function graph underlying this block Function.
-        /// Throws an exception ff this is not a block Function
+        /// Throws an exception of this is not a block Function
        ///
        CNTK_API FunctionPtr BlockRoot() const;

@ -4430,6 +4444,20 @@ namespace CNTK
        std::wstring m_streamAlias;
    };

+    struct HTKFeatureConfiguration
+    {
+        HTKFeatureConfiguration(const std::wstring& streamName, const std::wstring& scp, size_t dim, size_t left, size_t right, bool broadcast)
+            : m_streamName(streamName), m_dim(dim), m_scp(scp), m_left(left), m_right(right), m_broadcast(broadcast)
+        {}
+
+        std::wstring m_streamName;
+        std::wstring m_scp;
+        size_t m_dim;
+        size_t m_left;
+        size_t m_right;
+        bool m_broadcast;
+    };
+
    /// 
    /// Instantiate the CNTK built-in text format minibatch source
    ///
@ -4475,6 +4503,56 @@ namespace CNTK
        return CreateCompositeMinibatchSource(minibatchSourceConfiguration);
    }

+    typedef Dictionary ImageTransform;
+
+    /// 
+    /// Create a crop transform with the specified options to be used with a reader
+    /// 
+    CNTK_API ImageTransform ReaderCrop(const wchar_t* cropType = L"center",
+        int cropSize = 0, float sideRatio = 0.0f, float areaRatio = 0.0f,
+        float aspectRatio = 1.0f, const wchar_t* jitterType = L"none");
+
+    /// 
+    /// Create a scale transform with the specified options to be used with a reader
+    /// 
+    CNTK_API ImageTransform ReaderScale(int width,
+        int height, int channels, const wchar_t* interpolations = L"linear",
+        const wchar_t* scaleMode = L"fill", int padValue = -1);
+
+    /// 
+    /// Create a mean subtraction transform with the specified options to be used with a reader
+    /// 
+    CNTK_API ImageTransform ReaderMean(const wchar_t* meanFile);
+
+    /// 
+    /// Create a color transform with the specified options to be used with a reader
+    /// 
+    CNTK_API ImageTransform ReaderColor(float brightnessRadius = 0.0f,
+        float contrastRadius = 0.0f, float saturationRadius = 0.0f);
+
+
+    typedef Dictionary Deserializer;
+
+    /// 
+    /// Create an ImageDeserializer with the specified options
+    /// 
+    CNTK_API  Deserializer ImageDeserializer(const std::wstring& fileName, const std::wstring& labelStreamName, size_t numLabels, const std::wstring& imageStreamName, const std::vector<ImageTransform>& transforms = {});
+
+    /// 
+    /// Create an CTFDeserializer with the specified options
+    /// 
+    CNTK_API  Deserializer CTFDeserializer(const std::wstring& fileName, const std::vector<StreamConfiguration>& streams);
+
+    /// 
+    /// Create an HTKFeatureDeserializer with the specified options
+    /// 
+    CNTK_API  Deserializer HTKFeatureDeserializer(const std::vector<HTKFeatureConfiguration>& streams);
+
+    /// 
+    /// Create an HTKMLFDeserializer with the specified options
+    /// 
+    CNTK_API  Deserializer HTKMLFDeserializer(const std::wstring& streamName, const std::wstring& labelMappingFile, size_t dimension, const std::vector<std::wstring>& mlfFiles);
+
    ///
    /// Compute the per dimension means and variances for each of the specified streams using data from the specified minibatchSource.
    ///
@ -4769,6 +4847,9 @@ namespace CNTK
        bool keepExistingCheckpoints = false,
        size_t maxNumberOfTrainingSamples = std::numeric_limits<size_t>::max(),
        size_t progressFrequency = std::numeric_limits<size_t>::max());
+
+
+    CNTK_API void PrintBuiltInfo();
 }


--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -250,9 +250,6 @@ namespace CNTK
        CNTK_API void EnableForwardValuesSharing();
        CNTK_API void DisableForwardValuesSharing();

-        CNTK_API void EnableHyperMemoryCompress();
-        CNTK_API void DisableHyperMemoryCompress();
-
        CNTK_API void EnableGradientAccumulationOptimization();
        CNTK_API void DisableGradientAccumulationOptimization();

--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -144,6 +144,8 @@ namespace CNTK
                    opType = PrimitiveOpType::Sin;
                else if (node->OperationName() == OperationNameOf(PassNode))
                    opType = PrimitiveOpType::Pass;
+                else if (node->OperationName() == OperationNameOf(LabelsToGraphNode))
+                    opType = PrimitiveOpType::LabelsToGraph;
                else if (node->OperationName() == OperationNameOf(RectifiedLinearNode))
                    opType = PrimitiveOpType::ReLU;
                else if (node->OperationName() == OperationNameOf(ExpNode))
@ -450,7 +452,7 @@ namespace CNTK
                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameDeletionPenalty] = edNode->DeletionPenalty();
                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSubstitutionPenalty] = edNode->SubstitutionPenalty();
                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSquashInputs] = edNode->SquashInputs();
-                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameSamplesToIgnore] = AsDictionaryValueVector(edNode->SamplesToIgnore());
+                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameTokensToIgnore] = AsDictionaryValueVector(edNode->TokensToIgnore());

                    opType = PrimitiveOpType::EditDistanceError;
                }
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -106,6 +106,12 @@
      <DelayLoadDLLs>Math.dll; msmpi.dll; PerformanceProfilerDll.dll </DelayLoadDLLs>
      <OptimizeReferences Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">false</OptimizeReferences>
    </Link>
+    <PreBuildEvent>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
+    </PreBuildEvent>
+    <PreBuildEvent>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
+    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(GpuBuild)">
    <ClCompile>
@ -118,6 +124,15 @@
      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" "$(TargetDir)"</Command>
      <Message>Copying NVidia GDK extension DLL to target folder</Message>
    </PostBuildEvent>
+    <PreBuildEvent>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
+    </PreBuildEvent>
+    <PreBuildEvent>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
+    </PreBuildEvent>
+    <PreBuildEvent>
+      <Command Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">prebuild.bat "$(Configuration)" "$(CNTK_MKL_SEQUENTIAL)" "$(CNTK_ENABLE_1BitSGD)" "$(CudaPath)" "$(CUDNN_PATH)" "$(CUB_PATH)" "$(CNTK_ENABLE_ASGD)"</Command>
+    </PreBuildEvent>
  </ItemDefinitionGroup>
  <ItemGroup>
    <ClInclude Include="API\CNTKLibrary.h" />
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -17,8 +17,11 @@
 #include "PerformanceProfiler.h"
 #include "MPIWrapper.h"
 #include "Basics.h"
+#include "ProgressTracing.h"
+#include "buildinfo.h"

 extern bool g_shareNodeValueMatrices;
+using namespace Microsoft::MSR::CNTK;

 namespace CNTK
 {
@ -84,16 +87,6 @@ namespace CNTK
            Microsoft::MSR::CNTK::Globals::SetShareNodeValueMatrices(/* enable = */ false);
        }

-        void EnableHyperMemoryCompress()
-        {
-            Microsoft::MSR::CNTK::Globals::SetHyperCompressMemory(/* enable = */ true);
-        }
-
-        void DisableHyperMemoryCompress()
-        {
-            Microsoft::MSR::CNTK::Globals::SetHyperCompressMemory(/* enable = */ false);
-        }
-
        void EnableGradientAccumulationOptimization()
        {
            Microsoft::MSR::CNTK::Globals::SetGradientAccumulationOptimization(/* enable = */ true);
@ -617,6 +610,56 @@ namespace CNTK
        va_end(args);
    }

+
+    void PrintBuiltInfo()
+    {
+        LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
+        LOGPRINTF(stderr, "Build info: \n\n");
+        LOGPRINTF(stderr, "\t\tBuilt time: %s %s\n", __DATE__, __TIME__);
+        LOGPRINTF(stderr, "\t\tLast modified date: %s\n", __TIMESTAMP__);
+#ifdef _BUILDTYPE_
+        LOGPRINTF(stderr, "\t\tBuild type: %s\n", _BUILDTYPE_);
+#endif
+#ifdef _BUILDTARGET_
+        LOGPRINTF(stderr, "\t\tBuild target: %s\n", _BUILDTARGET_);
+#endif
+#ifdef _WITH_1BITSGD_
+        LOGPRINTF(stderr, "\t\tWith 1bit-SGD: %s\n", _WITH_1BITSGD_);
+#endif
+#ifdef _WITH_ASGD_
+        LOGPRINTF(stderr, "\t\tWith ASGD: %s\n", _WITH_ASGD_);
+#endif
+#ifdef _MATHLIB_
+        LOGPRINTF(stderr, "\t\tMath lib: %s\n", _MATHLIB_);
+#endif
+#ifdef _CUDA_PATH_
+        LOGPRINTF(stderr, "\t\tCUDA_PATH: %s\n", _CUDA_PATH_);
+#endif
+#ifdef _CUB_PATH_
+        LOGPRINTF(stderr, "\t\tCUB_PATH: %s\n", _CUB_PATH_);
+#endif
+#ifdef _CUDNN_PATH_
+        LOGPRINTF(stderr, "\t\tCUDNN_PATH: %s\n", _CUDNN_PATH_);
+#endif
+#ifdef _GIT_EXIST
+        LOGPRINTF(stderr, "\t\tBuild Branch: %s\n", _BUILDBRANCH_);
+        LOGPRINTF(stderr, "\t\tBuild SHA1: %s\n", _BUILDSHA1_);
+#endif
+#ifdef _BUILDER_
+        LOGPRINTF(stderr, "\t\tBuilt by %s on %s\n", _BUILDER_, _BUILDMACHINE_);
+#endif
+#ifdef _BUILDPATH_
+        LOGPRINTF(stderr, "\t\tBuild Path: %s\n", _BUILDPATH_);
+#endif
+#ifdef _MPI_NAME_
+        LOGPRINTF(stderr, "\t\tMPI distribution: %s\n", _MPI_NAME_);
+#endif
+#ifdef _MPI_VERSION_
+        LOGPRINTF(stderr, "\t\tMPI version: %s\n", _MPI_VERSION_);
+#endif
+        LOGPRINTF(stderr, "-------------------------------------------------------------------\n");
+    }
+
    template CNTK_API __declspec_noreturn void ThrowFormatted<std::runtime_error>(const char* format, ...);
    template CNTK_API __declspec_noreturn void ThrowFormatted<std::logic_error>(const char* format, ...);
    template CNTK_API __declspec_noreturn void ThrowFormatted<std::invalid_argument>(const char* format, ...);
--- a/Source/CNTKv2LibraryDll/CompositeFunction.cpp
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.cpp
@ -721,8 +721,8 @@ namespace CNTK
                auto delPen = functionConfig[PrimitiveFunction::AttributeNameDeletionPenalty].Value<float>();
                auto insPen = functionConfig[PrimitiveFunction::AttributeNameInsertionPenalty].Value<float>();
                auto squashInputs = functionConfig[PrimitiveFunction::AttributeNameSquashInputs].Value<bool>();
-                auto samplesToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameSamplesToIgnore].Value<std::vector<DictionaryValue>>());
-                computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), subPen, delPen, insPen, squashInputs, samplesToIgnore, internalNodeName);
+                auto tokensToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameTokensToIgnore].Value<std::vector<DictionaryValue>>());
+                computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
                break;
            }
            case PrimitiveOpType::LambdaRank:
@ -813,6 +813,9 @@ namespace CNTK
            case PrimitiveOpType::Pass:
                computationNodePtr = New<PassNode<ElementType>>(network->GetDeviceId(), internalNodeName);
                break;
+            case PrimitiveOpType::LabelsToGraph:
+                computationNodePtr = New<LabelsToGraphNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                break;
            default:
                CNTK::LogicError("Specified op %S not yet supported", PrimitiveOpTypeName(op).c_str());
                break;
@ -932,6 +935,18 @@ namespace CNTK
        return computationNodePtr;
    }

+    std::unordered_set<Variable> CompositeFunction::NonOwnerPreservingCopy(const std::unordered_set<Variable>& outputs)
+    {
+        std::unordered_set<Variable> result;
+        for (auto& o : outputs)
+        {
+            Variable sanitized = o.NonCompositePreservingCopy();
+            result.insert(sanitized);
+        }
+
+        return result;
+    }
+
    template <typename ElementType>
    ComputationNetworkPtr CompositeFunction::GetComputationNetwork(const DeviceDescriptor& device,
                                                                   const std::unordered_set<Variable>& backpropRoots,
@ -941,7 +956,7 @@ namespace CNTK
    {
        if (m_computationNetwork != nullptr)
        {
-            // TODO: We should either invalidate and readapt the network if he backpropRoots change compared to what was specified when the network
+            // TODO: We should either invalidate and readapt the network if the backpropRoots change compared to what was specified when the network
            // was last constructed, to just recreate a new network.
            // For now just disallow changing the backpropRoots after the network is created
            if (!backpropRoots.empty() && (m_currentBackpropRoots != backpropRoots))
@ -966,7 +981,7 @@ namespace CNTK
                    InvalidArgument("Function::Forward: Only inputs of a Function can be excluded from gradient computation");
            }

-            m_inputsExcludedFromGradientComputation = inputsToExcludeGradientsFor;
+            m_inputsExcludedFromGradientComputation = NonOwnerPreservingCopy(inputsToExcludeGradientsFor);

            ComputationNetworkBuilder<ElementType> builder(*m_computationNetwork);

@ -1023,7 +1038,7 @@ namespace CNTK
                }
            }

-            m_currentBackpropRoots = backpropRoots;
+            m_currentBackpropRoots = NonOwnerPreservingCopy(backpropRoots);

            // In case of recurrence, the inputs of some of the ComputationNodes are not attached due to cycles.
            // Now attach those after we have created all ComputationNodes in the network
@ -1317,10 +1332,12 @@ namespace CNTK
    {
        if (m_perOutputVarArgumentDependencies.find(output) == m_perOutputVarArgumentDependencies.end())
        {
-            if (output.IsOutput())
-                m_perOutputVarArgumentDependencies[output] = AsComposite(output.Owner())->Arguments();
+            auto sanitizedOutput = output.NonCompositePreservingCopy();
+
+            if (sanitizedOutput.IsOutput())
+                m_perOutputVarArgumentDependencies[sanitizedOutput] = AsComposite(sanitizedOutput.Owner())->Arguments();
            else
-                m_perOutputVarArgumentDependencies[output] = { output };
+                m_perOutputVarArgumentDependencies[sanitizedOutput] = { sanitizedOutput };
        }

        return m_perOutputVarArgumentDependencies[output];
@ -1381,12 +1398,13 @@ namespace CNTK
        std::unordered_set<Variable> functionOutputs(m_outputs.begin(), m_outputs.end());
        std::vector<ComputationNodeBasePtr> outputsToEvaluate;
        std::unordered_set<Variable> requiredArguments;
-        for (auto outputVarValuePair : outputs)
+
+        for (auto outputVariable : requestedOutputVariables)
        {
-            auto& requiredArgumentsForCurrentOutput = GetArgumentDependencies(outputVarValuePair.first);
+            auto& requiredArgumentsForCurrentOutput = GetArgumentDependencies(outputVariable);
            requiredArguments.insert(requiredArgumentsForCurrentOutput.begin(), requiredArgumentsForCurrentOutput.end());

-            auto outputComputationNode = m_variableToNodeMap.at(outputVarValuePair.first);
+            auto outputComputationNode = m_variableToNodeMap.at(outputVariable);
            outputsToEvaluate.push_back(outputComputationNode);
        }

--- a/Source/CNTKv2LibraryDll/CompositeFunction.h
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.h
@ -33,6 +33,13 @@ namespace CNTK
    class CompositeFunction;
    typedef std::shared_ptr<CompositeFunction> CompositeFunctionPtr;

+    ///
+    /// Represents a symbolic computation with zero or more input arguments and one or more outputs.
+    /// Opposed to primitive functions, a composite function is composed of other Function instances whose inputs and outputs are wired together.
+    /// CompositeFunction is also responsible for breaking the loop in case of cyclic graphs - it stores the pointers for to the child primitive
+    /// functions and controls their lifetime.
+    /// CompositeFunction class inherits thus from Function.
+    ///
    class CompositeFunction final : public Function
    {
        friend class Function;
@ -258,6 +265,9 @@ namespace CNTK
        void GetNetworkOutputs(std::unordered_map<Variable, ValuePtr>& outputs);
        void GetNetworkGradients(std::unordered_map<Variable, ValuePtr>& gradients);

+        // Remove cyclic references for composite nodes
+        static std::unordered_set<Variable> NonOwnerPreservingCopy(const std::unordered_set<Variable>& outputs);
+
        const std::vector<Variable>& GetArgumentDependencies(const Variable& output);

        std::unordered_map<Variable, uint64_t> GetCurrentBackpropRootsTimeStamps() const;
--- a/Source/CNTKv2LibraryDll/DistributedCommunicator.cpp
+++ b/Source/CNTKv2LibraryDll/DistributedCommunicator.cpp
@ -340,16 +340,16 @@ namespace CNTK
            if (dataType == DataType::Float)
            {
                if (inputData == outputData)
-                    m_mpi->AllReduceAsync<float>(static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
+                    m_mpi->AllReduceAsync(static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
                else
-                    m_mpi->AllReduceAsync<float>(static_cast<float*>(inputData), static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
+                    m_mpi->AllReduceAsync(static_cast<float*>(inputData), static_cast<float*>(outputData), numElements, &allReduceRequests[i]);
            }
            else if (dataType == DataType::Double)
            {
                if (inputData == outputData)
-                    m_mpi->AllReduceAsync<double>(static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
+                    m_mpi->AllReduceAsync(static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
                else
-                    m_mpi->AllReduceAsync<double>(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
+                    m_mpi->AllReduceAsync(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements, &allReduceRequests[i]);
            }
            else
                LogicError("Unknown DataType");
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -1078,14 +1078,14 @@ namespace CNTK
        }
    }

-    FunctionPtr EditDistanceError(const Variable& prediction, const Variable& labels, float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& samplesToIgnore, const std::wstring& name)
+    FunctionPtr EditDistanceError(const Variable& prediction, const Variable& labels, float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& tokensToIgnore, const std::wstring& name)
    {
        auto additionalProperties = Dictionary();
        additionalProperties[PrimitiveFunction::AttributeNameSubstitutionPenalty] = subPen;
        additionalProperties[PrimitiveFunction::AttributeNameDeletionPenalty] = delPen;
        additionalProperties[PrimitiveFunction::AttributeNameInsertionPenalty] = insPen;
        additionalProperties[PrimitiveFunction::AttributeNameSquashInputs] = squashInputs;
-        additionalProperties[PrimitiveFunction::AttributeNameSamplesToIgnore] = AsDictionaryValueVector(samplesToIgnore);
+        additionalProperties[PrimitiveFunction::AttributeNameTokensToIgnore] = AsDictionaryValueVector(tokensToIgnore);

        return BinaryOp(PrimitiveOpType::EditDistanceError, prediction, labels, std::move(additionalProperties), name);
    }
--- a/Source/CNTKv2LibraryDll/MinibatchSource.cpp
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.cpp
@ -349,4 +349,117 @@ namespace CNTK
        m_epochEndReached = false;
        m_prevMinibatchSize = 0;
    }
+
+    /* static */ ImageTransform ReaderCrop(const wchar_t* cropType,
+            int cropSize, float sideRatio, float areaRatio,
+            float aspectRatio, const wchar_t* jitterType)
+    {
+        ImageTransform crop;
+        crop.Add(L"type", L"Crop",
+            L"cropType", cropType,
+            L"cropSize", cropSize,
+            L"sideRatio", sideRatio,
+            L"areaRatio", areaRatio,
+            L"aspectRatio", aspectRatio,
+            L"jitterType", jitterType);
+        return crop;
+    }
+
+    /* static */ ImageTransform ReaderScale(int width,
+            int height, int channels, const wchar_t* interpolations,
+            const wchar_t* scaleMode, int padValue)
+    {
+        ImageTransform scale;
+        scale.Add(L"type", L"Scale",
+            L"width", width,
+            L"height", height,
+            L"channels", channels,
+            L"interpolations", interpolations,
+            L"scaleMode", scaleMode,
+            L"padValue", padValue);
+        return scale;
+    }
+
+    /* static */ ImageTransform ReaderMean(const wchar_t* meanFile)
+    {
+        ImageTransform mean;
+        mean.Add(L"type", L"Mean", L"meanFile", meanFile);
+        return mean;
+    }
+
+    /* static */ ImageTransform ReaderColor(float brightnessRadius,
+            float contrastRadius, float saturationRadius)
+    {
+        ImageTransform color;
+        color.Add(L"type", L"Color",
+            L"brightnessRadius", brightnessRadius,
+            L"contrastRadius", contrastRadius,
+            L"saturationRadius", saturationRadius);
+        return color;
+    }
+
+    Deserializer ImageDeserializer(const std::wstring& fileName, const std::wstring& labelStreamName, size_t numLabels, const std::wstring& imageStreamName, const std::vector<ImageTransform>& transforms)
+    {
+        Deserializer img;
+        std::vector<DictionaryValue> actualTransforms;
+        std::transform(transforms.begin(), transforms.end(), std::back_inserter(actualTransforms), [](ImageTransform t) { return static_cast<DictionaryValue>(t); });
+        Dictionary labeldim;
+        labeldim[L"labelDim"] = numLabels;
+        Dictionary xforms;
+        xforms[L"transforms"] = actualTransforms;
+        Dictionary input;
+        input.Add(imageStreamName.c_str(), xforms, labelStreamName.c_str(), labeldim);
+        img.Add(L"type", L"ImageDeserializer", L"file", fileName, L"input", input);
+        return img;
+    }
+
+    Deserializer CTFDeserializer(const std::wstring& fileName, const std::vector<StreamConfiguration>& streams)
+    {
+        Deserializer ctf;
+        Dictionary input;
+        for (const auto& s : streams)
+        {
+            const auto& key = s.m_streamName;
+            Dictionary stream;
+            stream.Add(L"alias", s.m_streamAlias, L"dim", s.m_dim, L"format", s.m_isSparse ? L"sparse" : L"dense");
+            input[key] = stream;
+        }
+        ctf.Add(L"type", L"CNTKTextFormatDeserializer", L"file", fileName, L"input", input);
+        return ctf;
+    }
+
+    Deserializer HTKFeatureDeserializer(const std::vector<HTKFeatureConfiguration>& streams)
+    {
+        Deserializer htk;
+        Dictionary input;
+        for (const auto& s : streams)
+        {
+            const auto& key = s.m_streamName;
+            Dictionary stream;
+            std::vector<DictionaryValue> ctxWindow = { DictionaryValue(s.m_left), DictionaryValue(s.m_right) };
+            stream.Add(L"scpFile", s.m_scp, L"dim", s.m_dim, L"contextWindow", ctxWindow, L"expandToUtterance", s.m_broadcast);
+            input[key] = stream;
+        }
+        htk.Add(L"type", L"HTKFeatureDeserializer", L"input", input);
+        return htk;
+    }
+
+    Deserializer HTKMLFDeserializer(const std::wstring& streamName, const std::wstring& labelMappingFile, size_t dimension, const std::vector<std::wstring>& mlfFiles)
+    {
+        Deserializer htk;
+        Dictionary stream;
+        Dictionary labels;
+        labels.Add(L"labelMappingFile", labelMappingFile, L"dim", dimension);
+        std::vector<DictionaryValue> actualFiles;
+        std::transform(mlfFiles.begin(), mlfFiles.end(), std::back_inserter(actualFiles), [](const std::wstring& s) {return static_cast<DictionaryValue>(s); });
+        if (actualFiles.size() > 1)
+            labels[L"mlfFileList"] = actualFiles;
+        else if (actualFiles.size() == 1)
+            labels[L"mlfFile"] = actualFiles[0];
+        else
+            LogicError("HTKMLFDeserializer: No mlf files were specified");
+        stream[streamName] = labels;
+        htk.Add(L"type", L"HTKMLFDeserializer", L"input", stream);
+        return htk;
+    }
 }
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
@ -79,7 +79,7 @@ namespace CNTK
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameDeletionPenalty = L"DeletionPenalty";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameInsertionPenalty = L"InsertionPenalty";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameSquashInputs = L"SquashInputs";
-    /*static*/ const std::wstring PrimitiveFunction::AttributeNameSamplesToIgnore = L"SamplesToIgnore";
+    /*static*/ const std::wstring PrimitiveFunction::AttributeNameTokensToIgnore = L"TokensToIgnore";

    /*static*/ DataType PrimitiveFunction::GetOutputDataType(PrimitiveOpType op, std::vector<Variable>& inputs, bool inferDimensions)
    {
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.h
@ -235,7 +235,7 @@ namespace CNTK
        static const std::wstring AttributeNameDeletionPenalty;
        static const std::wstring AttributeNameInsertionPenalty;
        static const std::wstring AttributeNameSquashInputs;
-        static const std::wstring AttributeNameSamplesToIgnore;
+        static const std::wstring AttributeNameTokensToIgnore;

    protected:
        PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName, const std::wstring& uid)
--- a/Source/CNTKv2LibraryDll/PrimitiveOpType.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveOpType.h
@ -72,6 +72,7 @@ namespace CNTK
        NDCG = 60,
        EditDistanceError = 61,
        NoOp = 62,
+        LabelsToGraph = 63
        // New op types should only be appended to the end of this list.
        // If you append here, also add checks in SerializationTests (CheckEnumValuesNotModified)
        // and bump up PrimitiveFunction::s_serializationVersion and update PrimitiveFunction::Deserialize
--- a/Source/CNTKv2LibraryDll/Variable.cpp
+++ b/Source/CNTKv2LibraryDll/Variable.cpp
@ -87,6 +87,13 @@ namespace CNTK
        return result;
    }

+    Variable Variable::NonCompositePreservingCopy() const
+    {
+        Variable copy = *this;
+        copy.m_outputComposite = nullptr;
+        return copy;
+    }
+
    void Variable::SetOwner(Function* ownerFunction)
    {
        if (Kind() != VariableKind::Output)
--- a/Source/CNTKv2LibraryDll/prebuild.bat
+++ b/Source/CNTKv2LibraryDll/prebuild.bat
--- a/Source/Common/Globals.cpp
+++ b/Source/Common/Globals.cpp
@ -14,7 +14,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    std::atomic<bool> Globals::m_forceConstantRandomSeed(false);

    std::atomic<bool> Globals::m_enableShareNodeValueMatrices(true);
-    std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
    std::atomic<bool> Globals::m_optimizeGradientAccumulation(true);

 }}}
--- a/Source/Common/Include/DataReader.h
+++ b/Source/Common/Include/DataReader.h
@ -151,6 +151,8 @@ public:
    }
    };

+    std::function<std::string(size_t)> m_getKeyById;
+
 private:
    typedef map<std::wstring, Input> MapType;
    MapType inputs;
--- a/Source/Common/Include/Globals.h
+++ b/Source/Common/Include/Globals.h
@ -28,15 +28,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        static void SetShareNodeValueMatrices(bool enable) { m_enableShareNodeValueMatrices = enable; }
        static bool ShouldEnableShareNodeValueMatrices() { return m_enableShareNodeValueMatrices; }

-        static void SetHyperCompressMemory(bool enable) { m_enableHyperCompressMemory = enable; }
-        static bool ShouldEnableHyperCompressMemory() { return m_enableHyperCompressMemory; }
-
    private:
        static std::atomic<bool> m_forceDeterministicAlgorithms;
        // The global flag to enable matrices values in forward and backward prop
        static std::atomic<bool> m_enableShareNodeValueMatrices;
-        // The global flag to enable hyper memory compression 
-        static std::atomic<bool> m_enableHyperCompressMemory;
        static std::atomic<bool> m_forceConstantRandomSeed;
        static std::atomic<bool> m_optimizeGradientAccumulation;
    };
--- a/Source/Common/Include/MPIWrapper.h
+++ b/Source/Common/Include/MPIWrapper.h
@ -1,14 +1,14 @@
 //
 // Copyright (c) Microsoft. All rights reserved.
-// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full licence information.
 //
-
 #pragma once

+#if HAS_MPI
 // Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#ms-mpi or
 // https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#open-mpi for setup instructions
 // of an MPI implementation on your platform.
+
 #ifdef _MSC_VER
 // Suppress warning for non-ASCII characters in MS-MPI headers
 #pragma warning(push)
@ -18,7 +18,25 @@
 #else
 #include "mpi.h"
 #endif
-#pragma comment(lib, "msmpi.lib")
+#else
+// Note: the following macros/typedefs define some of the MPI related functions and constants such that code
+//       using these functionality will compile cleanly - but will not actually perform the MPI operation.
+//       The clean way to go is to move any code related to mpi into the mpiwrapper class implementation and decide
+//       in this class if to use mpi.h or not.
+typedef void *MPI_Comm;
+typedef enum _MPI_Datatype { MPI_CHAR, MPI_INT, MPI_FLOAT, MPI_DOUBLE, MPI_UNSIGNED, MPI_LONG_LONG_INT } MPI_Datatype;
+
+#define MPI_IN_PLACE          ((void*)(int)-1)
+#define MPI_SUM               ((MPI_Op)0x58000003)
+
+#define MPI_STATUSES_IGNORE  (MPI_Status*)1
+#define MPI_STATUS_IGNORE    (MPI_Status*)1
+#define MPI_UNDEFINED        (-32766)
+
+typedef int MPI_Op;
+typedef int MPI_Request;
+typedef void *MPI_Status;
+#endif

 #include <errno.h> 
 #include <string>
@ -28,8 +46,6 @@

 #include "CommonMatrix.h"

-#define FFLUSH_SUCCESS 0
-
 namespace Microsoft { namespace MSR { namespace CNTK {

 struct MpiFail : public std::string
@ -40,481 +56,128 @@ struct MpiFail : public std::string
    }
 };

-static int operator||(int rc, const MpiFail &what)
-{
-    if (rc == MPI_SUCCESS)
-    {
-        return rc;
-    }
-
-    fprintf(stderr, "%s, MPI error %d\n", what.c_str(), rc);
-    fflush(stderr);
-
-    // (special case: we use that code to indicate a missing msmpi.dll...)
-    if (rc != MPI_ERR_INTERN)
-    {
-        char errbuf[MPI_MAX_ERROR_STRING + 1] = {0};
-        int len;
-        MPI_Error_string(rc, &errbuf[0], &len);
-        fprintf(stderr, "%s, MPI error %d: %s\n", what.c_str(), rc, errbuf);
-        fflush(stderr);
-
-        // we abort through this, so that the MPI system gets the memo
-        MPI_Abort(MPI_COMM_WORLD, rc);
-
-        // TODO: or does that only signal an issue, and we should still terminate ourselves?
-        // BUGBUG: We'd also need to Abort through the other sub-set communicator
-    }
-    RuntimeError("%s", what.c_str());
-}
+extern int operator||(int rc, const MpiFail &what);

 class MPIWrapper;
 typedef std::shared_ptr<MPIWrapper> MPIWrapperPtr;

+extern "C" void GetMpiWrapper(MPIWrapper **mpi);
+
+// Note: This is now a pure interface, so please don't add
+//       any functionality to this class.
+//       Instead, make your own implementation class, add/change
+//       functions there as needed and use a private interface to
+//       these functions.
+//       In case you need to add functions that affect all
+//       implementations, add a pure virtual function here and
+//       update any affected implementation.
 class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>
 {
-    int m_myRank;
-    std::wstring m_myName;
-    int m_numMPINodes;
-    size_t m_numNodesInUse;
-    bool m_multiHost;
-
-    // MPI communicator that reflects the current subset selection
-    MPI_Comm m_currentComm;
-
-    static MPIWrapperPtr s_mpi;
-
-    // MPI_Init() with delay-loading the msmpi.dll (possibly causing a failure if missing; we want to catch that)
-    int MPI_Init_DL()
-    {
-#ifdef WIN32
-        __try
-#endif
-        {
-            // don't initialize if that has been done already
-            int flag = 0;
-            MPI_Initialized(&flag);
-            if (flag)
-                return MPI_SUCCESS;
-
-            int argc = 0;
-            char **argv = NULL;
-            // TODO(qiwye) Multiverso(parameter server) will benefit from MPI_THREAD_MULTIPLE .
-            int requiredThreadLevelSupport = MPI_THREAD_SERIALIZED;
-            int provided;
-            int ret = MPI_Init_thread(&argc, &argv, requiredThreadLevelSupport, &provided);
-            if (provided != requiredThreadLevelSupport)
-                LogicError("Failed to initialize MPI with the desired level of thread support");
-
-            return ret;
-        }
-#ifdef WIN32
-        __except (EXCEPTION_EXECUTE_HANDLER)
-        {
-            fprintf(stderr, "mpihelper: msmpi.dll missing\n");
-            return MPI_ERR_INTERN;
-        }
-#endif
-    }
-
-    // Workaround for the issue with MPI hanging when we have non-0 exit codes from CNTK processes
-    // OpenMPI has a confirmed race condition on killing child process vs. handling their non-zero exit statuses, resulting
-    // in a deadlock, where all processes killed but MPI is still waiting.
-    // This happens when several perfectly synchronized processes (for example on MPI barrier)
-    // simulatenously exit with non-0 exit code.
-    // As a workaround, we simply sleep 50*rank miliseconds, effectively "de-synchronizing processes" at exit,
-    // allowing MPI to sequentially handle terminations
-    static int s_myRank;
-    static void MPIWorkaroundAtExit()
-    {
-        Sleep(s_myRank * 50);
-    }
-
 public:
-    MPIWrapper()
-        : m_currentComm(MPI_COMM_WORLD)
-    {
-        static bool initialized = false;
-        if (initialized)
-        {
-            LogicError("MPIWrapper: this is a singleton class that can only be instantiated once per process");
-        }
+    MPIWrapper() {}
+    virtual ~MPIWrapper() {}

-        initialized = true;
-        
-        if (GetMathLibTraceLevel() > 0)
-        {
-            fprintf(stderr, "MPIWrapper: initializing MPI\n");
-            fflush(stderr);
-        }
-
-        MPI_Init_DL() || MpiFail("mpiaggregator: MPI_Init");
-        MPI_Comm_rank(MPI_COMM_WORLD, &m_myRank);
-        MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes);
-        m_numNodesInUse = m_numMPINodes;
-        m_multiHost = true;
-
-        // Verify that the environment variable used by GetTotalNumberOfMPINodes()  
-        // matches what the MPI API says. There're actually two possible cases:
-        // 1) when we're running with mpiexec both values have to match;
-        // 2) when we're running without mpiexec, the former will return 0, and
-        // the later will be set to 1.
-        assert((GetTotalNumberOfMPINodes() == 0 && m_numNodesInUse == 1) ||
-                (GetTotalNumberOfMPINodes() == m_numNodesInUse));
-
-        char name[BUFSIZ];
-        int length;
-        MPI_Get_processor_name(name, &length);
-        m_myName = std::wstring(name, name+length);
-
-        // Applying MPI workaround
-        s_myRank = m_myRank;
-        atexit(&MPIWrapper::MPIWorkaroundAtExit);
-
-        // by default we use all of them
-        RequestNodes("MPIWrapper");
-
-        if (GetMathLibTraceLevel() > 0)
-        {
-            if (m_numMPINodes > 1)
-                fprintf(stderr, "mpihelper: we are cog %d in a gearbox of %d\n", (int) m_myRank, (int) m_numMPINodes);
-            else
-                fprintf(stderr, "mpihelper: only one MPI process: MPI operation will be boring\n");
-
-            fflush(stderr);
-        }
-
-        // do an initial handshake
-        Ping("mpihelper");
-
-        // stagger the jobs just a little to get a sort-of deterministic order e.g. in GPU allocation when running on one machine
-        // continue 0.5 seconds apart
-        ::Sleep((DWORD)(500 * CurrentNodeRank()));
-    }
+    static MPIWrapperPtr GetInstance(bool create = false);
+    static void DeleteInstance();
+    static MPIWrapperPtr s_mpi;

    // Note that specifically, this function is such that it does not require
    // MPI initialization. Moreover, it can be used without actually loading any
    // MPI libs.
    // TODO: Once we move to dynamic loading for MPI libs on Linux, move it to utilities.
-    static int GetTotalNumberOfMPINodes()
-    {
-#ifdef WIN32
-        const char* p = std::getenv("PMI_SIZE");
-#else
-        const char* p = std::getenv("OMPI_COMM_WORLD_SIZE");
-#endif
-        if (!p)
-        {
-            return 0;
-        }
-        else
-        {
-            return std::stoi(string(p));
-        }
-    }
+    static int GetTotalNumberOfMPINodes();

-    // Note: we don't clear the sub-communication here although we should, because in case of a crash, this prevents the EXE from terminating.
-    // It's OK since this class is a singleton anyway that gets instantiated exactly once at program startup.
-    ~MPIWrapper()
-    {
-        if (GetMathLibTraceLevel() > 0)
-        {
-            fprintf(stderr, "~MPIWrapper\n");
-        }
-
-        // Do not finalize in event of an exception since calling MPI_Finalize without
-        // all pending communications being finished results in a hang
-        int rc = fflush(stderr);
-        if (!std::uncaught_exception())
-        {
-            if (rc != FFLUSH_SUCCESS)
-            {
-            #ifdef _WIN32
-                RuntimeError("MPIWrapper: Failed to flush stderr, %d", ::GetLastError());
-            #else
-                RuntimeError("MPIWrapper: Failed to flush stderr, %d", errno);
-            #endif
-            }
-
-            MPI_Finalize();
-        }
-    }
-
-private:
-    void Ping(const char *msg) const
-    {
-#undef USE2NDCOMM
-#ifndef USE2NDCOMM
-        if (NumNodesInUse() != m_numMPINodes)
-        {
-            fprintf(stderr, "ping [%s]: cannot be applied to subset (%d) of nodes, skipping\n", msg, (int) NumNodesInUse());
-            fflush(stderr);
-            return;
-        }
-#endif
-        std::array<int, 1> handshake;
-        handshake[0] = 1;
-
-        if (GetMathLibTraceLevel() > 0)
-        {
-            fprintf(stderr, "ping [%s]: %d nodes pinging each other\n", msg, (int) NumNodesInUse());
-            fflush(stderr);
-        }
-
-        AllReduce(handshake);
-
-        if (GetMathLibTraceLevel() > 0)
-        {
-            fprintf(stderr, "ping [%s]: all %d nodes responded\n", msg, handshake[0]);
-            fflush(stderr);
-        }
-    }
-
-    void RequestNodes(const char *msg, size_t requestednodes = SIZE_MAX /*default: all*/)
-    {
-        Ping("requestnodes (before change)");
-
-// undo current split
-#ifdef USE2NDCOMM
-        if (m_currentComm != MPI_COMM_WORLD /*no subset*/ && m_currentComm != MPI_COMM_NULL /*idle nodes*/)
-        {
-            fprintf(stderr, "requestnodes: MPI_Comm_free %x\n", (int) m_currentComm);
-            fflush(stderr);
-            MPI_Comm_free(&m_currentComm) || MpiFail("requestnodes: MPI_Comm_free"); // will leave MPI_COMM_NULL here
-        }
-#endif
-        // reset to MPI_COMM_WORLD
-        m_currentComm = MPI_COMM_WORLD;
-        // create a new split (unless all nodes were requested)
-        if (requestednodes < (size_t) m_numMPINodes)
-        {
-#ifdef USE2NDCOMM
-            fprintf(stderr, "requestnodes: MPI_Comm_split %d\n", (node() < requestednodes) ? 1 : MPI_UNDEFINED);
-            fflush(stderr);
-            MPI_Comm_split(communicator(), (node() < requestednodes) ? 1 : MPI_UNDEFINED, 0, &m_currentComm) || MpiFail("requestnodes: MPI_Comm_split");
-            fprintf(stderr, "requestnodes: MPI_Comm_split -> %x\n", (int) m_currentComm);
-            fflush(stderr);
-#endif
-        }
-        else
-        {
-            // leave m_currentComm as MPI_COMM_WORLD
-            // and clip to #nodes
-            requestednodes = m_numMPINodes;
-        }
-
-        m_numNodesInUse = requestednodes;
-
-        if (GetMathLibTraceLevel() > 0)
-        {
-            fprintf(stderr, "requestnodes [%s]: using %d out of %d MPI nodes (%d requested); we (%d) are %s\n",
-                    msg, (int) m_numNodesInUse, (int) m_numMPINodes, (int) requestednodes,
-                    (int) CurrentNodeRank(), IsIdle() ? "out (idle)" : "in (participating)");
-            fflush(stderr);
-        }
-        Ping("requestnodes (after change)");
-
-        // If all ranks run on a single host, we can enable optimized communication
-        // paths (e.g. NCCL). To determine if a single machine is being used, we
-        // check that MPI_Get_processor_name matches for all ranks.
-        const int nameMax = MPI_MAX_PROCESSOR_NAME + 1;
-        char myName[nameMax] = {0};
-        int  myNameLen = 0;
-        MPI_Get_processor_name(myName, &myNameLen) || MpiFail("requestnodes: MPI_Get_processor_name");
-        myName[myNameLen] = '\0';
-
-        std::vector<char> nameBuffer(m_numNodesInUse * nameMax);
-        char* allNames = nameBuffer.data();
-        MPI_Allgather(myName, nameMax, MPI_CHAR, allNames, nameMax, MPI_CHAR, m_currentComm)
-            || MpiFail("requestnodes: MPI_Allgather");
-
-        m_multiHost = false;
-        for(size_t i=1; i<m_numNodesInUse; i++)
-        {
-            if (strcmp(allNames, allNames+i*nameMax) != 0)
-            {
-                m_multiHost = true;
-                break;
-            }
-        }
-
-        fprintf(stderr, "requestnodes [%s]: using %d out of %d MPI nodes on %s (%d requested); we (%d) are %s\n",
-                msg, (int) m_numNodesInUse, (int) m_numMPINodes, m_multiHost ? "multiple hosts" : "a single host",
-                (int) requestednodes, (int) CurrentNodeRank(), IsIdle() ? "out (idle)" : "in (participating)");
-        fflush(stderr);
-    }
-
-public:
-
-    static MPIWrapperPtr GetInstance(bool create = false)
-    {
-        if (create)
-        {
-            if (s_mpi != nullptr)
-                LogicError("Creating MPIWrapper instance after a GetInstance call has been already made!");
-            else
-                s_mpi = std::make_shared<MPIWrapper>();
-        }
-
-        return s_mpi;
-    }
-
-    static void DeleteInstance()
-    {
-        s_mpi = nullptr;
-    }
-
-    MPI_Comm Communicator() const
-    {
-        return m_currentComm;
-    }
-    size_t NumNodesInUse() const
-    {
-        return m_numNodesInUse;
-    }
-    size_t CurrentNodeRank() const
-    {
-        return m_myRank;
-    }
-    std::wstring CurrentNodeName() const
-    {
-        return m_myName;
-    }
-    bool IsMainNode() const
-    {
-        return m_myRank == 0;
-    } // we are the chosen one--do extra stuff like saving the model to disk
-    bool IsIdle() const
-    {
-        return CurrentNodeRank() >= NumNodesInUse();
-    } // user had requested to not use this many nodes
-    bool UsingAllNodes() const
-    {
-        return NumNodesInUse() == m_numMPINodes;
-    } // all nodes participate (used to check whether we can use MPI_Allreduce directly)
-    size_t MainNodeRank() const
-    {
-        return 0;
-    }
-
-    bool IsMultiHost()
-    {
-        return m_multiHost;
-    }
+    virtual size_t NumNodesInUse() const = 0;
+    virtual size_t CurrentNodeRank() const = 0;
+    virtual bool IsMainNode() const = 0;
+    virtual std::wstring CurrentNodeName() const = 0;
+    virtual bool IsIdle() const = 0;
+    virtual bool UsingAllNodes() const = 0;
+    virtual size_t MainNodeRank() const = 0;
+    virtual bool IsMultiHost() const = 0;

    // -----------------------------------------------------------------------
    // data-exchange functions (wrappers around MPI functions)
    // -----------------------------------------------------------------------

+    virtual int Finalize(void) = 0;
+    virtual int Wait(MPI_Request* request, MPI_Status* status) = 0;
+    virtual int Waitany(int count, MPI_Request array_of_requests[], int* index, MPI_Status* status) = 0;
+    virtual int Waitall(int count, MPI_Request array_of_requests[], MPI_Status array_of_statuses[]) = 0;
+    virtual int Isend(const void* buf, int count, MPI_Datatype datatype, int dest, int tag, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
+    virtual int Recv(void* buf, int count, MPI_Datatype datatype, int source, int tag, /*MPI_Comm comm,*/ MPI_Status* status) = 0;
+    virtual int Irecv(void* buf, int count, MPI_Datatype datatype, int source, int tag, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
+    virtual int Iallreduce(const void* sendbuf, void* recvbuf, int count, MPI_Datatype datatype, MPI_Op op, /*MPI_Comm comm,*/ MPI_Request* request) = 0;
+    virtual int Abort(int errorcode) = 0;
+    virtual int Error_string(int errorcode, char* string, int* resultlen) = 0;
+
+
    // helpers to determine the MPI_Datatype of a pointer
-    static MPI_Datatype GetDataType(char *)
-    {
-        return MPI_CHAR;
-    }
-    static MPI_Datatype GetDataType(int *)
-    {
-        return MPI_INT;
-    }
-    static MPI_Datatype GetDataType(float *)
-    {
-        return MPI_FLOAT;
-    }
-    static MPI_Datatype GetDataType(double *)
-    {
-        return MPI_DOUBLE;
-    }
-    static MPI_Datatype GetDataType(size_t *)
-    {
-        return sizeof(size_t) == 4 ? MPI_UNSIGNED : MPI_LONG_LONG_INT;
-    }
+    static MPI_Datatype GetDataType(char *);
+    static MPI_Datatype GetDataType(int *);
+    static MPI_Datatype GetDataType(float *);
+    static MPI_Datatype GetDataType(double *);
+    static MPI_Datatype GetDataType(size_t *);

    // allreduce of a vector
-    template <typename VECTORLIKEOBJECT>
-    void AllReduce(VECTORLIKEOBJECT &accumulator) const
-    {
-        auto *dataptr = accumulator.data();
-        size_t totalnumelements = accumulator.size();
-
-        // use MPI to compute the sum over all elements in (dataptr, totalnumelements) and redistribute to all nodes
-        AllReduce<typename VECTORLIKEOBJECT::value_type>(dataptr, totalnumelements);
-    }
+    virtual void AllReduce(std::vector<size_t>& accumulator) const = 0;
+    virtual void AllReduce(std::vector<int>& accumulator) const = 0;
+    virtual void AllReduce(std::vector<double>& accumulator) const = 0;
+    virtual void AllReduce(std::vector<float>& accumulator) const = 0;

    // for raw pointer
-    template <class ElemType>
-    void AllReduce(ElemType* sendData, size_t numElements, MPI_Op op = MPI_SUM) const
-    {
-        AllReduce<ElemType>(static_cast<ElemType*>(MPI_IN_PLACE), sendData, numElements, op);
-    }
+    virtual void AllReduce(size_t* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduce(int* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduce(double* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduce(float* sendData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;

-    template <class ElemType> 
-    void AllReduceAsync(ElemType* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const
-    {
-        AllReduceAsync<ElemType>(static_cast<ElemType*>(MPI_IN_PLACE), sendData, numElements, request, op);
-    }
+    virtual void AllReduce(size_t* sendData, size_t* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduce(int* sendData, int* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduce(double* sendData, double* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduce(float* sendData, float* receiveData, size_t numElements, MPI_Op op = MPI_SUM) const = 0;

-    template <class ElemType>
-    void AllGatherAsync(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements, MPI_Request* request) const
-    {
-        MPI_Iallgather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), Communicator(), request) || MpiFail("AllReduceAsync: MPI_Iallgather");
-    }
+    virtual void AllReduceAsync(size_t* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduceAsync(int* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduceAsync(double* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduceAsync(float* sendData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;

-    template <class ElemType>
-    void AllGather(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements) const
-    {
-        MPI_Allgather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), Communicator()) || MpiFail("AllReduceAsync: MPI_Allgather");
-    }
+    virtual void AllReduceAsync(size_t* sendData, size_t* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduceAsync(int* sendData, int* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduceAsync(double* sendData, double* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;
+    virtual void AllReduceAsync(float* sendData, float* receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const = 0;

-    template <class ElemType>
-    void AllReduceAsync(ElemType *sendData, ElemType *receiveData, size_t numElements, MPI_Request* request, MPI_Op op = MPI_SUM) const
-    {
-        MPI_Iallreduce(sendData, receiveData, (int)numElements, GetDataType(sendData), op, Communicator(), request) || MpiFail("AllReduceAsync: MPI_Iallreduce");
-    }
+    virtual void Bcast(size_t* sendData, size_t numElements, size_t srcRank) = 0;
+    virtual void Bcast(double* sendData, size_t numElements, size_t srcRank) = 0;
+    virtual void Bcast(float* sendData, size_t numElements, size_t srcRank) = 0;
+    virtual void Bcast(void* buffer, int count, MPI_Datatype datatype, int root) = 0;

-    template <class ElemType>
-    void AllReduce(ElemType *sendData, ElemType *receiveData, size_t numElements, MPI_Op op = MPI_SUM) const
-    {
-        MPI_Allreduce(sendData, receiveData, (int)numElements, GetDataType(sendData), op, Communicator()) || MpiFail("AllReduce: MPI_Allreduce");
-    }
+    virtual void AllGatherAsync(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
+    virtual void AllGatherAsync(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
+    virtual void AllGatherAsync(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;
+    virtual void AllGatherAsync(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements, MPI_Request* request) const = 0;

-    template <class ElemType>
-    void Gather(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, size_t numRecvElements, size_t rootRank) const
-    {
-        MPI_Gather(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, (int)numRecvElements, GetDataType(receiveData), (int)rootRank, Communicator()) || MpiFail("AllReduceAsync: MPI_Gather");
-    }
+    virtual void AllGather(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements) const = 0;
+    virtual void AllGather(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements) const = 0;
+    virtual void AllGather(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements) const = 0;
+    virtual void AllGather(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements) const = 0;
+    virtual void Allgather(const void* sendbuf, int sendcount, MPI_Datatype sendtype, void* recvbuf, int recvcount, MPI_Datatype recvtype) const = 0;

-    template <class ElemType>
-    void Gatherv(const ElemType *sendData, size_t numSendElements, ElemType *receiveData, int recvCounts[], int offsets[], size_t rootRank) const
-    {
-        MPI_Gatherv(sendData, (int)numSendElements, GetDataType(receiveData), receiveData, recvCounts, offsets, GetDataType(receiveData), (int)rootRank, Communicator()) || MpiFail("AllReduceAsync: MPI_Gatherv");
-    }
+    virtual void Gather(const size_t *sendData, size_t numSendElements, size_t *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
+    virtual void Gather(const int *sendData, size_t numSendElements, int *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
+    virtual void Gather(const float *sendData, size_t numSendElements, float *receiveData, size_t numRecvElements, size_t rootRank) const = 0;
+    virtual void Gather(const double *sendData, size_t numSendElements, double *receiveData, size_t numRecvElements, size_t rootRank) const = 0;

-    template <class ElemType>
-    void Bcast(ElemType *pData, size_t nData, size_t srcRank)
-    {
-        MPI_Bcast(pData, (int) nData, GetDataType(pData), (int) srcRank, Communicator()) || MpiFail("Bcast: MPI_Bcast");
-    }
-
-    // wait for an async request to finish
-    void Wait(MPI_Request* request)
-    {
-        MPI_Wait(request, MPI_STATUSES_IGNORE) || MpiFail("Wait: MPI_Wait");
-    }
-
-    void WaitAny(MPI_Request* requests, int numRequests, int* index)
-    {
-        MPI_Waitany(numRequests, requests, index, MPI_STATUSES_IGNORE) || MpiFail("WaitAny: MPI_Waitany");
-    }
+    virtual void Gatherv(const size_t *sendData, size_t numSendElements, size_t *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
+    virtual void Gatherv(const char *sendData, size_t numSendElements, char *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
+    virtual void Gatherv(const int *sendData, size_t numSendElements, int *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
+    virtual void Gatherv(const float *sendData, size_t numSendElements, float *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;
+    virtual void Gatherv(const double *sendData, size_t numSendElements, double *receiveData, int recvCounts[], int offsets[], size_t rootRank) const = 0;

    // wait for all ranks to reach here
-    void WaitAll()
-    {
-        MPI_Barrier(m_currentComm) || MpiFail("waitall: MPI_Barrier");
-    }
-
-    void WaitAll(std::vector<MPI_Request>& requests)
-    {
-        MPI_Waitall((int)requests.size(), &requests[0], MPI_STATUSES_IGNORE) || MpiFail("waitall: MPI_Waitall");
-    }
+    virtual int WaitAll() = 0;
+    virtual void WaitAny(MPI_Request* requests, int numRequests, int* index) = 0;
+    virtual void Wait(MPI_Request* request) = 0;
+    virtual int WaitAll(std::vector<MPI_Request>& requests) = 0;
 };

 }}}
--- a/Source/Common/MPIWrapper.cpp
+++ b/Source/Common/MPIWrapper.cpp
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -36,15 +36,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // -----------------------------------------------------------------------

 template <>
-vector<shared_ptr<Matrix<float>>>& MatrixPool::GetReleasedMatrices<float>()
+vector<MemRequestInfo<float>>& MatrixPool::GetMemRequestInfoVec<float>()
 {
-    return m_releasedFloatMatrices;
+    return m_memRequestInfoFloatVec;
 }

 template <>
-vector<shared_ptr<Matrix<double>>>& MatrixPool::GetReleasedMatrices<double>()
+vector<MemRequestInfo<double>>& MatrixPool::GetMemRequestInfoVec<double>()
 {
-    return m_releasedDoubleMatrices;
+    return m_memRequestInfoDoubleVec;
 }

 // -----------------------------------------------------------------------
@ -463,7 +463,7 @@ bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
        nodePtr->OperationName() == OperationNameOf(CrossEntropyNode) ||
        nodePtr->OperationName() == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode) ||
        nodePtr->OperationName() == OperationNameOf(ClassificationErrorNode) ||
-        nodePtr->OperationName() == OperationNameOf(EditDistanceErrorNode) ||
+        nodePtr->OperationName() == OperationNameOf(ForwardBackwardNode) ||
 #ifdef COMING_SOON
        nodePtr->OperationName() == OperationNameOf(CRFNode) ||
 #endif
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -49,6 +49,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(CropNode))                             return New<CropNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(CrossEntropyNode))                     return New<CrossEntropyNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(CrossEntropyWithSoftmaxNode))          return New<CrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(ForwardBackwardNode))                  return New<ForwardBackwardNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(DiagonalNode))                         return New<DiagonalNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(DiagTimesNode))                        return New<DiagTimesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(DropoutNode))                          return New<DropoutNode<ElemType>>(forward<_Types>(_Args)...);
@ -93,6 +94,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(PerDimMeanVarNormalizationNode))       return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PerDimMeanVarDeNormalizationNode))     return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PassNode))                             return New<PassNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(LabelsToGraphNode))                    return New<LabelsToGraphNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PlusNode))                             return New<PlusNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RandomSampleNode))                     return New<RandomSampleNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RandomSampleInclusionFrequencyNode))   return New<RandomSampleInclusionFrequencyNode<ElemType>>(forward<_Types>(_Args)...);
@ -430,9 +432,9 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Class
 }

 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> samplesToIgnore, const std::wstring nodeName)
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> tokensToIgnore, const std::wstring nodeName)
 {
-    return net.AddNodeToNetAndAttachInputs(New<EditDistanceErrorNode<ElemType>>(net.GetDeviceId(), subPen, delPen, insPen, squashInputs, samplesToIgnore, nodeName), { a, b });
+    return net.AddNodeToNetAndAttachInputs(New<EditDistanceErrorNode<ElemType>>(net.GetDeviceId(), nodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore), { a, b });
 }

 template <class ElemType>
@ -499,6 +501,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Seque
    return net.AddNodeToNetAndAttachInputs(New<SequenceWithSoftmaxNode<ElemType>>(net.GetDeviceId(), nodeName), { label, prediction, loglikelihood });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ForwardBackward(const ComputationNodePtr label, const ComputationNodePtr prediction, int blankTokenId, int delayConstraint, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<ForwardBackwardNode<ElemType>>(net.GetDeviceId(), nodeName, blankTokenId, delayConstraint), { label, prediction });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::NoiseContrastiveEstimation(const ComputationNodePtr label, const ComputationNodePtr prediction,
                                                                                                      const ComputationNodePtr input_weight,
@ -570,6 +578,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Pass(
    return net.AddNodeToNetAndAttachInputs(New<PassNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::LabelsToGraph(const ComputationNodePtr a, const std::wstring& nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<LabelsToGraphNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName)
 {
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -126,11 +126,12 @@ public:
    ComputationNodePtr CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
    ComputationNodePtr CrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
+    ComputationNodePtr ForwardBackward(const ComputationNodePtr label, const ComputationNodePtr prediction, int blankTokenId, int delayConstraint, const std::wstring nodeName = L"");
    ComputationNodePtr DiagTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Diagonal(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr DummyCriterion(const ComputationNodePtr objectives, const ComputationNodePtr derivatives, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
-    ComputationNodePtr EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> samplesToIgnore, const std::wstring nodeName = L"");
+    ComputationNodePtr EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<size_t> tokensToIgnore, const std::wstring nodeName = L"");
    ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName = L"");
    ComputationNodePtr ClassificationError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
@ -159,6 +160,7 @@ public:
    ComputationNodePtr Negate(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr NoiseContrastiveEstimation(const ComputationNodePtr label, const ComputationNodePtr prediction, const ComputationNodePtr input_weight, const ComputationNodePtr input_bias, const std::wstring nodeName = L"", NCEEvalMode mode = NCEEvalMode::None);
    ComputationNodePtr Pass(const ComputationNodePtr a, const std::wstring& nodeName = L"");
+    ComputationNodePtr LabelsToGraph(const ComputationNodePtr a, const std::wstring& nodeName = L"");
    ComputationNodePtr PastValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
    ComputationNodePtr PerDimMeanVarDeNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
    ComputationNodePtr PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean, const ComputationNodePtr InvStdDev, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -943,31 +943,41 @@ void ComputationNetwork::PrintMemorySharingStructure(const vector<ComputationNod
    size_t numUnshared = 0;
    for (const auto& item : memSharingStructure)
    {
-        if (item.second.size() < 2) // only print actually shared matrices
+        if (item.second.size() < 2) // unshared matrices
            numUnshared++;
-        else
+        else                        // shared matrices
            numShared++;
    }

-    fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
+    fprintf(stderr, "\nMemory Sharing: Out of %d matrices, %d are shared as %d, and %d are not shared.\n", (int)numMatrices, (int)(numMatrices - numUnshared), (int)numShared, (int)numUnshared);
+
+    fprintf(stderr, "\nHere are the ones that share memory:\n"); 
    for (const auto& item : memSharingStructure)
    {
-        if (item.second.size() < 2) // only print actually shared matrices
-            continue;
-        // Format:
-        // { node1
-        //   node2 }
-        // { node3
-        //   node4
-        //   node5 }
-        // where unshared nodes are not printed.
-        const char* delim = "\t{ ";
-        for (const auto& memShareInfo : item.second)
+        if (item.second.size() >= 2)
        {
-            fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
-            delim = "\n\t  ";
+            // Format:
+            // { node1
+            //   node2 }
+            // { node3
+            //   node4
+            //   node5 }
+            const char* delim = "\t{ ";
+            for (const auto& memShareInfo : item.second)
+            {
+                fprintf(stderr, "%s%ls", delim, memShareInfo.c_str());
+                delim = "\n\t  ";
+            }
+            fprintf(stderr, " }\n");
+        }
+    }
+    fprintf(stderr, "\nHere are the ones that don't share memory:\n");
+    for (const auto& item : memSharingStructure)
+    {
+        if (item.second.size() < 2)
+        {
+            fprintf(stderr, "\t{%ls}\n", item.second.begin()->c_str()); 
        }
-        fprintf(stderr, " }\n");
    }
    fprintf(stderr, "\n");
 }
@ -1003,7 +1013,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
    // Due to special topology, if a node is solely induced by parameters, its function value should not be shared
    MarkValueNonSharableNodes();

-    bool performingBackPropagation = (trainRootNode != nullptr) || (Globals::ShouldEnableHyperCompressMemory());
+    bool performingBackPropagation = (trainRootNode != nullptr);

    // Construct the composite forward prop eval order by enumerating the
    // nodes corresponding to each of our roots in global eval oder
@ -1062,6 +1072,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
        }
    }

+    m_matrixPool.ResetStepCounter(); 
    set<ComputationNodeBasePtr> completedEvaluate;
    for (auto& nodeIter : compositeForwardPropEvalOrder)
    {
@ -1127,8 +1138,16 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
        }
    }

+    m_matrixPool.OptimizedMemoryAllocation(); 
    m_areMatricesAllocated = true;

+    // TO DO: At the time of AllocateAllMatrices we don't know the minibatch size. In theory one may allocate memory again once we start to receive
+    // data from the reader (and the minibatch size is known). For some problems, minibatch size can change constantly, and there needs to be a 
+    // tradeoff in deciding how frequent to run optimized memory allocation. For now, we do it only once at the very beginning for speed concerns. 
+
+    // TO DO: when some matrices are sparse, the memory size request may be wrong. One may need to call OptimizedMemoryAllocation later again 
+    // if the requests of sparse allocation and release are re-processed correctly. Future work. 
+
    // print the memory sharing structure
    if (TraceLevel() > 0)
        PrintMemorySharingStructure(GetAllNodes());
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -626,14 +626,16 @@ template <class ElemType>
 // 'transpose' means print one row per sample (non-transposed is one column per sample).
 // 'isSparse' will print all non-zero values as one row (non-transposed, which makes sense for one-hot) or column (transposed).
 template <class ElemType>
-void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const FrameRange& fr,
+void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f,
+                                                             const FrameRange& fr,
                                                             size_t onlyUpToRow, size_t onlyUpToT, bool transpose, bool isCategoryLabel, bool isSparse,
                                                             const vector<string>& labelMapping, const string& sequenceSeparator, 
                                                             const string& sequencePrologue, const string& sequenceEpilogue,
                                                             const string& elementSeparator, const string& sampleSeparator,
                                                             string valueFormatString,
                                                             bool outputGradient,
-                                                             bool onlyShowAbsSumForDense) const
+                                                             bool onlyShowAbsSumForDense,
+                                                             std::function<std::string(size_t)> getKeyById) const
 {
    // get minibatch matrix -> matData, matRows, matStride
    const Matrix<ElemType>& outputValues = outputGradient ? Gradient() : Value();
@ -716,6 +718,8 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f, const Fram

        if (s > 0)
            fprintfOrDie(f, "%s", sequenceSeparator.c_str());
+        if (getKeyById)
+            fprintfOrDie(f, "%s ", getKeyById(seqInfo.seqId).c_str());
        fprintfOrDie(f, "%s", seqProl.c_str());

        // output it according to our format specification
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -791,8 +791,7 @@ public:
    void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
    bool IsOutputNeededDuringBackprop() const 
    { 
-        return (!Globals::ShouldEnableShareNodeValueMatrices() && !Globals::ShouldEnableHyperCompressMemory())
-            || m_outputNeededDuringBackprop; 
+        return !Globals::ShouldEnableShareNodeValueMatrices() || m_outputNeededDuringBackprop; 
    }

    // -----------------------------------------------------------------------
@ -1680,20 +1679,6 @@ public:
 #endif
        // tracing
        Trace();
-
-        // Any memory not needed could resize to zero immediately when HyperCompressMemory active. Since the memory won't really release,
-        // all these memory blocks are gathered into a memory pool. When the next request coming, the best fitting block will be chosen.
-        if (Globals::ShouldEnableHyperCompressMemory()) 
-        {
-            for (auto& input : GetInputs())
-            {
-                if (!input->IsOutputNeededDuringBackprop() && input->IsValueSharable())
-                {
-                    auto inputNodePtr = DownCast(input);
-                    inputNodePtr->Value().Resize(0, 0);
-                }
-            }
-        }
    }

    virtual void /*IComputationNode::*/BeginBackprop() override
@ -1728,9 +1713,9 @@ public:
        }
    }

+#ifdef _DEBUG
    virtual void /*IComputationNode::*/ EndBackprop() override
    {
-#ifdef _DEBUG
        Base::EndBackprop();
 #ifdef TRACK_GAP_NANS
        for (size_t i = 0; i < m_inputs.size(); i++)
@ -1744,18 +1729,8 @@ public:
            }
        }
 #endif
-#endif
-        // We could release the gradient of value sharable nodes and all no-longer used memory generated in forward.
-        if (IsValueSharable() && Globals::ShouldEnableHyperCompressMemory())
-        {
-            if (GradientPtr()) 
-                Gradient().Resize(0, 0);
-
-            // canceling the graph dependency
-            if (IsOutputNeededDuringBackprop()) 
-                Value().Resize(0, 0);
-        }
    }
+#endif

    // this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation
    // TODO: move to -Base (or -Network?)
@ -1816,10 +1791,12 @@ public:
    }

    // request matrices needed to do node function value evaluation
+    // for memory pool utilization optimizaiton, the requested pointer is not immediately useable until the entire network has gone through all requests 
    virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
+        size_t matrixSize = m_sampleLayout.GetNumElements();
        if (IsValueSharable())
-            RequestMatrixFromPool(m_value, matrixPool);
+            RequestMatrixFromPool(m_value, matrixPool, matrixSize, HasMBLayout());
        else
            CreateMatrixIfNull(m_value);
    }
@ -1844,7 +1821,8 @@ public:
    // request matrices that are needed for gradient computation
    virtual void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
    {
-        RequestMatrixFromPool(m_gradient, matrixPool);
+        size_t matrixSize = m_sampleLayout.GetNumElements();
+        RequestMatrixFromPool(m_gradient, matrixPool, matrixSize, HasMBLayout());
    }

    // release gradient and temp matrices that no longer needed after all the children's gradients are computed.
@ -1889,18 +1867,20 @@ protected:
            matrixPtr = make_shared<Matrix<ElemType>>(m_deviceId);
    }

-    void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
+    // matrixSize is per sample size, if unknown or hard to estimate, set matrixSize = 0
+    // if the matrix's size will scale with minibatch size, set mbScale = true 
+    void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false)
    {
        if (matrixPtr == nullptr)
        {
-            matrixPtr = matrixPool.Request<ElemType>(m_deviceId);
+            matrixPool.RequestAllocate<ElemType>(m_deviceId, &matrixPtr, matrixSize, mbScale);
        }
    }

    void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool)
    {
        assert(matrixPtr != nullptr);
-        matrixPool.Release<ElemType>(matrixPtr);
+        matrixPool.RequestRelease<ElemType>(&matrixPtr);
    }

 public:
@ -1915,7 +1895,8 @@ public:
                                      const std::vector<std::string>& labelMapping, const std::string& sequenceSeparator, 
                                      const std::string& sequencePrologue, const std::string& sequenceEpilogue, const std::string& elementSeparator,
                                      const std::string& sampleSeparator, std::string valueFormatString,
-                                      bool outputGradient = false, bool onlyShowAbsSumForDense = false) const;
+                                      bool outputGradient = false, bool onlyShowAbsSumForDense = false,
+                                      std::function<std::string(size_t)> getKeyById = std::function<std::string(size_t)>()) const;

    // simple helper to log the content of a minibatch
    void DebugLogMinibatch(bool outputGradient = false) const
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -220,7 +220,8 @@ protected:
    ImageLayoutKind m_imageLayout;
    
    size_t m_maxTempMemSizeInSamples;
-    shared_ptr<Matrix<ElemType>> m_tempMatrix;
+    shared_ptr<Matrix<ElemType>> m_tempMatrixForward;
+    shared_ptr<Matrix<ElemType>> m_tempMatrixBackward;

    std::unique_ptr<ConvolutionEngine<ElemType>> m_convEng;
 };
@ -239,7 +240,8 @@ protected:                                  \
    using Base::m_transpose;                \
    using Base::m_imageLayout;              \
    using Base::m_maxTempMemSizeInSamples;  \
-    using Base::m_tempMatrix;               \
+    using Base::m_tempMatrixForward;        \
+    using Base::m_tempMatrixBackward;       \
    using Base::m_convEng;                  \
    using Base::InferReductionDims;         \
 public:
@ -351,13 +353,13 @@ public:
        const Matrix<ElemType>& input0 = InputRef(0).ValueAsMatrix();
        Matrix<ElemType> sliceInput1Value = InputRef(1).ValueFor(fr);
        if (!m_transpose)
-            m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
+            m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrixForward);
        else
        {
            // BackwardData adds results to the output so need to zero them out first.
            // REVIEW alexeyk: should be rolled into BackwardData itself.
            sliceOutputValue.SetValue(0);
-            m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrix);
+            m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, /*accumulateGradient =*/ true, *m_tempMatrixForward);
        }
    }

@ -369,20 +371,20 @@ public:
            auto& grad = InputRef(0).GradientAsMatrix();
            auto sliceInput1Value = InputRef(1).ValueFor(fr);
            if (!m_transpose)
-                m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
+                m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrixBackward);
            else
-                m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrix);
+                m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, !Input(inputIndex)->ParentOverwritesGradient(), fr.IsAllFrames(), *m_tempMatrixBackward);
        }
        else if (inputIndex == 1) // derivative with respect to the input feature
        {
            auto& input0 = InputRef(0).ValueAsMatrix();
            auto sliceInput1Grad = InputRef(1).GradientFor(fr);
            if (!m_transpose)
-                m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrix);
+                m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, !Input(inputIndex)->ParentOverwritesGradient(), *m_tempMatrixBackward);
            else
            {
                // REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.
-                m_convEng->Forward(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
+                m_convEng->Forward(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrixBackward);
            }
        }
    }
@ -500,25 +502,26 @@ public:
    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeForwardProp(matrixPool);
-        RequestMatrixFromPool(m_tempMatrix, matrixPool);
+        RequestMatrixFromPool(m_tempMatrixForward, matrixPool);
    }

-    //void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) override
-    //{
-    //    Base::ReleaseMatricesAfterForwardProp(matrixPool);
-    //    ReleaseMatrixToPool(m_tempMatrix, matrixPool);
-    //}
+    // m_tempMatrixForward is only used as workspace for convolution, we can release it immediately afterwards
+    void ReleaseMatricesAfterForwardProp(MatrixPool& matrixPool) override
+    {
+        Base::ReleaseMatricesAfterForwardProp(matrixPool);
+        ReleaseMatrixToPool(m_tempMatrixForward, matrixPool);
+    }

-    //void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
-    //{
-    //    Base::RequestMatricesBeforeBackprop(matrixPool);
-    //    RequestMatrixFromPool(m_tempMatrix, matrixPool);
-    //}
+    void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
+    {
+        Base::RequestMatricesBeforeBackprop(matrixPool);
+        RequestMatrixFromPool(m_tempMatrixBackward, matrixPool);
+    }

    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
    {
        Base::ReleaseMatricesAfterBackprop(matrixPool);
-        ReleaseMatrixToPool(m_tempMatrix, matrixPool);
+        ReleaseMatrixToPool(m_tempMatrixBackward, matrixPool);
    }

    void SetmMaxTempMemSizeInSamples(const size_t maxTempMemSizeInSamples)
@ -530,6 +533,8 @@ public:

    bool IsConvolution2D() const { return m_convolution2D; }

+    bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+
 private:
    using TransformerNode::m_transforms;
    using ConvolutionNodeBase<ElemType>::ComputeFilterTransform;
@ -600,9 +605,12 @@ public:
    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeForwardProp(matrixPool);
-        RequestMatrixFromPool(m_tempMatrix, matrixPool);
+        size_t matrixSize = m_sampleLayout.GetNumElements();
+        RequestMatrixFromPool(m_tempMatrix, matrixPool, matrixSize, true);
    }

+    // m_tempMatrix cannot be released after Forward Prop because its content (argmax) is used for back prop. 
+
    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
    {
        Base::ReleaseMatricesAfterBackprop(matrixPool);
--- a/Source/ComputationNetworkLib/EvaluationNodes.h
+++ b/Source/ComputationNetworkLib/EvaluationNodes.h
@ -461,7 +461,7 @@ template class NDCG1EvalNode<double>;
 // Edit distance error evaluation node with the option of specifying penalty of substitution, deletion and insertion, as well as squashing the input sequences and ignoring certain samples.
 // Using the classic DP algorithm as described in https://en.wikipedia.org/wiki/Edit_distance, adjusted to take into account the penalties.
 // 
-// The node allows to squash sequences of repeating labels and ignore certain labels. For example, if squashInputs is true and samplesToIgnore contains label '-' then
+// The node allows to squash sequences of repeating labels and ignore certain labels. For example, if squashInputs is true and tokensToIgnore contains label '-' then
 // given first input sequence as s1="a-ab-" and second as s2="-aa--abb" the edit distance will be computed against s1' = "aab" and s2' = "aab".
 //
 // The returned error is computed as: EditDistance(s1,s2) * length(s1') / length(s1)
@ -480,21 +480,17 @@ public:
    // delPen - deletion penalty
    // insPen - insertion penalty
    // squashInputs - whether to merge sequences of identical samples.
-    // samplesToIgnore - list of samples to ignore during edit distance evaluation
-    EditDistanceErrorNode(DEVICEID_TYPE deviceId, float subPen, float delPen, float insPen, bool squashInputs, std::vector<size_t> samplesToIgnore, const wstring & name)
-        : Base(deviceId, name), m_subPen(subPen), m_delPen(delPen), m_insPen(insPen), m_squashInputs(squashInputs), m_SamplesToIgnore(samplesToIgnore)
+    // tokensToIgnore - list of samples to ignore during edit distance evaluation
+    EditDistanceErrorNode(DEVICEID_TYPE deviceId, const wstring & name, float subPen = 0.0f, float delPen = 0.0f, float insPen = 0.0f, bool squashInputs = false, vector<size_t> tokensToIgnore = {})
+        : Base(deviceId, name), m_SubPen(subPen), m_DelPen(delPen), m_InsPen(insPen), m_SquashInputs(squashInputs), m_tokensToIgnore(tokensToIgnore)
    {
    }

    EditDistanceErrorNode(const ScriptableObjects::IConfigRecordPtr configp)
-        : EditDistanceErrorNode(configp->Get(L"deviceId"), configp->Get(L"subPen"), configp->Get(L"delPen"), configp->Get(L"insPen"), configp->Get(L"squashInputs"), configp->Get(L"samplesToIgnore"), L"<placeholder>")
+        : EditDistanceErrorNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"subPen"), configp->Get(L"delPen"), configp->Get(L"insPen"), configp->Get(L"squashInputs"), {})
    {
        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
-    }
-
-    EditDistanceErrorNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name)
-    {
+        m_tokensToIgnore = ScriptableObjects::ConfigArray::FlattenedVectorFrom<size_t>(configp->Get(L"tokensToIgnore"));
    }

    virtual void BackpropToNonLooping(size_t /*inputIndex*/) override
@ -515,7 +511,7 @@ public:

        MaskMissingColumnsToZero(*m_maxIndexes0, Input(0)->GetMBLayout(), frameRange);
        MaskMissingColumnsToZero(*m_maxIndexes1, Input(1)->GetMBLayout(), frameRange);
-        Value()(0, 0) = ComputeEditDistanceError(*m_maxIndexes0, *m_maxIndexes1, Input(0)->GetMBLayout(), m_subPen, m_delPen, m_insPen, m_squashInputs, m_SamplesToIgnore);
+        Value()(0, 0) = ComputeEditDistanceError(*m_maxIndexes0, *m_maxIndexes1, Input(0)->GetMBLayout(), m_SubPen, m_DelPen, m_InsPen, m_SquashInputs, m_tokensToIgnore);
    }

    virtual void Validate(bool isFinalValidationPass) override
@ -544,11 +540,11 @@ public:
            node->m_maxIndexes0 = m_maxIndexes0;
            node->m_maxIndexes1 = m_maxIndexes1;
            node->m_maxValues = m_maxValues;
-            node->m_squashInputs = m_squashInputs;
-            node->m_subPen = m_subPen;
-            node->m_delPen = m_delPen;
-            node->m_insPen = m_insPen;
-            node->m_SamplesToIgnore = m_SamplesToIgnore;
+            node->m_SquashInputs = m_SquashInputs;
+            node->m_SubPen = m_SubPen;
+            node->m_DelPen = m_DelPen;
+            node->m_InsPen = m_InsPen;
+            node->m_tokensToIgnore = m_tokensToIgnore;
        }
    }

@ -578,9 +574,9 @@ public:
    // delPen - deletion penalty
    // insPen - insertion penalty
    // squashInputs - whether to merge sequences of identical samples.
-    // samplesToIgnore - list of samples to ignore during edit distance evaluation
+    // tokensToIgnore - list of samples to ignore during edit distance evaluation
    static ElemType ComputeEditDistanceError(Matrix<ElemType>& firstSeq, const Matrix<ElemType> & secondSeq, MBLayoutPtr pMBLayout, 
-        float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& samplesToIgnore)
+        float subPen, float delPen, float insPen, bool squashInputs, const vector<size_t>& tokensToIgnore)
    {
        std::vector<int> firstSeqVec, secondSeqVec;

@ -614,8 +610,8 @@ public:

                auto columnIndices = pMBLayout->GetColumnIndices(sequence);

-                ExtractSampleSequence(firstSeq, columnIndices, squashInputs, samplesToIgnore, firstSeqVec);
-                ExtractSampleSequence(secondSeq, columnIndices, squashInputs, samplesToIgnore, secondSeqVec);
+                ExtractSampleSequence(firstSeq, columnIndices, squashInputs, tokensToIgnore, firstSeqVec);
+                ExtractSampleSequence(secondSeq, columnIndices, squashInputs, tokensToIgnore, secondSeqVec);

                //calculate edit distance
                size_t firstSize = firstSeqVec.size();
@ -690,29 +686,29 @@ public:
        return (ElemType)(wrongSampleNum * totalframeNum / totalSampleNum);
    }

-    float SubstitutionPenalty() const { return m_subPen; }
-    float DeletionPenalty() const { return m_delPen; }
-    float InsertionPenalty() const { return m_insPen; }
-    bool SquashInputs() const { return m_squashInputs; }
-    std::vector<size_t> SamplesToIgnore() const { return m_SamplesToIgnore; }
+    float SubstitutionPenalty() const { return m_SubPen; }
+    float DeletionPenalty() const { return m_DelPen; }
+    float InsertionPenalty() const { return m_InsPen; }
+    bool SquashInputs() const { return m_SquashInputs; }
+    std::vector<size_t> TokensToIgnore() const { return m_tokensToIgnore; }

 private:
    shared_ptr<Matrix<ElemType>> m_maxIndexes0, m_maxIndexes1;
    shared_ptr<Matrix<ElemType>> m_maxValues;
-    bool m_squashInputs;
-    float m_subPen;
-    float m_delPen;
-    float m_insPen;
-    std::vector<size_t> m_SamplesToIgnore;
+    bool m_SquashInputs;
+    float m_SubPen;
+    float m_DelPen;
+    float m_InsPen;
+    std::vector<size_t> m_tokensToIgnore;

    // Clear out_SampleSeqVec and extract a vector of samples from the matrix into out_SampleSeqVec.
-    static void ExtractSampleSequence(const Matrix<ElemType>& firstSeq, vector<size_t>& columnIndices, bool squashInputs, const vector<size_t>& samplesToIgnore, std::vector<int>& out_SampleSeqVec)
+    static void ExtractSampleSequence(const Matrix<ElemType>& firstSeq, vector<size_t>& columnIndices, bool squashInputs, const vector<size_t>& tokensToIgnore, std::vector<int>& out_SampleSeqVec)
    {
        out_SampleSeqVec.clear();

        // Get the first element in the sequence
        size_t lastId = (int)firstSeq(0, columnIndices[0]);
-        if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), lastId) == samplesToIgnore.end())
+        if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), lastId) == tokensToIgnore.end())
            out_SampleSeqVec.push_back(lastId);

        // Remaining elements
@ -725,7 +721,7 @@ private:
                if (lastId != refId)
                {
                    lastId = refId;
-                    if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), refId) == samplesToIgnore.end())
+                    if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), refId) == tokensToIgnore.end())
                        out_SampleSeqVec.push_back(refId);
                }
            }
@ -735,7 +731,7 @@ private:
            for (size_t i = 1; i < columnIndices.size(); i++)
            {
                auto refId = (int)firstSeq(0, columnIndices[i]);
-                if (std::find(samplesToIgnore.begin(), samplesToIgnore.end(), refId) == samplesToIgnore.end())
+                if (std::find(tokensToIgnore.begin(), tokensToIgnore.end(), refId) == tokensToIgnore.end())
                    out_SampleSeqVec.push_back(refId);
            }
        }
--- a/Source/ComputationNetworkLib/MatrixPool.h
+++ b/Source/ComputationNetworkLib/MatrixPool.h
@ -8,6 +8,8 @@
 #include <string>
 #include <stdexcept>
 #include <vector>
+#include <set>
+#include <utility>
 #include <algorithm>
 #include <stdlib.h>

@ -17,59 +19,238 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+template <class ElemType>
+struct MemRequestInfo
+{
+    DEVICEID_TYPE deviceId;                     // which device to allocate data 
+    shared_ptr<Matrix<ElemType>>*pMatrixPtr;    // memory pointer 
+    size_t matrixSize;                          // memory size 
+    bool mbScale;                               // whether the memory shall be scaled by minibatch size 
+    int allocStep;                              // at what step counter memory allocation is requested 
+    int releaseStep;                            // at what step counter memory release is requested  
+    int memoryId;                               // integer indexing the memory buffer ID 
+    MemRequestInfo(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale, int allocStep)
+        :deviceId(deviceId), pMatrixPtr(pMatrixPtr), matrixSize(matrixSize), mbScale(mbScale), allocStep(allocStep), releaseStep(INT_MAX), memoryId(-1)
+    {
+    }
+    void SetReleaseStep(int step) { releaseStep = step; }
+    void SetMemoryId(int id) { memoryId = id;  }
+};
+
+template <class ElemType>
+struct greater_than_mem_req_size
+{
+    inline bool operator() (const MemRequestInfo<ElemType>& info1, const MemRequestInfo<ElemType>& info2)
+    {
+        return (info1.matrixSize > info2.matrixSize);
+    }
+};
+
+struct MemAllocInfo
+{
+    int memoryId; 
+    size_t memorySize; 
+    vector<pair<int, int>> occupancy; 
+    MemAllocInfo(int memoryId, size_t memorySize, vector<pair<int, int>> occ)
+        :memoryId(memoryId), memorySize(memorySize), occupancy(occ)
+    {
+    }
+};
+
 // MatrixPool -- class to support memory sharing
 // Despite the gather general name of this class, it is specifically designed to support the memory sharing of ComputationNodes.
 // Note: see #define SUPRESS_MEMSHARING below as for how to temporarily disable memory sharing altogether, for debugging
 class MatrixPool
 {
-    vector<shared_ptr<Matrix<float>>>  m_releasedFloatMatrices;
-    vector<shared_ptr<Matrix<double>>> m_releasedDoubleMatrices;
+    vector<MemRequestInfo<float>> m_memRequestInfoFloatVec; 
+    vector<MemRequestInfo<double>> m_memRequestInfoDoubleVec;
+    set<DEVICEID_TYPE> m_deviceIDSet; 
+    int m_stepCounter; 

    template <class ElemType>
-    vector<shared_ptr<Matrix<ElemType>>>& GetReleasedMatrices();
+    vector<MemRequestInfo<ElemType>>& GetMemRequestInfoVec(); 

 public:
-    // release here means the matrix can be put back and shared by others
-    template <class ElemType>
-    void Release(shared_ptr<Matrix<ElemType>> freeMatrix)
-    {
-        if (freeMatrix == nullptr || freeMatrix->GetMatrixType() == SPARSE)
-            LogicError("MatrixPool::Release: freeMatrix should not be null or sparse.");
-//#define SUPRESS_MEMSHARING // #define this to disable memory sharing through this structure
-        // TODO: Make this a runtime option.
-#ifndef SUPRESS_MEMSHARING
-        vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
-#ifdef _DEBUG
-        for (int i = 0; i < releasedMatrices.size(); i++)
-        {
-            if (releasedMatrices[i] == freeMatrix)
-                RuntimeError("MatrixPool::Release: freeMatrix is already in the released pool.");
-        }
+    void ResetStepCounter() { m_stepCounter = 0; };

-#endif
-        releasedMatrices.push_back(freeMatrix);
-#endif
+    template <class ElemType>
+    void RequestRelease(shared_ptr<Matrix<ElemType>> *pMatrixPtr)
+    {
+        vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
+        // iterate through the vector and find the pointer memInfo
+        for (auto& memInfo : memInfoVec)
+        {
+            if (memInfo.pMatrixPtr == pMatrixPtr)
+            {
+                memInfo.SetReleaseStep(m_stepCounter);
+                break; 
+            }
+        }
+        m_stepCounter++; 
    }

    template <class ElemType>
-    shared_ptr<Matrix<ElemType>> Request(DEVICEID_TYPE deviceId)
+    void RequestAllocate(DEVICEID_TYPE deviceId, shared_ptr<Matrix<ElemType>>*pMatrixPtr, size_t matrixSize, bool mbScale)
    {
-        vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
-        shared_ptr<Matrix<ElemType>> matrixPtr;
-        if (releasedMatrices.empty())
+        vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>(); 
+        MemRequestInfo<ElemType> memInfo(deviceId, pMatrixPtr, matrixSize, mbScale, m_stepCounter); 
+        memInfoVec.push_back(memInfo); 
+        m_deviceIDSet.insert(deviceId); 
+        m_stepCounter++; 
+
+        // assign some temporary pointer, they will be replaced later unless the matrix is sparse
+        *pMatrixPtr = make_shared<Matrix<ElemType>>(deviceId);
+    }
+
+    void OptimizedMemoryAllocation()
+    {
+        // MatrixPool is not templated, so we call both float and double versions here 
+        OptimizedMemoryAllocationFunc<float>(); 
+        OptimizedMemoryAllocationFunc<double>();
+        return; 
+    }
+
+private: 
+    bool CheckOverlap(pair<int, int>occ, vector<pair<int, int>>&occVec)
+    {
+        bool bRet = false;
+        for (auto& o : occVec)
        {
-            matrixPtr = make_shared<Matrix<ElemType>>(deviceId);
+            if (occ.first <= o.second && occ.second >= o.first)
+            {
+                bRet = true;
+                break;
+            }
        }
-        else
+//#define SUPRESS_MEMSHARING // #define this to disable memory sharing by always return true 
+// TODO: Make this a runtime option.
+#ifdef SUPRESS_MEMSHARING
+        bRet = true; 
+#endif
+        return bRet;
+    }
+
+    template <class ElemType>
+    void OptimizedMemoryAllocationFunc()
+    {
+        vector<MemRequestInfo<ElemType>>& memInfoVec = GetMemRequestInfoVec<ElemType>();
+        if (memInfoVec.empty())
+            return; 
+
+        // remove all requests that has been marked as sparse matrices, those will not participate in memory sharing 
+        for (auto iter = memInfoVec.begin(); iter != memInfoVec.end(); )
        {
-            matrixPtr = releasedMatrices.back();
-            releasedMatrices.pop_back();
+            if ((*(iter->pMatrixPtr))->GetMatrixType() == SPARSE)
+                memInfoVec.erase(iter);
+            else
+                iter++; 
        }

-        if (!matrixPtr) // this can't really happen
-            LogicError("MatrixPool::Request: failed to get a valid matrix.");
+        // sort the memory request from largest size to smallest 
+        std::sort(memInfoVec.begin(), memInfoVec.end(), greater_than_mem_req_size<ElemType>());

-        return matrixPtr;
+        for (auto& devId : m_deviceIDSet)
+        {
+            // memAllocInfoVec is a sorted list of memory allocations from smallest to largest in memory size 
+            vector<MemAllocInfo> memAllocInfoVec;
+            int memoryCounter = 0; 
+            // we start with memory request that is scalable with minibatch size(usually those require larger memory size)
+            for (auto& memInfo : memInfoVec)
+            {
+                // check if it's the proper device
+                if (memInfo.deviceId != devId || !memInfo.mbScale)
+                    continue;
+
+                if (!memAllocInfoVec.empty())
+                {
+                    // since we assign from highest memory to lowest, every memory that has been allocated can accommodate the 
+                    // current memory request, unless there is a conflict (overlap) 
+                    auto iter = memAllocInfoVec.begin(); 
+                    while (iter != memAllocInfoVec.end() && CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
+                        iter++; 
+                    if (iter == memAllocInfoVec.end())
+                    {   
+                        // no current memory can be assigned, need to create a new one 
+                        vector<pair<int, int>> occ; 
+                        occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep)); 
+                        MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
+                        // insert in the front of the vector to maintain sorted order 
+                        memAllocInfoVec.insert(memAllocInfoVec.begin(), ma); 
+                        memInfo.SetMemoryId(memoryCounter);
+                        memoryCounter++;
+                    }
+                    else
+                    {
+                        iter->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep)); 
+                        memInfo.SetMemoryId(iter->memoryId); 
+                    }
+                }
+                else
+                {
+                    vector<pair<int, int>> occ;
+                    occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
+                    MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
+                    memAllocInfoVec.push_back(ma);
+                    memInfo.SetMemoryId(memoryCounter);
+                    memoryCounter++;
+                }
+            }
+
+            // rescan the request list and this time allocate for those that doesn't depend on minibatch size 
+            for (auto& memInfo : memInfoVec)
+            {
+                // check if it's the proper device
+                if (memInfo.deviceId != devId || memInfo.mbScale)
+                    continue;
+
+                if (!memAllocInfoVec.empty())
+                {
+                    // the memory allocation vector is sorted by size. We find the largest available buffer that doesn't have time overlap
+                    auto workingAlloc = memAllocInfoVec.end(); 
+                    for (auto iter = memAllocInfoVec.begin(); iter != memAllocInfoVec.end(); iter++)
+                    {
+                        if (!CheckOverlap(make_pair(memInfo.allocStep, memInfo.releaseStep), iter->occupancy))
+                            workingAlloc = iter; 
+                    }
+                    if (workingAlloc == memAllocInfoVec.end())  // nothing works 
+                    {
+                        vector<pair<int, int>> occ;
+                        occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
+                        MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
+                        memAllocInfoVec.push_back(ma);  // add as the last one 
+                        memInfo.SetMemoryId(memoryCounter);
+                        memoryCounter++;
+                    }
+                    else
+                    {
+                        workingAlloc->occupancy.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
+                        memInfo.SetMemoryId(workingAlloc->memoryId);
+                    }
+                }
+                else
+                {
+                    vector<pair<int, int>> occ;
+                    occ.push_back(make_pair(memInfo.allocStep, memInfo.releaseStep));
+                    MemAllocInfo ma(memoryCounter, memInfo.matrixSize, occ);
+                    memAllocInfoVec.push_back(ma);
+                    memInfo.SetMemoryId(memoryCounter);
+                    memoryCounter++;
+                }
+            }
+            
+            // now assign the actual pointers 
+            for (int i = 0; i < memoryCounter; i++)
+            {
+                auto matrixPtr = make_shared<Matrix<ElemType>>(devId);
+                if (!matrixPtr) // this can't really happen, because we haven't started allocating memory yet
+                    LogicError("MatrixPool: failed to get a valid matrix.");
+                for (auto& memInfo : memInfoVec)
+                {
+                    if (memInfo.deviceId == devId && memInfo.memoryId == i) 
+                        *memInfo.pMatrixPtr = matrixPtr;
+                }
+            }
+        }
    }
 };

--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -149,6 +149,7 @@ DeclareUnaryElementWiseWithOpCodeNode(Floor,           Floor,           None,
 DeclareUnaryElementWiseWithOpCodeNode(Log,             Log,             ElementwiseProductWithLogDerivativeFromOutput,             binaryWithOutputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Negate,          Negate,          Negate,                                                    unaryGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Pass,            Copy,            Copy,                                                      unaryGradient);
+DeclareUnaryElementWiseWithOpCodeNode(LabelsToGraph,   Copy,            Copy,                                                      unaryGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Reciprocal,      Reciprocal,      ElementwiseProductWithReciprocalDerivative,                binaryWithOutputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, binaryWithOutputGradient);
 DeclareUnaryElementWiseWithOpCodeNode(Sigmoid,         Sigmoid,         ElementwiseProductWithSigmoidDerivativeFromOutput,         binaryWithOutputGradient);
--- a/Source/ComputationNetworkLib/RNNNodes.h
+++ b/Source/ComputationNetworkLib/RNNNodes.h
@ -75,11 +75,9 @@ public:
        ReleaseMatrixToPool(m_transposedOutput, matrixPool);
        ReleaseMatrixToPool(m_transposedDInput, matrixPool);
        ReleaseMatrixToPool(m_transposedDOutput, matrixPool);
-#if 0
        ReleaseMatrixToPool(m_reserve, matrixPool);
        ReleaseMatrixToPool(m_workspace, matrixPool);
        ReleaseMatrixToPool(m_packingIndex, matrixPool);
-#endif
    }

    virtual bool OutputUsedInComputingInputNodesGradients() const { return false; }
--- a/Source/ComputationNetworkLib/SpecialPurposeNodes.h
+++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.h
@ -7,6 +7,7 @@
 #include "Basics.h"
 #include "ComputationNode.h"
 #include "gammacalculation.h"
+#include "NonlinearityNodes.h"

 #include <map>
 #include <string>
@ -611,7 +612,7 @@ public:
        RequestMatrixFromPool(m_gammaFromLattice, matrixPool);
    }

-    // request matrices needed to do node function value evaluation
+    // release gradient and temp matrices that no longer needed after all the children's gradients are computed.
    virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
    {
        Base::ReleaseMatricesAfterBackprop(matrixPool);
@ -722,10 +723,7 @@ public:
        }
    }

-    virtual bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        return false;
-    }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }

    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
    {
@ -765,4 +763,192 @@ public:
 template class DummyCriterionNode<float>;
 template class DummyCriterionNode<double>;

+// -----------------------------------------------------------------------
+// ForwardBackwardNode (graph, prediction, delayConstraint)
+// CTC training criterion, primarily based on the paper "Connectionist Temporal Classification: Labelling Unsegmented
+// Sequence Data with Recurrent Neural Networks", ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
+//
+// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin.
+//      Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
+//      delayConstraint=-1 means no constraint
+// -----------------------------------------------------------------------
+
+template<class ElemType>
+class ForwardBackwardNode : public  ComputationNodeNonLooping<ElemType>, public NumInputs<2>
+{
+    typedef ComputationNodeNonLooping<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName()
+    {
+        return L"ForwardBackward";
+    }
+public:
+    DeclareConstructorFromConfigWithNumInputs(ForwardBackwardNode);
+    ForwardBackwardNode(DEVICEID_TYPE deviceId, const wstring & name, int blankTokenId=INT_MIN, int delayConstraint=-1) :
+        Base(deviceId, name), m_blankTokenId(blankTokenId), m_delayConstraint(delayConstraint)
+    {
+    }
+
+    // Compute gradients to input observations, the weights to the observations, and the class log posterior probabilites
+    virtual void BackpropToNonLooping(size_t inputIndex) override
+    {
+        // Left node must be a scalar
+        if (inputIndex == 0)  //left derivative
+        {
+            BackpropToLeft(*m_logSoftmaxOfRight, InputRef(inputIndex).Gradient(), Gradient());
+        }
+        else if (inputIndex == 1)
+        {
+            FrameRange frameRange(InputRef(0).GetMBLayout());
+            BackpropToRight(*m_softmaxOfRight, InputRef(inputIndex).Gradient(), Gradient(), *m_CTCposterior);
+            InputRef(inputIndex).MaskMissingGradientColumnsToZero(frameRange);
+        }
+        else
+           RuntimeError("ForwardBackwardNode criterion expects only two inputs: labels and network output.");
+    }
+
+    void BackpropToLeft(const Matrix<ElemType>& logSoftmaxOfRight, Matrix<ElemType>& inputGradientValues,
+        const Matrix<ElemType>& gradientValues)
+    {
+#if DUMPOUTPUT
+        logSoftmaxOfRight.Print("ForwardBackwardNode Partial-logSoftmaxOfRight");
+        gradientValues.Print("ForwardBackwardNode Partial-gradientValues");
+        inputGradientValues.Print("ForwardBackwardNode Partial-Left-in");
+#endif
+
+        Matrix<ElemType>::ScaleAndAdd(-gradientValues.Get00Element(), logSoftmaxOfRight, inputGradientValues);
+
+#if DUMPOUTPUT
+        inputGradientValues.Print("ForwardBackwardNode Partial-Left-out");
+#endif
+    }
+
+    void BackpropToRight(const Matrix<ElemType>& softmaxOfRight, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues,
+        const Matrix<ElemType> &CTCposterior)
+    {
+#if DUMPOUTPUT
+        softmaxOfRight.Print("ForwardBackwardNode Partial-softmaxOfRight");
+        inputFunctionValues.Print("ForwardBackwardNode Partial-inputFunctionValues");
+        gradientValues.Print("ForwardBackwardNode Partial-gradientValues");
+        inputGradientValues.Print("ForwardBackwardNode Partial-Right-in");
+#endif  
+        // inputGradientValues+= gradientValues*(softmaxOfRight - CTCposterior)
+        Matrix<ElemType>::AddScaledDifference(gradientValues, softmaxOfRight, CTCposterior, inputGradientValues); 
+
+#if DUMPOUTPUT
+        inputGradientValues.Print("ForwardBackwardNode Partial-Right");
+#endif
+    }
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        return false;
+    }
+
+    virtual void ForwardPropNonLooping() override
+    {
+        m_logSoftmaxOfRight->AssignLogSoftmaxOf(InputRef(1).Value(), true);
+        m_softmaxOfRight->SetValue(*m_logSoftmaxOfRight);
+        m_softmaxOfRight->InplaceExp();
+
+        m_CTCposterior->SwitchToMatrixType(m_softmaxOfRight->GetMatrixType(), m_softmaxOfRight->GetFormat(), false);
+        m_CTCposterior->Resize(m_softmaxOfRight->GetNumRows(), m_softmaxOfRight->GetNumCols());
+
+        FrameRange fr(InputRef(0).GetMBLayout());
+        InputRef(0).ValueFor(fr).VectorMax(*m_maxIndexes, *m_maxValues, true);
+        // compute CTC score
+        m_GammaCal.doCTC(Value(), *m_logSoftmaxOfRight, *m_maxIndexes, *m_maxValues, *m_CTCposterior, InputRef(0).GetMBLayout(), m_blankTokenId, m_delayConstraint);
+
+#if NANCHECK
+        functionValues.HasNan("ForwardBackwardNode");
+#endif
+#if DUMPOUTPUT
+        functionValues.Print("ForwardBackwardNode");
+#endif
+    }
+
+    virtual void /*ComputationNodeBase::*/Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+        m_pMBLayout = nullptr; // no layout
+
+        if (isFinalValidationPass) 
+        {
+            if (!(Input(0)->GetSampleMatrixNumRows() == Input(1)->GetSampleMatrixNumRows() && // match vector dimension
+                Input(0)->HasMBLayout() &&
+                Input(0)->GetMBLayout() == Input(1)->GetMBLayout()))
+            {
+                LogicError("The Matrix dimension in the ForwardBackwardNode operation does not match.");
+            }
+
+            auto leftNode = dynamic_pointer_cast<LabelsToGraphNode<ElemType>>(Input(0));
+            if (!leftNode)
+                LogicError("ForwardBackwardNode: Please pass LabelsToGraph(labels) for second argument");
+        }
+
+        SetDims(TensorShape(1), false);
+    }
+
+    virtual void CopyTo(const ComputationNodePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
+        {
+            auto node = dynamic_pointer_cast<ForwardBackwardNode<ElemType>>(nodeP);
+
+            node->m_logSoftmaxOfRight->SetValue(*m_logSoftmaxOfRight);
+            node->m_softmaxOfRight->SetValue(*m_softmaxOfRight);
+            node->m_CTCposterior->SetValue(*m_CTCposterior);
+            node->m_maxIndexes->SetValue(*m_maxIndexes);
+            node->m_maxValues->SetValue(*m_maxValues);
+            node->m_delayConstraint = m_delayConstraint;
+        }
+    }
+
+    // request matrices needed to do node function value evaluation
+    virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool)
+    {
+        Base::RequestMatricesBeforeForwardProp(matrixPool);
+        RequestMatrixFromPool(m_logSoftmaxOfRight, matrixPool);
+        RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
+        RequestMatrixFromPool(m_CTCposterior, matrixPool);
+        RequestMatrixFromPool(m_maxIndexes, matrixPool);
+        RequestMatrixFromPool(m_maxValues, matrixPool);
+    }
+
+    virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+    {
+        Base::ReleaseMatricesAfterBackprop(matrixPool);
+        ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
+        ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
+        ReleaseMatrixToPool(m_CTCposterior, matrixPool);
+        ReleaseMatrixToPool(m_maxIndexes, matrixPool);
+        ReleaseMatrixToPool(m_maxValues, matrixPool);
+    }
+
+    virtual void UpdateFunctionMBSize() override
+    {
+        Base::UpdateFunctionMBSize();
+
+        size_t cols = Input(0)->Value().GetNumCols();
+        m_maxIndexes->Resize(1, cols);
+        m_maxValues->Resize(1, cols);
+    }
+
+protected:
+    virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
+    shared_ptr<Matrix<ElemType>> m_logSoftmaxOfRight;
+    shared_ptr<Matrix<ElemType>> m_softmaxOfRight;
+    shared_ptr<Matrix<ElemType>> m_CTCposterior;
+    shared_ptr<Matrix<ElemType>> m_maxIndexes;
+    shared_ptr<Matrix<ElemType>> m_maxValues;
+
+    msra::lattices::GammaCalculation<ElemType> m_GammaCal;
+    int m_blankTokenId;
+    int m_delayConstraint;
+};
+
+template class ForwardBackwardNode<float>;
+template class ForwardBackwardNode<double>;
+
 } } }
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -219,6 +219,14 @@ public:
        RequestMatrixFromPool(m_softmaxOfRight, matrixPool);
    }

+    // release gradient and temp matrices that no longer needed after all the children's gradients are computed.
+    virtual void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool)
+    {
+        Base::ReleaseMatricesAfterBackprop(matrixPool);
+        ReleaseMatrixToPool(m_logSoftmaxOfRight, matrixPool);
+        ReleaseMatrixToPool(m_softmaxOfRight, matrixPool);
+    }
+
 protected:
    shared_ptr<Matrix<ElemType>> m_logSoftmaxOfRight;
    shared_ptr<Matrix<ElemType>> m_softmaxOfRight;
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -41,7 +41,6 @@ void CNTKEvalBase<ElemType>::Init(const std::string& config)
    CPUMatrix<ElemType>::SetNumThreads(nThreads);

    Globals::SetShareNodeValueMatrices(m_config(L"shareNodeValueMatrices", true));
-    Globals::SetHyperCompressMemory(m_config(L"hyperCompressMemory", false));
 }


--- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
@ -60,8 +60,8 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>EvalDLL.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <DelayLoadDLLs>EvalDll.dll</DelayLoadDLLs>
+      <AdditionalDependencies>EvalDLL.lib;Math.lib;Common.lib;$(MSMPI_LIB64)msmpi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>EvalDll.dll;Math.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -5873,6 +5873,166 @@ void CPUMatrix<ElemType>::RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha,
    }
 };

+template<class ElemType>
+CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignCTCScore(
+    const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta,
+    const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const std::vector<size_t>& uttMap, const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum,
+    const std::vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
+{
+    // Column wise representation of sequences in input matrices (each column is one sequence/utterance)
+    if (isColWise)
+    {
+        vector<size_t> curPhoneSeq;
+        
+        auto &us = *this;
+        size_t s, s2;
+        size_t senoneid, t;
+        ElemType ascore;
+        double x, y;
+        size_t senonenum, frameNum;
+
+        for (size_t uttId = 0;uttId < uttFrameNum.size(); uttId++) {
+            senonenum = uttPhoneNum[uttId];
+            frameNum = uttFrameNum[uttId];
+
+            // Populate utterance
+            // Using loop instead of memcpy for clarity
+            curPhoneSeq.reserve(senonenum);
+            for (size_t i =0;i < senonenum;i++)
+                curPhoneSeq.push_back(phoneSeq(i, uttId));
+
+            if (frameNum > 1)
+            {
+                //initialize alpha
+                for (s = 1; s < 3; s++)
+                {
+                    senoneid = curPhoneSeq[s];
+                    alpha(s, 0) = prob(senoneid, 0);
+                }
+                alpha(senonenum - 1, 0) = LZERO;
+                //initialize beta
+                for (s = senonenum - 3; s < senonenum - 1; s++)
+                {
+                    senoneid = curPhoneSeq[s];
+                    beta(s, frameNum - 1) = prob(senoneid, frameNum - 1);
+                }
+                beta(senonenum - 1, frameNum - 1) = LZERO;
+
+                //cal alpha
+                for (t = 1; t < frameNum; t++)
+                {
+                    for (s = 1; s < senonenum - 1; s++)
+                    {
+                        senoneid = curPhoneSeq[s];
+                        x = LZERO;
+                        for (s2 = s - 1; s2 <= s; s2++)
+                        {
+                            if (s2 > 0)
+                            {
+                                y = alpha(s2, t - 1);
+                                x = LogAddD(x, y);
+                            }
+                        }
+
+                        if (senoneid != prob.GetNumRows() - 1 && s - 2 > 0 && senoneid != curPhoneSeq[s - 2])
+                        {
+                            y = alpha(s - 2, t - 1);
+                            x = LogAddD(x, y);
+                        }
+                        if (senoneid != SIZE_MAX)
+                            ascore = prob(senoneid, t);
+                        else
+                            ascore = 0;
+                        alpha(s, t) = (float)x + ascore;
+                    }
+
+                }
+                //exit senone
+                x = LZERO;
+                for (s2 = senonenum - 3; s2 < senonenum - 1; s2++)
+                {
+                    y = alpha(s2, frameNum - 1);
+                    x = LogAddD(x, y);
+                }
+                alpha(senonenum - 1, frameNum - 1) = (float)x;
+
+                totalScore = -alpha(senonenum - 1, frameNum - 1);
+
+                //cal beta
+                for (t = frameNum - 2; t >= 0; t--)
+                {
+
+                    for (s = 1; s < senonenum - 1; s++)
+                    {
+                        senoneid = curPhoneSeq[s];
+                        x = LZERO;
+                        for (s2 = s; s2 <= s + 1; s2++)
+                        {
+                            if (s2 < senonenum - 1)
+                            {
+                                y = beta(s2, t + 1);
+                                x = LogAddD(x, y);
+                            }
+                        }
+                        if (senoneid != prob.GetNumRows() - 1 && s + 2 < senonenum - 1 && senoneid != curPhoneSeq[s + 2])
+                        {
+                            y = beta(s + 2, t + 1);
+                            x = LogAddD(x, y);
+                        }
+
+                        if (senoneid != SIZE_MAX)
+                            ascore = prob(senoneid, t);
+                        else
+                            ascore = 0;
+                        beta(s, t) = (float)x + ascore;
+
+                    }
+                    if (t == 0)
+                        break;
+                }
+                //entry senone
+                x = LZERO;
+                for (s2 = 1; s2 < 3; s2++)
+                {
+                    y = beta(s2, 0);
+                    x = LogAddD(x, y);
+                }
+                beta(0, 0) = (float)x;
+                for (t = 0; t < frameNum; t++)
+                {
+                    //cal zt
+                    double Zt = LZERO;
+                    for (s = 1; s < senonenum - 1; s++)
+                    {
+                        senoneid = curPhoneSeq[s];
+                        Zt = LogAddD(Zt, (alpha(s, t) + beta(s, t) - prob(senoneid, t)));
+                    }
+
+                    for (s = 1; s < senonenum - 1; s++)
+                    {
+                        senoneid = curPhoneSeq[s];
+                        if (senoneid != SIZE_MAX)
+                        {
+                            ElemType logoccu = alpha(s, t) + beta(s, t) - prob(senoneid, t) - (float)Zt;
+                            if (logoccu < LOG_OF_EPS_IN_LOG)
+                                us(senoneid, t) += 0.0f;
+                            else
+                                us(senoneid, t) += exp(logoccu);
+                        }
+                    }
+                }
+            }
+        }
+        return *this;
+
+    }
+     else {
+         LogicError("Only ColWise minibatch layout is supported.");
+    }
+
+    return *this;
+}
+
 /// the kernel function for RCRF backward computation
 template <class ElemType>
 void CPUMatrix<ElemType>::_rcrfBackwardCompute(size_t t, size_t k, const CPUMatrix<ElemType>& alpha,
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -231,6 +231,7 @@ public:
    // sequence training
    CPUMatrix<ElemType>& DropFrame(const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& gamma, const ElemType& threshhold);
    CPUMatrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& dnnoutput, const CPUMatrix<ElemType>& gamma, ElemType alpha);
+    CPUMatrix<ElemType>& AssignCTCScore(const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta, const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise);
    CPUMatrix<ElemType>& InplaceSqrt();
    CPUMatrix<ElemType>& AssignSqrtOf(const CPUMatrix<ElemType>& a);

--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -41,8 +41,6 @@ typedef unsigned char byte;
 #define GPUSPARSE_INDEX_TYPE int // cuSparse only supports int array indexes
 #define CPUSPARSE_INDEX_TYPE int // to be consistent with cuSparse but limited the possible size of the matrix.

-#define MEM_MAX_LIMIT_TIMES 2 // The maximum times allowed a cached memory block allocated to a request
-
 namespace Microsoft { namespace MSR { namespace CNTK {

 MATH_API void SetMathLibTraceLevel(int traceLevel);
@ -214,158 +212,6 @@ enum MatrixFlags
    matrixFlagSetValueOnDevice = 1 << bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
 };

-
-// -----------------------------------------------------------------------
-// BufferManagement -- to control the allocation and release of memory
-// 
-// 1. The goal of buffer management
-// The best way to save memory is releasing memory right after no longer used in the rest of the mini-batch, which makes 
-// the extra cost on memory operation and slows down the speed. An option to solve that is building the static link between 
-// all nodes in pre-computing process and making memory re-use in the runtime, known as shared node value matrices in CNTK. 
-// The other option is using a buffer pool to take over the allocation and release request. Whereas the physical operation on 
-// memory, logical operation will make nearly no cost on allocation or release. Since the second option, achieved as 
-// BufferManagement below, could control all the memory operation, including some trivial ones, like the workspace in convolutions, 
-// and more flexible, allocating based on size and being easy to implement new algorithm, it is usually more powerful than the
-// first method.
-// 2. How it works?
-// First, it should be called in Resize function. In Resize function, using Request and LogicalReleaseFunction to replace the original 
-// request and release functions. Since BufferManagement is singleton for deviceId, just call the GetManagementInstance. And in Resize, 
-// there is a flag named growthOnly, which will request only the size increases to save the allocation cost. In the case, since the 
-// buffer pool, nearly no cost on allocation, the growth only will be disable in BufferManagement mode.
-// -----------------------------------------------------------------------
-class BufferManagement
-{
-private:
-    BufferManagement() = default;
-
-    // Disable all the copy & move functions to keep the instance safely
-    DISABLE_COPY_AND_MOVE(BufferManagement);
-
-public:
-    static BufferManagement& GetManagerInstance(DEVICEID_TYPE deviceId)
-    {
-        static std::mutex instancLock;
-        auto instance = m_instances.find(deviceId);
-        if (instance == m_instances.end()) 
-        {
-            std::lock_guard<std::mutex> lock(instancLock);
-            if (instance == m_instances.end())
-            {
-                instance = m_instances.insert(std::make_pair(deviceId, std::unique_ptr<BufferManagement>(
-                    new BufferManagement()))).first;
-                instance->second->m_deviceId = deviceId;
-                instance->second->m_totalManageSize = 0;
-                instance->second->m_totalAllocSize = 0;
-            }
-        }
-        return *(instance->second);
-    }
-
-    // for requesting, find in buffer container first, if failed, allocate a new one
-    // if allocating from buffer, the size will be modified to the real buffer size
-    template<class ElemType>
-    ElemType* RequestBuffer(size_t& size)
-    {
-        ElemType* bufferPtr = nullptr;
-        auto& bufferContainer = BufferContainer<ElemType>();
-
-        // simply allocating based on size, more efficient and complex algorithm could be implemented here
-        auto bufferHint = bufferContainer.lower_bound(size);
-        if (bufferHint != bufferContainer.end() && bufferHint->first < size * MEM_MAX_LIMIT_TIMES) 
-        {
-            bufferPtr = bufferHint->second;
-            size = bufferHint->first;
-            m_totalManageSize -= size;
-            bufferContainer.erase(bufferHint);
-            return bufferPtr;
-        }
-
-        if (m_deviceId >= 0) {
-#ifndef CPUONLY
-            auto deviceSize = TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(m_deviceId);
-            float freeMemoryRatio = (float)deviceSize.first / deviceSize.second;
-            if (freeMemoryRatio < 0.05f || (deviceSize.first << 20) / sizeof(ElemType) < size) 
-            {
-                PhysicalReleaseAllBuffer<ElemType>();
-            }
-            bufferPtr = TracingGPUMemoryAllocator::Allocate<ElemType>(m_deviceId, size);
-            m_totalAllocSize += size;
-#endif
-        }
-        else 
-        {
-            // first, try no-throw allocation.
-            // if failed, empty the buffer and re-try a throwing allocation
-            // if failed again, let system throw the bad_alloc exception
-            bufferPtr = new (std::nothrow) ElemType[size];
-            if (!bufferPtr) 
-            {
-                PhysicalReleaseAllBuffer<ElemType>();
-                bufferPtr = new ElemType[size];
-            }
-            m_totalAllocSize += size;
-        }
-
-        return bufferPtr;
-    }
-
-    // insert the header of buffer into the buffer container
-    template<class ElemType>
-    void LogicalReleaseBuffer(ElemType* buffer, size_t size)
-    {
-        auto& bufferContainer = BufferContainer<ElemType>();
-        bufferContainer.insert(std::make_pair(size, buffer));
-        m_totalManageSize += size;
-    }
-
-    // physical release the buffer
-    template<class ElemType>
-    void PhysicalReleaseBuffer(ElemType* buffer)
-    {
-        if (m_deviceId >= 0) 
-        {
-#ifndef CPUONLY
-            TracingGPUMemoryAllocator::Free<ElemType>(m_deviceId, buffer, false);
-#endif
-        }
-        else {
-            delete[] buffer;
-        }
-    }
-
-    // empty all the cached buffer
-    template<class ElemType>
-    void PhysicalReleaseAllBuffer()
-    {
-        auto& bufferContainer = BufferContainer<ElemType>();
-
-        for (auto& iter : bufferContainer) 
-        {
-            PhysicalReleaseBuffer<ElemType>(iter.second);
-        }
-
-        bufferContainer.clear();
-        m_totalManageSize = 0;
-    }
-
-private:
-    static std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> m_instances;
-
-    template <class ElemType>
-    std::multimap<size_t, ElemType*>& BufferContainer();
-    DEVICEID_TYPE m_deviceId;
-    size_t m_totalManageSize;
-    size_t m_totalAllocSize;
-
-    // map to store all the temp buffer handle
-    std::multimap<size_t, float*> m_bufferFloatContainer;
-    std::multimap<size_t, double*> m_bufferDoubleContainer;
-    std::multimap<size_t, char*> m_bufferCharContainer;
-    std::multimap<size_t, short*> m_bufferShortContainer;
-    std::multimap<size_t, int*> m_bufferIntContainer;
-};
-
-
 // -----------------------------------------------------------------------
 // BaseMatrixStorage -- base class for all matrix types (CPU, GPU) x (dense, sparse)
 // -----------------------------------------------------------------------
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -260,8 +260,6 @@ protected:
        }

        // Only supported in MatrixPool enable
-        // NOTE: it's unnecessary to keep the workspace.
-        workspace.Resize(0, 0);
        CUDNN_CALL(err);
    }

@ -304,7 +302,6 @@ protected:
            if (CUDNN_STATUS_SUCCESS == err2)
                err = CUDNN_STATUS_SUCCESS;
        }
-        workspace.Resize(0, 0);
        CUDNN_CALL(err);
    }

@ -347,7 +344,6 @@ protected:
            if (CUDNN_STATUS_SUCCESS == err2)
                err = CUDNN_STATUS_SUCCESS;
        }
-        workspace.Resize(0, 0);
        CUDNN_CALL(err);
    }

--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -1531,42 +1531,35 @@ void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
+void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly)
 {
    if (GetNumRows() != numRows || GetNumCols() != numCols)
-        Resize(numRows, numCols, growOnly, cachedResize);
+        Resize(numRows, numCols, growOnly);
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
+void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
 {
    if (GetNumRows() == numRows && GetNumCols() == numCols)
        return;

    VerifyResizable(__FUNCTION__);
-    bool isForceResize = (!growOnly) || cachedResize;

    size_t numElements = numRows * numCols;
    if (numElements > GetSizeAllocated() ||                     // grow allocation
-        (isForceResize && numElements != GetSizeAllocated()))   // shrink allocation if not growOnly
+        (!growOnly && numElements != GetSizeAllocated()))   // shrink allocation if not growOnly
    {
        // If the buffer exists, free it before allocate
        if (Buffer())
        {
-            if (cachedResize)
-                BufferManagement::GetManagerInstance(GetComputeDeviceId()).LogicalReleaseBuffer<ElemType>(Buffer(), GetSizeAllocated());
-            else
-                TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
+            TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
        }

        // reallocate buffer if numElements > 0
        ElemType* pArray = nullptr;
        if (numElements > 0)
        {
-            if (cachedResize)
-                pArray = BufferManagement::GetManagerInstance(GetComputeDeviceId()).RequestBuffer<ElemType>(numElements);
-            else
-                pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
+            pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
        }

        SetBuffer(pArray, numElements * sizeof(ElemType));
@ -2374,7 +2367,9 @@ ElemType GPUMatrix<ElemType>::AbsoluteMax() const
        int resInd = 0;
        cublasIdamax(cuHandle, (CUDA_LONG)GetNumElements(), reinterpret_cast<double*>(Data()), 1, &resInd);
        resInd--;
+
        CUDA_CALL(cudaMemcpy(reinterpret_cast<double*>(&res), Data() + resInd, sizeof(double), cudaMemcpyDeviceToHost));
+
        return res;
    }
 }
@ -2951,7 +2946,30 @@ void GPUMatrix<ElemType>::Print(const char* /*matrixName*/, size_t /*rowStart*/,
 template <class ElemType>
 void GPUMatrix<ElemType>::Print(const char* matrixName /*=nullptr*/) const
 {
-    Print(matrixName, 0, GetNumRows() - 1, 0, GetNumCols() - 1);
+    size_t elemCount = GetNumRows() * GetNumCols();
+    vector<ElemType> localCopy(elemCount);
+    cudaMemcpy(localCopy.data(), Data(), elemCount * sizeof(ElemType), cudaMemcpyDeviceToHost);
+
+    fprintf(stderr, "\n###### ");
+    if (matrixName != nullptr)
+        fprintf(stderr, "%s ", matrixName);
+    fprintf(stderr, "(%lu, %lu) ######\n\n", (unsigned long)GetNumRows(), (unsigned long)GetNumCols());
+
+    if (IsEmpty())
+    {
+        fprintf(stderr, "(empty)\n");
+        return;
+    }
+
+    // CNTK is using column-major storage
+    for (size_t i = 0; i < GetNumRows(); i++)
+    {
+        for (size_t j = 0; j < GetNumCols(); j++)
+        {
+            fprintf(stderr, "%.10f\t", localCopy[i + j * GetNumRows()]);
+        }
+        fprintf(stderr, "\n");
+    }
 }

 //helpfer function used for convolution neural network
@ -4253,6 +4271,117 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::GetARowByIndex(const GPUMatrix<ElemTyp
    return *this;
 }

+// Calculate CTC score
+// prob (input): the posterior output from the network
+// alpha, beta (output): alpha and beta for forward-backward calculation. 
+// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance 
+// phoneBoundary (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance 
+// totalScore (output): total CTC score
+// uttToChanInd (input):  map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
+// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
+// uttFrameNum (input): the frame number of each utterance. The size of this vector =  the number of all utterances in this minibatch
+// uttPhoneNum (input): the phone number of each utterance. The size of this vector =  the number of all utterances in this minibatch
+// numParallelSequences (input): channel number in this minibatch
+// maxFrameNum (input): the maximum channel frame number
+// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
+//      Alpha and Beta scores outside of the delay boundary are set to zero.
+//      Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy
+//      delayConstraint=-1 means no constraint
+template<class ElemType>
+GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCTCScore(const GPUMatrix<ElemType>& prob,
+    GPUMatrix<ElemType>& alpha,
+    GPUMatrix<ElemType>& beta,
+    const GPUMatrix<ElemType> phoneSeq,
+    const GPUMatrix<ElemType> phoneBoundary,
+    ElemType &totalScore,
+    const std::vector<size_t>& uttToChanInd,
+    const std::vector<size_t> & uttBeginFrame,
+    const std::vector<size_t> & uttFrameNum,
+    const std::vector<size_t> & uttPhoneNum,
+    const size_t numParallelSequences,
+    const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
+{
+    if (isColWise)
+    {
+        PrepareDevice();
+        // Total number of phones
+        long totalPhoneNum = prob.GetNumRows();
+        size_t uttNum = uttFrameNum.size();
+
+        // Max number of phones in utterances in this minibatch
+        size_t maxPhoneNum = phoneSeq.GetNumRows();
+
+        size_t *gpuFrameNum;
+        CUDA_CALL(cudaMalloc((void **)&gpuFrameNum, uttNum * sizeof(size_t)));
+        CUDA_CALL(cudaMemcpy(gpuFrameNum, uttFrameNum.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
+
+        size_t *gpuPhoneNum;
+        CUDA_CALL(cudaMalloc((void **)&gpuPhoneNum, uttNum * sizeof(size_t)));
+        CUDA_CALL(cudaMemcpy(gpuPhoneNum, uttPhoneNum.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
+
+        size_t *gpuBeginFrame;
+        CUDA_CALL(cudaMalloc((void **)&gpuBeginFrame, uttNum * sizeof(size_t)));
+        CUDA_CALL(cudaMemcpy(gpuBeginFrame, uttBeginFrame.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
+
+        size_t *gpuUttToChanInd;
+        CUDA_CALL(cudaMalloc((void **)&gpuUttToChanInd, uttNum * sizeof(size_t)));
+        CUDA_CALL(cudaMemcpy(gpuUttToChanInd, uttToChanInd.data(), uttNum * sizeof(size_t), cudaMemcpyHostToDevice));
+
+        ElemType *gpuScores;
+        CUDA_CALL(cudaMalloc((void **)&gpuScores, uttNum * sizeof(ElemType)));
+
+        cudaEvent_t done = nullptr;
+        CUDA_CALL(cudaEventCreate(&done));
+        dim3 thread_tail(DEFAULT_THREAD_PER_DIM, DEFAULT_THREAD_PER_DIM);
+        // x dimension is for utterances
+        // y dimention is for phone sequence in each utterance
+        // Ensure that we allocate correct number of blocks for given number of utterances and max number of phones in those utterances 
+        dim3 block_tail((uttNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM, (maxPhoneNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM);
+        for (long t = 0; t < maxFrameNum; t++)
+        {
+            _assignAlphaScore << <block_tail, thread_tail, 0, t_stream >> >(prob.Data(), alpha.Data(), phoneSeq.Data(), phoneBoundary.Data(), gpuUttToChanInd,
+                gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
+        }
+
+        for (long t = maxFrameNum - 1; t >= 0; t--)
+        {
+            _assignBetaScore << <block_tail, thread_tail, 0, t_stream >> >(prob.Data(), beta.Data(), phoneSeq.Data(), phoneBoundary.Data(), gpuUttToChanInd,
+                gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
+        }
+
+        _assignTotalScore << <uttNum, 1, 0, t_stream >> > (beta.Data(), gpuScores, uttNum, gpuUttToChanInd, gpuBeginFrame, numParallelSequences, maxPhoneNum);
+
+        dim3 block_tail_2((uttNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM, (maxFrameNum + DEFAULT_THREAD_PER_DIM - 1) / DEFAULT_THREAD_PER_DIM);
+
+        _assignCTCScore << < block_tail_2, thread_tail, 0, t_stream >> >(Data(), prob.Data(), alpha.Data(), beta.Data(), phoneSeq.Data(), uttNum, gpuUttToChanInd,
+            gpuBeginFrame, gpuPhoneNum, gpuFrameNum, numParallelSequences, maxPhoneNum, totalPhoneNum);
+
+        vector<ElemType>scores(uttNum);
+        CUDA_CALL(cudaMemcpyAsync(scores.data(), gpuScores, sizeof(ElemType) * uttNum, cudaMemcpyDeviceToHost, t_stream));
+
+        for (size_t utt = 0; utt < uttFrameNum.size(); utt++)
+        {
+            totalScore += scores[utt];
+        }
+
+        CUDA_CALL(cudaFree(gpuFrameNum));
+        CUDA_CALL(cudaFree(gpuPhoneNum));
+        CUDA_CALL(cudaFree(gpuBeginFrame));
+        CUDA_CALL(cudaFree(gpuUttToChanInd));
+        CUDA_CALL(cudaFree(gpuScores));
+
+        CUDA_CALL(cudaEventRecord(done));
+        CUDA_CALL(cudaEventSynchronize(done));
+        CUDA_CALL(cudaEventDestroy(done));
+    }
+    else
+    {
+        NOT_IMPLEMENTED;
+    }
+
+    return *this;
+}
+
 template <class ElemType>
 void GPUMatrix<ElemType>::ConductRowElementMultiplyWithShift(const GPUMatrix<ElemType>& a, const GPUMatrix<ElemType>& b, GPUMatrix<ElemType>& c, const size_t shift, const bool isafixed)
 {
@ -4613,8 +4742,8 @@ template GPUMatrix<char>::GPUMatrix(const GPUMatrix<char>&);
 template GPUMatrix<char>::GPUMatrix(GPUMatrix<char>&&);
 template char* GPUMatrix<char>::CopyToArray() const;
 template void GPUMatrix<char>::ChangeDeviceTo(int);
-template void GPUMatrix<char>::Resize(size_t, size_t, bool, bool);
-template void GPUMatrix<char>::RequireSize(size_t, size_t, bool, bool);
+template void GPUMatrix<char>::Resize(size_t, size_t, bool);
+template void GPUMatrix<char>::RequireSize(size_t, size_t, bool);

 template GPUMatrix<char>::~GPUMatrix();
 template GPUMatrix<char> GPUMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
@ -4638,8 +4767,8 @@ template GPUMatrix<short>::GPUMatrix(const GPUMatrix<short>&);
 template GPUMatrix<short>::GPUMatrix(GPUMatrix<short>&&);
 template short* GPUMatrix<short>::CopyToArray() const;
 template void GPUMatrix<short>::ChangeDeviceTo(int);
-template void GPUMatrix<short>::Resize(size_t, size_t, bool, bool);
-template void GPUMatrix<short>::RequireSize(size_t, size_t, bool, bool);
+template void GPUMatrix<short>::Resize(size_t, size_t, bool);
+template void GPUMatrix<short>::RequireSize(size_t, size_t, bool);

 template GPUMatrix<short>::~GPUMatrix();
 template GPUMatrix<short> GPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -244,12 +244,12 @@ public:
    // RequireSize is now the new preferred method of ensuring the correct size inside of the Matrix class. Since Resize will fail if the storage object has
    // multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
    // will call Resize, which may fail if the matrix has multiple views.
-    void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow
-    void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true, bool cachedResize = false) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly, cachedResize); }
+    void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
+    void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly); }

    // Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
    // actually resizes the underlying matrix, doing any allocation as required.
-    void Resize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow
+    void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow

    ElemType&       operator()(const size_t /*row*/, const size_t /*col*/)       { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
    const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
@ -349,6 +349,10 @@ public:
    GPUMatrix<ElemType>& DropFrame(const GPUMatrix<ElemType>& label, const GPUMatrix<ElemType>& gamma, const ElemType& threshhold);
    GPUMatrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const GPUMatrix<ElemType>& label, const GPUMatrix<ElemType>& dnnoutput, const GPUMatrix<ElemType>& gamma, ElemType alpha);

+    GPUMatrix<ElemType>& AssignCTCScore(const GPUMatrix<ElemType>& prob, GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& beta,
+        const GPUMatrix<ElemType> phoneSeq, const GPUMatrix<ElemType> phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum,
+        const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise);
+
    GPUMatrix<ElemType>& InplaceSqrt();
    GPUMatrix<ElemType>& AssignSqrtOf(const GPUMatrix<ElemType>& a);

--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -5192,6 +5192,292 @@ __global__ void _adam4BlockSparseCol(CUDA_LONG size,
        val[idx] -= g;
    }
 }
+
+// calculate alpha in forward-backward calculation. equation (6), (7) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+// Calculate alpha in forward-backward calculation. equation (6), (7) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+// GPU x dimension corresponds to utterances, y dimension corresponds to phone sequence in each utterance
+// prob (input): the posterior output from the network
+// alpha (output): alpha for forward-backward calculation. 
+// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance 
+// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance 
+// uttToChanInd (input):  map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
+// uttFrameNum (input): the frame number of each utterance. The size of this vector =  the number of all utterances in this minibatch
+// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
+// uttPhoneNum (input): the phone number of each utterance. The size of this vector =  the number of all utterances in this minibatch
+// numChannels (input): channel number in this minibatch
+// uttNum (input): number of utterances
+// t (input): time stamp to process
+// maxPhoneNum (input): the max number of phones between utterances
+// totalPhoneNum (input): the total number of phones of all utterances
+// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
+//      Alpha and Beta scores outside of the delay boundary are set to zero.
+//      Setting this parameter smaller will result in shorted delay between label output during decoding.
+//      delayConstraint=-1 means no constraint
+template<class ElemType>
+__global__ void _assignAlphaScore(
+    const ElemType *prob,
+    ElemType *alphaScore,
+    ElemType *phoneSeq,
+    ElemType *phoneBound,
+    const size_t *uttToChanInd,
+    const size_t *uttFrameNum,
+    const size_t *uttBeginFrame,
+    const size_t *uttPhoneNum,
+    size_t numChannels,
+    const size_t uttNum,
+    const size_t  t,
+    const size_t maxPhoneNum, // Maximum length of utterance in this MB
+    const size_t totalPhoneNum, // Total number of phones
+    const int delayConstraint)
+{
+    LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
+    // Index of the label in the sequence
+    LONG64 phoneSeqId = blockDim.y * blockIdx.y + threadIdx.y;
+
+    // Number of phones and frames in this utterance
+    LONG64 phoneNum = uttPhoneNum[uttId]; 
+    LONG64 frameNum = uttFrameNum[uttId];
+
+    if (uttId >= uttNum || phoneSeqId >= phoneNum - 1 || t >= frameNum || phoneSeqId == 0) return;
+
+    // Current and previous phone indices in phoneSeq matrix
+    LONG64 labelid = uttId*maxPhoneNum + phoneSeqId;
+    LONG64 labelid_2 = labelid - 2;
+
+    // Actual current phone label
+    LONG64 phoneId = (LONG64)(phoneSeq[labelid]);
+
+    // Index of the current frame in minibatch
+    LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
+
+    // Index of probability of observing phoneId at frame timeId
+    LONG64 probId = timeId*totalPhoneNum + phoneId;
+
+    LONG64 alphaId = maxPhoneNum* timeId + phoneSeqId; // alpha_t(s)
+    // Previous time frame
+    LONG64 timeId_1 = timeId - numChannels; // Index corresponding to (t-1)
+    LONG64 alphaId_0 = maxPhoneNum* timeId_1 + phoneSeqId; // alpha_{t-1}(s)
+    LONG64 alphaId_1 = alphaId_0 - 1; // alpha_{t-1}(s-1)
+    LONG64 alphaId_2 = alphaId_0 - 2; // alpha_{t-1}(s-2)
+
+    if (t == 0)
+    {
+        // Initialize recursion
+        if (phoneSeqId == 1 || phoneSeqId == 2)
+        {
+            alphaScore[alphaId] = prob[probId];
+        }
+    }
+    else
+    {
+        if (phoneSeqId >= 1)
+        {
+            ElemType x = LZERO;
+
+            ElemType ascore;
+            if (phoneSeqId > 2)
+            {
+                // if current label is not blank and not equal prev non-blank label
+                if ((LONG64)(phoneSeq[labelid]) != totalPhoneNum - 1 && phoneId != (LONG64)(phoneSeq[labelid_2]))
+                {
+                    x = logaddk(x, alphaScore[alphaId_2]);
+                }
+            }
+
+            if (phoneSeqId > 1)
+            {
+                x = logaddk(x, alphaScore[alphaId_1]);
+            }
+
+            x = logaddk(x, alphaScore[alphaId_0]);
+
+            if (phoneId != SIZE_MAX)
+                ascore = prob[probId]; // Probability of observing given label at given time
+            else
+                ascore = 0;
+            alphaScore[alphaId] = (ElemType)x + ascore;
+            if (delayConstraint != -1)
+            {
+                LONG64 labelid_r = labelid + 2;
+                LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_r]);
+                if (phoneId == totalPhoneNum - 1)
+                {
+                    // only constraint right side
+                    if (t > phoneBoundId_r + delayConstraint - 1)
+                        alphaScore[alphaId] = LZERO;
+                }
+                else if (phoneId != totalPhoneNum - 1)
+                {
+                    if (t > phoneBoundId_r + delayConstraint)
+                        alphaScore[alphaId] = LZERO;
+                }
+            }
+        }
+    }
+}
+
+// Calculate beta in forward-backward calculation, equation (10), (11) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf 
+// See _assignAlphaScore for the explanation of parameters
+template<class ElemType>
+__global__ void _assignBetaScore(
+    const ElemType *prob,
+    ElemType *betaScore,
+    ElemType *phoneSeq,
+    ElemType *phoneBound,
+    const size_t *uttToChanInd,
+    const size_t *uttFrameNum,
+    const size_t *uttBeginFrame,
+    const size_t *uttPhoneNum,
+    const size_t numChannels,
+    const size_t uttNum,
+    const size_t  t,
+    const size_t maxPhoneNum,
+    const size_t totalPhoneNum,
+    const int delayConstraint)
+{
+    LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
+    // Index of the label in the sequence
+    LONG64 phoneSeqId = blockDim.y * blockIdx.y + threadIdx.y;
+    LONG64 phoneNum = uttPhoneNum[uttId];
+    LONG64 frameNum = uttFrameNum[uttId];
+
+    if (uttId >= uttNum || phoneSeqId >= phoneNum - 1 || t >= frameNum || phoneSeqId == 0) return;
+
+    LONG64 labelid = uttId*maxPhoneNum + phoneSeqId;
+    LONG64 labelid_2 = labelid + 2;
+    LONG64 phoneId = (LONG64)(phoneSeq[labelid]);
+    LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
+    LONG64 probId = timeId*totalPhoneNum + phoneId;
+    LONG64 betaid = maxPhoneNum* timeId + phoneSeqId;
+    LONG64 timeId_1 = timeId + numChannels;
+    LONG64 betaid_0 = maxPhoneNum* timeId_1 + phoneSeqId;
+    LONG64 betaid_1 = betaid_0 + 1;
+    LONG64 betaid_2 = betaid_0 + 2;
+
+    if (t == frameNum - 1)
+    {
+        if (phoneSeqId == phoneNum - 3 || phoneSeqId == phoneNum - 2)
+        {
+            betaScore[betaid] = prob[probId];
+        }
+    }
+    else
+    {
+        if (phoneSeqId >= 1)
+        {
+            ElemType x = LZERO;
+            ElemType ascore;
+            if (phoneSeqId < phoneNum - 3)
+            {
+                if (phoneSeq[labelid] != totalPhoneNum - 1 && phoneId != phoneSeq[labelid_2])
+                {
+                    x = logaddk(x, betaScore[betaid_2]);
+                }
+            }
+
+            if (phoneSeqId < phoneNum - 2)
+            {
+                x = logaddk(x, betaScore[betaid_1]);
+            }
+
+            x = logaddk(x, betaScore[betaid_0]);
+
+            if (phoneId != SIZE_MAX)
+                ascore = prob[probId];
+            else
+                ascore = 0;
+            betaScore[betaid] = (ElemType)x + ascore;
+            if (delayConstraint != -1)
+            {
+                LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_2]);
+                if (phoneId == totalPhoneNum - 1)
+                {
+                    if (t > phoneBoundId_r + delayConstraint - 1)
+                        betaScore[betaid] = LZERO;
+                }
+                else if (phoneId != totalPhoneNum - 1)
+                {
+                    if (t > phoneBoundId_r + delayConstraint)
+                        betaScore[betaid] = LZERO;
+                }
+            }
+        }
+    }
+}
+
+// Calculate derivative, equation (15) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+// See _assignAlphaScore for the explanation of parameters
+template<class ElemType>
+__global__ void _assignCTCScore(
+    ElemType *CTCscore,
+    ElemType *prob,
+    ElemType *alphaScore,
+    ElemType *betaScore,
+    ElemType *phoneSeq,
+    const size_t uttNum,
+    const size_t *uttToChanInd,
+    const size_t *uttBeginFrame,
+    const size_t *uttPhoneNum,
+    const size_t *uttFrameNum,
+    const long numChannels,
+    const long maxPhoneNum,
+    const long totalPhoneNum)
+{
+    LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
+    LONG64 t = blockDim.y * blockIdx.y + threadIdx.y;
+
+    if (uttId < uttNum && t < uttFrameNum[uttId])
+    {
+        LONG64 phoneNum = uttPhoneNum[uttId];
+        LONG64 alphaId_0 = (uttBeginFrame[uttId] * numChannels + uttToChanInd[uttId]) * maxPhoneNum;
+        LONG64 timeId = (t + uttBeginFrame[uttId])*numChannels + uttToChanInd[uttId];
+        ElemType P_lx = betaScore[alphaId_0];
+
+        for (int s = 1; s < phoneNum - 1; s++)
+        {
+            long phoneId = phoneSeq[uttId*maxPhoneNum + s];
+            LONG64 alphaId = maxPhoneNum* timeId + s;
+            LONG64 probId = timeId*totalPhoneNum + phoneId;
+
+            if (phoneId != SIZE_MAX)
+            {
+                ElemType logoccu = alphaScore[alphaId] + betaScore[alphaId] - prob[probId] - (ElemType)P_lx;
+                CTCscore[probId] = logaddk(CTCscore[probId], logoccu);
+            }
+        }
+
+        for (int s = 0; s < totalPhoneNum; s++)
+        {
+            LONG64 probId = timeId*totalPhoneNum + s;
+            ElemType logoccu = CTCscore[probId];
+            if (logoccu < LZERO)
+                CTCscore[probId] = 0.0f;
+            else
+                CTCscore[probId] = exp(logoccu);
+        }
+    }
+}
+
+// Calculate CTC score. equation (8) in http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf 
+template<class ElemType>
+__global__ void _assignTotalScore(ElemType *betaScore,
+    ElemType *totalScore,
+    const size_t uttNum,
+    const size_t *uttToChanInd,
+    const size_t *uttBeginFrame,
+    const size_t numChannels,
+    const size_t maxPhoneNum)
+{
+    LONG64 uttId = blockIdx.x;
+    if (uttId < uttNum)
+    {
+        LONG64 alphaId_0 = (uttBeginFrame[uttId] * numChannels + uttToChanInd[uttId]) * maxPhoneNum;
+
+        betaScore[alphaId_0] = logaddk(betaScore[alphaId_0 + 1], betaScore[alphaId_0 + 2]);
+        totalScore[uttId] = betaScore[alphaId_0];
+    }
+}
+
 }
 }
 }
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -158,23 +158,6 @@ int GetMathLibTraceLevel()

 MatrixBase::~MatrixBase() { }

-#pragma region BufferManagement
-
-std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> BufferManagement::m_instances;
-
-template <>
-std::multimap<size_t, float*>& BufferManagement::BufferContainer<float>() { return m_bufferFloatContainer; }
-template <>
-std::multimap<size_t, double*>& BufferManagement::BufferContainer<double>() { return m_bufferDoubleContainer; }
-template <>
-std::multimap<size_t, char*>& BufferManagement::BufferContainer<char>() { return m_bufferCharContainer; }
-template <>
-std::multimap<size_t, short*>& BufferManagement::BufferContainer<short>() { return m_bufferShortContainer; }
-template <>
-std::multimap<size_t, int*>& BufferManagement::BufferContainer<int>() { return m_bufferIntContainer; }
-
-#pragma endregion
-
 #pragma region Constructors, destructors and other static matrix builders


@ -184,10 +167,6 @@ std::multimap<size_t, int*>& BufferManagement::BufferContainer<int>() { return m
 //            { GPU code },
 //            ...

-// By default, the CachedMatrixBuffer is disable
-template <class ElemType>
-bool Matrix<ElemType>::m_useCachedResize = false;
-
 // Initialize members 
 template <class ElemType>
 void Matrix<ElemType>::Init(DEVICEID_TYPE deviceId)
@ -301,9 +280,6 @@ void Matrix<ElemType>::SetDataLocation(CurrentDataLocation location, MatrixType
        LogicError("SetDataLocation: New m_baseMatrix must not be NULL.");
 }

-template <class ElemType>
-void Matrix<ElemType>::UseCachedResizeOrNot(bool useCachedResize) { m_useCachedResize = useCachedResize; }
-
 //this is a private constructor only used internally to initialize a blank matrix
 template <class ElemType>
 Matrix<ElemType>::Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID)
@ -1829,7 +1805,7 @@ void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const
    // TODO: should this function test whether the size is changing, and skip if it isn't? We have at least one explicit test for this code calling this (recurrent node)
    DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
        { m_CPUMatrix->Resize(numRows, numCols, growOnly); },
-        { m_GPUMatrix->Resize(numRows, numCols, growOnly, m_useCachedResize); },
+        { m_GPUMatrix->Resize(numRows, numCols, growOnly); },
        { m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); },
        { m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); });
 #ifdef _DEBUG
@ -5736,6 +5712,51 @@ Matrix<ElemType>& Matrix<ElemType>::AssignSequenceError(const ElemType hsmoothin
                            NOT_IMPLEMENTED);
    return *this;
 }
+
+// Calculate CTC score
+// prob (input): the posterior output from the network
+// alpha, beta (output): alpha and beta for forward-backward calculation. 
+// phoneSeq (input): phone ID sequence for each utterance in this minibatch, each col is one utterance 
+// phoneBound (input): phone boundary (frame index) of each phone for each utterance in this minibatch, each col is one utterance 
+// totalScore (output): total CTC score
+// uttToChanInd (input):  map from utterance ID to minibatch channel ID. We need this because each channel may contain more than one utterance.
+// uttBeginFrame(input): the positon of the first frame of each utterance in the minibatch channel. We need this because each channel may contain more than one utterance.
+// uttFrameNum (input): the frame number of each utterance. The size of this vector =  the number of all utterances in this minibatch
+// uttPhoneNum (input): the phone number of each utterance. The size of this vector =  the number of all utterances in this minibatch
+// numParallelSequences (input): num of parallel sequences
+// mbsize (input): the maximum channel frame number
+// delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin.
+//      Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
+//      delayConstraint=-1 means no constraint
+template<class ElemType>
+Matrix<ElemType>& Matrix<ElemType>::AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta,
+    const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, ElemType &totalScore, const std::vector<size_t> & uttToChanInd,
+    const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum, const std::vector<size_t> & uttPhoneNum,
+    const size_t numParallelSequences, const size_t mbsize, const int delayConstraint, const bool isColWise)
+{
+    DecideAndMoveToRightDevice(prob, *this);
+    alpha.Resize(phoneSeq.GetNumRows(), prob.GetNumCols());
+    beta.Resize(phoneSeq.GetNumRows(), prob.GetNumCols());
+    Resize(prob.GetNumRows(), prob.GetNumCols());
+
+    alpha.SetValue(LZERO);
+    beta.SetValue(LZERO);
+    SetValue(LZERO);
+    SwitchToMatrixType(prob.GetMatrixType(), prob.GetFormat(), false);
+
+    DISPATCH_MATRIX_ON_FLAG(&prob,
+        this,
+        this->m_CPUMatrix->AssignCTCScore(*prob.m_CPUMatrix, *alpha.m_CPUMatrix, *beta.m_CPUMatrix, *phoneSeq.m_CPUMatrix, *phoneBound.m_CPUMatrix, totalScore,
+            uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, isColWise),
+        this->m_GPUMatrix->AssignCTCScore(*prob.m_GPUMatrix, *alpha.m_GPUMatrix, *beta.m_GPUMatrix, *phoneSeq.m_GPUMatrix, *phoneBound.m_GPUMatrix, totalScore,
+            uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, isColWise),
+        NOT_IMPLEMENTED,
+        NOT_IMPLEMENTED
+    );
+
+    return *this;
+}
+
 #pragma endregion Static BLAS Functions

 // TensorView currently does not interface with sparse matrices. For now, we just catch this and throw.
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -87,9 +87,6 @@ private:
    mutable size_t m_numTimesMatrixTypeChanged;
    mutable int m_devicesTransferedTo[2]; // TODO: what is this for? Seems only diagnostics
 
-    // whether to use cached memory Resize() or not
-    static bool m_useCachedResize;
-
    // Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
    void _transferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
    // Moves matrix from current device to device with id_to. This method doesn't change preferred device Id
@ -143,8 +140,6 @@ public:
        SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
    }

-    static void UseCachedResizeOrNot(bool useCachedResize);
-
 private:
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID);                                  // only used internally to initialize a blank matrix
@ -382,6 +377,11 @@ public:
    // sequence training
    Matrix<ElemType>& DropFrame(const Matrix<ElemType>& label, const Matrix<ElemType>& gamma, const ElemType& threshhold);
    Matrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const Matrix<ElemType>& label, const Matrix<ElemType>& dnnoutput, const Matrix<ElemType>& gamma, ElemType alpha);
+
+    Matrix<ElemType>& AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta, const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, ElemType &totalScore,
+        const vector<size_t> & extraUttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep,
+        const size_t mbSize, const int delayConstraint, const bool isColWise);
+
    Matrix<ElemType>& InplaceSqrt();
    Matrix<ElemType>& AssignSqrtOf(const Matrix<ElemType>& a);

--- a/Source/Math/NcclComm.cpp
+++ b/Source/Math/NcclComm.cpp
@ -26,10 +26,8 @@ NcclComm::NcclComm(int deviceId, const MPIWrapperPtr& mpi)
        return;

    size_t numRanks = mpi->NumNodesInUse();
-    MPI_Comm mpiComm = mpi->Communicator();
    std::vector<int> allDevs(numRanks);
-    MPI_Allgather(&deviceId, 1, MPI_INT, allDevs.data(), 1, MPI_INT, mpiComm)
-        || MpiFail("NcclComm: MPI_Allgather");
+    mpi->Allgather(&deviceId, 1, MPI_INT, allDevs.data(), 1, MPI_INT);

    for (size_t r = 0; r<numRanks; r++)
    {
@ -53,8 +51,7 @@ NcclComm::NcclComm(int deviceId, const MPIWrapperPtr& mpi)
    if (res != ncclSuccess)
        RuntimeError("NcclComm failed to obtain ncclUniqueId: %s", ncclGetErrorString(res));

-    MPI_Bcast(&ncclId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, mpiComm)
-        || MpiFail("NcclComm: MPI_Bcase");
+    mpi->Bcast(&ncclId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0);

    PrepareDevice(deviceId);
    res = ncclCommInitRank(&m_ncclComm, numRanks, ncclId, mpi->CurrentNodeRank());
--- a/Показать больше
+++ b/Показать больше