Merge branch 'master' into qiwye/asgd-dev

Conflicts: Source/ActionsLib/ActionsLib.vcxproj Source/Readers/HTKMLFReader/utterancesourcemulti.h Source/SGDLib/SGDLib.vcxproj
2016-04-28 13:56:19 +08:00 · 2016-04-28 13:56:19 +08:00 · 679b55df50
--- a/.gitattributes
+++ b/.gitattributes
@ -9,6 +9,7 @@ Dockerfile-GPU text
 *.post text
 *.cpu text
 *.gpu text
+*.rst text

 .gitattributes text
 .gitignore text
@ -75,6 +76,9 @@ mean.363 text
 var.363 text
 prior.132 text

+# dot (graph description language) file 
+*.dot text 
+
 # AMI-specific
 Results text
 40fbank.conf text
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -47,4 +47,52 @@
    <CudaLibPath>$(CudaPath)\lib\$(Platform)</CudaLibPath>
  </PropertyGroup>

+  <!-- TODO warn if ConfigurationType not (yet) defined -->
+
+  <PropertyGroup Condition="'$(ConfigurationType)' == 'StaticLibrary'">
+    <UseDebugLibraries>$(DebugBuild)</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <WholeProgramOptimization>$(ReleaseBuild)</WholeProgramOptimization>
+    <LinkIncremental>$(DebugBuild)</LinkIncremental>
+  </PropertyGroup>
+
+  <ItemDefinitionGroup Condition="'$(ConfigurationType)' == 'StaticLibrary'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <SDLCheck>true</SDLCheck>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+    </Link>
+  </ItemDefinitionGroup>
+
+  <ItemDefinitionGroup Condition="'$(ConfigurationType)' == 'StaticLibrary' And $(DebugBuild)">
+    <ClCompile>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <StackReserveSize>100000000</StackReserveSize>
+    </Link>
+  </ItemDefinitionGroup>
+
+  <ItemDefinitionGroup Condition="'$(ConfigurationType)' == 'StaticLibrary' And $(ReleaseBuild)">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <Profile>true</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+
 </Project>
--- a/CNTK.sln
+++ b/CNTK.sln
@ -7,8 +7,10 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTK", "Source\CNTK\CNTK.vc
 	ProjectSection(ProjectDependencies) = postProject
 		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 		{EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1}
 		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
+		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tests", "Tests", "{D45DF403-6781-444E-B654-A96868C5BE68}"
@ -129,15 +131,8 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{19EE975B-2
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ComputationNetworkLib", "Source\ComputationNetworkLib\ComputationNetworkLib.vcxproj", "{928ABD1B-4D3B-4017-AEF1-0FA1B4467513}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
-	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SGDLib", "Source\SGDLib\SGDLib.vcxproj", "{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}"
-	ProjectSection(ProjectDependencies) = postProject
-		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
-	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelTraining", "ParallelTraining", "{5E666C53-2D82-49C9-9127-3FDDC321C741}"
 	ProjectSection(SolutionItems) = preProject
@ -278,14 +273,10 @@ EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathTests", "Tests\UnitTests\MathTests\MathTests.vcxproj", "{4701E678-5E6F-470D-B348-9CD1A2C095D1}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ActionsLib", "Source\ActionsLib\ActionsLib.vcxproj", "{EB2BE26F-6BD4-4274-971F-86D080779DD1}"
-	ProjectSection(ProjectDependencies) = postProject
-		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}
-	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SequenceTraining", "SequenceTraining", "{BB8B9FC5-C4B3-477F-80E2-665DC8E431BD}"
 	ProjectSection(SolutionItems) = preProject
@ -363,6 +354,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ReaderTests", "Tests\UnitTe
 		{9BD0A711-0BBD-45B6-B81C-053F03C26CFB} = {9BD0A711-0BBD-45B6-B81C-053F03C26CFB}
 		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33D2FD22-DEF2-4507-A58A-368F641AEBE5}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 		{91973E60-A7BE-4C86-8FDB-59C88A0B3715} = {91973E60-A7BE-4C86-8FDB-59C88A0B3715}
 		{7B7A51ED-AA8E-4660-A805-D50235A02120} = {7B7A51ED-AA8E-4660-A805-D50235A02120}
 		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {E6646FFE-3588-4276-8A15-8D65C22711C1}
@ -373,10 +365,12 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalDll", "Source\EvalDll\E
 		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
 		{EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1}
+		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Math", "Source\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
 	ProjectSection(ProjectDependencies) = postProject
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {B3DD765E-694E-4494-BAD7-37BBF2942517}
 	EndProjectSection
 EndProject
@ -385,46 +379,55 @@ EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LibSVMBinaryReader", "Source\Readers\LibSVMBinaryReader\LibSVMBinaryReader.vcxproj", "{D667AF32-028A-4A5D-BE19-F46776F0F6B2}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BinaryReader", "Source\Readers\BinaryReader\BinaryReader.vcxproj", "{1D5787D4-52E4-45DB-951B-82F220EE0C6A}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DSSMReader", "Source\Readers\DSSMReader\DSSMReader.vcxproj", "{014DA766-B37B-4581-BC26-963EA5507931}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HTKMLFReader", "Source\Readers\HTKMLFReader\HTKMLFReader.vcxproj", "{33D2FD22-DEF2-4507-A58A-368F641AEBE5}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LMSequenceReader", "Source\Readers\LMSequenceReader\LMSequenceReader.vcxproj", "{9A2F2441-5972-4EA8-9215-4119FCE0FB68}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LUSequenceReader", "Source\Readers\LUSequenceReader\LUSequenceReader.vcxproj", "{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "SparsePCReader", "Source\Readers\SparsePCReader\SparsePCReader.vcxproj", "{CE429AA2-3778-4619-8FD1-49BA3B81197B}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UCIFastReader", "Source\Readers\UCIFastReader\UCIFastReader.vcxproj", "{E6646FFE-3588-4276-8A15-8D65C22711C1}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "EvalTest", "Tests\UnitTests\EvalTest\EvalTest.vcxproj", "{731312A8-6DA3-4841-AFCD-57520BA1BF8E}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathPerformanceTests", "Tests\UnitTests\MathPerformanceTests\MathPerformanceTests.vcxproj", "{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}"
@ -453,28 +456,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "GPU", "GPU", "{2A1F0FB0-230
 		Tests\EndToEndTests\LM\RNNLM\GPU\rnnlm.cntk = Tests\EndToEndTests\LM\RNNLM\GPU\rnnlm.cntk
 	EndProjectSection
 EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{BFBC6BE1-C33E-4A80-B8F3-A33410EC00FC}"
-	ProjectSection(SolutionItems) = preProject
-		Tests\EndToEndTests\SLU\atis.dev.IOB.simple = Tests\EndToEndTests\SLU\atis.dev.IOB.simple
-		Tests\EndToEndTests\SLU\atis.test.apos.pred.pos.head.IOB.simple = Tests\EndToEndTests\SLU\atis.test.apos.pred.pos.head.IOB.simple
-		Tests\EndToEndTests\SLU\atis.train.apos.pred.pos.head.IOB.simple = Tests\EndToEndTests\SLU\atis.train.apos.pred.pos.head.IOB.simple
-		Tests\EndToEndTests\SLU\baseline.linux.cpu.txt = Tests\EndToEndTests\SLU\baseline.linux.cpu.txt
-		Tests\EndToEndTests\SLU\baseline.linux.gpu.txt = Tests\EndToEndTests\SLU\baseline.linux.gpu.txt
-		Tests\EndToEndTests\SLU\baseline.windows.cpu.txt = Tests\EndToEndTests\SLU\baseline.windows.cpu.txt
-		Tests\EndToEndTests\SLU\baseline.windows.gpu.txt = Tests\EndToEndTests\SLU\baseline.windows.gpu.txt
-		Tests\EndToEndTests\SLU\globals.cntk = Tests\EndToEndTests\SLU\globals.cntk
-		Tests\EndToEndTests\SLU\input.txt = Tests\EndToEndTests\SLU\input.txt
-		Tests\EndToEndTests\SLU\inputmap.txt = Tests\EndToEndTests\SLU\inputmap.txt
-		Tests\EndToEndTests\SLU\lstm.ndl = Tests\EndToEndTests\SLU\lstm.ndl
-		Tests\EndToEndTests\SLU\lstmNDL.txt = Tests\EndToEndTests\SLU\lstmNDL.txt
-		Tests\EndToEndTests\SLU\output.txt = Tests\EndToEndTests\SLU\output.txt
-		Tests\EndToEndTests\SLU\README.txt = Tests\EndToEndTests\SLU\README.txt
-		Tests\EndToEndTests\SLU\rnnlu.cntk = Tests\EndToEndTests\SLU\rnnlu.cntk
-		Tests\EndToEndTests\SLU\rnnlu.ndl.cntk = Tests\EndToEndTests\SLU\rnnlu.ndl.cntk
-		Tests\EndToEndTests\SLU\run-test = Tests\EndToEndTests\SLU\run-test
-		Tests\EndToEndTests\SLU\testcases.yml = Tests\EndToEndTests\SLU\testcases.yml
-	EndProjectSection
-EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "MNIST", "MNIST", "{FA33A61E-95C7-4049-8111-22058CE361A3}"
 	ProjectSection(SolutionItems) = preProject
 		Examples\Image\MNIST\README.md = Examples\Image\MNIST\README.md
@ -773,9 +754,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "03_ConvBatchNorm", "03_Conv
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ReaderLib", "Source\Readers\ReaderLib\ReaderLib.vcxproj", "{F0A9637C-20DA-42F0-83D4-23B4704DE602}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Simple2d", "Simple2d", "{D456FA9C-A51C-48B9-87DE-0F7D8A910265}"
 EndProject
@ -841,32 +819,31 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PennTreebank", "PennTreeban
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "RNN", "RNN", "{B72C5B0E-38E8-41BF-91FE-0C1012C7C078}"
 	ProjectSection(SolutionItems) = preProject
-		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.debug.cpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.debug.cpu.txt
-		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.debug.gpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.debug.gpu.txt
-		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.release.cpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.release.cpu.txt
-		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.release.gpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.release.gpu.txt
-		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.debug.cpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.debug.cpu.txt
-		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.debug.gpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.debug.gpu.txt
-		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.release.cpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.release.cpu.txt
-		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.release.gpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.release.gpu.txt
+		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.cpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.cpu.txt
+		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.gpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.cpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.cpu.txt
+		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.gpu.txt = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\baseline.windows.gpu.txt
 		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\run-test = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\run-test
 		Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\testcases.yml = Tests\EndToEndTests\Examples\Text\PennTreebank\RNN\testcases.yml
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKTextFormatReader", "Source\Readers\CNTKTextFormatReader\CNTKTextFormatReader.vcxproj", "{91973E60-A7BE-4C86-8FDB-59C88A0B3715}"
 	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
 		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ExperimentalHTKMLFReader", "Source\Readers\ExperimentalHTKMLFReader\ExperimentalHTKMLFReader.vcxproj", "{7B7A51ED-AA8E-4660-A805-D50235A02120}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
 	EndProjectSection
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ImageReader", "Source\Readers\ImageReader\ImageReader.vcxproj", "{9BD0A711-0BBD-45B6-B81C-053F03C26CFB}"
 	ProjectSection(ProjectDependencies) = postProject
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
 	EndProjectSection
 EndProject
@ -909,6 +886,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NetworkTests", "Tests\UnitT
 	ProjectSection(ProjectDependencies) = postProject
 		{928ABD1B-4D3B-4017-AEF1-0FA1B4467513} = {928ABD1B-4D3B-4017-AEF1-0FA1B4467513}
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 		{EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1}
 		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
 	EndProjectSection
@ -923,8 +901,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CIFAR-10", "CIFAR-10", "{01
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "01_Convolution", "01_Convolution", "{58286327-6742-44C4-A34E-D2583419E55E}"
 	ProjectSection(SolutionItems) = preProject
-		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.cpu.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.cpu.txt
-		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.gpu.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.txt
 		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.windows.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.windows.txt
 		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\run-test = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\run-test
 		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\testcases.yml = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\testcases.yml
@ -954,6 +931,18 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "04_ResNet", "04_ResNet", "{
 		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\testcases.yml = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\testcases.yml
 	EndProjectSection
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Miscellaneous", "Miscellaneous", "{CCB0CD89-DE53-4104-94D3-041D46FC8885}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{B900D033-DC37-45F1-AE52-F35584FD3024}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\baseline.linux.cpu.txt = Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\baseline.linux.cpu.txt
+		Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\baseline.linux.gpu.txt = Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\baseline.windows.cpu.txt = Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\baseline.windows.cpu.txt
+		Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\baseline.windows.gpu.txt = Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\baseline.windows.gpu.txt
+		Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\run-test = Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\run-test
+		Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\testcases.yml = Tests\EndToEndTests\Examples\Text\Miscellaneous\SLU\testcases.yml
+	EndProjectSection
+EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Config", "Config", "{EC780385-7580-4D15-914B-1D878A295CBC}"
 	ProjectSection(SolutionItems) = preProject
 		Tests\EndToEndTests\Text\SequenceClassification\Config\seqcla.cntk = Tests\EndToEndTests\Text\SequenceClassification\Config\seqcla.cntk
@ -965,6 +954,28 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{D11F76CC-D
 		Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt
 	EndProjectSection
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{181664AC-4C95-4798-A923-09B879215B33}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Text\SLU\atis.dev.IOB.simple = Tests\EndToEndTests\Text\SLU\atis.dev.IOB.simple
+		Tests\EndToEndTests\Text\SLU\atis.test.apos.pred.pos.head.IOB.simple = Tests\EndToEndTests\Text\SLU\atis.test.apos.pred.pos.head.IOB.simple
+		Tests\EndToEndTests\Text\SLU\atis.train.apos.pred.pos.head.IOB.simple = Tests\EndToEndTests\Text\SLU\atis.train.apos.pred.pos.head.IOB.simple
+		Tests\EndToEndTests\Text\SLU\baseline.linux.cpu.txt = Tests\EndToEndTests\Text\SLU\baseline.linux.cpu.txt
+		Tests\EndToEndTests\Text\SLU\baseline.linux.gpu.txt = Tests\EndToEndTests\Text\SLU\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Text\SLU\baseline.windows.cpu.txt = Tests\EndToEndTests\Text\SLU\baseline.windows.cpu.txt
+		Tests\EndToEndTests\Text\SLU\baseline.windows.gpu.txt = Tests\EndToEndTests\Text\SLU\baseline.windows.gpu.txt
+		Tests\EndToEndTests\Text\SLU\globals.cntk = Tests\EndToEndTests\Text\SLU\globals.cntk
+		Tests\EndToEndTests\Text\SLU\input.txt = Tests\EndToEndTests\Text\SLU\input.txt
+		Tests\EndToEndTests\Text\SLU\inputmap.txt = Tests\EndToEndTests\Text\SLU\inputmap.txt
+		Tests\EndToEndTests\Text\SLU\lstm.ndl = Tests\EndToEndTests\Text\SLU\lstm.ndl
+		Tests\EndToEndTests\Text\SLU\output.txt = Tests\EndToEndTests\Text\SLU\output.txt
+		Tests\EndToEndTests\Text\SLU\rnnlu.cntk = Tests\EndToEndTests\Text\SLU\rnnlu.cntk
+		Tests\EndToEndTests\Text\SLU\rnnlu.ndl.cntk = Tests\EndToEndTests\Text\SLU\rnnlu.ndl.cntk
+		Tests\EndToEndTests\Text\SLU\run-test = Tests\EndToEndTests\Text\SLU\run-test
+		Tests\EndToEndTests\Text\SLU\testcases.yml = Tests\EndToEndTests\Text\SLU\testcases.yml
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Common", "Source\Common\Common.vcxproj", "{86883653-8A61-4038-81A0-2379FAE4200A}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug_CpuOnly|x64 = Debug_CpuOnly|x64
@ -1187,6 +1198,14 @@ Global
 		{CDA96AA3-3252-4978-A0BF-2ACD670823CB}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
 		{CDA96AA3-3252-4978-A0BF-2ACD670823CB}.Release|x64.ActiveCfg = Release|x64
 		{CDA96AA3-3252-4978-A0BF-2ACD670823CB}.Release|x64.Build.0 = Release|x64
+		{86883653-8A61-4038-81A0-2379FAE4200A}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{86883653-8A61-4038-81A0-2379FAE4200A}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{86883653-8A61-4038-81A0-2379FAE4200A}.Debug|x64.ActiveCfg = Debug|x64
+		{86883653-8A61-4038-81A0-2379FAE4200A}.Debug|x64.Build.0 = Debug|x64
+		{86883653-8A61-4038-81A0-2379FAE4200A}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{86883653-8A61-4038-81A0-2379FAE4200A}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{86883653-8A61-4038-81A0-2379FAE4200A}.Release|x64.ActiveCfg = Release|x64
+		{86883653-8A61-4038-81A0-2379FAE4200A}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -1250,7 +1269,6 @@ Global
 		{811924DE-2F12-4EA0-BE58-E57BEF3B74D1} = {3BF59CCE-D245-420A-9F17-73CE61E284C2}
 		{96012801-5187-4FAF-A54E-BF4B73C855F8} = {811924DE-2F12-4EA0-BE58-E57BEF3B74D1}
 		{2A1F0FB0-2304-4F35-87B3-66230C6E58F0} = {811924DE-2F12-4EA0-BE58-E57BEF3B74D1}
-		{BFBC6BE1-C33E-4A80-B8F3-A33410EC00FC} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5}
 		{FA33A61E-95C7-4049-8111-22058CE361A3} = {9BDFA4BE-790E-408F-915B-5979BB5078C6}
 		{F99E1E80-50D8-421C-AD94-8ED0DF08C355} = {9BDFA4BE-790E-408F-915B-5979BB5078C6}
 		{ED57E827-B28F-4BEE-BFB7-398EF8D83357} = {FA33A61E-95C7-4049-8111-22058CE361A3}
@ -1314,7 +1332,11 @@ Global
 		{AB9207B9-B134-4C57-B7ED-F3DCF7B0DC5F} = {0141526B-F257-4574-8CBE-99634726FFCE}
 		{12FB912C-43F8-40FE-BD7F-B52F589A1EBC} = {0141526B-F257-4574-8CBE-99634726FFCE}
 		{2BFE4D88-6F32-4701-887A-1DE3D7626DBB} = {0141526B-F257-4574-8CBE-99634726FFCE}
+		{CCB0CD89-DE53-4104-94D3-041D46FC8885} = {439BE0E0-FABE-403D-BF2C-A41FB8A60616}
+		{B900D033-DC37-45F1-AE52-F35584FD3024} = {CCB0CD89-DE53-4104-94D3-041D46FC8885}
 		{EC780385-7580-4D15-914B-1D878A295CBC} = {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}
 		{D11F76CC-DB6D-4CB4-B3B7-AB139DE2F5FA} = {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}
+		{181664AC-4C95-4798-A923-09B879215B33} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 	EndGlobalSection
 EndGlobal
--- a/Documentation/Documents/Configuration
+++ b/Documentation/Documents/Configuration
@ -49,7 +49,12 @@ files=(;c:\data.txt;c:\labels.txt)
 		<td>Parameter Set</td>
 		<td>
 			
-<pre><code>section1=[id=1;size=256]
section2=[
  subsection=[string="hi";num=5]
  value=1e-10
  array=10:"this is a test":1.25
]
+<pre><code>section1=[id=1;size=256]
+section2=[
+  subsection=[string="hi";num=5]
+  value=1e-10
+  array=10:"this is a test":1.25
+]
 </code></pre>
 			
 		</td>
@ -66,7 +71,7 @@ files=(;c:\data.txt;c:\labels.txt)

 ### Organization

-In CNTK configuration files Parameter Sets are organized in a hierarchal fashion. The actual data values are not evaluated until a CNTK components requests the value. When a value is requested, by a component, it will search that components section of the configuration file, if the value is not found, it will continue looking in the parent parameter set and continue looking in parent parameter sets until the parameter is found, or the top level of the configuration hierarchy is reached without a match.
+In CNTK configuration files Parameter Sets are organized in a hierarchical fashion. The actual data values are not evaluated until a CNTK components requests the value. When a value is requested, by a component, it will search that components section of the configuration file, if the value is not found, it will continue looking in the parent parameter set and continue looking in parent parameter sets until the parameter is found, or the top level of the configuration hierarchy is reached without a match.

 ### Default Values

@ -107,7 +112,69 @@ This section will go through a sample configuration file that creates a simple D
 Here is a simple example of a configuration file:

 ```
-# sample configuration file for CNTK 
command=mnistTrain:mnistTest

#global parameters, all commands use these values unless overridden at a higher level
precision=float
deviceId=auto

#commands used will be appended the stderr name to create a path 
stderr=c:\cntk\log\cntk # “_mnistTrain_mnistTest.log” would be appended
traceLevel=0 # larger values mean more output
ndlMacros=C:\cntk\config\DefaultMacros.ndl
modelPath=c:\cntk\model\sample.dnn
labelMappingFile=c:\cntk\data\mnist\labels.map

mnistTrain=[
    action=train
    minibatchSize=32
    epochSize=60000

    NDLNetworkBuilder=[
        networkDescription=c:\cntk\config\sample.ndl
        run=ndlMacroUse
    ]
    SGD=[
        #modelPath - moved to root level to share with mnistTest
        learningRatesPerMB=0.001
        maxEpochs=50
    ]
    reader=[
        readerType=UCIFastReader
        file=c:\cntk\data\mnist\mnist_train.txt
        features=[
            dim=784
            start=1        
        ]
        labels=[
            dim=1
            start=0
            labelDim=10
        ]
    ]
]

mnistTest=[
    action=eval
    maxEpochs=1
    epochSize=10000
    minibatchSize=1000    
    reader=[
        readerType=UCIFastReader
        randomize=None
        file=c:\data\mnist\mnist_test.txt
        features=[
            dim=784
            start=1
        ]
        labels=[
            dim=1
            start=0
            labelDim=10
        ]
    ]
]
+# sample configuration file for CNTK 
+command=mnistTrain:mnistTest
+
+#global parameters, all commands use these values unless overridden at a higher level
+precision=float
+deviceId=auto
+
+#commands used will be appended the stderr name to create a path 
+stderr=c:\cntk\log\cntk # “_mnistTrain_mnistTest.log” would be appended
+traceLevel=0 # larger values mean more output
+ndlMacros=C:\cntk\config\DefaultMacros.ndl
+modelPath=c:\cntk\model\sample.dnn
+labelMappingFile=c:\cntk\data\mnist\labels.map
+
+mnistTrain=[
+    action=train
+    minibatchSize=32
+    epochSize=60000
+
+    NDLNetworkBuilder=[
+        networkDescription=c:\cntk\config\sample.ndl
+        run=ndlMacroUse
+    ]
+    SGD=[
+        #modelPath - moved to root level to share with mnistTest
+        learningRatesPerMB=0.001
+        maxEpochs=50
+    ]
+    reader=[
+        readerType=UCIFastReader
+        file=c:\cntk\data\mnist\mnist_train.txt
+        features=[
+            dim=784
+            start=1        
+        ]
+        labels=[
+            dim=1
+            start=0
+            labelDim=10
+        ]
+    ]
+]
+
+mnistTest=[
+    action=eval
+    maxEpochs=1
+    epochSize=10000
+    minibatchSize=1000    
+    reader=[
+        readerType=UCIFastReader
+        randomize=None
+        file=c:\data\mnist\mnist_test.txt
+        features=[
+            dim=784
+            start=1
+        ]
+        labels=[
+            dim=1
+            start=0
+            labelDim=10
+        ]
+    ]
+]
 ```

 ### Commands and actions
@ -121,7 +188,9 @@ command=mnistTrain:mnistTest
 This command instructs CNTK to execute the **mnistTrain** section of the config file, followed by mnistTest. Each of these Config sections has an action associated with it:

 ```
-mnistTrain=[
    action=train
    …
+mnistTrain=[
+    action=train
+    …
 ```
 The **mnistTrain** section will execute the **train** action, and the **mnistTest** section will execute **eval**. The names of the sections is arbitrary, but the configuration parameter names must be command and action.

@ -158,7 +227,9 @@ all | Use all the available GPU devices (will use PTask engine if more than one
 Log files are redirection of the normal standard error output. All log information is sent to standard error, and will appear on the console screen unless the stderr parameter is defined, or some other form of user redirection is active. The stderr parameter defines the directory and the prefix for the log file. The suffix is defined by what commands are being run. As an example if “abc” is the setting “abc\_mnistTrain.log” would be the log file name. It is important to note that this file is overwritten on subsequent executions if the stderr parameter and the command being run are identical.

 ```
-#commands used will be appended the stderr name to create a path 
stderr=c:\cntk\log\cntk # “_mnistTrain_mnistTest.log” would be appended
traceLevel=0 # larger values mean more output
+#commands used will be appended the stderr name to create a path 
+stderr=c:\cntk\log\cntk # “_mnistTrain_mnistTest.log” would be appended
+traceLevel=0 # larger values mean more output
 ```

 The **traceLevel** parameter is uniformly used by the code in CNTK to specify how much extra output (verbosity) is desired. The default value is 0 (zero) and specifies minimal output, the higher the number the more output can be expected. Currently 0-limited output, 1-medium ouput, 2-verbose output are the only values supported.
@ -168,7 +239,9 @@ The **traceLevel** parameter is uniformly used by the code in CNTK to specify ho
 It is often advantageous to set some values at the top level of the config file. This is because config searches start with the target section and continue the search to higher level sections. If the same parameter is used in multiple sections putting the parameter at a higher level where both sections can share it can be a good idea. In our example the following parameters are used by both the train and the test step:

 ```
-ndlMacros=C:\cntk\config\DefaultMacros.ndl
modelPath=c:\cntk\model\sample.dnn
labelMappingFile=c:\cntk\data\mnist\labels.map
+ndlMacros=C:\cntk\config\DefaultMacros.ndl
+modelPath=c:\cntk\model\sample.dnn
+labelMappingFile=c:\cntk\data\mnist\labels.map
 ```

 It can also be advantageous to specify parameters that often change all in one area, rather than separated into the sections to which the parameters belong. These commonly modified parameters can even be placed in a separate file if desired. See the layered config files in the reference section for more information.
@ -193,7 +266,34 @@ sub-section     | Options              | Description
 For the Network Builder and the Trainer the existence of the sub-section name tells the train action which component to use. For example, **NDLNetworkBuilder** is specified in our example, so CNTK will use the NDL Network Builder to define the network. Similarly **SGD** is specified, so that trainer will be used. The reader sub-section is a little different, and is always called **reader**, the **readerType** parameter in the sub-section defines which reader will actually be used. Readers are implemented as separate DLLs, and the name of the reader is also the name of the DLL file that will be loaded.

 ```
-mnistTrain=[
    action=train
    minibatchSize=32
    epochSize=60000

    NDLNetworkBuilder=[
        networkDescription=c:\cntk\config\sample.ndl
        run=ndlMacroUse
    ]
    SGD=[
        #modelPath - moved to root level to share with mnistTest
        learningRatesPerMB=0.001
        maxEpochs=50
    ]
    reader=[
        readerType=UCIFastReader
        file=c:\cntk\data\mnist\mnist_train.txt
        features=[
            dim=784
            start=1        
        ]
        labels=[
            dim=1
            start=0
            labelDim=10
        ]
    ]
]
+mnistTrain=[
+    action=train
+    minibatchSize=32
+    epochSize=60000
+
+    NDLNetworkBuilder=[
+        networkDescription=c:\cntk\config\sample.ndl
+        run=ndlMacroUse
+    ]
+    SGD=[
+        #modelPath - moved to root level to share with mnistTest
+        learningRatesPerMB=0.001
+        maxEpochs=50
+    ]
+    reader=[
+        readerType=UCIFastReader
+        file=c:\cntk\data\mnist\mnist_train.txt
+        features=[
+            dim=784
+            start=1        
+        ]
+        labels=[
+            dim=1
+            start=0
+            labelDim=10
+        ]
+    ]
+]
 ```

 The rest of the parameters in the mnistTrain Command Section are briefly explained here, more details about the parameters available for each component are available in the Configuration Reference section of this document.
@ -212,7 +312,11 @@ epochSize=60000
 **epochSize** is the number of dataset records that will be processed in a training pass. It is most often set to be the same as the dataset size, but can be smaller or larger that the dataset. It defaults to the size of the dataset if not present in the configuration file. It can also be set to zero for SGD, which has the same meaning.

 ```
-SGD=[
    #modelPath - moved to root level to share with mnistTest
    learningRatesPerMB=0.001
    maxEpochs=50
]
+SGD=[
+    #modelPath - moved to root level to share with mnistTest
+    learningRatesPerMB=0.001
+    maxEpochs=50
+]
 ```

 **modelPath** is the path to the model file, and will be the name used when a model is completely trained. For epochs prior to the final model a number will be appended to the end signifying the epoch that was saved (i.e. myModel.dnn.5). These intermediate files are important to allow the training process to restart after an interruption. Training will automatically resume at the first non-existent epoch when training is restarted.
@ -246,7 +350,19 @@ readerType=UCIFastReader
 Each of the readers uses the same interface into CNTK, and each reader is implemented in a separate DLL. There are many parameters in the reader section that are used by all the different types of readers, and some are specific to a particular reader. Our example reader section is as follows:

 ```
-reader=[
    readerType=UCIFastReader
    file=c:\cntk\data\mnist\mnist_train.txt
    features=[
        dim=784
        start=1        
    ]
    labels=[
        dim=1
        start=0
        labelDim=10
    ]
]
+reader=[
+    readerType=UCIFastReader
+    file=c:\cntk\data\mnist\mnist_train.txt
+    features=[
+        dim=784
+        start=1        
+    ]
+    labels=[
+        dim=1
+        start=0
+        labelDim=10
+    ]
+]
 ```

 The two sub-sections in the reader section identify two different data sets. In our example they are named **features** and **labels**, though any names could be used. These names need to match the names used in the NDL network definition Inputs in our example, so the correct definition is used for each input dataset. Each of these sections for the UCIFastReader have the following parameters:
@ -327,7 +443,17 @@ In addition being able to specify multiple configuration files at the command li
 While layered configuration files allow users to reuse configuration files across experiments, this can still be a cumbersome process. For each experiment, a user might have to override several parameters, some of which might be long file paths (eg, ‘stderr’, ‘modelPath’, ‘file’, etc). The “stringize” functionality can make this process much easier. It allows a user to specify configuration like the following:

 ```
-command=SpeechTrain
stderr=$Root$\$RunName$.log
speechTrain=[
    modelPath=$Root$\$RunName$.model
    SGD=[
        reader=[
            features=[
                type=Real
                dim=$DataSet1_Dim$
                file=$DataSet1_Features$
]]]] 
+command=SpeechTrain
+stderr=$Root$\$RunName$.log
+speechTrain=[
+    modelPath=$Root$\$RunName$.model
+    SGD=[
+        reader=[
+            features=[
+                type=Real
+                dim=$DataSet1_Dim$
+                file=$DataSet1_Features$
+]]]] 
 ```

 Here, “Root”,“RunName”, “DataSet1\_Dim”, and “DataSet1\_Features” are variables specified elsewhere in the configuration (at a scope visible from the point at which they are used). When interpreting this configuration file, the parser would replace every string of the form “$VarName$” with the string “VarValue”, where “VarValue” represents the value of the variable called “VarName”. The variable resolution process is recursive; for example, if A=$B$, B=$C$, and C=HelloWorld.txt, then A would be resolved as “HelloWorld.txt”.
@ -350,7 +476,16 @@ If a parameter occurs more than once in a given parameter set, the last occurren
 There must be a top-level command parameter, which defines the commands that will be executed in the configuration file. Each command references a Command section of the file, which must contain an action parameter defining the operation that section will perform:

 ```
-command=mnistTrain:mnistTest

mnistTrain=[
    action=train
    …
]
mnistTest=[
    action=eval
    …
]
+command=mnistTrain:mnistTest
+
+mnistTrain=[
+    action=train
+    …
+]
+mnistTest=[
+    action=eval
+    …
+]
 ```

 This snippet will execute the **mnistTrain** section which executes the **train** action, followed by the **mnistTest** section.
@ -525,7 +660,18 @@ Each of the readers uses the same interface into CNTK, and each reader is implem
 There are many parameters in the reader section that are used by all the different types of readers, and others are specific to a particular reader. There are sub-sections under the reader section which are used to define the data records to be read. For UCIFastReader these look like:

 ```
-reader=[
    readerType=UCIFastReader
    file=c:\cntk\data\mnist\mnist_train.txt
    features=[
        dim=784
        start=1        
    ]
    labels=[
        dim=1
        start=0
        labelDim=10
    ]
+reader=[
+    readerType=UCIFastReader
+    file=c:\cntk\data\mnist\mnist_train.txt
+    features=[
+        dim=784
+        start=1        
+    ]
+    labels=[
+        dim=1
+        start=0
+        labelDim=10
+    ]
 ]
 ```

@ -654,7 +800,8 @@ For dataset processing the following parameters are used:
 SequenceReader is a reader that reads text string. It is mostly often used for language modeling tasks. An example of the text string is as follows:

 ```
-</s> pierre <unk> N years old will join the board as a nonexecutive director nov. N </s>
</s> mr. <unk> is chairman of <unk> n.v. the dutch publishing group </s>
+</s> pierre <unk> N years old will join the board as a nonexecutive director nov. N </s>
+</s> mr. <unk> is chairman of <unk> n.v. the dutch publishing group </s>
 ```

 Symbol &lt;/s&gt; is used to denote both beginning and ending of a sentence. However, this symbol can be specified by beginSequence and endSequence.
@ -686,7 +833,19 @@ A subsection is for input label information.
 LUSequenceReader is similar to SequenceReader. It however is used for language understanding tasks which have input and output strings that are different. The content of an example file is listed below

 ```
-BOS O
i O
want O
to O
fly O
from O
boston B-fromloc.city_name
at O
1110 B-arrive_time.time
in O
the O
morning B-arrive_time.period_of_day
EOS O
+BOS O
+i O
+want O
+to O
+fly O
+from O
+boston B-fromloc.city_name
+at O
+1110 B-arrive_time.time
+in O
+the O
+morning B-arrive_time.period_of_day
+EOS O
 ```

 consists of some unique setups as follows:
@ -704,7 +863,8 @@ The LUSequenceReader has some unique setups as follows:
 -   Wordmap – this specifies a file that maps inputs to other inputs. This is useful if the user wants to map some inputs to unknown symbols. For example:

 ```
-    buy buy
	trans <unk>
+    buy buy
+	trans <unk>
 ```

 -   File – the corpus file
@ -752,7 +912,67 @@ BinaryWriter is an implementation of a hierarchal file format the mirrors the co
 The following is an example of a BinaryWriter definition. Since it is most commonly used as a cache for UCIFastReader, this definition is show as a UCIFastReader cache. The parameters needed for BinaryWriter are in bold type below:

 ```
-    # Parameter values for the reader with cache
    reader=[
      # reader to use
      readerType=UCIFastReader
      # if writerType is set, we will cache to a binary file
      # if the binary file exists, we will use it instead of parsing this file
      writerType=BinaryReader
      miniBatchMode=Partial
      randomize=Auto
      windowSize=10000

      #### write definition
      wfile=c:\data\mnist\mnist_train.bin
      #wsize - inital size of the file in MB
      # if calculated size would be bigger, that is used instead
      wsize=256

      #wrecords - number of records we should allocate space for in the file
      # files cannot be expanded, so this should be large enough. 
      wrecords=60000

      features=[
        dim=784
        start=1        
        file=c:\data\mnist\mnist_train.txt

        ### write definition
        #wsize=200
        #wfile=c:\data\mnist\mnist_train_features.bin
        sectionType=data
      ]
      labels=[
        dim=1
        start=0
        file=c:\data\mnist\mnist_train.txt
        labelMappingFile=c:\temp\labels.txt
        labelDim=10
        labelType=Category

        #### Write definition ####
        # sizeof(unsigned) which is the label index type
        #wsize=10
        #wfile=c:\data\mnist\mnist_train_labels.bin
        elementSize=4
        wref=features
        sectionType=labels
        mapping=[
          #redefine number of records for this section, 
          #since we don't need to save it for each data record
          wrecords=10
          #variable size so use an average string size
          elementSize=10
          sectionType=labelMapping
        ]
        category=[
          dim=10
          #elementSize=sizeof(ElemType) is default
          sectionType=categoryLabels
        ]
      ]
    ]
+    # Parameter values for the reader with cache
+    reader=[
+      # reader to use
+      readerType=UCIFastReader
+      # if writerType is set, we will cache to a binary file
+      # if the binary file exists, we will use it instead of parsing this file
+      writerType=BinaryReader
+      miniBatchMode=Partial
+      randomize=Auto
+      windowSize=10000
+
+      #### write definition
+      wfile=c:\data\mnist\mnist_train.bin
+      #wsize - inital size of the file in MB
+      # if calculated size would be bigger, that is used instead
+      wsize=256
+
+      #wrecords - number of records we should allocate space for in the file
+      # files cannot be expanded, so this should be large enough. 
+      wrecords=60000
+
+      features=[
+        dim=784
+        start=1        
+        file=c:\data\mnist\mnist_train.txt
+
+        ### write definition
+        #wsize=200
+        #wfile=c:\data\mnist\mnist_train_features.bin
+        sectionType=data
+      ]
+      labels=[
+        dim=1
+        start=0
+        file=c:\data\mnist\mnist_train.txt
+        labelMappingFile=c:\temp\labels.txt
+        labelDim=10
+        labelType=Category
+
+        #### Write definition ####
+        # sizeof(unsigned) which is the label index type
+        #wsize=10
+        #wfile=c:\data\mnist\mnist_train_labels.bin
+        elementSize=4
+        wref=features
+        sectionType=labels
+        mapping=[
+          #redefine number of records for this section, 
+          #since we don't need to save it for each data record
+          wrecords=10
+          #variable size so use an average string size
+          elementSize=10
+          sectionType=labelMapping
+        ]
+        category=[
+          dim=10
+          #elementSize=sizeof(ElemType) is default
+          sectionType=categoryLabels
+        ]
+      ]
+    ]
 ]
 ```

@ -865,13 +1085,18 @@ array | ConfigArray |

 				<li>
 				
-<pre><code>{
value
value
value*#
}</code></pre>
+<pre><code>{
+value
+value
+value*#
+}</code></pre>
 				
 				</li>

 			</ul>
 		</td>
-		<td>Multiple values in an array are separated by colons ‘:’. A value may be repeated multiple times with the ‘*’ character followed by an integer (the # in the examples). Values in an array may be of any supported type and need not be uniform. The values in a vector can also be surrounded by curly braces ‘{}’, braces are required if new lines are used as separators. An alternate separation character can be specified immediately following the opening brace if desired.
</td>
+		<td>Multiple values in an array are separated by colons ‘:’. A value may be repeated multiple times with the ‘*’ character followed by an integer (the # in the examples). Values in an array may be of any supported type and need not be uniform. The values in a vector can also be surrounded by curly braces ‘{}’, braces are required if new lines are used as separators. An alternate separation character can be specified immediately following the opening brace if desired.
+</td>
 	</tr>
 	
 	<!-- DICTIONARY ROW -->
@ -894,12 +1119,18 @@ boolparam</code></pre>
 				</li>
 				<li>
 				
-<pre><code>[
parameter1=value1
parameter2=value2
boolparam
]
</code></pre>
+<pre><code>[
+parameter1=value1
+parameter2=value2
+boolparam
+]
+</code></pre>
 				
 				</li>
 			</ul>
 		</td>
-		<td>Multiple parameters grouped together in a dictionary. The contents of the dictionary are each named values and can be of different types. Dictionaries can be used to create a configuration hierarchy. When specified on the same line a ‘;’ semicolon is used as the default separator. The values can optionally be surrounded by square braces ‘[]’. Braces are required when using newlines as separators in a config file. An unnamed dictionary is also allowed in the case of an array of dictionaries. An alternate separation character can be specified immediately following the opening brace if desired.
</td>
+		<td>Multiple parameters grouped together in a dictionary. The contents of the dictionary are each named values and can be of different types. Dictionaries can be used to create a configuration hierarchy. When specified on the same line a ‘;’ semicolon is used as the default separator. The values can optionally be surrounded by square braces ‘[]’. Braces are required when using newlines as separators in a config file. An unnamed dictionary is also allowed in the case of an array of dictionaries. An alternate separation character can be specified immediately following the opening brace if desired.
+</td>
 	</tr>
 </table>

@ -910,7 +1141,9 @@ boolparam</code></pre>
 There are three main classes that are used to access configuration files. *ConfigParameters* and *ConfigArray* contain instances of *ConfigValue*. The main definitions are as follows:

 ```
-class ConfigValue : public std::string
class ConfigParameters : public ConfigParser, public ConfigDictionary
class ConfigArray:public ConfigParser, public std::vector<ConfigValue>
+class ConfigValue : public std::string
+class ConfigParameters : public ConfigParser, public ConfigDictionary
+class ConfigArray:public ConfigParser, public std::vector<ConfigValue>
 ```

 ##### ConfigValue
@ -968,7 +1201,8 @@ To use this method with a ConfigArray, the file can simply contain a list of val
 ConfigArray instances can also be converted to argvector&lt;T&gt; instances simply by assigning them. Care should be taken to assign to a local variable, and not just passing as a parameter due to lifetime issues, as follows:

 ```
-ConfigArray configLearnRatesPerMB = config("learningRatesPerMB");
argvector<float> learnRatesPerMB = configLearnRatesPerMB;
+ConfigArray configLearnRatesPerMB = config("learningRatesPerMB");
+argvector<float> learnRatesPerMB = configLearnRatesPerMB;
 ```

 ConfigParameters and ConfigArray instances are very flexible, but require parsing every time a value is accessed. argvector&lt;T&gt; ,on the other hand, parses once and then accesses values as a standard vector.
@ -978,7 +1212,60 @@ ConfigParameters and ConfigArray instances are very flexible, but require parsin
 Some sample code that would parse the example configuration file given at the beginning of this document follows. This is a revised version of actual code in CNTK:

 ```
-#include "commandArgUtil.h"

// process the command
void DoCommand(const ConfigParameters& config)
{
    ConfigArray command = config("command");
    for (int i=0; i < command.size(); i++)
    {
        //get the configuration parameters that match the command
        ConfigParameters commandParams=config(command[i]);
        ConfigArray action = commandParams("action","train");

        // determine the action to perform, and do it
        for (int j=0; j < action.size(); j++)
        {
            if (action[j] == "train")
                DoTrain(commandParams);
            else if (action[j] == "test" || action[j] == "eval")
                DoEval(commandParams);
            else
                throw runtime_error("unknown action: " + action[j] + " in command set: " + command[i]);
        }
    }
}

void DoTrain(const ConfigParameters& config)
{
    ConfigParameters configSGD=config("SGD");
    ConfigParameters readerConfig = config("reader");

    IComputationNetBuilder* netBuilder = NULL;
    ConfigParameters configNDL = config("NDLNetworkBuilder");
    netBuilder = (IComputationNetBuilder*)new NDLBuilder(configNDL);

    DataReader* dataReader = new DataReader(readerConfig);

    ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
    floatargvector learningRatesPerMB = learningRatesPerMBStr;

    ConfigArray minibatchSize = configSGD("minibatchSize", "256");
    size_t epochSize = configSGD("epochSize", "0");
    if (epochSize == 0)
    {
        epochSize = requestDataSize;
    }
    size_t maxEpochs = configSGD("maxEpochs");
    wstring modelPath = configSGD("modelPath");
    int traceLevel = configSGD("traceLevel", "0");
    SGD = sgd(learningRatesPerMB, minibatchSize, epochSize, maxEpochs, modelPath, traceLevel);
    sgd.Train(netBuilder, dataReader);

    delete netBuilder;
    delete dataReader;
}
+#include "commandArgUtil.h"
+
+// process the command
+void DoCommand(const ConfigParameters& config)
+{
+    ConfigArray command = config("command");
+    for (int i=0; i < command.size(); i++)
+    {
+        //get the configuration parameters that match the command
+        ConfigParameters commandParams=config(command[i]);
+        ConfigArray action = commandParams("action","train");
+
+        // determine the action to perform, and do it
+        for (int j=0; j < action.size(); j++)
+        {
+            if (action[j] == "train")
+                DoTrain(commandParams);
+            else if (action[j] == "test" || action[j] == "eval")
+                DoEval(commandParams);
+            else
+                throw runtime_error("unknown action: " + action[j] + " in command set: " + command[i]);
+        }
+    }
+}
+
+void DoTrain(const ConfigParameters& config)
+{
+    ConfigParameters configSGD=config("SGD");
+    ConfigParameters readerConfig = config("reader");
+
+    IComputationNetBuilder* netBuilder = NULL;
+    ConfigParameters configNDL = config("NDLNetworkBuilder");
+    netBuilder = (IComputationNetBuilder*)new NDLBuilder(configNDL);
+
+    DataReader* dataReader = new DataReader(readerConfig);
+
+    ConfigArray learningRatesPerMBStr = configSGD("learningRatesPerMB", "");
+    floatargvector learningRatesPerMB = learningRatesPerMBStr;
+
+    ConfigArray minibatchSize = configSGD("minibatchSize", "256");
+    size_t epochSize = configSGD("epochSize", "0");
+    if (epochSize == 0)
+    {
+        epochSize = requestDataSize;
+    }
+    size_t maxEpochs = configSGD("maxEpochs");
+    wstring modelPath = configSGD("modelPath");
+    int traceLevel = configSGD("traceLevel", "0");
+    SGD = sgd(learningRatesPerMB, minibatchSize, epochSize, maxEpochs, modelPath, traceLevel);
+    sgd.Train(netBuilder, dataReader);
+
+    delete netBuilder;
+    delete dataReader;
+}
 ```

 The code above is very easy to code, you simply delare a config, or basic type variable on the stack and assign something from a ConfigParameters class to that variable (i.e. int i = config(”setting”,”default”). Both parameters with defaults and those that don’t are used in the sample code above. The ConfigValue class takes care of parsing the value to be the correct type, and is returned by config() references above.
@ -994,7 +1281,10 @@ Other possible scenarios are also enabled by using a common interface, for examp
 The five readers and one writer provided with CNTK all use these same interfaces and each is housed in its own DLL. CNTK loads the DLL and looks for exported functions that will return the interface of interest. The functions are defined as follows:

 ```
-extern "C" DATAREADER_API void GetReaderF(IDataReader<float>** preader);
extern "C" DATAREADER_API void GetReaderD(IDataReader<double>** preader);
extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter);
extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter);
+extern "C" DATAREADER_API void GetReaderF(IDataReader<float>** preader);
+extern "C" DATAREADER_API void GetReaderD(IDataReader<double>** preader);
+extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter);
+extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter);
 ```

 each reader or writer DLL exports the appropriate functions, and will return the interface when called. The following sections defined the interfaces:
@ -1002,7 +1292,31 @@ each reader or writer DLL exports the appropriate functions, and will return the
 #### Reader Interface

 ```
-/ Data Reader interface
// implemented by DataReader and underlying classes
template<class ElemType>
class DATAREADER_API IDataReader
{
public:
    typedef std::string LabelType;
    typedef unsigned LabelIdType;

    virtual void Init(const ConfigParameters& config) = 0;
    virtual void Destroy() = 0;
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize) = 0;
    virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) = 0;
    virtual const std::map<typename LabelIdType, typename LabelType>& GetLabelMapping(const std::wstring& sectionName) = 0; 
    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<typename LabelIdType, typename LabelType>& labelMapping) = 0;
    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart) = 0;
    virtual bool DataEnd(EndDataType endDataType) = 0;

    // Recursive network specific methods
    virtual size_t NumberSlicesInEachRecurrentIter() = 0; 
    virtual void SetNbrSlicesEachRecurrentIter(const size_t) = 0;
    virtual void ReloadLabels() = 0;
    virtual void SaveLabels() = 0;
    virtual void SetSentenceEndInBatch(vector<size_t> &sentenceEnd)=0;
};
+/ Data Reader interface
+// implemented by DataReader and underlying classes
+template<class ElemType>
+class DATAREADER_API IDataReader
+{
+public:
+    typedef std::string LabelType;
+    typedef unsigned LabelIdType;
+
+    virtual void Init(const ConfigParameters& config) = 0;
+    virtual void Destroy() = 0;
+    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize) = 0;
+    virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices) = 0;
+    virtual const std::map<typename LabelIdType, typename LabelType>& GetLabelMapping(const std::wstring& sectionName) = 0; 
+    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<typename LabelIdType, typename LabelType>& labelMapping) = 0;
+    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart) = 0;
+    virtual bool DataEnd(EndDataType endDataType) = 0;
+
+    // Recursive network specific methods
+    virtual size_t NumberSlicesInEachRecurrentIter() = 0; 
+    virtual void SetNbrSlicesEachRecurrentIter(const size_t) = 0;
+    virtual void ReloadLabels() = 0;
+    virtual void SaveLabels() = 0;
+    virtual void SetSentenceEndInBatch(vector<size_t> &sentenceEnd)=0;
+};
 ```

 The methods are as follows:
@ -1068,7 +1382,21 @@ The methods are as follows:
 #### Writer Interface

 ```
-// Data Writer interface
// implemented by some DataWriters
template<class ElemType>
class DATAWRITER_API IDataWriter
{
public:
    typedef std::string LabelType;
    typedef unsigned LabelIdType;

    virtual void Init(const ConfigParameters& config) = 0;
    virtual void Destroy() = 0;
    virtual void GetSections(std::map<std::wstring, SectionType, nocase_compare>& sections) = 0;
    virtual bool SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized) = 0;
    virtual void SaveMapping(std::wstring saveId, const std::map<typename LabelIdType, typename LabelType>& labelMapping) = 0;
};
+// Data Writer interface
+// implemented by some DataWriters
+template<class ElemType>
+class DATAWRITER_API IDataWriter
+{
+public:
+    typedef std::string LabelType;
+    typedef unsigned LabelIdType;
+
+    virtual void Init(const ConfigParameters& config) = 0;
+    virtual void Destroy() = 0;
+    virtual void GetSections(std::map<std::wstring, SectionType, nocase_compare>& sections) = 0;
+    virtual bool SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized) = 0;
+    virtual void SaveMapping(std::wstring saveId, const std::map<typename LabelIdType, typename LabelType>& labelMapping) = 0;
+};
 ```

 The methods are as follows:
@ -1111,22 +1439,63 @@ The library uses BLAS libraries from NVidia for the GPU (CuBLAS) and AMD for the

 ### PTask support

-PTask is a library used in CTNK to enable multiple GPU computation on a single machine. PTask uses the concept of a “Tasks organized in a filter graph. It allows fully asynchronous operation of the tasks, each only depending on inputs being available to execute. PTask distributes the tasks across the available hardware and handles data transfers.
+PTask is a library used in CNTK to enable multiple GPU computation on a single machine. PTask uses the concept of a “Tasks organized in a filter graph. It allows fully asynchronous operation of the tasks, each only depending on inputs being available to execute. PTask distributes the tasks across the available hardware and handles data transfers.

-CTNK is organized in a different fashion with Computation Nodes. However, each node has two methods that do all the computation work: EvaluateThisNode() and ComputeInputPartial(), which can be used as the “Tasks”. However, since Tasks can be executed asynchronously, they need to be stateless. To enable these methods as task a static version of each method that takes all inputs and outputs as parameters are created. The class methods simply call these “Task” functions with the class variables for their implementation.
+CNTK is organized in a different fashion with Computation Nodes. However, each node has two methods that do all the computation work: EvaluateThisNode() and ComputeInputPartial(), which can be used as the “Tasks”. However, since Tasks can be executed asynchronously, they need to be stateless. To enable these methods as task a static version of each method that takes all inputs and outputs as parameters are created. The class methods simply call these “Task” functions with the class variables for their implementation.

 The PTaskGraphBuilder component takes a computation network and transforms it into a filter graph. In order to do this work it requires the parameter description for each of the tasks. Since C++ does not have a reflection mechanism as in available in C\# and some other languages, a class method has been introduced to ComputationNode to provide this information. The method GetPTaskDescriptor() provides this information to PTaskGraphBuilder so it can build the graph.

 The following is an example of a GetPTaskDescriptor() implementation. This function returns a TaskDescriptor class containing all the parameter and other information necessary to build the filter graph for a particular node. This node is the “TimesNode” and does a matrix multiply. The following implementation of the two important member functions are:

 ```
-virtual void EvaluateThisNode()  
{
    EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues());
}
virtual void ComputeInputPartial(const size_t inputIndex)
{
    if (inputIndex > 1)
        throw std::invalid_argument("Times operation only takes two inputs.");

    if (inputIndex == 0)  //left derivative
    {
        ComputeInputPartialLeft(Inputs(1)->FunctionValues(), Inputs(0)->GradientValues(), GradientValues());
    }
    else  //right derivative
    {
        ComputeInputPartialRight(Inputs(0)->FunctionValues(), Inputs(1)->GradientValues(), GradientValues());
    }
}
+virtual void EvaluateThisNode()  
+{
+    EvaluateThisNodeS(FunctionValues(), Inputs(0)->FunctionValues(), Inputs(1)->FunctionValues());
+}
+virtual void ComputeInputPartial(const size_t inputIndex)
+{
+    if (inputIndex > 1)
+        throw std::invalid_argument("Times operation only takes two inputs.");
+
+    if (inputIndex == 0)  //left derivative
+    {
+        ComputeInputPartialLeft(Inputs(1)->FunctionValues(), Inputs(0)->GradientValues(), GradientValues());
+    }
+    else  //right derivative
+    {
+        ComputeInputPartialRight(Inputs(0)->FunctionValues(), Inputs(1)->GradientValues(), GradientValues());
+    }
+}
 ```

 The GPTaskDescriptor() method describes the necessary parameter information for each method. Each node has a FunctionValue matrix and a GradientValue matrix associated with it, and the descriptor methods identify which values are needed, and if they come from the current node or one of its inputs as follows:

 ```
-// GetTaskDescriptor - Get a task descriptor for this node
// taskType - task type we are generating a task for
virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
{
    TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
    switch(taskType)
    {
    case taskComputeInputPartial:
        descriptor->FunctionParam(1-inputIndex, paramOptionsInput);
        descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
        descriptor->GradientParam();
        descriptor->SetFunction( (inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft));
        break;
    case taskEvaluate:
        descriptor->FunctionParam();
        descriptor->FunctionParam(0, paramOptionsInput);
        descriptor->FunctionParam(1, paramOptionsInput);
        descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
        break;
    default:
        assert(false);
        throw std::logic_error("Unsupported task requested");
    }
    return descriptor;
}
+// GetTaskDescriptor - Get a task descriptor for this node
+// taskType - task type we are generating a task for
+virtual TaskDescriptor<ElemType>* GetPTaskDescriptor(TaskType taskType, size_t inputIndex=0) const
+{
+    TaskDescriptor<ElemType>* descriptor = new TaskDescriptor<ElemType>(this, taskType, inputIndex);
+    switch(taskType)
+    {
+    case taskComputeInputPartial:
+        descriptor->FunctionParam(1-inputIndex, paramOptionsInput);
+        descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
+        descriptor->GradientParam();
+        descriptor->SetFunction( (inputIndex?(FARPROC)ComputeInputPartialRight:(FARPROC)ComputeInputPartialLeft));
+        break;
+    case taskEvaluate:
+        descriptor->FunctionParam();
+        descriptor->FunctionParam(0, paramOptionsInput);
+        descriptor->FunctionParam(1, paramOptionsInput);
+        descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
+        break;
+    default:
+        assert(false);
+        throw std::logic_error("Unsupported task requested");
+    }
+    return descriptor;
+}
 ```

 For the Evaluate method, the first parameter is an output to the FunctionValue matrix of the current node.
@ -1138,7 +1507,8 @@ descriptor->FunctionParam();
 The default value for this method is “current node, output” so no parameters are needed. The next two parameters are inputs and are the function values from the two inputs:

 ```
-descriptor->FunctionParam(0, paramOptionsInput);
descriptor->FunctionParam(1, paramOptionsInput);
+descriptor->FunctionParam(0, paramOptionsInput);
+descriptor->FunctionParam(1, paramOptionsInput);
 ```

 The last call passes a pointer to the task function:
@ -1150,7 +1520,8 @@ descriptor->SetFunction((FARPROC)EvaluateThisNodeS);
 and the descriptor is complete. The two ComputeInputPartial task function parameters are very similar. Depending on the inputIndex, the values are switched. The first parameter is an input of the function value of one of the inputs, and the second is an output value to the gradient matrix of the other input:

 ```
-descriptor->FunctionParam(1-inputIndex, paramOptionsInput);
descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
+descriptor->FunctionParam(1-inputIndex, paramOptionsInput);
+descriptor->GradientParam(inputIndex, paramOptionsInput | paramOptionsOutput | paramOptionsInitialize);
 ```

 The second parameter is interesting because it is required to retain it value from one call to the next, this is done in a filter graph by having a parameter be input and output at the same time, meaning it updates itself. There is a clear distinction between values that need to be maintained and those that are transcient in a filter graph, and this idiom is how we instruct PTaskGraphBuilder to retain the value. The Initialize option is also necessary so on the first iteration the matrix will be cleared out (zeros).
@ -1170,7 +1541,12 @@ descriptor->SetFunction((inputIndex ? (FARPROC)ComputeInputPartialRight : (FARPR
 For reference the three task functions are as follows:

 ```
-static void WINAPI ComputeInputPartialLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)  

static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)  

static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)  
```
+static void WINAPI ComputeInputPartialLeft(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)  
+
+static void WINAPI ComputeInputPartialRight(Matrix<ElemType>& inputFunctionValues, Matrix<ElemType>& inputGradientValues, const Matrix<ElemType>& gradientValues)  
+
+static void WINAPI EvaluateThisNodeS(Matrix<ElemType>& functionValues, const Matrix<ElemType>& input0, const Matrix<ElemType>& input1)  
+```

 ### NDL classes and processing

--- a/Examples/Other/Simple2d/Config/AddOperatorConstant.cntk
+++ b/Examples/Other/Simple2d/Config/AddOperatorConstant.cntk
@ -5,7 +5,6 @@ command = Add_Operator_Constant

 precision = "float"
 traceLevel = 1
-outputNodeNames = AddResult

 #######################################
 #  NETWORK CONFIG                     #
--- a/Examples/Other/Simple2d/Config/AddOperatorConstantNoInput.cntk
+++ b/Examples/Other/Simple2d/Config/AddOperatorConstantNoInput.cntk
@ -5,7 +5,6 @@ command = Add_Operator_Constant

 precision = "float"
 traceLevel = 1
-outputNodeNames = AddResult

 #######################################
 #  NETWORK CONFIG                     #
--- a/Examples/Text/Miscellaneous/SLU/globals.cntk
+++ b/Examples/Text/Miscellaneous/SLU/globals.cntk
@ -4,4 +4,4 @@ ConfigDir=$WorkDir$\config
 NdlDir=$ConfigDir$
 ExpDir=c:\temp\exp\atis
 OutDir=$ExpDir$\output
-DeviceNumber=0
+DeviceId=0
--- a/Examples/Text/Miscellaneous/SLU/rnnlu.cntk
+++ b/Examples/Text/Miscellaneous/SLU/rnnlu.cntk
@ -1,21 +1,16 @@
 # configuration file for CNTK ATIS for language understanding tasks

-stderr=$ExpDir$\ATIS\logd
-
+precision="float"
+deviceId = $DeviceId$
 command=LSTM:LSTMTest
-
-type=float
-
-deviceId=0  #"auto"   # use -1 for CPU. Note: due to a bug, testing only works on CPU
-
+traceLevel=1

 LSTM=[
    action=train
-
-    traceLevel=1
+	makeMode=true

    # output model path
-    modelPath=$ExpDir$\cntkdebug.dnn
+    modelPath=$ExpDir$/cntkdebug.dnn

    # uncomment NDLNetworkBuilder to use NDL
    # need to comment out SimpleNetworkBuilder section
@ -100,7 +95,7 @@ LSTM=[
        # writerType=BinaryReader

        #### write definition
-        wfile=$ExpDir$\sequenceSentence.bin
+        wfile=$ExpDir$/sequenceSentence.bin
        #wsize - inital size of the file in MB
        # if calculated size would be bigger, that is used instead
        wsize=256
@ -112,8 +107,8 @@ LSTM=[
        windowSize=10000

        unk="<unk>"
-          wordmap=$DataDir$\inputmap.txt
-          file=$DataDir$\atis.train.apos.pred.pos.head.IOB.simple
+          wordmap=$DataDir$/inputmap.txt
+          file=$DataDir$/atis.train.apos.pred.pos.head.IOB.simple

          #additional features sections
          #for now store as expanded category data (including label in)
@ -137,14 +132,14 @@ LSTM=[

            # vocabulary size
            labelDim=10000
-            labelMappingFile=$ExpDir$\sentenceLabels.txt
+            labelMappingFile=$ExpDir$/sentenceLabels.txt
            labelType=Category
            beginSequence="BOS"
            endSequence="EOS"
            usewordmap=true

            # input word list
-            token=$DataDir$\input.txt
+            token=$DataDir$/input.txt
            
            #### Write definition ####
            # sizeof(unsigned) which is the label index type
@ -169,9 +164,9 @@ LSTM=[
            labelType=Category

            # output token list
-            token=$DataDir$\output.txt
+            token=$DataDir$/output.txt

-            labelMappingFile=$ExpDir$\sentenceLabels.out.txt
+            labelMappingFile=$ExpDir$/sentenceLabels.out.txt
            #### Write definition ####
            # sizeof(unsigned) which is the label index type
            sectionType=labels
@ -200,7 +195,7 @@ LSTM=[
      equalLength = false

      #### write definition
-      wfile=$ExpDir$\sequenceSentence.valid.bin
+      wfile=$ExpDir$/sequenceSentence.valid.bin
      #wsize - inital size of the file in MB
      # if calculated size would be bigger, that is used instead
      wsize=256
@ -212,8 +207,8 @@ LSTM=[
      windowSize=10000

      unk="<unk>"
-      wordmap=$DataDir$\inputmap.txt
-      file=$DataDir$\atis.dev.IOB.simple
+      wordmap=$DataDir$/inputmap.txt
+      file=$DataDir$/atis.dev.IOB.simple

      #additional features sections
      #for now store as expanded category data (including label in)
@ -237,13 +232,13 @@ LSTM=[

        # vocabulary size
        labelDim=10000
-        labelMappingFile=$ExpDir$\sentenceLabels.in.txt
+        labelMappingFile=$ExpDir$/sentenceLabels.in.txt
        labelType=Category
        beginSequence="BOS"
        endSequence="EOS"
        usewordmap=true

-        token=$DataDir$\input.txt
+        token=$DataDir$/input.txt

        #### Write definition ####
        # sizeof(unsigned) which is the label index type
@ -267,10 +262,10 @@ LSTM=[
        dim=1
        labelType=Category

-        token=$DataDir$\output.txt
+        token=$DataDir$/output.txt

        labelDim=10000
-        labelMappingFile=$ExpDir$\sentenceLabels.out.txt
+        labelMappingFile=$ExpDir$/sentenceLabels.out.txt
        #### Write definition ####
        # sizeof(unsigned) which is the label index type
        elementSize=4
@ -299,13 +294,12 @@ LSTM=[
 LSTMTest=[
    action=write

-    traceLevel=1
    epochSize=4430000
    # which is 886 * 5000
    #recurrentLayer=1
    defaultHiddenActivity=0.1

-    modelPath=$ExpDir$\cntkdebug.dnn
+    modelPath=$ExpDir$/cntkdebug.dnn

    outputNodeNames=outputs:labels

@ -315,15 +309,15 @@ LSTMTest=[
      randomize=None
      wordContext=0:1:2
      unk="<unk>"
-      wordmap=$DataDir$\inputmap.txt
-      file=$DataDir$\atis.test.apos.pred.pos.head.IOB.simple
+      wordmap=$DataDir$/inputmap.txt
+      file=$DataDir$/atis.test.apos.pred.pos.head.IOB.simple

      # if writerType is set, we will cache to a binary file
      # if the binary file exists, we will use it instead of parsing this file
      # writerType=BinaryReader

      #### write definition
-      wfile=$ExpDir$\sequenceSentence.bin
+      wfile=$ExpDir$/sequenceSentence.bin
      #wsize - inital size of the file in MB
      # if calculated size would be bigger, that is used instead
      wsize=256
@ -355,13 +349,13 @@ LSTMTest=[

        # vocabulary size
        labelDim=10000
-        labelMappingFile=$ExpDir$\sentenceLabels.txt
+        labelMappingFile=$ExpDir$/sentenceLabels.txt
        labelType=Category
        beginSequence="BOS"
        endSequence="EOS"
        usewordmap=true

-        token=$DataDir$\input.txt
+        token=$DataDir$/input.txt

        #### Write definition ####
        # sizeof(unsigned) which is the label index type
@ -387,12 +381,12 @@ LSTMTest=[
        beginSequence="BOS"
        endSequence="EOS"

-        token=$DataDir$\output.txt
+        token=$DataDir$/output.txt

        # vocabulary size
        labelDim=127

-        labelMappingFile=$ExpDir$\sentenceLabels.out.txt
+        labelMappingFile=$ExpDir$/sentenceLabels.out.txt
        #### Write definition ####
        # sizeof(unsigned) which is the label index type
        elementSize=4
@ -416,13 +410,13 @@ LSTMTest=[
        writerType=LUSequenceWriter

        outputs=[
-            file=$OutDir$\output.rec.txt
-            token=$DataDir$\output.txt
+            file=$OutDir$/output.rec.txt
+            token=$DataDir$/output.txt
        ]
        
        labels=[
-            file=$OutDir$\output.lbl.txt
-            token=$DataDir$\output.txt
+            file=$OutDir$/output.lbl.txt
+            token=$DataDir$/output.txt
        ]
    ]
 ]
--- a/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
+++ b/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
@ -1,11 +1,14 @@
-# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunRootDir=$(SolutionDir)Examples/Text/PennTreebank  DeviceId=-1  makeMode=false
-# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunRootDir=$(SolutionDir)g2p  makeMode=false
+# TASK
+# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunRootDir=$(SolutionDir)Examples/Text/PennTreebank
+# configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunRootDir=$(SolutionDir)g2p
 ####################
 # WORK IN PROGRESS #
 # WORK IN PROGRESS #
 # WORK IN PROGRESS #
 ####################

+makeMode = false
+
 # Command line to run in debugger:
 # configFile=$(SolutionDir)Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk  RunRootDir=$(SolutionDir)Examples/Text/PennTreebank  train=[SGD=[maxEpochs=1]]  confVocabSize=1000  DeviceId=-1  makeMode=false

@ -19,18 +22,15 @@ ExpRootDir = "$RunRootDir$"
 #ExpId = _run

 deviceId = 1
-#ExpId = 68-$deviceId$-s2sae-bigmodel
-ExpId = 06-$deviceId$-g2p
-#ExpId = 05-3-g2p # for decoding a different model
+#ExpId = 41-$deviceId$-s2sae    # TASK
+ExpId = 14-$deviceId$-g2p
+#ExpId = 13-5-g2p # for decoding a different model

 # directories
 ExpDir    = "$ExpRootDir$/$ExpId$"
 ModelDir  = "$ExpDir$/Models"

-stderr = $ExpDir$/S2SAutoEncoder.log7
-
-# Append this for small set:
-# train=[epochSize=2048]]  trainFile=ptb.small.train.txt  validFile=ptb.small.valid.txt testFile=ptb.small.test.txt
+stderr = $ExpDir$/S2SAutoEncoder.log3

 # It implements a sequence-to-sequence based auto-encoder.
 # It encodes an entire sentence into a flat vector, and tries to regenerate it.
@ -38,18 +38,23 @@ stderr = $ExpDir$/S2SAutoEncoder.log7

 command = writeWordAndClassInfo:train:test:write
 #command = write
+#command = dump

 precision  = "float"
 traceLevel = 1
 modelPath  = "$ModelDir$/S2SAutoEncoder.dnn"

-decodeModelPath = "$modelPath$.13" # epoch to decode can be appended here
-beamDepth = 1                    # 0=predict; 1=greedy; >1=beam
+decodeModelPath = "$modelPath$.35" # epoch to decode can be appended here  TASK
+beamDepth = 3                    # 0=predict; 1=greedy; >1=beam
 decodeOutputPath = "$decodeModelPath$.b$beamDepth$"

+dumpModelPath = "$modelPath$.2" # model to dump if needed
+
 #confVocabSize = 10000
 #confClassSize = 50
-
+#maxLength = 84
+#isAutoEncoder=true
+#
 #trainFile = "ptb.train.txt"
 ##trainFile = "ptb.small.train.txt"
 #validFile = "ptb.valid.txt"
@ -59,13 +64,16 @@ decodeOutputPath = "$decodeModelPath$.b$beamDepth$"
 ##testFile = "ptb.small.train.txt" # test on train, to see whether model makes sense at all
 #startSymbol = "</s>"

-confVocabSize = 69 #10000
-confClassSize = 0 #50
+confVocabSize = 69
+confClassSize = 0
+maxLength = 20
+isAutoEncoder=false

 trainFile = "g014b2b.train-dev-20-21.bsf.joint"
 #trainFile = "g014b2b.train-dev-1-21.bsf.joint" # small one for debugging
 validFile = "g014b2b.train-dev-1-21.bsf.joint"
 testFile  = "g014b2b.test.bsf.joint"
+#testFile  = "g014b2b.test.bsf.joint.masked"
 startSymbol = "<s>"

 #######################################
@ -74,34 +82,36 @@ startSymbol = "<s>"

 BrainScriptNetworkBuilder = (new ComputationNetwork [

-# TODO: move this somewhere shared
    enableTracing = true
-    traceFrequency = 1000
+    traceFrequency = 100
    tracingLabelMappingFile = "$ModelDir$/vocab.wl"
+    beamDepth=3 // for above Trace macros only, need to clean that up
    include "S2SLib.bs"
-    beamDepth=3 // for above Trace macros only

    # import general config options from outside config values
    vocabDim = $confVocabSize$
    nbrClass = $confClassSize$

-    isAutoencoder = false               # input is only one sequence, meant to reproduce itself
+    isAutoencoder = $isAutoEncoder$     # input is only one sequence, meant to reproduce itself
+    attentionSpan = $maxLength$         # 0 to disable. We only support fixed-size attention windows for now. 0 means no attention; exactly 20 is needed for the g2p CMUDict task
+
    useStabilizer = true
    useEncoder    = true                # if false, this becomes a regular RNN
    useNYUStyle   = false               # if true use thought vector for all inputs, NYU-style
-    attentionSpan = 20                  # we only support fixed-size attention windows for now. 0 means no attention; exactly 20 is needed for the g2p CMUDict task
+    useBidirectionalEncoder = false

-    # import some namespaces
+    # import some names
    Parameters = BS.Parameters
    Constants  = BS.Constants
    Sequences  = BS.Sequences
    Loop       = BS.Loop
    Boolean    = BS.Boolean
-    RecurrentLSTMP = BS.RNNs.RecurrentLSTMP

+    # dimensions
    embeddingDim = $confVocabSize$ # 300
-    hiddenDim    = 750 # 512 # 1024 # 200   --TODO: Kaisheng used 500
-    maxLayer = 2 # 1 # 0
+    hiddenDim    = 512  #420 #768 # 1024 # 200   --TODO: Kaisheng used 500
+    attentionDim = 128 # dim of attention  projection
+    maxLayer = 1 # 0

    encoderDims[i:0..maxLayer] = hiddenDim # this defines the number of hidden layers in each
    decoderDims[i:0..maxLayer] = hiddenDim # both are one LSTM layer only for now
@ -110,6 +120,7 @@ BrainScriptNetworkBuilder = (new ComputationNetwork [
    #input = SparseInput(vocabDim, tag='feature');  # BUGBUG: Slice() not working for sparse, need to extend TensorView
    input = Input(vocabDim, tag='feature');

+    # get out input and label data
    streams = [
        rawInput = input
        out = if isAutoencoder
@ -120,6 +131,7 @@ BrainScriptNetworkBuilder = (new ComputationNetwork [
        ]
        else [
            # we encode input and label as a single input; this splits it into two
+            # This dance will become unnecessary once the new Reader API is fully hooked in.
            separatorRow = 2                                                                          # row index of separator symbokl 
            isSeparator = RowSlice (separatorRow, 1, rawInput)                                        # cut out the separator as a flag
            inInput  = Boolean.Or (FutureValue (1, inInput , defaultHiddenActivation=0), isSeparator) # flag sequence: word is input...
@ -129,15 +141,14 @@ BrainScriptNetworkBuilder = (new ComputationNetwork [
        ]
    ].out

-    # helpers
+    # helpers   --TODO: move to CNTK.core.bs
    First (x) = Slice (0,  1, x,  axis=-1)
    Last (x)  = Slice (-1, 0, x,  axis=-1)

    # strip separators
-    # TODO: find out which one is the correct one
-    #inputSequence = Slice (0, -1, streams.input,   axis=-1)  # e.g. <s> A   B   C      # TODO: process </s> as well, to trigger the thought vector
-    inputSequence  =               streams.input              # e.g. <s> A   B   C    </s>
-    labelSequence  = Slice (1,  0, streams.labels,  axis=-1)  # e.g. A   B   C   </s>
+    inputSequence  = Pass (              streams.input            )  # e.g. <s> A   B   C    </s>
+    labelSequence  = Pass (Slice (1,  0, streams.labels,  axis=-1))  # e.g. A   B   C   </s>
+    # ^^ use Pass() to make these easily accessible in MEL, e.g. for decoding

    # embeddings  --as long as we cannot read multiple sequences, we got one embedding
    # Note: Embeddings are linear, so better stabilize. We really should use BatchNorm.
@ -151,49 +162,61 @@ BrainScriptNetworkBuilder = (new ComputationNetwork [
    labelSentenceStart = First (streams.labels)
    labelSentenceStartEmbedded = EmbedLabels (labelSentenceStart)

-    RecurrentLSTMPWithAttentionWindow2 (inputDim/*x.dim*/, outputDim/*h.dim*/, cellDim/*c.dim*/, x, projectedAttentionWindowBroadcast, attentionDim, attentionSpan, enableSelfStabilization=false) =
+    RecurrentLSTMPWithAttentionWindow2 (inputDim1/*x.dim*/, outputDim/*h.dim*/, cellDim1/*c.dim*/, x, projectedAttentionWindowBroadcast, attentionDim, attentionSpan, previousHook=BS.RNNs.PreviousHC, enableSelfStabilization=false) =
    [
-        prevState =
-        [
-            h = Loop.Previous (lstmState.h)             # hidden state(t-1)
-            c = Loop.Previous (lstmState.c)             # cell(t-1)
-        ]
+        prevState = previousHook (lstmState)

        # compute additional hidden state from attention
-        W(x) = Parameters.WeightParam (attentionDim, outputDim) * Parameters.Stabilize (x, enabled=useStabilizer)
-        projectedH = W (prevState.h) # [cellDim]
-        tanHOut = Tanh (projectedAttentionWindowBroadcast.value + projectedH) # [attentionDim x attentionSpan]
-        v(x) = Parameters.WeightParam (1, attentionDim) * Parameters.Stabilize (x, enabled=useStabilizer) # [1 x attentionDim]
+        W(x) = TraceDense(    Parameters.WeightParam (attentionDim, outputDim)    , 'Wdec') * Parameters.Stabilize (x, enabled=false/*useStabilizer*/)
+        projectedH = W (prevState.h) # [outputDim]    // [outputDim x D]
+        tanHOut = Tanh (TraceDense(   projectedAttentionWindowBroadcast.projectedValue,   'hencp') + TraceDense (   projectedH,   'hdecp')) # [attentionDim x attentionSpan]
+        # ^^ [attDim x 1 x attSpan] + [attDim x D] -> [attDim x D x attSpan]
+        v(x) = TraceDenseTransposed(    Parameters.WeightParam (1, attentionDim)     ,'v')  * Parameters.Stabilize (x, enabled=useStabilizer) # [1 x attentionDim]
        u = v (tanHOut)                                            # [1 x attentionSpan]
+        # [1 x D x attSpan]
        uValid = u + Log (projectedAttentionWindowBroadcast.valid) # [1 x attentionSpan]
-        attentionWeights = Softmax (uValid)                        # [1 x attentionSpan]
-        weightedAttentionWindow = projectedAttentionWindowBroadcast.value .* attentionWeights # [attentionDim x attentionSpan]
-        weightedAttentionAverage = weightedAttentionWindow * BS.Constants.OnesTensor (attentionSpan) # [attentionDim]
+        # [1 x D x attSpan] + [1 x 1 x attSpan] -> [1 x D x attSpan]
+        attentionWeights = TraceDense(  Softmax (uValid)    ,'weights')                    # [1 x attentionSpan]
+        # [1 x D x attSpan]   BUGBUG, needs to keep Softmax denoms separate over D
+        weightedAttentionWindow = projectedAttentionWindowBroadcast.value .* attentionWeights # [encoderHiddenDim x attentionSpan]
+        # [encDim x 1 x attSpan] .* [1 x D x attSpan] -> [encDim x D x attSpan]  BUGBUG, needs to keep Softmax denoms separate over D
+        weightedAttentionAverage = weightedAttentionWindow * BS.Constants.OnesTensor (attentionSpan) # [encoderHiddenDim]
+        # [encDim x D]

        # feed both to LSTM as a single agumented input, so that we can reuse the existing LSTM component
        augmentedX = RowStack (weightedAttentionAverage : x)

        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
-        lstmState = BS.RNNs.LSTMP (attentionDim + inputDim, outputDim, cellDim, augmentedX, prevState, enableSelfStabilization=enableSelfStabilization1)
-    ].lstmState // that's the value we return
-
-    RecurrentLSTMP2WithInitialState (inputDim, outputDim, cellDim, x, initialState, enableSelfStabilization=false) =
-    [
-        prevState =
-        [
-            isFirst = Loop.IsFirst (initialState.h)
-            h = Boolean.If (isFirst, initialState.h, Loop.Previous (lstmState.h))             // hidden state(t-1)
-            c = Boolean.If (isFirst, initialState.c, Loop.Previous (lstmState.c))             // cell(t-1)
-        ]
-        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
-        lstmState = BS.RNNs.LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
+        lstmState = BS.RNNs.LSTMP (outputDim, cellDim=cellDim1, augmentedX, inputDim=projectedAttentionWindowBroadcast.dim + inputDim1, prevState, enableSelfStabilization=enableSelfStabilization1)
    ].lstmState // that's the value we return

    # encoder (processes inputEmbedded)
-    encoder = BS.RNNs.RecurrentLSTMP2Stack (inputEmbedded, embeddingDim, encoderDims, encoderDims, enableSelfStabilization=useStabilizer)
-
+    encoder =
+        if useBidirectionalEncoder
+        then
+        [
            encoderOutputLayer = Length (encoderDims)-1
-    encoderOutput = encoder[encoderOutputLayer]
+            forwardEncoder  = BS.RNNs.RecurrentLSTMP2Stack (encoderDims, /*cellDims=encoderDims,*/ inputEmbedded, inputDim=embeddingDim, enableSelfStabilization=useStabilizer)
+            NextHC (lstmState) = [
+               h = Loop.Next (lstmState.h)             // hidden state(t-1)
+               c = Loop.Next (lstmState.c)             // cell(t-1)
+            ]
+            backwardEncoder = BS.RNNs.RecurrentLSTMP2Stack (encoderDims, /*encoderDims,*/ inputEmbedded, inputDim=embeddingDim, previousHook=NextHC, enableSelfStabilization=useStabilizer)
+            output = [
+                h = RowStack (forwardEncoder[encoderOutputLayer].h : backwardEncoder[encoderOutputLayer].h)
+                c = RowStack (forwardEncoder[encoderOutputLayer].c : backwardEncoder[encoderOutputLayer].c)
+                dim = 2 * encoderDims[encoderOutputLayer]
+            ]
+            #dim = 2 * encoderDims[encoderOutputLayer]
+        ]
+        else
+        [
+            encoderOutputLayer = Length (encoderDims)-1
+            encoder = BS.RNNs.RecurrentLSTMP2Stack (inputEmbedded, embeddingDim, encoderDims, encoderDims, enableSelfStabilization=useStabilizer)
+            output = encoder[encoderOutputLayer]
+            #dim = encoderDims[encoderOutputLayer]
+        ]
+    encoderOutput = encoder.output # TODO: remove .output indirection, no longer needed

    # that last frame should be fed as an additional input to every decoder step
    # Three ways of passing encoder state:
@ -204,22 +227,25 @@ BrainScriptNetworkBuilder = (new ComputationNetwork [
    thoughtVector = [
        h = Last (encoderOutput.h)
        c = Last (encoderOutput.c)
+        dim = encoder.output.dim
    ]
-    thoughtVectorDim = encoderDims[encoderOutputLayer]
+    thoughtVectorDim = thoughtVector.dim

    thoughtVectorPadded = [ # padded with zeroes until end of target sequence
        h = Sequences.BroadcastSequenceAs (labelsEmbedded, thoughtVector.h)
        c = Sequences.BroadcastSequenceAs (labelsEmbedded, thoughtVector.c)
+        dim = thoughtVector.dim
    ]

    # attention (fixed rolling window)
-    attentionWindow = Sequences.PastValueWindow (attentionSpan, encoderOutput.h)
-    attentionDim = thoughtVectorDim
+    attentionWindow = Sequences.PastValueWindow (attentionSpan, encoderOutput.h, axis=2) # BUGBUG: We need axis=3 for beam search. Track this down.
    projectedAttentionWindowBroadcast = [
-        W(x) = Parameters.WeightParam (attentionDim, thoughtVectorDim) * Parameters.Stabilize (x, enabled=useStabilizer)
+        W(x) = TraceDense (   Parameters.WeightParam (attentionDim, thoughtVector.dim),    'Wenc')   * Parameters.Stabilize (x, enabled=false/*useStabilizer*/)
        #B = Parameters.BiasParam (vocabDim) # no bias in attention
-        value = Sequences.BroadcastSequenceAs (labelsEmbedded, W (attentionWindow.value)) # apply the projection columnwise to the attentionWindow tensor
+        value          = Sequences.BroadcastSequenceAs (labelsEmbedded,    attentionWindow.value)
+        projectedValue = Sequences.BroadcastSequenceAs (labelsEmbedded, W (attentionWindow.value)) # apply the projection columnwise to the attentionWindow tensor
        valid          = Sequences.BroadcastSequenceAs (labelsEmbedded,    attentionWindow.valid)
+        dim            = thoughtVector.dim
    ]

    # NYU style: expand h to all, drop c
@ -227,37 +253,73 @@ BrainScriptNetworkBuilder = (new ComputationNetwork [
    thoughtVectorEverywhere = Boolean.If (Loop.IsFirst (thoughtVectorPadded.h),    # if first entry
                                 /*then*/ thoughtVectorPadded.h,                   # then copy that
                                 /*else*/ Loop.Previous (thoughtVectorEverywhere)) # else just propagate to the front
-    # TODO: create an indexSequence that contains all zeroes, basically broadcast a single-frame sequence across another sequence length
+    # TODO: use BroadcastSequenceAs()

    # decoder
    # NYU style:
    # The decoder starts with hidden state 0
    # and takes as input [thoughtVectorEverywhere; previous word].

-    delayedDecoderFeedback = Loop.PreviousOrDefault (defaultValue=labelSentenceStartEmbedded, labelsEmbedded)
+    # we bake into the LSTMs to multiply h and c with beamSearchReorderHook, which we will patch in decoding
+    # ReorderTopN (past_h_or_c) = Times (TraceState (past_h_or_c, 'past'), TraceDense (tokens.from, 'backp'))
+
+    beamSearchReorderHook = Pass (Constants.One)
+
+    # helper functions to delay h and c with possibility to later hook in a different matrix
+
+    PreviousHCFromThoughtVectorWithReorderingHook (lstmState) = [ # with thought vector and beam-search hook
+       isFirst = Loop.IsFirst (initialState.h)
+       # BUGBUG: Should be thoughtVector, but Scatter() can't expand from inside a loop
+       h = Boolean.If (isFirst, thoughtVectorPadded.h, Loop.Previous (lstmState.h/* * beamSearchReorderHook*/))             // hidden state(t-1)
+       c = Boolean.If (isFirst, thoughtVectorPadded.c, Loop.Previous (lstmState.c/* * beamSearchReorderHook*/))             // cell(t-1)
+    ]
+
+    PreviousHCWithReorderingHook (lstmState) = [
+       h = Loop.Previous (lstmState.h/* * beamSearchReorderHook*/)             // hidden state(t-1)
+       c = Loop.Previous (lstmState.c/* * beamSearchReorderHook*/)             // cell(t-1)
+    ]
+
+    decoderHistoryFromGroundTruth = labelsEmbedded                            # decoder input for training is ground truth...
+    decoderHistoryFromOutput = Pass (EmbedLabels (Hardmax (z)), tag='output') # ...but for testing, it's the output. Make an 'output' to make it a root that is kept
+
+    # during training, we use ground truth. For decoding, we will rewire decoderHistoryHook = decoderHistoryFromOutput
+    decoderHistoryHook = Pass (decoderHistoryFromGroundTruth) # this gets redirected in decoding to feed back decoding output instead
+
+    PreviousOrDefault1 (x, defaultValue=Constant (0)) =   # a delay node with initial value  --TODO: merge the two, then do in C++
+    [
+        flags = Loop.IsFirst (defaultValue/*x*/)
+        out = Boolean.If (flags,
+                 /*then*/ defaultValue,
+                 /*else*/ Loop.Previous (x))
+    ].out
+    labelSentenceStartEmbeddedScattered = BS.Sequences.Scatter (Loop.IsFirst (labelSequence), labelSentenceStartEmbedded) # unfortunately needed presently
+
+    decoderInput    = Pass (PreviousOrDefault1 (defaultValue=labelSentenceStartEmbeddedScattered, decoderHistoryHook))
+    decoderInputDim = embeddingDim #labelsEmbedded.dim

-    decoderInputDim = labelsEmbedded.dim  #embeddingDim
-    decoderInput    = Pass (delayedDecoderFeedback)
    decoderOutputLayer = Length (decoderDims)-1
    decoder[i:0..decoderOutputLayer] =
        if i == 0
-        then if useEncoder && useNYUStyle then BS.RNNs.RecurrentLSTMP2 (thoughtVectorDim + decoderInputDim, decoderDims[i], decoderDims[i],
-                                                                        RowStack (thoughtVectorEverywhere : decoderInput),
+        then if useEncoder && useNYUStyle then BS.RNNs.RecurrentLSTMP2 (decoderDims[i],// cellDim=decoderDims[i],
+                                                                        RowStack (thoughtVectorEverywhere : decoderInput), inputDim=thoughtVectorDim + decoderInputDim,
+                                                                        previousHook=PreviousHCWithReorderingHook,
                                                                        enableSelfStabilization=useStabilizer)
-             else if useEncoder && attentionSpan > 0 then RecurrentLSTMPWithAttentionWindow2 (thoughtVectorDim + decoderInputDim, decoderDims[i], decoderDims[i],
-                                                                                              RowStack (thoughtVectorEverywhere : decoderInput),
+             else if useEncoder && attentionSpan > 0 then RecurrentLSTMPWithAttentionWindow2 (/*thoughtVectorDim + //<-BUGBUG*/ decoderInputDim, decoderDims[i], decoderDims[i],
+                                                                                              /*RowStack (thoughtVectorEverywhere : //<-BUGBUG)*/ (decoderInput),
                                                                                              projectedAttentionWindowBroadcast, attentionDim, attentionSpan,
+                                                                                              previousHook=PreviousHCWithReorderingHook,
                                                                                              enableSelfStabilization=useStabilizer)
-             else RecurrentLSTMP2WithInitialState (decoderInputDim, decoderDims[i], decoderDims[i],
-                                                   decoderInput,
-                                                   thoughtVectorPadded, # BUGBUG: Should be thoughtVector, but Scatter() can't expand from inside a loop
+             else BS.RNNs.RecurrentLSTMP2 (decoderDims[i],// cellDim=decoderDims[i],
+                                           decoderInput, inputDim=decoderInputDim,
+                                           previousHook=PreviousHCFromThoughtVectorWithReorderingHook, # Previous() function with thought vector as initial state
                                           enableSelfStabilization=useStabilizer)
-        else BS.RNNs.RecurrentLSTMP2 (decoderDims[i-1], decoderDims[i], decoderDims[i],
-                                      decoder[i-1].h,
+        else BS.RNNs.RecurrentLSTMP2 (decoderDims[i],// cellDim=decoderDims[i],
+                                      decoder[i-1].h, inputDim=decoderDims[i-1] /*TODO: decoder[i-1].dim*/,
+                                      previousHook=PreviousHCWithReorderingHook,
                                      enableSelfStabilization=useStabilizer)
-    #decoderDim = decoderDims[decoderOutputLayer]
    decoderOutput = decoder[decoderOutputLayer].h
-    decoderDim = decoderOutput.dim
+    #decoderDim = decoderOutput.dim
+    decoderDim = decoderDims[decoderOutputLayer]

    # and add a softmax layer on top

@ -280,7 +342,7 @@ reader = [
    #randomize = "auto" # gets ignored

    readerType = LMSequenceReader
-    mode = "softmax"
+    mode = "softmax"                    # TODO: find out what this means
    nbruttsineachrecurrentiter = 0      # 0 means auto-fill given minibatch size
    cacheBlockSize = 100000000          # read block size. This value is large enough to load entire corpus at once

@ -445,12 +507,12 @@ train = [
        learningRatesPerSample = 0.007*2:0.0035 #0.01 #0.005 # 0.01
        momentumAsTimeConstant = 2500
        gradientClippingWithTruncation = true   # TODO: clip and truncate? What is the difference?
-        clippingThresholdPerSample = 15.0
+        clippingThresholdPerSample = 1  # 15.0 # visibly impacts objectives, but not final result, so keep it for safety
        maxEpochs = 50
        numMBsToShowResult = 100
        firstMBsToShowResult = 10
        gradUpdateType = "none" # FSAdaGrad?
-        loadBestModel = true
+        loadBestModel = false   # true # broken for some models (rereading overwrites something that got set by validation)

        # tracing (enable these for debugging)
        #traceNodeNamesReal = labelsEmbedded:decoderInput:"decoder[0].lstmState._privateInnards.ht":z.Plus_left.Times_right.result:z:ce
@ -475,6 +537,18 @@ train = [
    ]
 ]

+#######################################
+#  DUMP CONFIG                        #
+#######################################
+
+# dumps the model, specifically the learnable parameters
+
+dump = [
+    action = "dumpnode"
+    modelPath = "$dumpModelPath$"
+    outputFile = "$dumpModelPath$.txt"
+]
+
 #######################################
 #  TEST CONFIG                        #
 #######################################
@ -639,7 +713,7 @@ write = [
            # reduce back to a single column
            topHyps = TraceSparse (topPathScores * OnesTensor (1 : topN), 'topHyps')

-            inputsOut = Pass (model.inputSequence)
+            inputsOut = Pass (model.streams_out_input/*inputSequence*/)
            labelsOut = Pass (TraceOneHot (model.labelSequence, 'labels'))
            decodeOut = Pass (TraceOneHot (top1, 'out'))
            topNOut   = Pass (topHyps)
@ -653,7 +727,7 @@ write = [
        PreviousOrDefault1 (x, defaultValue=Constant (0)) =   # a delay node with initial value  --TODO: merge the two, then do in C++
        [
            flags = IsFirst (defaultValue/*x*/)
-            out = If (flags,
+            out = BS.Boolean.If (flags,
                        /*then*/ defaultValue,
                        /*else*/ Previous (x))
        ].out
@ -667,7 +741,7 @@ write = [
        delayedDecoderFeedback = TraceDense (/*Loop.*/PreviousOrDefault1 (defaultValue=labelSentenceStartEmbeddedScattered, TraceDense (decoderFeedback, 'lemb'))   , 'prev lemb')

        greedyDecodingModel = BS.Network.Edit (modelAsTrained,
-                                               BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.delayedDecoderFeedback, delayedDecoderFeedback),
+                                               BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.decoderInput/*delayedDecoderFeedback*/, delayedDecoderFeedback),
                                               modelAsTrained.z/*dummy for now since cannot pass empty set*/)

        # beam search of width 'beamDepth'
@ -679,7 +753,7 @@ write = [
            #  decoder[0].prevState.h.elseVal = PastValue (decoder[0].lstmState._privateInnards.ht) : [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h] -> [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h]
            #  decoder[0].prevState.c.elseVal = PastValue (decoder[0].lstmState._privateInnards.ct) : [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h] -> [512 x 1 x labelSequence.h.out.h.indexSequence.h.indexSequence.h]

-            hiddenDim    = modelAsTrained.delayedDecoderFeedback.dim
+            hiddenDim    = modelAsTrained.decoderFeedback.dim
            embeddingDim = modelAsTrained.decoderOutputEmbedded.dim
            vocabSize    = modelAsTrained.z.dim

@ -702,7 +776,37 @@ write = [
            #  - traceback is a right-to-left recurrence
            #     - output best hypo conditioned on the path (it is already known)

-            propagationEdits[i:0..8] = // TODO: implement and use { } syntax  TODO: VV elseVal only for non-NYU?
+            # attention:
+            # tanHOut = Tanh (TraceDense(   projectedAttentionWindowBroadcast.projectedValue,   'hencp') + TraceDense (   projectedH,   'hdecp')) # [attentionDim x attentionSpan]
+            #   decoder[0].tanHOut.z = Plus (decoder[0].tanHOut.z.PlusArgs[0], decoder[0].tanHOut.z.PlusArgs[1]) : [128 x 20 x WhereNodeAxis1], [128] -> [128 x 20 x WhereNodeAxis1]
+            #   patch PlusArgs[0]
+            # uValid = u + Log (projectedAttentionWindowBroadcast.valid) # [1 x attentionSpan]
+            #   decoder[0].uValid = Plus (decoder[0].u, decoder[0].uValid.PlusArgs[1]) : [1 x 20 x WhereNodeAxis1], [1 x 20 x WhereNodeAxis1] -> [1 x 20 x WhereNodeAxis1]
+            #   patch PlusArgs[1]
+            # weightedAttentionWindow = projectedAttentionWindowBroadcast.value .* attentionWeights # [encoderHiddenDim x attentionSpan]
+            #   decoder[0].weightedAttentionWindow = ElementTimes (projectedAttentionWindowBroadcast.value.out, decoder[0].attentionWeights) : [512 x 20 x WhereNodeAxis1], [1 x 20 x WhereNodeAxis1] -> [512 x 20 x WhereNodeAxis1]
+            #   patch ElementTimesArgs[0]
+            # each:
+            #   node -> SplitDimension (node, /*axis=*/, 1 /*->0:1*/)
+            #   e.g.
+            #   [512 x 20 x *] -> [(0:1) x 20 x *hereNodeAxis13] -> [512 x 1 x 20 x *]
+            # decoder[0].weightedAttentionAverage = Times (decoder[0].weightedAttentionWindow, decoder[0].weightedAttentionAverage.TimesArgs[1]) : [512 x 1 x 20 x WhereNodeAxis11], [20] -> [512] FAILED
+            #   change to outputRank=2
+            # attentionWeights = TraceDense(  Softmax (uValid)    ,'weights')                    # [1 x attentionSpan]
+            #   decoder[0].attentionWeights.h = Softmax (decoder[0].uValid) : [1 x 3 x 20 x WhereNodeAxis21] -> [1 x 3 x 20 x WhereNodeAxis21]
+            #   path SoftmaxArgs[0] to be column-wise over axis 3
+
+            ColumnwiseSoftmax (axis=1, z) = [ n = TraceDense(   Softmax (z),    'smz') ; axis1 = axis ; d = TraceDense(    ReducePlus (axis=axis1, n),    'denom') ; p = TraceDense(    n .* Reciprocal (d),    'p') ].p
+
+            #Columnwise (f, beamDepth, z) = # TODO: Takes LogSoftmax over axis=1. it is more tricky to do this over arbitrary axes
+            #[
+            #    cols[d:0..beamDepth-1] = f (Slice (d, d+1, z, axis=2) /*[:,d]*/ )
+            #    out = Splice (cols, axis=2)
+            #].out
+
+            InjectDepth (node) = SplitDimension (node, /*axis=*/1, 1 /*->0:1*/)
+
+            propagationEdits[i:0..13] = // TODO: implement and use { } syntax  TODO: VV elseVal only for non-NYU?
                # non-NYU:
                if      i ==  0 then (node => if node.name == 'decoder[0].prevState.h.elseVal' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
                else if i ==  1 then (node => if node.name == 'decoder[0].prevState.c.elseVal' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
@ -714,7 +818,13 @@ write = [
                else if i ==  5 then (node => if node.name == 'decoder[1].prevState.c' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
                else if i ==  6 then (node => if node.name == 'decoder[2].prevState.h' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node) # inject reshuffling of hypotheses
                else if i ==  7 then (node => if node.name == 'decoder[2].prevState.c' then TraceState (Previous (ReorderTopN (node.PastValueArgs[0])), 'propagated') else node)
-                else                BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.delayedDecoderFeedback, delayedDecoderFeedback)
+                # attention:
+                else if i ==  8 then (node => if node.name != 'decoder[0].tanHOut.z'                then node else InjectDepth (node.PlusArgs[0])          +              node.PlusArgs[1])
+                else if i ==  9 then (node => if node.name != 'decoder[0].uValid'                   then node else              node.PlusArgs[0]           + InjectDepth (node.PlusArgs[1]))
+                else if i == 10 then (node => if node.name != 'decoder[0].weightedAttentionWindow'  then node else InjectDepth (node.ElementTimesArgs[0]) .*              node.ElementTimesArgs[1])
+                else if i == 11 then (node => if node.name != 'decoder[0].weightedAttentionAverage' then node else Times (node.TimesArgs[0], node.TimesArgs[1], outputRank=2))
+                else if i == 12 then (node => if node.name != 'decoder[0].attentionWeights.h'       then node else ColumnwiseSoftmax (axis=3, node.SoftmaxArgs[0]))
+                else                BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.decoderInput/*delayedDecoderFeedback*/, delayedDecoderFeedback)

            # decoderFeedback must be updated to take actual decoder output

@ -808,7 +918,7 @@ write = [
            #       +-----+

            # tokens.word:
-            #tokens.word = ReduceSum (axis=2, topPaths) # TODO: add an axis parameter to SumColumnElements()
+            #tokens.word = ReducePlus (axis=2, topPaths) # TODO: add an axis parameter to SumColumnElements()
            #   +-+
            #   |0|
            #   |0|-+
--- a/2
+++ b/2
@ -233,7 +233,7 @@ READER_SRC =\
 	$(SOURCEDIR)/Readers/ReaderLib/ChunkRandomizer.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/SequenceRandomizer.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/SequencePacker.cpp \
-	$(SOURCEDIR)/Readers/ReaderLib/BpttPacker.cpp \
+	$(SOURCEDIR)/Readers/ReaderLib/TruncatedBpttPacker.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/PackerBase.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/FramePacker.cpp \

--- a/README.md
+++ b/README.md
@ -1,6 +1,9 @@
 # CNTK

 ## Latest news
+*2016-04-25.* V 1.1 Binary release
+CNTK v.1.1 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases/tag/v1.1)
+
 *2016-04-12.* CNTK is available as [Azure Virtual Machines](https://github.com/Microsoft/CNTK/wiki/CNTK-on-Azure) and [Docker Containers](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers)

 *2016-04-12.* Added support for ND convolution and ND pooling and CPU support for `cudnn` layout in convolution, pooling and batch normalization nodes.
@ -8,10 +11,6 @@ Read [documentation](https://github.com/Microsoft/CNTK/wiki/Full-NDL-Function-Re

 *2016-04-05.* CUDA7.5 support for Windows Build: Windows project files have been updated to automatically utilize CUDA 7.5 if present

-## March 2016
-*2016-03-24.* New Text Reader (CNTKTextFormatReader) is available
-Read description here https://github.com/Microsoft/CNTK/wiki/CNTKTextFormat-Reader
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 ## What is CNTK
--- a/Source/ActionsLib/ActionsLib.vcxproj
+++ b/Source/ActionsLib/ActionsLib.vcxproj
@ -24,36 +24,17 @@
    <RootNamespace>CNTK</RootNamespace>
    <ProjectName>ActionsLib</ProjectName>
  </PropertyGroup>
+  <PropertyGroup Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+  </PropertyGroup>
  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings" />
  <ImportGroup Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="$(DebugBuild)">
-    <LinkIncremental>true</LinkIncremental>
-    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
-  </PropertyGroup>
-  <PropertyGroup Condition="$(ReleaseBuild)">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
-    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
-  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\CNTK;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
@ -67,56 +48,33 @@
        <AdditionalIncludeDirectories>$(SolutionDir)Source\multiverso;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
      </ClCompile>
  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="$(DebugBuild)">
+  <ItemDefinitionGroup>
    <ClCompile>
      <PrecompiledHeader>
      </PrecompiledHeader>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-      <OpenMPSupport>true</OpenMPSupport>
-      <TreatWarningAsError>true</TreatWarningAsError>
-      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
-      <DelayLoadDLLs>Math.dll; nvml.dll</DelayLoadDLLs>
-      <StackReserveSize>100000000</StackReserveSize>
+      <AdditionalDependencies>Math.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>Math.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(DebugBuild)">
+    <ClCompile>
+      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
-      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
-      <Profile>true</Profile>
-      <DelayLoadDLLs>Math.dll; nvml.dll</DelayLoadDLLs>
-    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
    <ClCompile>
      <PreprocessorDefinitions>%(PreprocessorDefinitions);CPUONLY</PreprocessorDefinitions>
    </ClCompile>
    <Link>
-      <DelayLoadDLLs>Math.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(GpuBuild)">
@ -125,9 +83,10 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
+      <DelayLoadDLLs>%(DelayLoadDLLs);nvml.dll</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
-      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" "$(TargetDir)"</Command>
      <Message>Copying NVidia GDK extension DLL to target folder</Message>
    </PostBuildEvent>
  </ItemDefinitionGroup>
@ -148,13 +107,6 @@
    <ClInclude Include="SimpleNetworkBuilder.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\Common\File.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\TimerUtility.cpp" />
    <ClCompile Include="NetworkDescriptionLanguage.cpp" />
    <ClCompile Include="NetworkFactory.cpp" />
    <ClCompile Include="SimpleNetworkBuilder.cpp" />
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -9,16 +9,18 @@

 #include "NetworkDescriptionLanguage.h"
 #include "NDLNetworkBuilder.h"
+
+#include "ConvolutionalNodes.h"
+#include "DeprecatedNodes.h"
+#include "EvaluationNodes.h"
 #include "InputAndParamNodes.h"
 #include "LinearAlgebraNodes.h"
 #include "NonlinearityNodes.h"
-#include "ConvolutionalNodes.h"
-#include "RecurrentNodes.h"
+#include "PreComputeNodes.h"
 #include "ReshapingNodes.h"
+#include "RecurrentNodes.h"
 #include "SpecialPurposeNodes.h"
 #include "TrainingNodes.h"
-#include "PreComputeNodes.h"
-#include "EvaluationNodes.h"

 using namespace std;

@ -156,6 +158,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(CRFNode), L"CRF")) ret = true;
 #endif
    else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(ClipNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ConvolutionNode), L"Convolve")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(PoolingNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(CosDistanceNode), L"CosDist")) ret = true;
@ -170,11 +173,13 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(ElementTimesNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ErrorPredictionNode), L"ClassificationError")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ExpNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(FloorNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(FutureValueNode))) ret = true;
 #ifdef COMING_SOON
    else if (EqualInsensitive(nodeType, OperationNameOf(GMMLogLikelihoodNode), L"GMMLL")) ret = true;
 #endif
    else if (EqualInsensitive(nodeType, OperationNameOf(HardmaxNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(IfNode), L"If")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(InputValue), L"Input")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(InvStdDevNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(KhatriRaoProductNode), L"ColumnwiseCrossProduct")) ret = true;
--- a/Source/ActionsLib/OtherActions.cpp
+++ b/Source/ActionsLib/OtherActions.cpp
@ -510,11 +510,11 @@ template <typename ElemType>
 void DoTopologyPlot(const ConfigParameters& config)
 {
    wstring modelPath     = config(L"modelPath");
-    wstring outputDotFile = config(L"outputDotFile"); // filename for the dot language output, if not specified, %modelpath%.dot will be used
-    wstring outputFile    = config(L"outputFile");    // filename for the rendered topology plot
+    wstring outputDotFile = config(L"outputDotFile", L""); // filename for the dot language output, if not specified, %modelpath%.dot will be used
+    wstring outputFile    = config(L"outputFile", L"");    // filename for the rendered topology plot
    // this can be empty, in that case no rendering will be done
    // or if this is set, renderCmd must be set, so CNTK will call re
-    wstring renderCmd = config(L"renderCmd"); // if this option is set, then CNTK will call the render to convert the outdotFile to a graph
+    wstring renderCmd = config(L"renderCmd", L""); // if this option is set, then CNTK will call the render to convert the outdotFile to a graph
    // e.g. "d:\Tools\graphviz\bin\dot.exe -Tpng -x <IN> -o<OUT>"
    //              where <IN> and <OUT> are two special placeholders

@ -544,7 +544,8 @@ void DoTopologyPlot(const ConfigParameters& config)
        renderCmd = msra::strfun::ReplaceAll(renderCmd, wstring(L"<OUT>"), outputFile);
    }

-
+    if (!renderCmd.empty())
+    {
        fprintf(stderr, "Executing third-party tool for rendering dot:\n%ls\n", renderCmd.c_str());
 #ifdef __unix__
        auto rc = system(msra::strfun::utf8(renderCmd).c_str());
@ -552,6 +553,7 @@ void DoTopologyPlot(const ConfigParameters& config)
 #else
        _wsystem(renderCmd.c_str());
 #endif
+    }
    fprintf(stderr, "Done.\n");
 }

--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -15,10 +15,7 @@ Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how
 Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ] 
 Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ] 
 Chr(c) = new StringFunction [ what = 'Chr' ;  arg = c ] 
-Floor(x)  = new NumericFunction [ what = 'Floor' ;  arg = x ] 
 Length(x) = new NumericFunction [ what = 'Length' ; arg = x ] 
-Ceil(x) = -Floor(-x) 
-Round(x) = Floor(x+0.5)
 Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0 
 Min(a,b) = if a < b then a else b 
 Max(a,b) = if a > b then a else b 
@ -29,7 +26,7 @@ IsSameObject(a,b) = new CompareFunction [ what = 'IsSameObject' ; args = (a : b)
 # ComputationNodes
 ##############################################################################

-LearnableParameter(rows, cols, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (rows : cols) ] /*plus the function args*/ ]
+LearnableParameter (outputDim, inputDim, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (outputDim : inputDim) ] /*plus the function args*/ ]
 Parameter = LearnableParameter // deprecated 
 # TODO: make Parameter take tensor dims?
 ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
@ -41,6 +38,7 @@ SparseInput(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operati
 ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]
 SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
 EnvironmentInput(propertyName, tag='') = new ComputationNode [ operation = 'EnvironmentInput' /*plus the function args*/ ]
+# TODO: make 'dims' the first parameter, think ConstantTensor<dims> (val)
 ConstantTensor(val, dims, tag='') = ParameterTensor(dims, learningRateMultiplier = 0, init = 'fixedValue', value = val) 
 Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, init = 'fixedValue', value = val) 
 PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
@ -94,7 +92,9 @@ Delay = PastValue

 BatchNormalization(input, scale, bias, runMean, runInvStdDev, spatial, normalizationTimeConstant = 0, blendTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]
 Abs(x, tag='') = new ComputationNode [ operation = 'Abs' ; inputs = x /*plus the function args*/ ]
+Ceil(x, tag='') = Negate(Floor(Negate(x)), tag=tag)
 ClassBasedCrossEntropyWithSoftmax(labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax, tag='') = new ComputationNode [ operation = 'ClassBasedCrossEntropyWithSoftmax' ; inputs = (labelClassDescriptorVectorSequence : mainInputInfo : mainWeight : classLogProbsBeforeSoftmax) /*plus the function args*/ ]
+Clip(minValue, maxValue, x, tag='') = new ComputationNode [ operation = 'Clip' ; inputs = (minValue : maxValue : x) /* plus the function args*/ ]
 ColumnElementTimes(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'ColumnElementTimes' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
 // TODO: ColumnElementTimes = ElementTimes
 CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNode [ operation = 'CosDistance' ; inputs = (aVectorSequence : anotherVectorSequence) /*plus the function args*/ ]
@ -109,11 +109,13 @@ ElementTimes(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation =
 ElementDivide(aMatrix, anotherMatrix, tag='') = ElementTimes(aMatrix, Reciprocal(anotherMatrix), tag=tag)
 ErrorPrediction(labelVectorSequence, outVectorSequence, tag='') = new ComputationNode [ operation = 'ErrorPrediction' ; inputs = (labelVectorSequence : outVectorSequence) /*plus the function args*/ ]
 Exp(x, tag='') = new ComputationNode [ operation = 'Exp' ; inputs = x /*plus the function args*/ ]
+Floor(x, tag='') = new ComputationNode [ operation = 'Floor' ; inputs = x /*plus the function args*/ ]
 GatherPacked(indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'GatherPacked' ; inputs = (indexSequence : sourceData) /*plus the function args*/ ]
 GMMLogLikelihood(unnormalizedPriorVector, meansAsRows, logStdDevAsRows, dataVectorSequence, tag='') = new ComputationNode [ operation = 'GMMLogLikelihood' ; inputs = (unnormalizedPriorVector : meansAsRows : logStdDevAsRows : dataVectorSequence) /*plus the function args*/ ]
 InvStdDev(dataVectorSequence, tag='') = new ComputationNode [ operation = 'InvStdDev' ; inputs = dataVectorSequence /*plus the function args*/ ]
 KhatriRaoProduct(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'KhatriRaoProduct' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 Log(x, tag='') = new ComputationNode [ operation = 'Log' ; inputs = x /*plus the function args*/ ]
+LogPlus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'LogPlus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 LogSoftmax(z, tag='') = new ComputationNode [ operation = 'LogSoftmax' ; inputs = z /*plus the function args*/ ]
 MatrixL1Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL1Reg' ; inputs = matrix /*plus the function args*/ ]
 MatrixL2Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL2Reg' ; inputs = matrix /*plus the function args*/ ]
@ -127,6 +129,12 @@ PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag=
 Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 Reciprocal(z, tag='') = new ComputationNode [ operation = 'Reciprocal' ; inputs = z /*plus the function args*/ ]
 RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ]
+ReducePlus (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Plus"    /*plus the function args*/ ]
+#ReduceLogPlus (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogPlus" /*plus the function args*/ ]
+#ReduceMean (z, axis=0, tag='')    = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean"    /*plus the function args*/ ]
+#ReduceMax (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max"     /*plus the function args*/ ]
+#ReduceMin (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min"     /*plus the function args*/ ]
+Round(x, tag='') = Floor(Plus(x, ConstantTensor(0.5, (1))), tag=tag)
 Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
 // TODO: Scale = ElementTimes
 ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
@ -136,8 +144,7 @@ Softmax(z, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = z /*
 Hardmax(z, tag='') = new ComputationNode [ operation = 'Hardmax' ; inputs = z /*plus the function args*/ ]
 Sqrt(z, tag='') = new ComputationNode [ operation = 'Sqrt' ; inputs = z /*plus the function args*/ ]
 SquareError(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'SquareError' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ]
-SumColumnElements(z, tag='') = new ComputationNode [ operation = 'SumColumnElements' ; inputs = z /*plus the function args*/ ]
-# ^^ TODO: Rename to SumElements? ReduceSum without axis?
+SumColumnElements(z, tag='') = new ComputationNode [ operation = 'SumColumnElements' ; inputs = z /*plus the function args*/ ] // deprecated
 SumElements(matrix, tag='') = new ComputationNode [ operation = 'SumElements' ; inputs = matrix /*plus the function args*/ ]
 # ^^ TODO: Rename to ReduceSumMB?
 Tanh(z, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = z /*plus the function args*/ ]
@ -212,7 +219,7 @@ Boolean = [

    # select a value
    # Note: This will be replaced by BrainScript 'if cond then thenVal else elseVal' and SwitchNode
-    If (cond, thenVal, elseVal) = cond .* thenVal + Not (cond) .* elseVal
+    If (cond, thenVal, elseVal, tag='') =  new ComputationNode [ operation = 'If' ; inputs = (cond : thenVal : elseVal) /*plus the function args*/ ]
 ]

 ##############################################################################
@ -223,16 +230,24 @@ Boolean = [
 Sequences = [
    # broadcast a single-step sequence to a multi-step sequence
    BroadcastSequenceAs (type, data1) = [                      # type=example sequence with desired length (outside of a loop), data1=1 time step
-        ZeroSequenceLike (x) = RowSlice (0, 1, x) .* Constants.Zero # BUGBUG: SumColumnElements() has a CPU/GPU problem
-        index = /*Constants.*/ZeroSequenceLike (type)  # create an index sequence [ 0 0 0 ... ] of target length
-        packedIndex = PackedIndex (data1, index)       # convert into internal packed index w.r.t. 'data1'
-        out = GatherPacked (packedIndex, data1)        # copy data1[0] to all elements, total length like 'type'
+        # BUGBUG: This should work but gives worse results.
+        #ZeroSequenceLike (x) = RowSlice (0, 1, x) .* Constants.Zero # BUGBUG: SumColumnElements() has a CPU/GPU problem
+        #index = /*Constants.*/ZeroSequenceLike (type)  # create an index sequence [ 0 0 0 ... ] of target length
+        #packedIndex = PackedIndex (data1, index)       # convert into internal packed index w.r.t. 'data1'
+        #out = GatherPacked (packedIndex, data1)        # copy data1[0] to all elements, total length like 'type'
+
+        # alternative (slower, older) implementation (10% slower end-to-end?)
+        # Gives nearly the same result, but not completely. Since Gather() above has an atomicAdd(), let's leave this on for now and check later.
+        dataPadded = Sequences.Scatter (Loop.IsFirst (type), data1) # padded with zeroes until end of target sequence
+        out = Boolean.If (Loop.IsFirst (dataPadded), # if first entry
+                 /*then*/ dataPadded,                # then copy that
+                 /*else*/ Loop.Previous (out))       # else just propagate to the front
    ].out

    # rolling window over past N samples
-    # returns a record [ value=..., valid=... ]
+    # returns a record [ value=..., valid=... ], both being 1-step sequences of [dim x N]. N can optionally be moved to axes >2.
    # This implementation is suboptimal in that it creates copies for the intermediate steps.
-    PastValueWindow (N, in) = [
+    PastValueWindow (N, in, axis=2) = [
        delayLine[t:0..N-1] = [     # shift register for encoder, last N inputs
            value = if t == 0
                    then in        # delay 0: current value
@ -243,8 +258,12 @@ Sequences = [
        ]
        # delayLine[t].value = value of t steps in the past
        # delayLine[t].valid = true if we had a value t steps in the past
-        value = Slice (-1, 0, axis=-1, SplitDimension (RowStack (array[0..N-1](t=>delayLine[t].value)), 1, N))  # [i, delay]
-        valid = Slice (-1, 0, axis=-1, SplitDimension (RowStack (array[0..N-1](t=>delayLine[t].valid)), 1, N))  # [i, delay]
+        SplitStack (x) =
+            if      axis == 2 then SplitDimension (x, 1, N)
+            else if axis > 2  then TransposeDimensions (SplitDimension (x, 1, N), 2, axis)
+            else Fail ("PastValueWindow: axis>2 required.") # BUGBUG: We also require that input is a single vector. Address later.
+        value = Slice (-1, 0, axis=-1, SplitStack (RowStack (array[0..N-1](t=>delayLine[t].value))))  # [i, delay]
+        valid = Slice (-1, 0, axis=-1, SplitStack (RowStack (array[0..N-1](t=>delayLine[t].valid))))  # [i, delay]
    ]

    # fold left/right: Reduce entire sequence by applying binaryOp, e.g. FoldL (Plus, 0, input)
@ -374,14 +393,17 @@ Parameters =
    BiasParam (dim)                   = ParameterTensor ((dim), init='fixedValue', value=0.0)
    ScalarParam()                     = BiasParam (1)

-    # route input through an extra scalar weight, for stabilization
-    Stabilize (x, enabled=true) =
+    # route input through an extra weight, for stabilization
+    StabilizeElements (x, inputDim=x.dim, enabled=true) =
        if enabled
        then [
-                 beta = Exp (ScalarParam())
-                 result = Scale (beta, x)
+            beta = Exp (BiasParam ((inputDim)))
+            result = beta .* x
        ].result
    else x
+
+    # and the same with a scalar stabilizer shared across all components
+    Stabilize (x, enabled=true) = if enabled then StabilizeElements (x, inputDim=1, enabled=true) else x
 ]

 ##############################################################################
@ -393,19 +415,20 @@ RNNs =
    # LSTMP -- LSTM function with projection and self-stabilization
    # Projection it enabled by passing different values for outputDim and cellDim.
    # This is the stateless version that takes the previous state as an input.
-    # It returns a dictionary with two members: h and c. prevState must be in the same format.
-    // TODO: Standardize on one parameter order. Is first dimension the output (like in math, strcpy, or functional style) or the input (listing inputs first)?
-    //       If we change this, we'd need to fix the LSTM end-to-end test.
-    LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=false) =
+    # It returns a dictionary with three members: h and c, and dim=h.dim for convenience. prevState must have h and c.
+    LSTMP (outputDim, cellDim=outputDim, x, inputDim=x.dim, prevState, enableSelfStabilization=false) =
    [
-        #inputDim = x.dim   # get dimension from 'x' (if this works, we can remove the inputDim1 parameter)
-        _privateInnards = [       // encapsulate the privateInnards workings
+        # TODO: rename to just _
+        _privateInnards = [       // encapsulate the inner workings
            dh = prevState.h // previous values
            dc = prevState.c

            // parameter macros--these carry their own weight matrices
            B() = Parameters.BiasParam (cellDim)

+            #inputDim1 = inputDim
+            #W(v) = Parameters.WeightParam (cellDim, inputDim)  * Parameters.StabilizeElements (v, inputDim=inputDim1, enabled=enableSelfStabilization) // input-to-hidden
+            # ^^ element-wise stab, use if input is a concatenation; vv stab for entire matrix
            W(v) = Parameters.WeightParam (cellDim, inputDim)  * Parameters.Stabilize (v, enabled=enableSelfStabilization) // input-to-hidden
            H(h) = Parameters.WeightParam (cellDim, outputDim) * Parameters.Stabilize (h, enabled=enableSelfStabilization) // hidden-to-hidden
            C(c) = Parameters.DiagWeightParam (cellDim)       .* Parameters.Stabilize (c, enabled=enableSelfStabilization) // cell-to-hiddden (note: applied elementwise)
@ -423,12 +446,6 @@ RNNs =
            ht = ot .* Tanh (ct)                               // applied to tanh(cell(t))
        ]

-        // LSTM cell
-        # TODO: This is temporary test code for the new ShiftNode (until we switch PastValue() itself over)
-        #PastValueShift(dimDummy, input) = Shift(input, /*fromOffsets=*/-1, /*boundaryValue=*/Constant(0.1), dim=-1)
-        #PastValue1 = PastValue
-        #PastValue1 = PastValueShift
-
        # our return values
        c = _privateInnards.ct          // cell value
        h = if outputDim != cellDim     // output/hidden state
@ -437,41 +454,44 @@ RNNs =
                htp = Wmr * Parameters.Stabilize (_privateInnards.ht, enabled=enableSelfStabilization)
            ].htp         // TODO: ^^ extend BS syntax to allow to say: then [ Wmr = WeightParam(outputDim, cellDim) ] in Wmr * Stabilize (...)
            else _privateInnards.ht     // no projection
+        dim = outputDim
    ]

-    # this implements a recurrent (stateful) LSTM with projection and self-stabilization
-    RecurrentLSTMP (inputDim, outputDim, cellDim, x, enableSelfStabilization=false) =
-    [
-        prevState =
-        [
+    # helper function to delay h and c
+    # Callers can provide their own, e.g. useful for beam decoding.
+    PreviousHC (lstmState) = [
       h = Loop.Previous (lstmState.h)         // hidden state(t-1)
       c = Loop.Previous (lstmState.c)         // cell(t-1)
    ]
-        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
-        lstmState = LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
-    ].lstmState.h // that's the value we return

-    # same as RecurrentLSTMP but returns both h and c
-    RecurrentLSTMP2 (inputDim, outputDim, cellDim, x, enableSelfStabilization=false) =
-    [
-        prevState =
-        [
-            h = Loop.Previous (lstmState.h)             # hidden state(t-1)
-            c = Loop.Previous (lstmState.c)             # cell(t-1)
+    # pass previousHook=BS.RNNs.NextHC instead of PreviousHC to get a right-to-left recurrence
+    NextHC (lstmState) = [
+       h = Loop.Next (lstmState.h)             // hidden state(t-1)
+       c = Loop.Next (lstmState.c)             // cell(t-1)
    ]
-        enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
-        lstmState = BS.RNNs.LSTMP (inputDim, outputDim, cellDim, x, prevState, enableSelfStabilization=enableSelfStabilization1)
-    ].lstmState // that's the value we return
+
+    # this implements a recurrent (stateful) LSTM with projection and self-stabilization
+    # It returns a record (h,c). To use its output, say .h
+    # By default, this is left-to-right. Pass previousHook=BS.RNNs.NextHC for a right-to-left model.
+    # TODO: remove the -2 once this works
+    RecurrentLSTMP = RecurrentLSTMP2
+    RecurrentLSTMP2 (outputDim, cellDim=outputDim.dim, x, inputDim=x.dim, previousHook=PreviousHC, enableSelfStabilization=false) =
+    [
+        prevState = previousHook (lstmState)
+        inputDim1 = inputDim ; cellDim1 = cellDim ; enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
+        lstmState = BS.RNNs.LSTMP (outputDim, cellDim=cellDim1, x, inputDim=inputDim1, prevState, enableSelfStabilization=enableSelfStabilization1)
+    ].lstmState // we return the state record (h,c)

    # a stack of recurrent LSTMs (unidirectional)
-    RecurrentLSTMP2Stack (input, inputDim, hiddenDims, cellDims, enableSelfStabilization=false) = [
-        useStabilizer = enableSelfStabilization
-        layer[i:0..Length (hiddenDims)-1] =
-            RecurrentLSTMP2 (if i == 0 then inputDim else hiddenDims[i-1],
-                             hiddenDims[i], cellDims[i],
-                             if i == 0 then input else layer[i-1].h,
+    RecurrentLSTMPStack = RecurrentLSTMP2Stack  # TODO: remove the -2 name once this works
+    RecurrentLSTMP2Stack (hiddenDims, cellDims=hiddenDims, input, inputDim=input.dim, previousHook=PreviousHC, enableSelfStabilization=false) = [
+        previousHook1 = previousHook ; useStabilizer = enableSelfStabilization
+        layers[i:0..Length (hiddenDims)-1] =
+            RecurrentLSTMP2 (hiddenDims[i], cellDim=cellDims[i],
+                             if i == 0 then input else layers[i-1].h, inputDim=if i == 0 then inputDim else hiddenDims[i-1] /*TODO: layers[i-1].dim*/,
+                             previousHook=previousHook1,
                             enableSelfStabilization=useStabilizer)
-    ].layer
+    ].layers
 ]

 ##############################################################################
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -80,7 +80,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>ActionsLib.lib; SGDLib.lib; ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ActionsLib.lib; SGDLib.lib; ComputationNetworkLib.lib; Math.lib; Common.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <DelayLoadDLLs>Math.dll; msmpi.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
      <StackReserveSize>100000000</StackReserveSize>
    </Link>
@ -107,7 +107,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>ActionsLib.lib; SGDLib.lib; ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ActionsLib.lib; SGDLib.lib; ComputationNetworkLib.lib; Math.lib; Common.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
      <DelayLoadDLLs>Math.dll; msmpi.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
      <StackReserveSize>100000000</StackReserveSize>
@ -124,12 +124,8 @@
      <DelayLoadDLLs>Math.dll; msmpi.dll;</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">Copying dependencies</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">xcopy /I /D /Y $(ProjectDir)BrainScript\CNTKCoreLib\CNTK.core.bs $(TargetDir)</Command>
-    </PostBuildEvent>
-    <PostBuildEvent>
-      <Message Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">Copying dependencies</Message>
-      <Command Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">xcopy /I /D /Y $(ProjectDir)BrainScript\CNTKCoreLib\CNTK.core.bs $(TargetDir)</Command>
+      <Message>Copying dependencies</Message>
+      <Command>xcopy /I /D /Y "$(ProjectDir)BrainScript\CNTKCoreLib\CNTK.core.bs" "$(TargetDir)"</Command>
    </PostBuildEvent>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(GpuBuild)">
@ -140,7 +136,7 @@
      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
    </Link>
    <PostBuildEvent>
-      <Command>xcopy /I /D /Y $(ProjectDir)BrainScript\CNTKCoreLib\CNTK.core.bs $(TargetDir) &amp;&amp; if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Command>xcopy /I /D /Y "$(ProjectDir)BrainScript\CNTKCoreLib\CNTK.core.bs" "$(TargetDir)" &amp;&amp; if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" "$(TargetDir)"</Command>
      <Message>Copying dependencies</Message>
    </PostBuildEvent>
  </ItemDefinitionGroup>
@ -192,21 +188,6 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\Common\Config.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\DataReader.cpp" />
-    <ClCompile Include="..\Common\DataWriter.cpp" />
-    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\Common\File.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\Include\ConcStack.h" />
-    <ClCompile Include="..\Common\TimerUtility.cpp" />
-    <ClCompile Include="..\Common\MPIWrapper.cpp" />
    <ClCompile Include="BrainScript\BrainScriptEvaluator.cpp" />
    <ClCompile Include="BrainScript\BrainScriptParser.cpp" />
    <ClCompile Include="BrainScript\BrainScriptTest.cpp" />
--- a/Source/Common/Common.vcxproj
+++ b/Source/Common/Common.vcxproj
@ -0,0 +1,73 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_CpuOnly|x64">
+      <Configuration>Debug_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_CpuOnly|x64">
+      <Configuration>Release_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{86883653-8A61-4038-81A0-2379FAE4200A}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CNTK</RootNamespace>
+    <ProjectName>Common</ProjectName>
+  </PropertyGroup>
+  <PropertyGroup Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+  </PropertyGroup>
+  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings" />
+  <ImportGroup Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\Math;$(MSMPI_INC)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(DebugBuild)">
+    <ClCompile>
+      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
+    <ClCompile>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
+    <ClCompile>
+      <PreprocessorDefinitions>%(PreprocessorDefinitions);CPUONLY</PreprocessorDefinitions>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="Config.cpp" />
+    <ClCompile Include="DataReader.cpp" />
+    <ClCompile Include="DataWriter.cpp" />
+    <ClCompile Include="Eval.cpp" />
+    <ClCompile Include="ExceptionWithCallStack.cpp" />
+    <ClCompile Include="File.cpp" />
+    <ClCompile Include="fileutil.cpp" />
+    <ClCompile Include="MPIWrapper.cpp" />
+    <ClCompile Include="TimerUtility.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets" />
+</Project>
--- a/Source/Common/File.cpp
+++ b/Source/Common/File.cpp
@ -149,8 +149,15 @@ void File::Init(const wchar_t* filename, int fileOptions)
 /*static*/ wstring File::DirectoryPathOf(wstring path)
 {
 #ifdef _WIN32
+    // Win32 accepts forward slashes, but it seems that PathRemoveFileSpec() does not
+    // TODO:
+    // "PathCchCanonicalize does the / to \ conversion as a part of the canonicalization, it’s
+    // probably a good idea to do that anyway since I suspect that the '..' characters might
+    // confuse the other PathCch functions" [Larry Osterman]
+    // "Consider GetFullPathName both for canonicalization and last element finding." [Jay Krell]
+    path = msra::strfun::ReplaceAll<wstring>(path, L"/", L"\\");
+
    HRESULT hr;
-    path = msra::strfun::ReplaceAll<wstring>(path, L"/", L"\\"); // Win32 accepts forward slashes, but it seems that PathRemoveFileSpec() does not
    if (IsWindows8OrGreater()) // PathCchRemoveFileSpec() only available on Windows 8+
    {
        typedef HRESULT(*PathCchRemoveFileSpecProc)(_Inout_updates_(_Inexpressible_(cchPath)) PWSTR, _In_ size_t);
--- a/Source/Common/Include/Eval.h
+++ b/Source/Common/Include/Eval.h
@ -36,6 +36,9 @@ enum NodeGroup
 };

 // IEvaluateModel - interface used by decoders and other components that need just evaluator functionality in DLL form
+// NOTICE: This interface is a public interface for evaluating models in CNTK. 
+//         Changes to this interface may affect other projects, such as Argon and LatGen,
+//         and therefore need to be communicated with such groups.
 template <class ElemType>
 class IEvaluateModel // Evaluate Model Interface
 {
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -487,7 +487,7 @@ private:
    void CheckIsValid() const
    {
        if (m_numFramesDeclared != GetNumCols())
-            LogicError("MBLayout: Attempting to read out flags, but only only %d out of %d frames have been defined.",
+            LogicError("MBLayout: Attempting to read out flags, but only %d out of %d frames have been defined.",
                       (int) m_numFramesDeclared, (int) (m_numTimeSteps * m_numParallelSequences));
    }

--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@ -701,6 +701,13 @@ public:
        return s;
    }

+    // pretty-printing, wstring version 
+    operator std::wstring() const
+    {
+        std::string s = this->operator std::string(); 
+        return msra::strfun::utf16(s);
+    }
+
 private:
    // reset m_strides and m_offset to represent a canonical no-strides column-major tensor
    void InitAsNoSlice()
--- a/Source/ComputationNetworkLib/ComputationEnvironment.h
+++ b/Source/ComputationNetworkLib/ComputationEnvironment.h
@ -29,8 +29,17 @@ struct ComputationEnvironment
 {
    // networkOperationMode tells whether we are training or inferring, which affects some nodes' behavior
    NetworkOperationMode m_networkOperationMode = NetworkOperationMode::inferring; // by default, a network is always able to infer
+    bool IsInferring()     const { return m_networkOperationMode == NetworkOperationMode::inferring; }
    bool IsTraining()     const { return m_networkOperationMode == NetworkOperationMode::training; }
    bool IsPreComputing() const { return m_networkOperationMode == NetworkOperationMode::preComputing; }
+
+    //set new value and return old one
+    NetworkOperationMode SetOperationMode(NetworkOperationMode mode)
+    {
+        NetworkOperationMode oldMode = m_networkOperationMode;
+        m_networkOperationMode = mode;
+        return oldMode;
+    }
    // more properties should be added here as needed
 };
 typedef std::shared_ptr<ComputationEnvironment> ComputationEnvironmentPtr;
@ -48,12 +57,11 @@ public:
    ScopedNetworkOperationMode(const std::shared_ptr<ComputationNetwork>& net, NetworkOperationMode networkOperationMode) :
        m_environment(net->Environment())
    {
-        m_previousNetworkOperationMode = m_environment.m_networkOperationMode;
-        m_environment.m_networkOperationMode = networkOperationMode;
+        m_previousNetworkOperationMode = m_environment.SetOperationMode(networkOperationMode);
    }
    ~ScopedNetworkOperationMode() // destructor restores the previous mode
    {
-        m_environment.m_networkOperationMode = m_previousNetworkOperationMode;
+        m_environment.SetOperationMode(m_previousNetworkOperationMode);
    }
 };

--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -776,8 +776,8 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
    for (const auto& x : allnodes)
    {
        line.clear();
-        line = msra::strfun::wstrprintf(L" \"%ls\" [ label = \"%ls [%s%ls]\\n%ls\" ] ;\n",
-                                        x->GetName().c_str(), x->GetName().c_str(), string(x->GetSampleLayout()).c_str(), x->GetMBLayoutAxisString().c_str(),
+        line = msra::strfun::wstrprintf(L" \"%ls\" [ label = \"%ls [%ls%ls]\\n%ls\" ] ;\n",
+                                        x->GetName().c_str(), x->GetName().c_str(), wstring(x->GetSampleLayout()).c_str(), x->HasMBLayout() ? L" x *" : L"",
                                        x->OperationName().c_str());
        fstream << line;
    }
@ -851,7 +851,7 @@ void ComputationNetwork::DescribeNetworkUsingDot(list<ComputationArc>& arcs,
    fstream << L"\n}\n";
 }

-void ComputationNetwork::PlotNetworkTopology(const wstring outputFile) //  [1/13/2015 erw] plot network topology using dot language
+void ComputationNetwork::PlotNetworkTopology(const wstring& outputFile) 
 {
    VerifyIsCompiled("PlotNetworkTopology");
    // ValidateNetwork(false, true);
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -188,6 +188,7 @@ public:
    void AllocateAllMatrices(const std::vector<ComputationNodeBasePtr>& evalRootNodes, const std::vector<ComputationNodeBasePtr>& outValueRootNodes, ComputationNodeBasePtr trainRootNode);

 private:
+    template <class ElemType> void PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes);
    void ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount);
    void AllocateGradientMatricesForInputs(ComputationNodeBasePtr parentNode);

@ -832,7 +833,7 @@ private:

 public:
    void DescribeNetworkUsingDot(std::list<ComputationArc>& arcs, std::wstring outFile);
-    void PlotNetworkTopology(const std::wstring outputFile);
+    void PlotNetworkTopology(const std::wstring& outputFile);

    // -----------------------------------------------------------------------
    // scripting integration
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -9,18 +9,19 @@

 #include "Basics.h"
 #include "ComputationNetworkBuilder.h"
-
 #include "ComputationNode.h"
+
+#include "ConvolutionalNodes.h"
+#include "DeprecatedNodes.h"
+#include "EvaluationNodes.h"
 #include "InputAndParamNodes.h"
 #include "LinearAlgebraNodes.h"
 #include "NonlinearityNodes.h"
-#include "ConvolutionalNodes.h"
-#include "RecurrentNodes.h"
-#include "ReshapingNodes.h"
 #include "PreComputeNodes.h"
-#include "TrainingNodes.h"
-#include "EvaluationNodes.h"
+#include "ReshapingNodes.h"
+#include "RecurrentNodes.h"
 #include "SpecialPurposeNodes.h"
+#include "TrainingNodes.h"

 #include <string>

@ -39,6 +40,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
 #endif
         if (nodeType == OperationNameOf(AbsNode))                              return New<AbsNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode))return New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(ClipNode))                             return New<ClipNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(CosDistanceNode))                      return New<CosDistanceNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(CosDistanceWithNegativeSamplesNode))   return New<CosDistanceWithNegativeSamplesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(CosineNode))                           return New<CosineNode<ElemType>>(forward<_Types>(_Args)...);
@ -53,12 +55,14 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(EnvironmentInputNode))                 return New<EnvironmentInputNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ErrorPredictionNode))                  return New<ErrorPredictionNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ExpNode))                              return New<ExpNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(FloorNode))                            return New<FloorNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(FutureValueNode))                      return New<FutureValueNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(GatherPackedNode))                     return New<GatherPackedNode<ElemType>>(forward<_Types>(_Args)...);
 #ifdef COMING_SOON
    else if (nodeType == OperationNameOf(GMMLogLikelihoodNode))                 return New<GMMLogLikelihoodNode<ElemType>>(forward<_Types>(_Args)...);
 #endif
    else if (nodeType == OperationNameOf(HardmaxNode))                          return New<HardmaxNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(IfNode))                               return New<IfNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(InvStdDevNode))                        return New<InvStdDevNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(KhatriRaoProductNode))                 return New<KhatriRaoProductNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(LogNode))                              return New<LogNode<ElemType>>(forward<_Types>(_Args)...);
@ -79,6 +83,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(ReconcileDynamicAxisNode))             return New<ReconcileDynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ReciprocalNode))                       return New<ReciprocalNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RectifiedLinearNode))                  return New<RectifiedLinearNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(ReduceElementsNode))                   return New<ReduceElementsNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ReshapeNode))                          return New<ReshapeNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RowRepeatNode))                        return New<RowRepeatNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RowStackNode))                         return New<RowStackNode<ElemType>>(forward<_Types>(_Args)...);
@ -417,6 +422,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Class
    return net.AddNodeToNetAndAttachInputs(New<ClassBasedCrossEntropyWithSoftmaxNode<ElemType>>(net.GetDeviceId(), nodeName), { label, prediction, input_weight, cls_log_post_prob });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Clip(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<ClipNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b, c });
+}
+
 #ifdef COMING_SOON
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CRF(const ComputationNodePtr label,
@ -530,12 +541,24 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Abs(c
    return net.AddNodeToNetAndAttachInputs(New<AbsNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Floor(const ComputationNodePtr a, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<FloorNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Hardmax(const ComputationNodePtr a, const std::wstring nodeName)
 {
    return net.AddNodeToNetAndAttachInputs(New<HardmaxNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::If(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<IfNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b, c });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Softmax(const ComputationNodePtr a, const std::wstring nodeName)
 {
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -99,6 +99,7 @@ public:
 #endif
    ComputationNodePtr Abs(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr ClassCrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const ComputationNodePtr input_weight, const ComputationNodePtr cls_log_post_prob, const std::wstring nodeName = L"");
+    ComputationNodePtr Clip(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName = L"");
    ComputationNodePtr Cos(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
@ -111,11 +112,13 @@ public:
    ComputationNodePtr DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName = L"");
    ComputationNodePtr ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Exp(const ComputationNodePtr a, const std::wstring nodeName = L"");
+    ComputationNodePtr Floor(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
 #ifdef COMING_SOON
    ComputationNodePtr GMMLogLikelihood(const ComputationNodePtr unnormedPrior, const ComputationNodePtr mean, const ComputationNodePtr logStddev, const ComputationNodePtr feature, const std::wstring nodeName = L"");
 #endif
    ComputationNodePtr Hardmax(const ComputationNodePtr a, const std::wstring nodeName = L"");
+    ComputationNodePtr If(const ComputationNodePtr a, const ComputationNodePtr b, const ComputationNodePtr c, const std::wstring nodeName = L"");
    ComputationNodePtr InvStdDev(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr KhatriRaoProduct(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Log(const ComputationNodePtr a, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -105,7 +105,7 @@ ComputationNodeBasePtr ComputationNetwork::GetNestedNetwork(const ComputationNod
 ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(const std::vector<shared_ptr<SEQTraversalFlowControlNode>>& recurrentInfo, const std::list<ComputationNodeBasePtr>& allNodes /*must be in eval order*/)
 {
    // traverse the network in evaluation order and create a new list that replaces all recurrence by a SEQTraversalFlowControlNode
-    set<shared_ptr<IComputationNode>> loopsSeen; // for consistency check only
+    std::set<shared_ptr<IComputationNode>> loopsSeen; // for consistency check only
    for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end();)
    {
        shared_ptr<SEQTraversalFlowControlNode> recInfo = FindInRecurrentLoops(recurrentInfo, *nodeIter); // check if this node participates in a recurrent loop
@ -803,6 +803,40 @@ void ComputationNetwork::MarkValueNonSharableNodes()
    }
 }

+template <class ElemType>
+void ComputationNetwork::PrintMemorySharingStructure(const std::vector<ComputationNodeBasePtr>& nodes)
+{
+    std::map <const Matrix<ElemType>*, std::set<wstring>> memSharingStructure;
+    for (auto& n : nodes)
+    {
+        ComputationNode<ElemType>* node = n->As<ComputationNode<ElemType>>();
+        std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> matrixInfo = node->GetMatrixInfo();
+        for (const auto&item : matrixInfo)
+        {
+            const Matrix<ElemType>* matrix = item.first;
+            if (memSharingStructure.find(matrix) == memSharingStructure.end())
+                memSharingStructure.insert(std::pair<const Matrix<ElemType>*, std::set<wstring>>(matrix, std::set<wstring>()));
+
+            std::set<wstring>& s = memSharingStructure[matrix];
+            s.insert(item.second);
+        }
+    }
+
+    fprintf(stderr, "\nMemory Sharing Structure:\n\n");
+    for (const auto& item : memSharingStructure)
+    {
+        const std::set<wstring>& s = item.second;
+        fprintf(stderr, "%p: {", item.first);
+        for (const auto& memShareInfo: s)
+        {
+            fprintf(stderr, "[%ls] ", memShareInfo.c_str());
+        }
+        fprintf(stderr, "}\n");
+    }
+    fprintf(stderr, "\n");
+}
+
+
 // this function will need to be called before actual validation and execution to
 // predetermine how to share matrices to reduce memory usage.
 // TODO: find a simple topological order and allocateEvalMatrices on that order directly
@ -947,6 +981,18 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
    }

    m_areMatricesAllocated = true;
+
+    //print the memory sharing structure
+    std::vector<ComputationNodeBasePtr> allNodes = GetAllNodes();
+    if (allNodes.size() == 0)
+        LogicError("Network has no computation node.");
+
+    if (allNodes[0]->Is<ComputationNode<float>>())
+        PrintMemorySharingStructure<float>(allNodes);
+    else if (allNodes[0]->Is<ComputationNode<double>>())
+        PrintMemorySharingStructure<double>(allNodes);
+    else
+        LogicError("Unexpected node precision type.");
 }

 void ComputationNetwork::ReleaseMatricesAfterEvalForChildren(ComputationNodeBasePtr n, std::unordered_map<ComputationNodeBasePtr, int>& parentCount)
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@ -24,95 +24,45 @@
    <RootNamespace>CNTK</RootNamespace>
    <ProjectName>ComputationNetworkLib</ProjectName>
  </PropertyGroup>
+  <PropertyGroup Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+  </PropertyGroup>
  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings" />
  <ImportGroup Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="$(DebugBuild)">
-    <LinkIncremental>true</LinkIncremental>
-    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
-  </PropertyGroup>
-  <PropertyGroup Condition="$(ReleaseBuild)">
-    <LinkIncremental>false</LinkIncremental>
-    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
-    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
-  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(NvmlLib)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>Math.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>Math.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-      <OpenMPSupport>true</OpenMPSupport>
-      <TreatWarningAsError>true</TreatWarningAsError>
+      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
-      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
-      <StackReserveSize>100000000</StackReserveSize>
-    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
-      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
-      <Profile>true</Profile>
-      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
-    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
    <ClCompile>
      <PreprocessorDefinitions>CPUONLY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
-    <Link>
-      <DelayLoadDLLs>Math.dll</DelayLoadDLLs>
-    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(GpuBuild)">
    <ClCompile>
@ -120,9 +70,10 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
+      <DelayLoadDLLs>%(DelayLoadDLLs);nvml.dll;$(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
-      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" "$(TargetDir)"</Command>
      <Message>Copying NVidia GDK extension DLL to target folder</Message>
    </PostBuildEvent>
  </ItemDefinitionGroup>
@ -144,6 +95,7 @@
    <ClInclude Include="ComputationNetworkBuilder.h" />
    <ClInclude Include="ComputationNode.h" />
    <ClInclude Include="ConvolutionalNodes.h" />
+    <ClInclude Include="DeprecatedNodes.h" />
    <ClInclude Include="PreComputeNodes.h" />
    <ClInclude Include="SpecialPurposeNodes.h" />
    <ClInclude Include="EvaluationNodes.h" />
@ -159,13 +111,6 @@
  </ItemGroup>
  <ItemGroup>
    <ClCompile Include="..\Common\BestGpu.cpp" />
-    <ClCompile Include="..\Common\File.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\TimerUtility.cpp" />
    <ClCompile Include="ComputationNetwork.cpp" />
    <ClCompile Include="ComputationNetworkAnalysis.cpp" />
    <ClCompile Include="ComputationNetworkBuilder.cpp" />
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj.filters
@ -138,6 +138,9 @@
    <ClInclude Include="ComputationEnvironment.h">
      <Filter>Environment</Filter>
    </ClInclude>
+    <ClInclude Include="DeprecatedNodes.h">
+      <Filter>Nodes</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Common">
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -158,6 +158,84 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all
    SetDims(TensorShape(dims), HasMBLayout());
 }

+// N-nary zip operation, e.g. for TernaryZip for clip()
+// If allowBroadcast then one can be a sub-dimension of the other (if layout then only for rows, otherwise for cols, too).
+// This also helpfully resizes the children if not yet sized.
+void ComputationNodeBase::ValidateNaryZip(bool isFinalValidationPass, bool allowBroadcast, size_t numInputs)
+{
+    assert(m_inputs.size() == numInputs);
+    ComputationNodeBase::Validate(isFinalValidationPass);
+    InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
+
+    ValidateInferNaryInputDims(numInputs);
+
+    // check minibatch layout consistency for all possible pairs (n choose 2)
+    if (isFinalValidationPass)
+        for (size_t i = 0; i < numInputs; i++)        
+            for (size_t j = i+1; j < numInputs; j++)            
+                if (Input(i)->GetMBLayout() != Input(j)->GetMBLayout() && Input(i)->HasMBLayout() && Input(j)->HasMBLayout())
+                    LogicError("%ls: Minibatch layouts are not the same between arguments and might get out of sync during runtime. If this is by design, use ReconcileDynamicAxis() to forward layouts between nodes.", NodeDescription().c_str());
+
+    // result has tensor shape with dimensions being the max over all inputs
+    let shape0 = GetInputSampleLayout(0);
+
+    // dims is max over all inputs
+    size_t maxRank = shape0.GetRank();    
+    for (size_t i = 1; i < numInputs; i++)
+    {
+        let shape = GetInputSampleLayout(i);
+        if (shape.GetRank() > maxRank)
+            maxRank = shape.GetRank();
+    }        
+    SmallVector<size_t> dims = shape0.GetDims();
+    dims.resize(maxRank, 1); // pad with 1
+
+    // first check for invalid dimensions
+    for (size_t k = 0; k < maxRank; k++)
+    {
+        size_t maxDim = 0;
+        TensorShape maxShape = shape0; // arbitrary; this is just used for the error message
+        for (size_t i = 0; i < numInputs; i++)
+        {
+            let currentShape = GetInputSampleLayout(i);
+            size_t currentRank = currentShape.GetRank();
+            // make sure that the rank of this input is bigger than the current index (otherwise, these are implied singleton dimensions that do not need to be checked)
+            if (currentRank > k)
+            {
+                size_t currentDim = currentShape[k];
+                if (currentDim > 1 && maxDim != currentDim && maxDim > 1) // 1=broadcasting, 0=not known yet, meant to be inferred
+                {
+                    InvalidArgument("%ls: Input dimensions [%s] and [%s] are not compatible.",
+                        NodeDescription().c_str(), string(maxShape).c_str(), string(currentShape).c_str());
+                }
+                else if (currentDim > maxDim)
+                {
+                    maxDim = currentDim;
+                    maxShape = currentShape;
+                }
+            }
+        }
+    }
+
+    // now set up the right dims
+    for (size_t k = 0; k < maxRank; k++)
+    {
+        for (size_t i = 0; i < numInputs; i++)
+        {
+            let shape = GetInputSampleLayout(i);
+
+            if (shape.GetRank() > k)
+            {
+                size_t dim = shape[k];
+                if (dims[k] <= 1 && dim != 0)
+                    dims[k] = dim;
+            }
+        }
+    }
+
+    SetDims(TensorShape(dims), HasMBLayout());
+}
+
 // unary reduce-to-(1,1) operation, e.g. MatrixL1RegNode
 void ComputationNodeBase::ValidateUnaryReduce(bool isFinalValidationPass)
 {
@ -215,6 +293,30 @@ void ComputationNodeBase::ValidateInferBinaryInputDims()
    }
 }

+// as above but for N-ary cases
+void ComputationNodeBase::ValidateInferNaryInputDims(size_t numInputs)
+{
+    // limited inference of children dimensions
+    // if dimension not specified we assume two operands' dimensions should be the same
+    // NOTE: The assert is set to check if >= numInputs since this is called from nodes which have more than 'nInputs' children.
+    //      The number of children is formally verified elsewhere, so this will not break consistency.
+    assert(m_inputs.size() >= numInputs);
+    for (size_t index = 0; index < numInputs; index++)
+    {
+        const auto& in = Input(index);
+        
+        for (size_t indexOther = 0; indexOther < numInputs; indexOther++)
+        {
+            if (indexOther != index) 
+            {
+                const auto& other = Input(indexOther);
+                // borrow any unset dimension on one input from the other input
+                in->ValidateInferInputDimsFrom(other->GetSampleLayout());
+            }
+        }
+    }
+}
+
 // in case of an error, we just back out, and leave it to outside code to detect errors
 template <class ElemType>
 void ComputationNode<ElemType>::ValidateInferInputDimsFrom(const TensorShape& otherShape)
@ -252,7 +354,7 @@ TensorShape ComputationNodeBase::GetTensorShape(size_t rank) const
    TensorShape tensorShape = GetSampleLayout(); // TODO: Do we need to expect this tensor to have arbitrary strides? In case it came out of a Slice, Reshape, or Transpose op in-place?
    if (HasMBLayout())
    {
-        size_t i = rank;
+        size_t i = (rank != SIZE_MAX) ? rank : tensorShape.GetRank();
        tensorShape.AppendInPlace(i++, GetMBLayout()->GetNumParallelSequences());
        tensorShape.AppendInPlace(i++, GetMBLayout()->GetNumTimeSteps());
    }
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -649,8 +649,10 @@ protected:
    void ValidateUnaryMap(bool isFinalValidationPass);
    void ValidateUnaryReduce(bool isFinalValidationPass);
    void ValidateInferBinaryInputDims();
+    void ValidateInferNaryInputDims(size_t numInputs);    
    void ValidateBinaryZip(bool isFinalValidationPass, bool allowBroadcast);
    void ValidateBinaryReduce(bool isFinalValidationPass);    
+    void ValidateNaryZip(bool isFinalValidationPass, bool allowBroadcast, size_t numInputs);
    void InferMBLayoutFromInputsForStandardCase(bool isFinalValidationPass);
    virtual void ValidateInferInputDimsFrom(const TensorShape&) = 0;    // (implemented by ComputationNode<ElemType>)

@ -1318,7 +1320,7 @@ public:
    void UpdateFunctionValuesSize()
    {
        UpdateDataSize(Value());
-        Value().CollapseDataLocationAfterWriting(); // actually before writing, should change the name
+        Value().CollapseDataLocation(); // actually before writing, should change the name
    }

    // -----------------------------------------------------------------------
@ -1420,6 +1422,16 @@ public:
    // memory sharing
    // -----------------------------------------------------------------------

+    //this function is for displaying memeory sharing information
+    //TODO: customize this function for all nodes that uses temp internal matrices.
+    virtual std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> GetMatrixInfo()
+    {
+        std::set<std::pair<const Matrix<ElemType>*, const std::wstring>> matrixInfo;
+        matrixInfo.insert(make_pair(&Value(),    NodeName() + L" Value"    + msra::strfun::utf16(ShapeDescription())));
+        matrixInfo.insert(make_pair(&Gradient(), NodeName() + L" Gradient" + msra::strfun::utf16(ShapeDescription())));
+        return matrixInfo;
+    }
+
    // request matrices needed to do node function value evaluation
    virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
@ -1961,7 +1973,9 @@ protected:
    using Base::Validate;                                                                                                                                \
    using Base::ValidateBinaryReduce;                                                                                                                    \
    using Base::ValidateBinaryZip;                                                                                                                       \
+    using Base::ValidateNaryZip;                                                                                                                         \
    using Base::ValidateInferBinaryInputDims;                                                                                                            \
+    using Base::ValidateInferNaryInputDims;                                                                                                              \
    using Base::ValidateInferInputDimsFrom;                                                                                                              \
    using Base::ValidateUnaryMap;                                                                                                                        \
    using Base::ValidateUnaryReduce;                                                                                                                     \
--- a/Source/ComputationNetworkLib/DeprecatedNodes.h
+++ b/Source/ComputationNetworkLib/DeprecatedNodes.h
@ -0,0 +1,64 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+#pragma once
+
+#include "Basics.h"
+#include "ComputationNode.h"
+#include "Matrix.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// -----------------------------------------------------------------------
+// SumColumnElements (input)
+// Sums up all elements in each sample (column) of the input. Every sample
+// will be reduced to a scalar. This is equivalent to multiplying with a row of ones.
+// This is deprecated, in favor of ReduceElements().
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class SumColumnElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
+{
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"SumColumnElements"; }
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(SumColumnElementsNode);
+    SumColumnElementsNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
+    {
+        auto sliceInputValue  = Input(0)->ValueFor(fr);
+        auto sliceOutputValue =           ValueFor(fr); // row vector
+
+        Matrix<ElemType>::VectorSum(sliceInputValue, sliceOutputValue, true);
+    }
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
+    {
+        auto sliceInputGrad  = Input(0)->GradientFor(fr);
+        auto sliceOutputGrad =           GradientFor(fr);
+
+        sliceInputGrad += sliceOutputGrad; // here the assumption is that sliceOutputGrad is a row vector
+    }
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
+
+        SetDims(TensorShape(1), Input(0)->HasMBLayout()); // each column is reduced to a scalar
+    }
+};
+
+template class SumColumnElementsNode<float>;
+template class SumColumnElementsNode<double>;
+
+}}}
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -29,12 +29,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 template <class ElemType>
 class PlusNode : public BinaryElementWiseNode<ElemType>
 {
-    typedef BinaryElementWiseNode<ElemType> Base;
-    UsingBinaryElementwiseNodeBaseMembers;
-    static const std::wstring TypeName()
-    {
-        return L"Plus";
-    }
+    typedef BinaryElementWiseNode<ElemType> Base; UsingBinaryElementwiseNodeBaseMembers;
+    static const std::wstring TypeName() { return L"Plus"; }

 public:
    DeclareConstructorFromConfigWithNumInputs(PlusNode);
@ -76,12 +72,8 @@ template class PlusNode<double>;
 template <class ElemType>
 class LogPlusNode : public BinaryElementWiseNode<ElemType>
 {
-    typedef BinaryElementWiseNode<ElemType> Base;
-    UsingBinaryElementwiseNodeBaseMembers;
-    static const std::wstring TypeName()
-    {
-        return L"LogPlus";
-    }
+    typedef BinaryElementWiseNode<ElemType> Base; UsingBinaryElementwiseNodeBaseMembers;
+    static const std::wstring TypeName() { return L"LogPlus"; }

 public:
    DeclareConstructorFromConfigWithNumInputs(LogPlusNode);
@ -113,6 +105,7 @@ public:
        if (Input(inputIndex)->ReducesInTimeWrt(Input(1 - inputIndex)))
            Input(1 - inputIndex)->MaskMissingValueColumnsToZero(fr);

+        // TODO: would be nice to state the derivative here in a comment
        inputGradient.AddElementwiseProductWithLogSumDerivativeOf(gradient, input0, input1);
    }
 };
@ -497,8 +490,7 @@ template class TimesNode<double>;
 // This differs from TimesNode in that A is transposed, where A must be a
 // rank-1 or rank-2 tensor.
 // A common use of transposition is trace(X'X) where X is a matrix of samples.
-// This can NOT be implemented with this node. Instead, use
-// SumColumnElements (ElementTimes (X, X))
+// This can be more efficiently implemented as ReducePlus (ElementTimes (X, X))
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -653,6 +645,9 @@ template class DiagTimesNode<double>;
 // When applied to minibatch data, this will sum across all sequences in the
 // minibatch, like a training-criterion node. This is one of the few operations
 // that cross the boundary between input sequences.
+// Note that SGD itself aggregates over samples in a criterion node.
+// So the only proper use of this node is for multi-task learning, where
+// different nodes have different numbers of samples (sequence lenth).
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -697,63 +692,6 @@ public:
 template class SumElementsNode<float>;
 template class SumElementsNode<double>;

-// -----------------------------------------------------------------------
-// SumColumnElementsNode (input)
-// Sums up all elements in each sample (column) of the input. Every sample
-// will be reduced to a scalar. This is equivalent to multiplying with a row of ones.
-// TODO: This should be deprecated, in favor of a reduce node.
-// TODO: Implement this with the tensor library.
-//       axis=0: all elements; axis>0: only that axis; axis<0: time (implemented in BS)
-// -----------------------------------------------------------------------
-
-template <class ElemType>
-class SumColumnElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
-{
-    typedef ComputationNode<ElemType> Base;
-    UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName()
-    {
-        return L"SumColumnElements";
-    }
-
-public:
-    DeclareConstructorFromConfigWithNumInputs(SumColumnElementsNode);
-    SumColumnElementsNode(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name)
-    {
-    }
-
-    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
-    {
-        auto sliceInputGrad = Input(0)->GradientFor(fr);
-        auto sliceOutputGrad = GradientFor(fr);
-
-        sliceInputGrad += sliceOutputGrad; // here the assumption is that sliceOutputGrad is a row vector
-    }
-
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
-    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
-
-    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
-    {
-        auto sliceInputValue = Input(0)->ValueFor(fr);
-        auto sliceOutputValue = ValueFor(fr); // row vector
-
-        Matrix<ElemType>::VectorSum(sliceInputValue, sliceOutputValue, true);
-    }
-
-    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-    {
-        Base::Validate(isFinalValidationPass);
-        InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
-
-        SetDims(TensorShape(1), Input(0)->HasMBLayout()); // each column is reduced to a scalar
-    }
-};
-
-template class SumColumnElementsNode<float>;
-template class SumColumnElementsNode<double>;
-
 // -----------------------------------------------------------------------
 // TransposeDimensions (input, axis1, axis2)
 //  - swaps index dimensions axis1 and axis2. The values are 1-based; 1 stands for the leading dimension.
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -29,9 +29,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 enum GradientOperationType 
 {
-    UnaryGradient,
-    BinaryWithInputGradient,
-    BinaryWithOutputGradient
+    noGradient,
+    unaryGradient,
+    binaryWithInputGradient,
+    binaryWithOutputGradient
 };

 template <class ElemType, ElementWiseOperator opForward, ElementWiseOperator opBackward, GradientOperationType opType>
@ -56,19 +57,20 @@ public:

    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
    {
-        assert(inputIndex == 0);
-        inputIndex;
+        assert(inputIndex == 0), inputIndex;

        // get the args
        size_t rank = DetermineElementwiseTensorRank();
        auto sliceOutputGrad =           GradientTensorFor(rank, fr); // propagate from this one...
        auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr); // ...to this one

-        // we expect a constant conditional expression here -- suppress the warning that leads to an error
-        // TODO: alternative: assign to a non-const variable and test that.
-#pragma warning( push )
-#pragma warning( disable : 4127 )
-        if (opType == UnaryGradient) 
+        GradientOperationType opTypeHolder = opType;  // preventing pragma warning C4127
+
+        if (opTypeHolder == noGradient)
+        {
+            // Do nothing
+        }
+        else if (opTypeHolder == unaryGradient)
        {
            sliceInputGrad.DoUnaryOpOf(1, sliceOutputGrad, 1, opBackward, opSum);
        }
@ -76,11 +78,10 @@ public:
        {
            // If gradient can be compute from output rather than input, then that's better for mem sharing (and faster in most cases).
            // Not possible for Cos().
-            auto sliceValue = (opType == BinaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
+            auto sliceValue = (opType == binaryWithOutputGradient) ? ValueTensorFor(rank, fr) : // using input or output value
                Input(0)->ValueTensorFor(rank, fr);
            sliceInputGrad.DoBinaryOpOf(1, sliceOutputGrad, sliceValue, 1, opBackward, opSum);
        }
-#pragma warning( pop )
    }

    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
@ -90,11 +91,11 @@ public:

    virtual bool OutputUsedInComputingInputNodesGradients() const override
    {
-        return opType == BinaryWithOutputGradient;
+        return opType == binaryWithOutputGradient;
    }
    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override
    {
-        return opType == BinaryWithInputGradient;
+        return opType == binaryWithInputGradient;
    }
 };

@ -107,6 +108,7 @@ public:
 // RectifiedLinearNode (input)
 // LogNode (input)
 // ExpNode (input)
+// FloorNode (input)
 // CosineNode (input)
 // SinNode (input)
 // Abs(input)
@ -137,18 +139,19 @@ public:
    }

 //                                    Name             Forward and      Backward opcodes                                           Gradient optype
-DeclareUnaryElementWiseWithOpCodeNode(Pass,            Copy,            Copy,                                                      UnaryGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Sigmoid,         Sigmoid,         ElementwiseProductWithSigmoidDerivativeFromOutput,         BinaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Tanh,            Tanh,            ElementwiseProductWithTanhDerivativeFromOutput,            BinaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, BinaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Log,             Log,             ElementwiseProductWithLogDerivativeFromOutput,             BinaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Exp,             Exp,             ElementwiseProduct,                                        BinaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Cosine,          Cosine,          ElementwiseProductWithCosDerivative,                       BinaryWithInputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Sin,             Sin,             ElementwiseProductWithSinDerivative,                       BinaryWithInputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Abs,             Abs,             ElementwiseProductWithAbsDerivative,                       BinaryWithInputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Negate,          Negate,          Negate,                                                    UnaryGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Sqrt,            Sqrt,            ElementwiseProductWithSqrtDerivative,                      BinaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Reciprocal,      Reciprocal,      ElementwiseProductWithReciprocalDerivative,                BinaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Abs,             Abs,             ElementwiseProductWithAbsDerivative,                       binaryWithInputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Cosine,          Cosine,          ElementwiseProductWithCosDerivative,                       binaryWithInputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Exp,             Exp,             ElementwiseProduct,                                        binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Floor,           Floor,           None,                                                      noGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Log,             Log,             ElementwiseProductWithLogDerivativeFromOutput,             binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Negate,          Negate,          Negate,                                                    unaryGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Pass,            Copy,            Copy,                                                      unaryGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Reciprocal,      Reciprocal,      ElementwiseProductWithReciprocalDerivative,                binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Sigmoid,         Sigmoid,         ElementwiseProductWithSigmoidDerivativeFromOutput,         binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Sin,             Sin,             ElementwiseProductWithSinDerivative,                       binaryWithInputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Sqrt,            Sqrt,            ElementwiseProductWithSqrtDerivative,                      binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Tanh,            Tanh,            ElementwiseProductWithTanhDerivativeFromOutput,            binaryWithOutputGradient);

 #pragma pop_macro("DeclareUnaryElementWiseWithOpCodeNode")

@ -423,4 +426,132 @@ public:
 template class HardmaxNode<float>;
 template class HardmaxNode<double>;

+// -----------------------------------------------------------------------
+// If (flag, ifValue, elseValue)
+// -----------------------------------------------------------------------
+// Similar to C's ternary operator "flag ? ifValue : elseValue". If first input is !=0 return second input, else third
+template <class ElemType>
+class IfNode : public ComputationNode<ElemType>, public NumInputs<3>
+{
+    typedef ComputationNode<ElemType> Base;
+    UsingComputationNodeMembersBoilerplate;
+
+    static const std::wstring TypeName()
+    {
+        return L"If";
+    }
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(IfNode);
+    IfNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex)  const override { return childIndex == 0; }
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
+    {
+        size_t rank = DetermineElementwiseTensorRank();
+        auto result =           ValueTensorFor(rank, fr);
+        auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto input2 = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
+        result.AssignCondOf(input0, input1, input2);
+    }
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+    {
+        if (inputIndex == 0) // derivative of the first input (the flag) is always 0 => no action.
+            return;
+
+        size_t rank = DetermineElementwiseTensorRank();
+        auto gradient      =                    GradientTensorFor(rank, fr);
+        auto input0        = Input(0)->            ValueTensorFor(rank, fr.AllowBroadcast());
+        auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
+
+        // if reduction then mask the respective input(s) (zero out the gaps)
+        if (Input(inputIndex)->ReducesInTimeWrt(shared_from_this()))
+            MaskMissingGradientColumnsToZero(fr);
+
+        if (inputIndex == 1)
+        {
+            inputGradient.AddCopyIfOf(input0, gradient);
+        }
+        else if (inputIndex == 2)
+        {
+            inputGradient.AddCopyIfNotOf(input0, gradient);
+        }
+    }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        ValidateNaryZip(isFinalValidationPass, /* allow broadcast */ true, /* num Inputs */ 3);
+    }
+};
+
+template class IfNode<float>;
+template class IfNode<double>;
+
+// -----------------------------------------------------------------------
+// ClipNode (minValue, maxValue, tensor)
+// -----------------------------------------------------------------------
+// This node clips the values in a tensor elements-wise to ensure they are within minValue <= x >= maxValue
+// The gradient (per element) is (ge(x, minValue) AND le(x, maxValue)), or in other words, 1 if the value has
+// not been clipped, and 0 if the value has been clipped.
+
+template <class ElemType>
+class ClipNode : public ComputationNode<ElemType>, public NumInputs<3>
+{
+    typedef ComputationNode<ElemType> Base;    
+    UsingComputationNodeMembersBoilerplate;
+
+    static const std::wstring TypeName()
+    {
+        return L"Clip";
+    }
+
+public:
+    DeclareConstructorFromConfigWithNumInputs(ClipNode);
+    ClipNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
+    {
+        size_t rank = DetermineElementwiseTensorRank();
+        auto result =           ValueTensorFor(rank, fr);
+        auto input0 = Input(0)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto input1 = Input(1)->ValueTensorFor(rank, fr.AllowBroadcast());
+        auto input2 = Input(2)->ValueTensorFor(rank, fr.AllowBroadcast());
+
+        result.AssignClipOf(input0, input1, input2);
+    }
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+    {
+        // there is only a gradient for the input tensor that is to be clipped
+        if (inputIndex == 2)
+        {
+            size_t rank = DetermineElementwiseTensorRank();
+            auto gradient =                         GradientTensorFor(rank, fr);
+            auto inputGradient = Input(inputIndex)->GradientTensorFor(rank, fr.AllowBroadcast());
+            auto input =         Input(inputIndex)->ValueTensorFor(rank, fr.AllowBroadcast());
+            auto output =                           ValueTensorFor(rank, fr.AllowBroadcast());
+
+            inputGradient.AddCopyIfEqualOf(input, output, gradient);
+        }        
+    }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        ValidateNaryZip(isFinalValidationPass, /* allow broadcast */ true, /* num Inputs */ 3);
+    }
+};
+
+template class ClipNode<float>;
+template class ClipNode<double>;
+
 }}}
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -23,6 +23,133 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+// -----------------------------------------------------------------------
+// ReduceElements (op, axis=, input)
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+/*virtual*/ void ReduceElementsNode<ElemType>::CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const /*override*/
+{
+    Base::CopyTo(nodeP, newName, flags);
+    auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
+    node->m_axis      = m_axis;
+    node->m_operation = m_operation;
+    node->m_op        = m_op;
+}
+
+template <class ElemType>
+/*virtual*/ void ReduceElementsNode<ElemType>::Load(File& fstream, size_t modelVersion) /*override*/
+{
+    Base::Load(fstream, modelVersion);
+    fstream >> m_axis >> m_operation;
+    ValidateOp();
+}
+
+template <class ElemType>
+/*virtual*/ void ReduceElementsNode<ElemType>::Save(File& fstream) const /*override*/
+{
+    Base::Save(fstream);
+    fstream << m_axis << m_operation; // note: we serialize the string and not the opcode, since opcodes may change
+}
+
+template <class ElemType>
+/*virtual*/ void ReduceElementsNode<ElemType>::ForwardProp(const FrameRange& fr) /*override*/
+{
+    // get the args
+    size_t rank = DetermineElementwiseTensorRank();
+    auto result =           ValueTensorFor(rank, fr);
+    auto input  = Input(0)->ValueTensorFor(rank, fr);
+
+    // the actual operation is a Copy with a reduction op
+    result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_op);
+    // note: we can implement "Mean" by passing 1/dim for alpha
+}
+
+template <class ElemType>
+/*virtual*/ void ReduceElementsNode<ElemType>::BackpropTo(const size_t inputIndex, const FrameRange& fr) /*override*/
+{
+    assert(inputIndex == 0), inputIndex;
+
+    // get the args
+    size_t rank = DetermineElementwiseTensorRank();
+    auto sliceOutputGrad =           GradientTensorFor(rank, fr); // propagate from this one...
+    auto sliceInputGrad  = Input(0)->GradientTensorFor(rank, fr); // ...to this one
+
+    // gradients are not as simple as passing an op-code, unfortunately
+    switch (m_op)
+    {
+    case ElementWiseOperator::opSum:
+        // "Plus": broadcast the gradient
+        sliceInputGrad.AddCopyOf(sliceOutputGrad);
+        break;
+
+        // more coming
+
+        // "LogPlus": softmax
+        //   f(x) = log(sum_i exp x_i), hence gradient is:
+        //   df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i  – ReduceLogPlus(x))
+        // targetGradient = gradientFromTop .* Exp (inputValue - outputValue)   --TODO: verify
+        // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.
+
+        // "Max": Copy the gradient only to the max value. targetGradient += gradientFromTop .* (outputValue == inputValue). Needs its own opcode. --TODO : verify
+    }
+}
+
+template <class ElemType>
+/*virtual*/ bool ReduceElementsNode<ElemType>::OutputUsedInComputingInputNodesGradients() const /*override*/
+{
+    switch (m_op)
+    {
+    case ElementWiseOperator::opSum: return false;
+    // will be different e.g. for LogPlus, Max, and Min
+    }
+    LogicError("Should not get here.");
+}
+
+template <class ElemType>
+/*virtual*/ bool ReduceElementsNode<ElemType>::InputUsedInComputingInputNodesGradients(size_t inputIndex) const /*override*/
+{
+    switch (m_op)
+    {
+    case ElementWiseOperator::opSum: return false;
+    // will be different for LogPlus, Max, and Min
+    }
+    LogicError("Should not get here.");
+}
+
+// map the operation specific as a string to an ElementWiseOperator to pass to 
+template <class ElemType>
+void ReduceElementsNode<ElemType>::ValidateOp()
+{
+    if (m_operation == L"Plus") m_op = ElementWiseOperator::opSum;
+    // more here
+    else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Plus'. And a few more soon.", NodeDescription().c_str(), m_operation.c_str());
+}
+
+template <class ElemType>
+/*virtual*/ void ReduceElementsNode<ElemType>::Validate(bool isFinalValidationPass) /*override*/
+{
+    Base::Validate(isFinalValidationPass);
+    InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
+
+    // validate the opcode (in case we got instantiated empty and never updated)
+    ValidateOp();
+
+    let shape = Input(0)->GetSampleLayout();
+    auto dims = shape.GetDims();
+    if (m_axis == 0)
+        dims = { 1 };                       // entire sample is reduced to a scalar
+    else if (m_axis - 1 >= 0 && m_axis - 1 < dims.size())
+        dims[m_axis - 1] = 1;               // one axis is reduced to a scalar
+    else if (isFinalValidationPass)
+        InvalidArgument("The shape of %ls [%s] has no axis %d", NodeDescription().c_str(), string(shape).c_str(), m_axis);
+
+    SetDims(TensorShape(dims), Input(0)->HasMBLayout());
+}
+
+template class ReduceElementsNode<float>;
+template class ReduceElementsNode<double>;
+
 // -----------------------------------------------------------------------
 // Where(bitVector) -- extract indices of non-0 values in a sequence
 // -----------------------------------------------------------------------
@ -74,7 +201,7 @@ template <class ElemType>
                indexSequence.push_back(t);
        // Note: The above accesses m_value directly on the CPU, putting it into BOTH state, possibly for other consumers as well.
    }
-    input.CollapseDataLocationAfterWriting(); // BUGBUG: Move back, since BOTH state is broken at present.
+    input.CollapseDataLocation(); // BUGBUG: Move back, since BOTH state is broken at present.
    // create a new MBLayout
    let& outMBLayout = GetMBLayout();
    outMBLayout->InitAsPackedSequences(SequenceLengthVector(sequences, indexSequences), /*temp*/m_placementBuffer, /*temp*/m_rowAllocationsBuffer);
@ -158,7 +285,7 @@ template <class ElemType>
        }
    }
    // Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes.
-    result.CollapseDataLocationAfterWriting(); // BUGBUG: Move back, since BOTH state is broken at present.
+    result.CollapseDataLocation(); // BUGBUG: Move back, since BOTH state is broken at present.
 }

 template <class ElemType>
@ -223,12 +350,12 @@ template <class ElemType>
    // inherit MBLayout from indexData
    m_pMBLayout = Input(INDEXDATA)->GetMBLayout();
    if (isFinalValidationPass && (!Input(INDEXDATA)->HasMBLayout()))
-        LogicError("%ls requires first argument (index data) to have a time dimension.", this->NodeDescription().c_str());
+        LogicError("%ls requires first argument (index data) to have a time dimension.", NodeDescription().c_str());

    bool sourceHasTimeDimension = Input(SOURCEDATA)->HasMBLayout();

    if (isFinalValidationPass && Input(INDEXDATA)->GetSampleLayout().GetNumElements() != 1)
-        InvalidArgument("%ls requires the first argument (index data) to be a scalar time sequence.", this->NodeDescription().c_str());
+        InvalidArgument("%ls requires the first argument (index data) to be a scalar time sequence.", NodeDescription().c_str());

    // inherit tensor dimension from sourceData, minus the last (column or time) dimension. TODO this needs to become simpler...
    if (sourceHasTimeDimension)
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -170,6 +170,56 @@ private:
 template class ReshapeNode<float>;
 template class ReshapeNode<double>;

+// -----------------------------------------------------------------------
+// ReduceElements (op, axis=, input)
+// Reduces (e.g. sums up) all elements in each sample (column) of the input.
+// The optional axis can be 0 (meaning all elements) or a specific axis.
+// Allowed operations:
+//  - "Plus"
+//  - "LogPlus"   --not implemented yet
+//  - "Mean"      --not implemented yet
+//  - "Max"       --not implemented yet
+//  - "Min"       --not implemented yet
+//  - "All"       --not implemented yet
+//  - "Any"       --not implemented yet
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class ReduceElementsNode : public ComputationNode<ElemType>, public NumInputs<1>
+{
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"ReduceElements"; }
+
+    void ValidateOp();
+public:
+    ReduceElementsNode(DEVICEID_TYPE deviceId, const wstring& name, const std::wstring& operation = std::wstring(), int axis = 0) :
+        Base(deviceId, name), m_operation(operation), m_axis(axis), m_op((ElementWiseOperator)-1/*invalid*/)
+    {
+        if (!m_operation.empty()) // verify validity already here out of courtesy (would otherwise be caught in Validate())
+            ValidateOp();
+    }
+
+    ReduceElementsNode(const ScriptableObjects::IConfigRecordPtr configp) :
+        ReduceElementsNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"reductionOp"), configp->Get(L"axis"))
+    {
+        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
+    }
+
+    virtual void /*ComputationNodeBase::*/ CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override;
+    virtual void /*ComputationNodeBase::*/ Load(File& fstream, size_t modelVersion) override;
+    virtual void /*ComputationNodeBase::*/ Save(File& fstream) const override;
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override;
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override;
+    virtual bool /*ComputationNodeBase::*/ OutputUsedInComputingInputNodesGradients() const override;
+    virtual bool /*ComputationNodeBase::*/ InputUsedInComputingInputNodesGradients(size_t childIndex) const override;
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override;
+
+private:
+    int m_axis;
+    std::wstring m_operation; // the operation as a string, e.g. "Plus", see GetOpcode()
+    ElementWiseOperator m_op; // the operation mapped to our internal opCode
+};
+
 // -----------------------------------------------------------------------
 // ReconcileDynamicAxis (dataInput, layoutInput)
 // This node copies data from 'dataInput' while it propagates the minibatch-layout information from 'layoutInput'.
@ -1320,10 +1370,10 @@ reshaping
 reductions
 ----------

- - ReduceSum
+ - these are/will be implemented as a node for samples, and as recurrences for sequences
+ - ReducePlus
    - sum over all elements of a dimension, or over time
-    - we already got: SumColumnElements
- - ReduceMax
+ - ReduceMax, ReduceMin
    - max
    - can use MaxPooling?
 - ReduceMean
@ -1332,12 +1382,12 @@ reductions
 - ArgMax, ArgMin
    - we already have that somewhere, for evaluation
 - All, Any
-    - logical test --must be done over sequences
+    - logical test
 - TF also has:
-    - reduce_prod, reduce_min
+    - reduce_prod
    - segment_sum etc.; we use sequences
    - listdiff
-    - where: indices of 'true' values  -> 2D tensor of coordinates
+    - where: indices of 'true' values  -> 2D tensor of coordinates (unlike our Where)
    - unique (1D only)
    - edit_distance
    - invert_permutation: invert a permutation index vector
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -1451,7 +1451,11 @@ public:
        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
        Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-        if (m_dropoutRate > 0)
+        if (Environment().IsInferring() || m_dropoutRate <= 0)
+        {
+            sliceOutputValue.SetValue(sliceInput0Value);
+        }
+        else
        {
            // determine drop-out mask for this minibatch
            auto sliceMask = DataFor(*m_maskOfDropout, fr);
@ -1460,10 +1464,6 @@ public:
            // apply dropout mask
            sliceOutputValue.AssignElementProductOf(sliceMask, sliceInput0Value);
        }
-        else
-        {
-            sliceOutputValue.SetValue(sliceInput0Value);
-        }
    }

    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -100,6 +100,11 @@ void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimen
    {
    case nodeInput:
    {
+        if (outputNodes.size() == 0)
+        {
+            LogicError("No Output nodes found: Cannot determine Input node dimensions due to lack of Output nodes.\n(are 'outputNodeNames' and/or 'OutputNodes' properly defined in the configuration file?)");
+        }
+
        auto& nodes = m_net->InputNodes(outputNodes[0]);
        for (auto& node : nodes)
        {
--- a/Source/EvalDll/CNTKEval.h
+++ b/Source/EvalDll/CNTKEval.h
@ -4,6 +4,10 @@
 //
 // CNTKEval.h - Include file for the CNTK Evaluation DLL
 // 
+// NOTICE: This interface is a public interface for evaluating models in CNTK. 
+//         Changes to this interface may affect other projects, such as Argon and LatGen,
+//         and therefore need to be communicated with such groups.
+//
 #pragma once

 #include <string>
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@ -75,7 +75,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; ActionsLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ActionsLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
@ -99,7 +99,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; ActionsLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ActionsLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
@ -120,7 +120,7 @@
      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
    </Link>
    <PostBuildEvent>
-      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" "$(TargetDir)"</Command>
      <Message>Copying NVidia GDK extension DLL to target folder</Message>
    </PostBuildEvent>
  </ItemDefinitionGroup>
@ -140,22 +140,6 @@
  <ItemGroup>
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptEvaluator.cpp" />
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptParser.cpp" />
-    <ClCompile Include="..\Common\Config.cpp" />
-    <ClCompile Include="..\Common\DataReader.cpp" />
-    <ClCompile Include="..\Common\Eval.cpp">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">true</ExcludedFromBuild>
-    </ClCompile>
-    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\TimerUtility.cpp" />
    <ClCompile Include="dllmain.cpp">
      <CompileAsManaged>false</CompileAsManaged>
      <PrecompiledHeader>
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -4135,6 +4135,111 @@ void CPUMatrix<ElemType>::ConvolutionBackwardKernel(const CPUMatrix<ElemType>& i
    }
 }

+template <class ElemType>
+void CPUMatrix<ElemType>::UnrollConvolutionInput(size_t unrollCols, size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
+                                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
+{
+    size_t batchSize = GetNumCols();
+
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)batchSize; sample++)
+    {
+        for (size_t row = 0; row < mapOutSize; row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            assert(0 <= colBase && colBase < GetNumRows());
+
+            int i0 = mpRowRun(row, 0);
+            int skip = runs(i0++, 0);
+            int size = runs(i0++, 0);
+            int imask = i0 + size;
+            for (int i = 0; i < size; i++)
+            {
+                if (runs(imask + i, 0) == 0)
+                    continue;
+                int dcol = runs(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
+                output.Data()[(row * batchSize + sample) * unrollCols + skip + i] = (*this)(colBase + dcol, sample);
+            }
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const CPUMatrix<int>& mpRowCol,
+                                                  const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
+{
+    assert((mpRowCol.GetNumRows() % mapOutCount) == 0);
+    size_t mapOutSize = mpRowCol.GetNumRows() / mapOutCount;
+    size_t batchSize = GetNumCols();
+
+    size_t kernelSize = runs(1, 0);
+    assert((kernelSize % mapInCount) == 0);
+    size_t kernelMapSize = kernelSize / mapInCount;
+
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < mapOutSize; row++)
+        {
+            int colBase = mpRowCol(row, 0);
+
+            int i0 = mpRowRun(row, 0);
+            int skip = runs(i0++, 0);
+            int size = runs(i0++, 0);
+            int imask = i0 + size;
+            for (int i = 0; i < std::min(size, (int)kernelMapSize); i++)
+            {
+                if (runs(imask + i, 0) == 0)
+                    continue;
+                int dcol = runs(i0 + i, 0);
+                size_t isrc = row;
+                size_t idst = ((colBase + dcol) * batchSize + sample) * unrollCols + ((skip + i) % kernelMapSize) * mapOutCount;
+                for (size_t outMap = 0; outMap < mapOutCount; outMap++, isrc += mapOutSize)
+                {
+                    assert(isrc < GetNumElements());
+                    assert(idst + outMap < output.GetNumElements());
+
+                    output.Data()[idst + outMap] = (*this)(isrc, sample);
+                }
+            }
+        }
+    }
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
+                                                                  const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const
+{
+    size_t batchSize = GetNumCols();
+    size_t unrollCols = mapOutSize * batchSize;
+
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)batchSize; sample++)
+    {
+        for (size_t row = 0; row < mapOutSize; row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            assert(0 <= colBase && colBase < GetNumRows());
+
+            int i0 = mpRowRun(row, 0);
+            int skip = runs(i0++, 0);
+            int size = runs(i0++, 0);
+            int imask = i0 + size;
+            for (int i = 0; i < size; i++)
+            {
+                if (runs(imask + i, 0) == 0)
+                    continue;
+                int dcol = runs(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < GetNumRows());
+                size_t idst = (skip + i) * unrollCols + row * batchSize + sample;
+                assert(idst < output.GetNumElements());
+                output.Data()[idst] = (*this)(colBase + dcol, sample);
+            }
+        }
+    }
+}
+
 template <class ElemType>
 void CPUMatrix<ElemType>::MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
 {
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -348,6 +348,13 @@ public:
    void ConvolutionBackwardKernel(const CPUMatrix<ElemType>& in, const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIwht,
                                   const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& kernelGrad) const;

+    void UnrollConvolutionInput(size_t unrollCols, size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
+                                const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
+    void UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const CPUMatrix<int>& mpRowCol,
+                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
+    void UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const CPUMatrix<int>& mpRowCol,
+                                                 const CPUMatrix<int>& mpRowRun, const CPUMatrix<int>& runs, CPUMatrix<ElemType>& output) const;
+
    void MaxPoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
    void MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
                            const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -266,6 +266,38 @@ void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
    }
 }

+template <class ElemType>
+void CPUSparseMatrix<ElemType>::MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val)
+{
+    VerifyWritable(__func__);
+
+    size_t n = GetNumCols();
+    if (n != columnsMask.GetNumCols())
+        RuntimeError("Matrix and column mask must have equal number of columns.");
+
+    if (val != 0)
+        LogicError("MaskColumnsValue is not implmented for a non-zero mask for sparse matrices.");
+
+#ifdef _DEBUG
+    if (GetFormat() == MatrixFormat::matrixFormatSparseCSC)
+    {
+        // Get the binary columns mask
+        char* maskedCols = columnsMask.Data();
+
+        // If we're CSC, we only need to verify that the columns to be zeroed are empty.
+        GPUSPARSE_INDEX_TYPE* colVector = SecondaryIndexLocation();
+
+#pragma omp parallel for
+        for (long j = 0; j < n; j++)
+            if (maskedCols[j] == 0 && colVector[j + 1] != colVector[j])
+                LogicError("CPUSparseMatrix attempted to mask column %d, but it has %d elements in it.", (int)j, (int)(colVector[j + 1] - colVector[j]));
+    }
+    else
+        NOT_IMPLEMENTED;
+#endif
+}
+
+
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::Print(const char* matrixName) const
 {
--- a/Source/Math/CPUSparseMatrix.h
+++ b/Source/Math/CPUSparseMatrix.h
@ -83,6 +83,7 @@ public:

    void SetValue(const size_t row, const size_t col, ElemType val);
    void SetValue(const CPUSparseMatrix<ElemType>& /*val*/);
+    void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val);

    size_t BufferSize() const
    {
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -73,15 +73,15 @@ private:
 enum ElementWiseOperator
 {
    // nullary
-    opConstOne,
+    opConstOne, opNone,
    // unary (or binary with constant parameter)
    opCopy,
-    opNegate, opNot, opAbs, opReciprocal,
+    opNegate, opNot, opAbs, opFloor, opReciprocal,
    opSigmoid, opTanh, opSqr, opSqrt, opExp, opLog, opLinearRectifier, opCosine, opSin,
    // unary ops for use by Matrix class only (there is no TensorView implementation)
    opSigmoidDerivative, opLinearRectifierDerivative, opNegativeSine,
    // binary
-    opSum, opDifference, opElementwiseProduct, opElementwiseQuotient, opLogSum,
+    opCopyIf, opCopyIfNot, opSum, opDifference, opElementwiseProduct, opElementwiseQuotient, opLogSum,
    opMax, opMin,
    opLT, opEQ, opGT, opGE, opNE, opLE, // Note: must obey this order: (sgn(a-b) == -1, 0, +1), (sgn(a-b) != -1, 0, +1)
    opAnd, opOr, opXor, opMaskNegative,
@ -95,7 +95,8 @@ enum ElementWiseOperator
    // ternary
    opCond /*a ? b : c*/,
    opClip, /*clip a within interval b..c*/
-    opElementwiseProductWithLogSumDerivative
+    opElementwiseProductWithLogSumDerivative,
+    opCopyIfEqual
    // Note: not all that's implemented in CNTK ComputationNodes has an opcode yet.
 };

@ -108,6 +109,7 @@ enum ElementWiseOperator
    Macro(Negate);            \
    Macro(Not);               \
    Macro(Abs);               \
+    Macro(Floor);             \
    Macro(Reciprocal);        \
    Macro(Sigmoid);           \
    Macro(Tanh);              \
@ -120,6 +122,8 @@ enum ElementWiseOperator
    Macro(Sin);

 #define ForAllBinaryOps(Macro)                                        \
+    Macro(CopyIf);                                                    \
+    Macro(CopyIfNot);                                                 \
    Macro(Sum);                                                       \
    Macro(Difference);                                                \
    Macro(ElementwiseProduct);                                        \
@ -151,6 +155,7 @@ enum ElementWiseOperator

 #define ForAllTernaryOps(Macro)                         \
    Macro(Cond);                                        \
+    Macro(CopyIfEqual);                                 \
    Macro(Clip);                                        \
    Macro(ElementwiseProductWithLogSumDerivative);      

--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -210,13 +210,13 @@ protected:
            InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
    }

-private:
+protected:
    static bool IsGpu(DEVICEID_TYPE deviceId)
    {
        return deviceId >= 0;
    }

-private:
+protected:
    using IntMatPtr = std::unique_ptr<Matrix<int>>;

    Matrix<int> m_mpRowCol;
@ -511,6 +511,308 @@ private:
    bool m_gpuSparse1D;
 };

+//------------------------------------------------------------------
+// GEMM convolution engine implementation.
+// This engine supports arbitrary convolution configuration with full
+// sharing and implemented using unroll + GEMM technique 
+// (High performance convolutional neural networks for document processing; Chellapilla, Puri, Simard)
+// Uses reference engine for pooling operations.
+//------------------------------------------------------------------
+template <class ElemType>
+class GemmConvolutionEngine : public ReferenceConvolutionEngine<ElemType>
+{
+public:
+    using Base = ReferenceConvolutionEngine<ElemType>;
+    using typename Base::Mat;
+
+public:
+    GemmConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind)
+        : Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind)
+    {
+    }
+
+protected:
+    using typename Base::IntMatPtr;
+
+    using Base::IsGpu;
+
+    using Base::m_geometry;
+    using Base::m_deviceId;
+    using Base::m_imageLayout;
+    using Base::m_maxTempMemSizeInSamples;
+
+    using Base::m_mpRowCol;
+    using Base::m_mpRowIwht;
+    using Base::m_mpRowRun;
+    using Base::m_runs;
+
+    void EnsureCompatible() override
+    {
+        if (m_imageLayout != ImageLayoutKind::CHW)
+            LogicError("GEMM convolution engine supports only CHW/cudnn layout.");
+        if (IsGpu(m_deviceId))
+            LogicError("GEMM convolution engine currently supports only CPU device.");
+    }
+
+    // A note on notation used in the documentation for the next 3 functions: 
+    // for simplicity we use cuDNN-style notation for 2D convolutions (though this engine supports arbitrary convolution configuration)
+    // where N - is the number of samples in a batch, C, H, W are number of channels, height and width of the input respectively.
+    // For the output we use K as the number of output feature maps and H', W' as height and width of the output.
+    // We also use column-major notation everywhere (as opposed to cuDNN which uses row-major) to follow CNTK rules.
+    // For kernels we use X, Y, Z to represent width, height and depth. This engine requires Z == C which is
+    // not a significant restriction as tensors of higher dimensions (+1) can be used to describe the same convolution configuration.
+    // Example: [WHC x N] - is a matrix of WHC rows by N columns and represents a convolution input
+    // where each column is a sample that has layout of WHC, so W dimension stride is 1.
+    //
+    // The forward method consists of 3 parts:
+    // 1. Unrolling convolution input (in) into a matrix: [WHC x N] -> [XYC x NW'H']
+    //    Using this format allows to perform convolution for the whole minibatch as a single GEMM operation
+    //    which is not possible with WHCN format. Alternatively, CWHN format (used in legacy engine) could be used
+    //    but this would require both unrolling the input and transforming the weight matrix.
+    // 2. Performing matrix multiplication of unrolled input with weight matrix:
+    //    [XYC x NW'H']^T * [XYC x K] -> [NW'H' x K]
+    // 3. Reshape and transpose result: [NW'H' x K] -> [N x W'H'K]^T -> [W'H'K x N]
+    //    In case minibatch size == 1 this step is not required and step 2 writes results directly to output (out).
+    void ForwardCore(const Mat& in, const Mat& kernel, Mat& out, Mat& workspace) override
+    {
+        size_t batchSize = in.GetNumCols();
+        size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
+
+        size_t mapCount = m_geometry->GetMapCount(m_geometry->InputShape().GetRank() - 1);
+        size_t mapOutSize = m_geometry->OutputShape().GetNumElements() / mapCount;
+        size_t unrollRows = mapOutSize * subBatchSize;
+        size_t unrollCols = m_geometry->KernelShape().GetNumElements();
+        // Reserve space for unrolled inputs and, if needed, intermediate outputs. 
+        // Intermediate outputs will be transposed to final outputs after GEMM operation.
+        // Transpose is not required if subBatchSize == 1.
+        workspace.Resize(unrollRows, unrollCols + (subBatchSize > 1 ? mapCount : 0));
+
+        for (size_t start = 0; start < batchSize; start += subBatchSize)
+        {
+            size_t curBatchSize = min(subBatchSize, batchSize - start);
+            auto inputSlice = in.ColumnSlice(start, curBatchSize);
+            auto unrolledInput = workspace.ColumnSlice(0, unrollCols);
+            if (curBatchSize != subBatchSize)
+            {
+                unrolledInput.Reshape(mapOutSize, subBatchSize * unrollCols);
+                unrolledInput = unrolledInput.ColumnSlice(0, curBatchSize * unrollCols);
+            }
+            // Need to reshape (soft transpose) as matrices are column-major.
+            unrolledInput.Reshape(unrollCols, mapOutSize * curBatchSize);
+
+            // Unroll inputs.
+            unrolledInput.SetValue(0);
+            inputSlice.UnrollConvolutionInput(unrollCols, mapOutSize, m_mpRowCol, *m_mpRowRun, *m_runs, unrolledInput);
+
+            // cudnn layout uses row-major kernel weight matrix.
+            auto kern = kernel.ColumnSlice(0, kernel.GetNumCols());
+            kern.Reshape(kernel.GetNumCols(), kernel.GetNumRows());
+
+            // Perform matrix multiplication of unrolled inputs with weights.
+            // If there is just one sample in the sub-batch then compute result directly to the output matrix.
+            if (curBatchSize == 1)
+            {
+                auto outSlice = out.ColumnSlice(start, 1);
+                outSlice.Reshape(mapOutSize, mapCount);
+                Mat::Multiply(unrolledInput, true, kern, false, outSlice);
+            }
+            else
+            {
+                auto outTempSlice = workspace.ColumnSlice(unrollCols, mapCount);
+                if (curBatchSize != subBatchSize)
+                {
+                    outTempSlice.Reshape(mapOutSize, subBatchSize * mapCount);
+                    outTempSlice = outTempSlice.ColumnSlice(0, curBatchSize * mapCount);
+                    outTempSlice.Reshape(mapOutSize * curBatchSize, mapCount);
+                }
+                Mat::Multiply(unrolledInput, true, kern, false, outTempSlice);
+                outTempSlice.Reshape(curBatchSize, mapOutSize * mapCount);
+                auto outSlice = out.ColumnSlice(start, curBatchSize);
+                outSlice.AssignTransposeOf(outTempSlice);
+            }
+        }
+    }
+    
+    // The backward data method works by representing this operation as a "reverse" convolution
+    // in case kernel's last dimension is equal to input dimension. Gradients matrix (grad) becomes
+    // an output of such reverse convolution.
+    // There are 4 steps:
+    // 1. Transpose and reshape kernel weights: [XYC x K]^T -> [K x XYC] -> [KXY x C]
+    // 2. Unroll convolution output (here source gradients, srcGrad):
+    //    [W'H'K' x N] -> [KXY x NWH]
+    // 3. Performing matrix multiplication of unrolled scrGrad with transposed weights:
+    //    [KXY x NWH]^T * [KXY x C] -> [NWH x C]
+    // 4. Reshape and transpose outputs (grad): [NWH x C] -> [N x WHC]^T -> [WHC x N]
+    //    In case minibatch size == 1 this step is not required and step 3 writes results directly to output (grad).
+    void BackwardDataCore(const Mat& srcGrad, const Mat& kernel, Mat& grad, Mat& workspace) override
+    {
+        size_t batchSize = srcGrad.GetNumCols();
+        size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
+
+        const auto& inT = m_geometry->InputShape();
+        const auto& kernT = m_geometry->KernelShape();
+
+        size_t dimCount = inT.GetRank();
+        assert(kernT[dimCount - 1] == inT[dimCount - 1]);
+        if (kernT[dimCount - 1] != inT[dimCount - 1])
+        {
+            RuntimeError("GEMM convolution engine does not support this convolution configuration. "
+                         "It is possible to make GEMM engine work with this configuration by defining "
+                         "input/output/kernel using tensors of higher(+1) dimension. Geometry: %s", ((string)*m_geometry).c_str());
+        }
+
+        size_t mapInCount  = kernT[dimCount - 1];
+        size_t mapOutCount = m_geometry->GetMapCount(dimCount - 1);
+        size_t mapInSize   = inT.GetNumElements() / mapInCount;
+
+        size_t unrollRows = mapInSize * subBatchSize;
+        size_t unrollCols = kernel.GetNumElements() / mapInCount;
+
+        // Reserve space for:
+        // 1. Transposed kernel weights.
+        // 2. Unrolled source gradients.
+        // 3. Intermediate gradients (optional).
+        // Intermediate outputs will be transposed to final outputs after GEMM operation.
+        // Transpose is not required if subBatchSize == 1.
+        size_t kernCols = kernel.GetNumElements();
+        workspace.Resize(1, kernCols + unrollRows * (unrollCols + (subBatchSize > 1 ? mapInCount : 0)));
+
+        auto kern = kernel.ColumnSlice(0, kernel.GetNumCols());
+        // cudnn layout uses row-major kernel weight matrix.
+        kern.Reshape(kernel.GetNumCols(), kernel.GetNumRows());
+        // Now transpose and reshape to [KXY x C].
+        auto kernTran = workspace.ColumnSlice(0, kernCols);
+        // Reshape to transpose shape, AssignTransposeOf requires that.
+        kernTran.Reshape(kern.GetNumCols(), kern.GetNumRows());
+        kernTran.AssignTransposeOf(kern);
+        kern = kernTran.ColumnSlice(0, kernTran.GetNumCols());
+        // Reshape to final shape.
+        kern.Reshape(kernel.GetNumElements() / mapInCount, mapInCount);
+
+        for (size_t start = 0; start < batchSize; start += subBatchSize)
+        {
+            size_t curBatchSize = min(subBatchSize, batchSize - start);
+            auto srcGradSlice = srcGrad.ColumnSlice(start, curBatchSize);
+            auto unrolledSrcGrad = workspace.ColumnSlice(kernCols, unrollRows * unrollCols);
+            if (curBatchSize != subBatchSize)
+                unrolledSrcGrad = unrolledSrcGrad.ColumnSlice(0, mapInSize * curBatchSize * unrollCols);
+            // Need to reshape (soft transpose) as matrices are column-major.
+            unrolledSrcGrad.Reshape(unrollCols, mapInSize * curBatchSize);
+
+            // Unroll outputs (source gradients).
+            unrolledSrcGrad.SetValue(0);
+            srcGradSlice.UnrollConvolutionOutput(unrollCols, mapInCount, mapOutCount, m_mpRowCol, *m_mpRowRun, *m_runs, unrolledSrcGrad);
+
+            // Perform matrix multiplication of unrolled outputs with weights.
+            // If there is just one sample in the sub-batch then compute result directly to the output matrix.
+            if (curBatchSize == 1)
+            {
+                auto gradSlice = grad.ColumnSlice(start, 1);
+                gradSlice.Reshape(mapInSize, mapInCount);
+                Mat::MultiplyAndAdd(unrolledSrcGrad, true, kern, false, gradSlice);
+            }
+            else
+            {
+                // Need to transpose existing destination gradients first so we can add new values to them.
+                auto gradTempSlice = workspace.ColumnSlice(kernCols + unrollRows * unrollCols, unrollRows * mapInCount);
+                if (curBatchSize != subBatchSize)
+                    gradTempSlice = gradTempSlice.ColumnSlice(0, mapInSize * curBatchSize * mapInCount);
+                gradTempSlice.Reshape(curBatchSize, mapInSize * mapInCount);
+                auto gradSlice = grad.ColumnSlice(start, curBatchSize);
+                gradTempSlice.AssignTransposeOf(gradSlice);
+                gradTempSlice.Reshape(mapInSize * curBatchSize, mapInCount);
+                // Multiply unrolled srcGrad with weights and add to grad.
+                Mat::MultiplyAndAdd(unrolledSrcGrad, true, kern, false, gradTempSlice);
+                // Reshape and transpose grads back to original form.
+                gradTempSlice.Reshape(curBatchSize, mapInSize * mapInCount);
+                gradSlice.AssignTransposeOf(gradTempSlice);
+            }
+        }
+    }
+
+    // The backward kernel method consists of 3 parts:
+    // 1. Transpose and reshape convolution output matrix (srcGrad) into [NW'H' x K] layout.
+    //    This step is not needed if current minibatch size == 1 and srcGrad are used instead.
+    // 2. Unrolling convolution input (in) into a matrix of [NW'H' x WHC] layout.
+    // 3. Performing matrix multiplication of unrolled input with transposed output:
+    //    [NW'H' x WHC]^T * [NW'H' x K] -> [WHC x K] - kernel gradients.
+    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
+    {
+        size_t batchSize = srcGrad.GetNumCols();
+        size_t subBatchSize = m_maxTempMemSizeInSamples == 0 ? batchSize : min(batchSize, m_maxTempMemSizeInSamples);
+
+        const auto& inT = m_geometry->InputShape();
+        const auto& kernT = m_geometry->KernelShape();
+        const auto& outT = m_geometry->OutputShape();
+
+        size_t dimCount = inT.GetRank();
+        size_t mapOutCount = m_geometry->GetMapCount(dimCount - 1);
+        size_t mapOutSize = outT.GetNumElements() / mapOutCount;
+
+        assert(kernT[dimCount - 1] == inT[dimCount - 1]);
+        if (kernT[dimCount - 1] != inT[dimCount - 1])
+        {
+            RuntimeError("GEMM convolution engine does not support this convolution configuration. "
+                         "It is possible to make GEMM engine work with this configuration by defining "
+                         "input/output/kernel using tensors of higher(+1) dimension. Geometry: %s", ((string)*m_geometry).c_str());
+        }
+
+        size_t unrollRows = kernT.GetNumElements();
+        size_t unrollCols = mapOutSize * subBatchSize;
+
+        // Reserve space for:
+        // 1. Unrolled inputs.
+        // 2. Transposed source gradients (optional).
+        workspace.Resize(unrollCols, unrollRows + (subBatchSize > 1 ? mapOutCount : 0));
+
+        for (size_t start = 0; start < batchSize; start += subBatchSize)
+        {
+            size_t curBatchSize = min(subBatchSize, batchSize - start);
+            // 1. Transpose and reshape srcGrad.
+            auto srcGradSlice = srcGrad.ColumnSlice(start, curBatchSize);
+            if (curBatchSize > 1)
+            {
+                auto srcGradTranSlice = workspace.ColumnSlice(unrollRows, mapOutCount);
+                if (curBatchSize != subBatchSize)
+                {
+                    srcGradTranSlice.Reshape(mapOutCount * mapOutSize, subBatchSize);
+                    srcGradTranSlice = srcGradTranSlice.ColumnSlice(0, curBatchSize);
+                }
+                // Reshape to transposed shape - required by AssignTransposeOf.
+                srcGradTranSlice.Reshape(srcGradSlice.GetNumCols(), srcGradSlice.GetNumRows());
+                srcGradTranSlice.AssignTransposeOf(srcGradSlice);
+                srcGradSlice = srcGradTranSlice.ColumnSlice(0, srcGradTranSlice.GetNumCols());
+            }
+            srcGradSlice.Reshape(mapOutSize * curBatchSize, mapOutCount);
+
+            // 2. Unroll inputs.
+            auto inputSlice = in.ColumnSlice(start, curBatchSize);
+            auto unrolledInputSlice = workspace.ColumnSlice(0, unrollRows);
+            if (curBatchSize != subBatchSize)
+            {
+                unrolledInputSlice.Reshape(mapOutSize * unrollRows, subBatchSize);
+                unrolledInputSlice = unrolledInputSlice.ColumnSlice(0, curBatchSize);
+            }
+            unrolledInputSlice.Reshape(mapOutSize * curBatchSize, unrollRows);
+            unrolledInputSlice.SetValue(0);
+            inputSlice.UnrollConvolutionInputForKernelBackprop(mapOutSize, m_mpRowCol, *m_mpRowRun, *m_runs, unrolledInputSlice);
+
+            // cudnn layout uses row-major kernel weight matrix.
+            auto kernGrad = kernelGrad.ColumnSlice(0, kernelGrad.GetNumCols());
+            kernGrad.Reshape(kernelGrad.GetNumCols(), kernelGrad.GetNumRows());
+            // 3. Multiply.
+            Mat::MultiplyAndAdd(unrolledInputSlice, true, srcGradSlice, false, kernGrad);
+        }
+}
+
+public:
+    static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry)
+    {
+        return deviceId < 0 &&
+               find(begin(geometry->Sharing()), end(geometry->Sharing()), false) == end(geometry->Sharing());
+    }
+};
+
 template <class ElemType>
 std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
                                                                                 ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
@ -539,6 +841,12 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

+    if (isEnabled(ConvolutionEngineKind::Gemm) && GemmConvolutionEngine<ElemType>::IsSupported(deviceId, geometry))
+    {
+        fprintf(stderr, "\nUsing GEMM convolution engine for geometry: %s.\n", engStr.c_str());
+        return std::make_unique<GemmConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
+    }
+
    if (!isEnabled(ConvolutionEngineKind::Reference))
        RuntimeError("Reference convolution is disabled and no other engine supports such configuratin (or disabled).");
    fprintf(stderr, "\nUsing reference convolution engine for geometry: %s.\n", engStr.c_str());
--- a/Source/Math/ConvolutionEngine.h
+++ b/Source/Math/ConvolutionEngine.h
@ -18,11 +18,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 enum class ConvolutionEngineKind
 {
    None      = 0,
-    Reference = 1,
-    CuDnn     = 1 << 1,
-    Legacy    = 1 << 2,
+    Reference = 1,      // Reference, lookup-based implementation. Very slow but works for any convo configuration.
+    CuDnn     = 1 << 1, // cuDNN, works only for 2D/3D convos with full sharing.
+    Legacy    = 1 << 2, // Legacy, for backwards compatibility. REVIEW alexeyk: implement sparse version and remove Legacy altogether.
+    Gemm      = 1 << 3, // Uses convolution unrolling+GEMM technique. Works only for convos with full sharing.

-    All     = Reference | CuDnn | Legacy
+    All       = Reference | CuDnn | Legacy | Gemm
 };

 enum class PoolKind
--- a/Source/Math/GPUSparseMatrix.cu
+++ b/Source/Math/GPUSparseMatrix.cu
@ -492,6 +492,57 @@ void GPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& denseMatrix,
    }
 }

+template <class ElemType>
+GPUSPARSE_INDEX_TYPE* GPUSparseMatrix<ElemType>::GetCondensedVector() const
+{
+    if (GetFormat() == MatrixFormat::matrixFormatSparseCSC || GetFormat() == MatrixFormat::matrixFormatSparseCSR)
+    {
+        PrepareDevice();
+        GPUSPARSE_INDEX_TYPE* pArray = new GPUSPARSE_INDEX_TYPE[SecondaryIndexCount()];
+        CUDA_CALL(cudaMemcpy(pArray, SecondaryIndexLocation(), sizeof(GPUSPARSE_INDEX_TYPE) * SecondaryIndexCount(), cudaMemcpyDeviceToHost));
+        return pArray;
+    }
+    else
+    {
+        return NULL;
+    }
+}
+
+template <class ElemType>
+void GPUSparseMatrix<ElemType>::MaskColumnsValue(const GPUMatrix<char>& columnsMask, ElemType val)
+{
+    VerifyWritable(__func__);
+
+    size_t n = GetNumCols();
+    if (n != columnsMask.GetNumCols())
+        RuntimeError("Matrix and column mask must have equal number of columns");
+
+    if (val != 0)
+        LogicError("MaskColumnsValue is not implmented for a non-zero mask for sparse matrices.");
+
+#ifdef _DEBUG
+    if (GetFormat() == MatrixFormat::matrixFormatSparseCSC)
+    {
+        // TODO: We could do this on the GPU, but for now C++ is easier.
+        // Download the binary columns mask
+        char* maskedCols = columnsMask.CopyToArray();
+
+        // If we're CSC, we only need to verify that the columns to be zeroed are empty, since val == 0.
+        // So just download the condensed column vector.
+        GPUSPARSE_INDEX_TYPE* colVector = GetCondensedVector();
+
+        // Verify that if the column is to be masked, there are no elements in it.
+        #pragma omp parallel for
+        for (long j = 0; j < n; j++)
+            if (maskedCols[j] == 0 && colVector[j + 1] != colVector[j])
+                RuntimeError("GPUSparseMatrix attempted to mask column %d, but it has %d elements in it.", (int)j, (int)(colVector[j + 1] - colVector[j]));
+    }
+    else
+        NOT_IMPLEMENTED;
+#endif
+}
+
+
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseMatrix<ElemType>& deepCopy)
 {
--- a/Source/Math/GPUSparseMatrix.h
+++ b/Source/Math/GPUSparseMatrix.h
@ -286,6 +286,9 @@ public:
    void SetValue(const GPUMatrix<ElemType>& denseMatrix, const MatrixFormat matrixFormat);
    void SetValue(const GPUMatrix<ElemType>& denseMatrix);
    
+    GPUSPARSE_INDEX_TYPE* GetCondensedVector() const;
+    void MaskColumnsValue(const GPUMatrix<char>& columnsMask, ElemType val);
+
    void Reshape(const size_t numRows, const size_t numCols);
    void ResizeAsAndCopyIndexFrom(const GPUSparseMatrix<ElemType>& a, const bool growOnly = true);

--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -77,12 +77,12 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>libacml_mp_dll.lib;Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <DelayLoadDLLs>libacml_mp_dll.dll; $(CudaDlls); %(DelayLoadDLLs)</DelayLoadDLLs>
      <Profile>true</Profile>
    </Link>
    <PostBuildEvent>
-      <Command>xcopy /D /I /Y "$(ACML_PATH)\lib\*.dll" $(OutputPath)</Command>
+      <Command>xcopy /D /I /Y "$(ACML_PATH)\lib\*.dll" "$(OutputPath)"</Command>
      <Message>Copying ACML DLLs</Message>
    </PostBuildEvent>
    <CudaCompile>
@ -119,12 +119,12 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>libacml_mp_dll.lib;Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
      <DelayLoadDLLs>libacml_mp_dll.dll; $(CudaDlls); %(DelayLoadDLLs)</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
-      <Command>xcopy /D /I /Y "$(ACML_PATH)\lib\*.dll" $(OutputPath)</Command>
+      <Command>xcopy /D /I /Y "$(ACML_PATH)\lib\*.dll" "$(OutputPath)"</Command>
      <Message>Copying ACML DLLs</Message>
    </PostBuildEvent>
    <CudaCompile>
@ -189,13 +189,6 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="BatchNormalizationEngine.cpp" />
    <ClCompile Include="ConvolutionEngine.cpp" />
    <ClCompile Include="CPUSparseMatrix.cpp" />
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@ -16,19 +16,11 @@
    <RootNamespace>Math</RootNamespace>
    <ProjectName>MathCUDA</ProjectName>
  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
  <PropertyGroup>
    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
-    <UseDebugLibraries>true</UseDebugLibraries>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
-    <UseDebugLibraries>false</UseDebugLibraries>
  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <Choose>
    <When Condition="Exists('$(CUDNN_PATH)')">
@ -58,37 +50,28 @@
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
-    <LinkIncremental>true</LinkIncremental>
+  <PropertyGroup Condition="$(DebugBuild)">
    <CudaCodeGen>$(CNTK_CUDA_CODEGEN_DEBUG)</CudaCodeGen>
    <CudaCodeGen Condition="'$(CudaCodeGen)'==''">compute_20,compute_20;compute_30,sm_30</CudaCodeGen>
  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
-    <LinkIncremental>false</LinkIncremental>
+  <PropertyGroup Condition="$(ReleaseBuild)">
    <CudaCodeGen>$(CNTK_CUDA_CODEGEN_RELEASE)</CudaCodeGen>
    <CudaCodeGen Condition="'$(CudaCodeGen)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;</CudaCodeGen>
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <WarningLevel>Level4</WarningLevel>
      <PreprocessorDefinitions>NO_SYNC; WIN32; _WINDOWS; _USRDLL; MATH_EXPORTS; $(CuDnnDefine); %(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\include;$(ACML_PATH)\include;$(CudaInclude);$(CUB_PATH);$(CuDnnIncPath)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\include;$(CudaInclude);$(CUB_PATH);$(CuDnnIncPath)</AdditionalIncludeDirectories>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <FloatingPointModel>Fast</FloatingPointModel>
-      <OpenMPSupport>true</OpenMPSupport>
-      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>$(CudaLibs);libacml_mp_dll.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <Profile>true</Profile>
+      <AdditionalDependencies>$(CudaLibs);%(AdditionalDependencies)</AdditionalDependencies>
      <DelayLoadDLLs>$(CudaDlls);%(DelayLoadDLLs)</DelayLoadDLLs>
    </Link>
    <Lib>
-      <AdditionalLibraryDirectories>$(OutDir);$(ACML_PATH)\lib;$(CudaLibPath);$(CuDnnLibPath)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(OutDir);$(CudaLibPath);$(CuDnnLibPath)</AdditionalLibraryDirectories>
      <AdditionalDependencies>$(CuDnnLib)</AdditionalDependencies>
    </Lib>
    <CudaCompile>
@ -102,34 +85,22 @@
      <AdditionalOptions>-Xcudafe "--diag_suppress=field_without_dll_interface" %(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <PostBuildEvent>
-      <Command>for %%l in ($(CudaDlls)) do if exist "$(CudaPath)\bin\%%l" xcopy /D /Y "$(CudaPath)\bin\%%l*" $(OutputPath)
-if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
+      <Command>for %%l in ($(CudaDlls)) do if exist "$(CudaPath)\bin\%%l" xcopy /D /Y "$(CudaPath)\bin\%%l*" "$(OutputPath)"
+if exist "$(CuDnnDll)" xcopy /Y "$(CuDnnDll)" "$(OutputPath)"
 </Command>
    </PostBuildEvent>
  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
+  <ItemDefinitionGroup Condition="$(DebugBuild)">
    <ClCompile>
-      <PreprocessorDefinitions>_DEBUG; %(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <Optimization>Disabled</Optimization>
      <MinimalRebuild>false</MinimalRebuild>
    </ClCompile>
-    <Link>
-    </Link>
  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
    <ClCompile>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NDEBUG; %(PreprocessorDefinitions)</PreprocessorDefinitions>
      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
      <FloatingPointExceptions>false</FloatingPointExceptions>
      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
    <CudaCompile>
      <HostDebugInfo>false</HostDebugInfo>
    </CudaCompile>
@ -183,7 +154,6 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
    <CudaCompile Include="GPUWatcher.cu">
      <FileType>CppCode</FileType>
    </CudaCompile>
-    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp" />
    <ClCompile Include="cudalattice.cpp" />
    <ClCompile Include="cudalib.cpp" />
    <CudaCompile Include="CuDnnConvolutionEngine.cu">
@ -222,6 +192,6 @@ if exist "$(CuDnnDll)" (xcopy /Y "$(CuDnnDll)" $(OutputPath))
  </ImportGroup>
  <Target Name="CheckDependencies">
    <Error Condition="!Exists('$(CUB_PATH)')" Text="CNTK requires the NVIDIA CUB library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
-    <Warning Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
+    <Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
  </Target>
 </Project>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -1152,12 +1152,11 @@ void Matrix<ElemType>::MaskColumnsValue(const Matrix<char>& columnsMask, ElemTyp
    else if (GetDeviceId() != columnsMask.GetDeviceId() && columnsMask.GetCurrentMatrixLocation() != BOTH)
        RuntimeError("MaskColumnsValue: Matrix and column mask must be on the same device.");

-    DISPATCH_MATRIX_ON_FLAG(this,
-                            this,
-                            m_CPUMatrix->MaskColumnsValue(*columnsMask.m_CPUMatrix, val),
-                            m_GPUMatrix->MaskColumnsValue(*columnsMask.m_GPUMatrix, val),
-                            NOT_IMPLEMENTED,
-                            NOT_IMPLEMENTED);
+    DISPATCH_MATRIX_ON_FLAG(this, this,
+        { m_CPUMatrix->MaskColumnsValue(*columnsMask.m_CPUMatrix, val); },
+        { m_GPUMatrix->MaskColumnsValue(*columnsMask.m_GPUMatrix, val); },
+        { m_CPUSparseMatrix->MaskColumnsValue(*columnsMask.m_CPUMatrix, val); },
+        { m_GPUSparseMatrix->MaskColumnsValue(*columnsMask.m_GPUMatrix, val); });
 }

 template <class ElemType>
@ -4031,6 +4030,63 @@ void Matrix<ElemType>::ConvolutionBackwardKernel(const Matrix<ElemType>& in, con
                            NOT_IMPLEMENTED);
 }

+template <class ElemType>
+void Matrix<ElemType>::UnrollConvolutionInput(size_t unrollCols, size_t mapOutSize, const Matrix<int>& mpRowCol,
+                                              const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowRun.GetNumCols() == 1);
+    assert(runs.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, output);
+
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->UnrollConvolutionInput(unrollCols, mapOutSize, *(mpRowCol.m_CPUMatrix),
+                                                                *(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(output.m_CPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const Matrix<int>& mpRowCol,
+                                               const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowRun.GetNumCols() == 1);
+    assert(runs.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, output);
+
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->UnrollConvolutionOutput(unrollCols, mapInCount, mapOutCount, *(mpRowCol.m_CPUMatrix),
+                                                                 *(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(output.m_CPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
+template <class ElemType>
+void Matrix<ElemType>::UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const Matrix<int>& mpRowCol,
+                                                               const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowRun.GetNumCols() == 1);
+    assert(runs.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, output);
+
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->UnrollConvolutionInputForKernelBackprop(mapOutSize, *(mpRowCol.m_CPUMatrix),
+                                                                                 *(mpRowRun.m_CPUMatrix), *(runs.m_CPUMatrix), *(output.m_CPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
 template <class ElemType>
 void Matrix<ElemType>::MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
 {
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -121,7 +121,7 @@ public:
    ~Matrix();

    // workaround to bugs in BOTH implementation: force to collapse to home location
-    void CollapseDataLocationAfterWriting() const
+    void CollapseDataLocation() const
    {
        SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
    }
@ -479,6 +479,13 @@ public:
    void ConvolutionBackwardKernel(const Matrix<ElemType>& in, const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIwht,
                                   const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& kernelGrad) const;

+    void UnrollConvolutionInput(size_t unrollCols, size_t mapOutSize, const Matrix<int>& mpRowCol,
+                                const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
+    void UnrollConvolutionOutput(size_t unrollCols, size_t mapInCount, size_t mapOutCount, const Matrix<int>& mpRowCol,
+                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
+    void UnrollConvolutionInputForKernelBackprop(size_t mapOutSize, const Matrix<int>& mpRowCol,
+                                                 const Matrix<int>& mpRowRun, const Matrix<int>& runs, Matrix<ElemType>& output) const;
+
    void MaxPoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
    void MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
                            const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -92,6 +92,17 @@ void GPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCo
 {
 }

+template <class ElemType>
+GPUSPARSE_INDEX_TYPE* GPUSparseMatrix<ElemType>::GetCondensedVector() const
+{
+    return NULL;
+}
+
+template <class ElemType>
+void GPUSparseMatrix<ElemType>::MaskColumnsValue(const GPUMatrix<char>& columnsMask, ElemType val)
+{
+}
+
 template <class ElemType>
 GPUSparseMatrix<ElemType>& GPUSparseMatrix<ElemType>::operator=(const GPUSparseMatrix<ElemType>& deepCopy)
 {
--- a/Source/Math/TensorOps.h
+++ b/Source/Math/TensorOps.h
@ -47,6 +47,7 @@ OverloadUnaryMathFns(sqrt);
 OverloadUnaryMathFns(fabs);
 OverloadUnaryMathFns(cos);
 OverloadUnaryMathFns(sin);
+OverloadUnaryMathFns(floor);

 #pragma pop_macro("OverloadUnaryMathFns")

@ -194,6 +195,7 @@ DefUnaryOp(Copy, a);
 DefUnaryOp(Negate, -a);
 DefUnaryOp(Not, !a);
 DefUnaryOp(Abs, fabs_(a));
+DefUnaryOp(Floor, floor_(a));
 DefUnaryOp(Sigmoid, Sigmoid(a));
 DefUnaryOp(Tanh, tanh_(a));
 DefUnaryOp(Sqr, Sqr(a));
@ -214,7 +216,8 @@ DefUnaryOp(Reciprocal, a == 0 ? 0 : 1 / a);
        return expr;                             \
    }
 //#define DefBinaryOp(op, expr) template<class ElemType> DECL ElemType Op ## op(const ElemType & a, ElemType b, int i = 0) { UNUSED(i); return expr; }
-
+DefBinaryOp(CopyIf, a != 0 ? b : 0);
+DefBinaryOp(CopyIfNot, a == 0 ? b : 0);
 DefBinaryOp(Sum, a + b);
 DefBinaryOp(Difference, a - b);
 DefBinaryOp(ElementwiseProduct, a* b);
@ -255,7 +258,8 @@ DefBinaryOp(SqrOfDifference, Sqr(a - b));
    }

 DefTernaryOp(Cond, a ? b : c);
-DefTernaryOp(Clip, a < b ? b : (a > c ? c : a));
+DefTernaryOp(CopyIfEqual, a == b ? c : 0); // CopyIfEqual(a,b)(c) -- if a==b copy c, otherwise 0; used for gradient of clip, min, max, etc.
+DefTernaryOp(Clip, c < a ? a : (c > b ? b : c)); // Clip(min,max)(data) => a=min, b=max, c=data
 DefTernaryOp(ElementwiseProductWithLogSumDerivative, a * Sigmoid(c - b));

 #pragma pop_macro("DefTernaryOp")
--- a/Source/Readers/BinaryReader/BinaryReader.vcxproj
+++ b/Source/Readers/BinaryReader/BinaryReader.vcxproj
@ -71,7 +71,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -92,7 +92,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
    </Link>
  </ItemDefinitionGroup>
@ -106,21 +106,9 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp" />
-    <ClCompile Include="..\..\Common\DataWriter.cpp" />
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="BinaryFile.cpp" />
    <ClCompile Include="BinaryReader.cpp" />
    <ClCompile Include="BinaryWriter.cpp" />
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="Exports.cpp" />
    <ClCompile Include="dllmain.cpp">
      <CompileAsManaged>false</CompileAsManaged>
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.cpp
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.cpp
@ -47,6 +47,12 @@ CNTKTextFormatReader::CNTKTextFormatReader(MemoryProviderPtr provider,
        randomizer->Initialize(nullptr, config);

        m_transformer = randomizer;
+
+        // TODO: add "frameMode"  config paramter
+        m_packer = std::make_shared<SequencePacker>(
+            m_provider,
+            m_transformer,
+            GetStreamDescriptions());
    }
    catch (const std::runtime_error& e)
    {
@ -61,18 +67,13 @@ std::vector<StreamDescriptionPtr> CNTKTextFormatReader::GetStreamDescriptions()

 void CNTKTextFormatReader::StartEpoch(const EpochConfiguration& config)
 {
-    if (config.m_totalEpochSizeInSamples <= 0)
+    if (config.m_totalEpochSizeInSamples == 0)
    {
-        RuntimeError("Unsupported minibatch size '%d'.", (int)config.m_totalEpochSizeInSamples);
+        RuntimeError("Epoch size cannot be 0.");
    }

    m_transformer->StartEpoch(config);
-    // TODO: add "frameMode"  config paramter
-    m_packer = std::make_shared<SequencePacker>(
-        m_provider,
-        m_transformer,
-        config.m_minibatchSizeInSamples,
-        GetStreamDescriptions());
+    m_packer->StartEpoch(config);
 }

 Minibatch CNTKTextFormatReader::ReadMinibatch()
--- a/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj
+++ b/Source/Readers/CNTKTextFormatReader/CNTKTextFormatReader.vcxproj
@ -62,7 +62,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>ReaderLib.lib;Math.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ReaderLib.lib;Math.lib;Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -100,17 +100,6 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp" />
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="Indexer.cpp" />
    <ClCompile Include="TextConfigHelper.cpp" />
    <ClCompile Include="TextParser.cpp" />
--- a/Source/Readers/CNTKTextFormatReader/TextParser.cpp
+++ b/Source/Readers/CNTKTextFormatReader/TextParser.cpp
@ -219,8 +219,6 @@ void TextParser<ElemType>::TextDataChunk::GetSequence(size_t sequenceId, std::ve
 {
    auto it = m_sequencePtrMap.find(sequenceId);
    assert(it != m_sequencePtrMap.end());
-//TODO: Remove pragma once new randomizer is in master.
-#pragma omp atomic
    ++m_sequenceRequestCount;
    result.reserve(it->second.size());
    copy(it->second.begin(), it->second.end(), back_inserter(result));
@ -230,9 +228,6 @@ template <class ElemType>
 ChunkPtr TextParser<ElemType>::GetChunk(size_t chunkId)
 {
    ChunkPtr chunk;
-    //TODO: Remove pragma once new randomizer is in master.
-#pragma omp critical
-    {
    auto it = m_chunkCache.find(chunkId);
    if (it != m_chunkCache.end())
    {
@ -256,7 +251,6 @@ ChunkPtr TextParser<ElemType>::GetChunk(size_t chunkId)
            {
                const auto& chunk = *(it.second.get());
                size_t numSequencesUsed = 0;
-#pragma omp atomic
                numSequencesUsed += chunk.m_sequenceRequestCount;
                size_t numSequencesLeft = chunk.m_sequences.size() - numSequencesUsed;
                if (numSequencesLeft < minNumSequencesLeft)
@ -276,7 +270,6 @@ ChunkPtr TextParser<ElemType>::GetChunk(size_t chunkId)

        chunk = textChunk;
    }
-    }
    return chunk;
 }

--- a/Source/Readers/DSSMReader/DSSMReader.vcxproj
+++ b/Source/Readers/DSSMReader/DSSMReader.vcxproj
@ -69,7 +69,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -90,7 +90,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
    </Link>
  </ItemDefinitionGroup>
--- a/Source/Readers/ExperimentalHTKMLFReader/ExperimentalHTKMLFReader.vcxproj
+++ b/Source/Readers/ExperimentalHTKMLFReader/ExperimentalHTKMLFReader.vcxproj
@ -54,7 +54,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>ReaderLib.lib;Math.lib;kernel32.lib;user32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ReaderLib.lib;Common.lib;Math.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
@ -96,27 +96,12 @@
    <ClInclude Include="UtteranceDescription.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\TimerUtility.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="ConfigHelper.cpp" />
    <ClCompile Include="Exports.cpp" />
    <ClCompile Include="dllmain.cpp">
      <CompileAsManaged>false</CompileAsManaged>
      <PrecompiledHeader />
    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp">
-      <PrecompiledHeader />
-    </ClCompile>
    <ClCompile Include="HTKDataDeserializer.cpp" />
    <ClCompile Include="HTKMLFReader.cpp" />
    <ClCompile Include="MLFDataDeserializer.cpp" />
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKChunkDescription.h
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKChunkDescription.h
@ -129,7 +129,8 @@ public:
        }
        catch (...)
        {
-            ReleaseData();
+            // Releasing all data
+            m_frames.resize(0, 0);
            throw;
        }
    }
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKDataDeserializer.cpp
@ -210,7 +210,8 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe
    for (size_t i = 0; i < chunk.GetNumberOfUtterances(); ++i)
    {
        auto utterance = chunk.GetUtterance(i);
-        size_t major = utterance->GetId();
+        // Currently we do not support common prefix, so simply assign the minor to the key.
+        size_t sequence = utterance->GetId();

        if (m_frameMode)
        {
@ -219,8 +220,8 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe
            {
                SequenceDescription f;
                f.m_chunkId = chunkId;
-                f.m_key.m_major = major;
-                f.m_key.m_minor = k;
+                f.m_key.m_sequence = sequence;
+                f.m_key.m_sample = k;
                f.m_id = offsetInChunk++;
                f.m_isValid = true;
                f.m_numberOfSamples = 1;
@ -232,8 +233,8 @@ void HTKDataDeserializer::GetSequencesForChunk(size_t chunkId, vector<SequenceDe
            // Creating sequence description per utterance.
            SequenceDescription f;
            f.m_chunkId = chunkId;
-            f.m_key.m_major = major;
-            f.m_key.m_minor = 0;
+            f.m_key.m_sequence = sequence;
+            f.m_key.m_sample = 0;
            f.m_id = offsetInChunk++;
            f.m_isValid = true;
            f.m_numberOfSamples = utterance->GetNumberOfFrames();
@ -432,7 +433,7 @@ static SequenceDescription s_InvalidSequence{0, 0, 0, false};
 void HTKDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& d)
 {
    assert(!m_primary);
-    auto iter = m_keyToChunkLocation.find(key.m_major);
+    auto iter = m_keyToChunkLocation.find(key.m_sequence);
    if (iter == m_keyToChunkLocation.end())
    {
        // Unknown sequence. Return invalid.
@ -443,7 +444,7 @@ void HTKDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key, Sequen
        const auto& chunk = m_chunks[iter->second.first];
        const auto& sequence = chunk.GetUtterance(iter->second.second);
        d.m_chunkId = sequence->GetChunkId();
-        d.m_id = m_frameMode ? sequence->GetStartFrameIndexInsideChunk() + key.m_minor : sequence->GetIndexInsideChunk();
+        d.m_id = m_frameMode ? sequence->GetStartFrameIndexInsideChunk() + key.m_sample : sequence->GetIndexInsideChunk();
        d.m_isValid = true;
        d.m_numberOfSamples = m_frameMode ? 1 : sequence->GetNumberOfFrames();
    }
--- a/Source/Readers/ExperimentalHTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/HTKMLFReader.cpp
@ -13,7 +13,7 @@
 #include "StringUtil.h"
 #include "FramePacker.h"
 #include "SequencePacker.h"
-#include "BpttPacker.h"
+#include "TruncatedBpttPacker.h"
 #include "BlockRandomizer.h"
 #include "NoRandomizer.h"

@ -136,22 +136,6 @@ HTKMLFReader::HTKMLFReader(MemoryProviderPtr provider,
            m_streams.push_back(stream);
        }
    }
-}
-
-std::vector<StreamDescriptionPtr> HTKMLFReader::GetStreamDescriptions()
-{
-    assert(!m_streams.empty());
-    return m_streams;
-}
-
-void HTKMLFReader::StartEpoch(const EpochConfiguration& config)
-{
-    if (config.m_totalEpochSizeInSamples <= 0)
-    {
-        RuntimeError("Unsupported minibatch size '%d'.", (int)config.m_totalEpochSizeInSamples);
-    }
-
-    m_randomizer->StartEpoch(config);

    // TODO: should we unify sample and sequence mode packers into a single one.
    // TODO: functionally they are the same, the only difference is how we handle
@ -164,20 +148,35 @@ void HTKMLFReader::StartEpoch(const EpochConfiguration& config)
    switch (m_packingMode)
    {
    case PackingMode::sample:
-        m_packer = std::make_shared<FramePacker>(
-            m_provider,
-            m_randomizer,
-            config.m_minibatchSizeInSamples,
-            m_streams);
+        m_packer = std::make_shared<FramePacker>(m_provider, m_randomizer, m_streams);
        break;
    case PackingMode::sequence:
-        m_packer = std::make_shared<SequencePacker>(
-            m_provider,
-            m_randomizer,
-            config.m_minibatchSizeInSamples,
-            m_streams);
+        m_packer = std::make_shared<SequencePacker>(m_provider, m_randomizer, m_streams);
        break;
    case PackingMode::truncated:
+        m_packer = std::make_shared<TruncatedBPTTPacker>(m_provider, m_randomizer, m_streams);
+        break;
+    default:
+        LogicError("Unsupported type of packer '%d'.", (int)m_packingMode);
+    }
+}
+
+std::vector<StreamDescriptionPtr> HTKMLFReader::GetStreamDescriptions()
+{
+    assert(!m_streams.empty());
+    return m_streams;
+}
+
+void HTKMLFReader::StartEpoch(const EpochConfiguration& config)
+{
+    if (config.m_totalEpochSizeInSamples == 0)
+    {
+        RuntimeError("Epoch size cannot be 0.");
+    }
+
+
+
+    if (m_packingMode == PackingMode::truncated)
    {
        size_t minibatchSize = config.m_minibatchSizeInSamples;
        size_t truncationLength = m_truncationLength;
@ -192,16 +191,21 @@ void HTKMLFReader::StartEpoch(const EpochConfiguration& config)
            minibatchSize = numParallelSequences * truncationLength;
        }
        
-        m_packer = std::make_shared<BpttPacker>(
-            m_provider,
-            m_randomizer,
-            minibatchSize,
-            truncationLength,
-            m_streams);
-        break;
+        EpochConfiguration bpttConfig;
+        bpttConfig.m_numberOfWorkers = config.m_numberOfWorkers;
+        bpttConfig.m_workerRank = config.m_workerRank;
+        bpttConfig.m_totalEpochSizeInSamples = config.m_totalEpochSizeInSamples;
+        bpttConfig.m_epochIndex = config.m_epochIndex;
+        bpttConfig.m_minibatchSizeInSamples = minibatchSize;
+        bpttConfig.m_truncationSize = truncationLength;
+
+        m_randomizer->StartEpoch(bpttConfig);
+        m_packer->StartEpoch(bpttConfig);
    }
-    default:
-        LogicError("Unsupported type of packer '%d'.", (int)m_packingMode);
+    else
+    {
+        m_randomizer->StartEpoch(config);
+        m_packer->StartEpoch(config);
    }
 }

--- a/Source/Readers/ExperimentalHTKMLFReader/MLFDataDeserializer.cpp
+++ b/Source/Readers/ExperimentalHTKMLFReader/MLFDataDeserializer.cpp
@ -94,7 +94,7 @@ MLFDataDeserializer::MLFDataDeserializer(CorpusDescriptorPtr corpus, const Confi
        if (!stringRegistry.TryGet(l.first, id))
            continue;

-        description.m_key.m_major = id;
+        description.m_key.m_sequence = id;

        const auto& utterance = l.second;
        description.m_sequenceStart = m_classIds.size();
@ -130,18 +130,18 @@ MLFDataDeserializer::MLFDataDeserializer(CorpusDescriptorPtr corpus, const Confi
        description.m_numberOfSamples = numberOfFrames;
        totalFrames += numberOfFrames;
        m_utteranceIndex.push_back(m_frames.size());
-        m_keyToSequence[description.m_key.m_major] = m_utteranceIndex.size() - 1;
+        m_keyToSequence[description.m_key.m_sequence] = m_utteranceIndex.size() - 1;

        // TODO: Should be created by chunks only.
        MLFFrame f;
        f.m_chunkId = 0;
        f.m_numberOfSamples = 1;
-        f.m_key.m_major = description.m_key.m_major;
+        f.m_key.m_sequence = description.m_key.m_sequence;
        f.m_isValid = description.m_isValid;
        for (size_t k = 0; k < description.m_numberOfSamples; ++k)
        {
            f.m_id = m_frames.size();
-            f.m_key.m_minor = k;
+            f.m_key.m_sample = k;
            f.m_index = description.m_sequenceStart + k;
            m_frames.push_back(f);
        }
@ -208,8 +208,8 @@ void MLFDataDeserializer::GetSequencesForChunk(size_t, std::vector<SequenceDescr
        for (size_t i = 0; i < m_frames.size(); ++i)
        {
            SequenceDescription f;
-            f.m_key.m_major = m_frames[i].m_key.m_major;
-            f.m_key.m_minor = m_frames[i].m_key.m_minor;
+            f.m_key.m_sequence = m_frames[i].m_key.m_sequence;
+            f.m_key.m_sample = m_frames[i].m_key.m_sample;
            f.m_id = m_frames[i].m_id;
            f.m_chunkId = m_frames[i].m_chunkId;
            f.m_numberOfSamples = 1;
@ -223,8 +223,8 @@ void MLFDataDeserializer::GetSequencesForChunk(size_t, std::vector<SequenceDescr
        for (size_t i = 0; i < m_utteranceIndex.size() - 1; ++i)
        {
            SequenceDescription f;
-            f.m_key.m_major = m_frames[m_utteranceIndex[i]].m_key.m_major;
-            f.m_key.m_minor = 0;
+            f.m_key.m_sequence = m_frames[m_utteranceIndex[i]].m_key.m_sequence;
+            f.m_key.m_sample = 0;
            f.m_id = i;
            f.m_chunkId = m_frames[m_utteranceIndex[i]].m_chunkId;
            f.m_numberOfSamples = m_utteranceIndex[i + 1] - m_utteranceIndex[i];
@ -305,7 +305,7 @@ static SequenceDescription s_InvalidSequence { 0, 0, 0, false };

 void MLFDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key, SequenceDescription& result)
 {
-    auto sequenceId = m_keyToSequence.find(key.m_major);
+    auto sequenceId = m_keyToSequence.find(key.m_sequence);
    if (sequenceId == m_keyToSequence.end())
    {
        result = s_InvalidSequence;
@ -314,13 +314,13 @@ void MLFDataDeserializer::GetSequenceDescriptionByKey(const KeyType& key, Sequen

    if (m_frameMode)
    {
-        size_t index = m_utteranceIndex[sequenceId->second] + key.m_minor;
+        size_t index = m_utteranceIndex[sequenceId->second] + key.m_sample;
        result = m_frames[index];
    }
    else
    {
-        result.m_key.m_major = key.m_major;
-        result.m_key.m_minor = 0;
+        result.m_key.m_sequence = key.m_sequence;
+        result.m_key.m_sample = 0;
        result.m_id = sequenceId->second;
        result.m_chunkId = m_frames[m_utteranceIndex[sequenceId->second]].m_chunkId;
        result.m_numberOfSamples = m_utteranceIndex[sequenceId->second + 1] - m_utteranceIndex[sequenceId->second];
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
@ -70,7 +70,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <AdditionalLibraryDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <AdditionalLibraryDirectories Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
    </Link>
@ -94,7 +94,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
      <AdditionalLibraryDirectories Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
      <AdditionalLibraryDirectories Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
@ -123,11 +123,6 @@
    <ClInclude Include="utterancesourcemulti.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp" />
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\..\Common\TimerUtility.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="Exports.cpp" />
    <ClCompile Include="DataWriterLocal.cpp" />
    <ClCompile Include="dllmain.cpp">
@ -135,9 +130,6 @@
      <PrecompiledHeader>
      </PrecompiledHeader>
    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="HTKMLFReader.cpp" />
    <ClCompile Include="HTKMLFWriter.cpp" />
    <ClCompile Include="latticearchive.cpp" />
--- a/Source/Readers/HTKMLFReader/utterancesourcemulti.h
+++ b/Source/Readers/HTKMLFReader/utterancesourcemulti.h
@ -153,9 +153,9 @@ class minibatchutterancesourcemulti : public minibatchsource
            try // this function supports retrying since we read from the unrealible network, i.e. do not return in a broken state
            {
                msra::asr::htkfeatreader reader; // feature reader (we reinstantiate it for each block, i.e. we reopen the file actually)
-                auto_timer* p_pageintimer = nullptr;
+                std::unique_ptr<auto_timer> pageintimer = nullptr;
                if (verbosity > 2)
-                  p_pageintimer = new auto_timer();
+                    pageintimer.reset(new auto_timer());
                // if this is the first feature read ever, we explicitly open the first file to get the information such as feature dimension
                if (featdim == 0)
                {
@ -181,21 +181,29 @@ class minibatchutterancesourcemulti : public minibatchsource
                    fprintf(stderr, "requiredata: %d utterances read\n", (int)utteranceset.size());
                    if (verbosity > 2)
                    {
-                        if (p_pageintimer != nullptr)
+                        if (pageintimer != nullptr)
                        {
-                          double pageintime = (double)(*p_pageintimer);
+                            double pageintime = (double)(*pageintimer);
+#ifdef _MSC_VER
+                            fprintf(stderr, "Chunk read statistics; Total time = %.8g, Num Frames read = %Iu, Num bytes per frame = %Iu, Avg I/O bandwidth = %.2g MB/sec).\n",
+                              pageintime, totalframes, featdim * sizeof(float), (double)(featdim * sizeof(float) * totalframes / 1024 / 1024 / pageintime));
+#else
                            fprintf(stderr, "Chunk read statistics; Total time = %.8g, Num Frames read = %zu, Num bytes per frame = %zu, Avg I/O bandwidth = %.2g MB/sec).\n",
                              pageintime, totalframes, featdim * sizeof(float), (double)(featdim * sizeof(float) * totalframes / 1024 / 1024 / pageintime));
+#endif
+
                        }
                    }
                }
            }
            catch (...)
            {
-                releasedata();
+                // Clean up in a non-throwable way in order not to hide the original exception.
+                cleandata();
                throw;
            }
        }
+
        // page out data for this chunk
        void releasedata() const
        {
@ -203,6 +211,12 @@ class minibatchutterancesourcemulti : public minibatchsource
                LogicError("releasedata: cannot page out virgin block");
            if (!isinram())
                LogicError("releasedata: called when data is not memory");
+            cleandata();
+        }
+
+    private:
+        void cleandata() const
+        {
            // release frames
            frames.resize(0, 0);
            // release lattice data
--- a/Source/Readers/ImageReader/ByteReader.h
+++ b/Source/Readers/ImageReader/ByteReader.h
@ -22,7 +22,7 @@ public:
    virtual ~ByteReader() = default;

    virtual void Register(size_t seqId, const std::string& path) = 0;
-    virtual cv::Mat Read(size_t seqId, const std::string& path) = 0;
+    virtual cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) = 0;

    DISABLE_COPY_AND_MOVE(ByteReader);
 };
@ -31,7 +31,7 @@ class FileByteReader : public ByteReader
 {
 public:
    void Register(size_t, const std::string&) override {}
-    cv::Mat Read(size_t seqId, const std::string& path) override;
+    cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) override;
 };

 #ifdef USE_ZIP
@ -41,7 +41,7 @@ public:
    ZipByteReader(const std::string& zipPath);

    void Register(size_t seqId, const std::string& path) override;
-    cv::Mat Read(size_t seqId, const std::string& path) override;
+    cv::Mat Read(size_t seqId, const std::string& path, bool grayscale) override;

 private:
    using ZipPtr = std::unique_ptr<zip_t, void(*)(zip_t*)>;
--- a/Source/Readers/ImageReader/ImageConfigHelper.cpp
+++ b/Source/Readers/ImageReader/ImageConfigHelper.cpp
@ -77,6 +77,7 @@ ImageConfigHelper::ImageConfigHelper(const ConfigParameters& config)

    m_mapPath = config(L"file");

+    m_grayscale = config(L"grayscale", false);
    std::string rand = config(L"randomize", "auto");

    if (AreEqualIgnoreCase(rand, "auto"))
--- a/Source/Readers/ImageReader/ImageConfigHelper.h
+++ b/Source/Readers/ImageReader/ImageConfigHelper.h
@ -46,6 +46,11 @@ public:
        return m_randomize;
    }

+    bool UseGrayscale() const
+    {
+        return m_grayscale;
+    }
+	
 	bool IsMultiViewCrop() const
    {
        return m_multiViewCrop;
@ -61,6 +66,7 @@ private:
    int m_cpuThreadCount;
    bool m_randomize;
    bool m_multiViewCrop;
+    bool m_grayscale;
 };

 typedef std::shared_ptr<ImageConfigHelper> ImageConfigHelperPtr;
--- a/Source/Readers/ImageReader/ImageDataDeserializer.cpp
+++ b/Source/Readers/ImageReader/ImageDataDeserializer.cpp
@ -77,7 +77,7 @@ public:
        const auto& imageSequence = m_description;

        auto image = std::make_shared<DeserializedImage>();
-        image->m_image = std::move(m_parent.ReadImage(m_description.m_id, imageSequence.m_path));
+        image->m_image = std::move(m_parent.ReadImage(m_description.m_id, imageSequence.m_path, m_parent.m_grayscale));
        auto& cvImage = image->m_image;

        if (!cvImage.data)
@ -119,6 +119,7 @@ ImageDataDeserializer::ImageDataDeserializer(const ConfigParameters& config)
    ImageConfigHelper configHelper(config);
    m_streams = configHelper.GetStreams();
    assert(m_streams.size() == 2);
+    m_grayscale = configHelper.UseGrayscale();
 	const auto& label = m_streams[configHelper.GetLabelStreamId()];
    const auto& feature = m_streams[configHelper.GetFeatureStreamId()];

@ -214,8 +215,8 @@ void ImageDataDeserializer::CreateSequenceDescriptions(std::string mapPath, size
            description.m_chunkId = curId;
            description.m_path = imagePath;
            description.m_classId = cid;
-            description.m_key.m_major = description.m_id;
-            description.m_key.m_minor = 0;
+            description.m_key.m_sequence = description.m_id;
+            description.m_key.m_sample = 0;

            m_imageSequences.push_back(description);
            RegisterByteReader(description.m_id, description.m_path, knownReaders);
@ -266,20 +267,23 @@ void ImageDataDeserializer::RegisterByteReader(size_t seqId, const std::string&
 #endif
 }

-cv::Mat ImageDataDeserializer::ReadImage(size_t seqId, const std::string& path)
+cv::Mat ImageDataDeserializer::ReadImage(size_t seqId, const std::string& path, bool grayscale)
 {
    assert(!path.empty());

    ImageDataDeserializer::SeqReaderMap::const_iterator r;
    if (m_readers.empty() || (r = m_readers.find(seqId)) == m_readers.end())
-        return m_defaultReader.Read(seqId, path);
-    return (*r).second->Read(seqId, path);
+        return m_defaultReader.Read(seqId, path, grayscale);
+    return (*r).second->Read(seqId, path, grayscale);
 }

-cv::Mat FileByteReader::Read(size_t, const std::string& path)
+cv::Mat FileByteReader::Read(size_t, const std::string& path, bool grayscale)
 {
 	assert(!path.empty());

+    if (grayscale)
+        return cv::imread(path, cv::IMREAD_GRAYSCALE);
+    else
        return cv::imread(path, cv::IMREAD_COLOR);
 }
 }}}
--- a/Source/Readers/ImageReader/ImageDataDeserializer.h
+++ b/Source/Readers/ImageReader/ImageDataDeserializer.h
@ -56,10 +56,13 @@ private:
    // Element type of the feature/label stream (currently float/double only).
    ElementType m_featureElementType;

+    // whether images shall be loaded in grayscale 
+    bool m_grayscale;
+
    // Not using nocase_compare here as it's not correct on Linux.
    using PathReaderMap = std::unordered_map<std::string, std::shared_ptr<ByteReader>>;
    void RegisterByteReader(size_t seqId, const std::string& path, PathReaderMap& knownReaders);
-    cv::Mat ReadImage(size_t seqId, const std::string& path);
+    cv::Mat ReadImage(size_t seqId, const std::string& path, bool grayscale);

    // REVIEW alexeyk: can potentially use vector instead of map. Need to handle default reader and resizing though.
    using SeqReaderMap = std::unordered_map<size_t, std::shared_ptr<ByteReader>>;
--- a/Source/Readers/ImageReader/ImageReader.cpp
+++ b/Source/Readers/ImageReader/ImageReader.cpp
@ -38,13 +38,17 @@ ImageReader::ImageReader(MemoryProviderPtr provider,
    auto deserializer = std::make_shared<ImageDataDeserializer>(config);

    TransformerPtr randomizer;
+    // Request multi-threaded randomizer operation to speed up CPU-intensive image-decoding and transformations.
+    const bool multithreadedGetNextSequences = true;
    if (configHelper.ShouldRandomize())
    {
-        randomizer = std::make_shared<BlockRandomizer>(0, 1, deserializer, BlockRandomizer::DecimationMode::sequence, false);
+        // We do not use legacy randomization.
+        bool useLegacyRandomization = false;
+        randomizer = std::make_shared<BlockRandomizer>(0, 1, deserializer, BlockRandomizer::DecimationMode::sequence, useLegacyRandomization, multithreadedGetNextSequences);
    }
    else
    {
-        randomizer = std::make_shared<NoRandomizer>(deserializer);
+        randomizer = std::make_shared<NoRandomizer>(deserializer, multithreadedGetNextSequences);
    }

    randomizer->Initialize(nullptr, config);
@ -66,6 +70,11 @@ ImageReader::ImageReader(MemoryProviderPtr provider,
    }

    m_transformer = last;
+
+    m_packer = std::make_shared<FramePacker>(
+        m_provider,
+        m_transformer,
+        m_streams);
 }

 std::vector<StreamDescriptionPtr> ImageReader::GetStreamDescriptions()
@ -76,17 +85,13 @@ std::vector<StreamDescriptionPtr> ImageReader::GetStreamDescriptions()

 void ImageReader::StartEpoch(const EpochConfiguration& config)
 {
-    if (config.m_totalEpochSizeInSamples <= 0)
+    if (config.m_totalEpochSizeInSamples == 0)
    {
-        RuntimeError("Unsupported minibatch size '%u'.", (int)config.m_totalEpochSizeInSamples);
+        RuntimeError("Epoch size cannot be 0.");
    }

    m_transformer->StartEpoch(config);
-    m_packer = std::make_shared<FramePacker>(
-        m_provider,
-        m_transformer,
-        config.m_minibatchSizeInSamples,
-        m_streams);
+    m_packer->StartEpoch(config);
 }

 Minibatch ImageReader::ReadMinibatch()
--- a/Source/Readers/ImageReader/ImageReader.vcxproj
+++ b/Source/Readers/ImageReader/ImageReader.vcxproj
@ -75,7 +75,7 @@
    <Link>
      <SubSystem>Windows</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>ReaderLib.lib;Math.lib;$(OpenCVLib);$(ZipLibs);%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ReaderLib.lib;Common.lib;Math.lib;$(OpenCVLib);$(ZipLibs);%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
    <PostBuildEvent>
      <Command>if "$(HasOpenCv)" == "true" xcopy /I /D /Y "$(OPENCV_PATH)\x64\vc12\bin\opencv_world300.dll" "$(TargetDir)"
@ -126,18 +126,6 @@ if "$(UseZip)" == "true" if exist "$(ZLIB_PATH)\bin\zlib1.dll" (xcopy /I /D /Y "
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp" />
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="$(ReleaseBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="ImageConfigHelper.cpp" />
    <ClCompile Include="ImageDataDeserializer.cpp" />
    <ClCompile Include="dllmain.cpp" />
--- a/Source/Readers/ImageReader/ZipByteReader.cpp
+++ b/Source/Readers/ImageReader/ZipByteReader.cpp
@ -56,7 +56,7 @@ void ZipByteReader::Register(size_t seqId, const std::string& path)
    m_zips.push(std::move(zipFile));
 }

-cv::Mat ZipByteReader::Read(size_t seqId, const std::string& path)
+cv::Mat ZipByteReader::Read(size_t seqId, const std::string& path, bool grayscale)
 {
    // Find index of the file in .zip file.
    auto r = m_seqIdToIndex.find(seqId);
@ -99,7 +99,11 @@ cv::Mat ZipByteReader::Read(size_t seqId, const std::string& path)
    }
    m_zips.push(std::move(zipFile));

-    cv::Mat img = cv::imdecode(cv::Mat(1, (int)size, CV_8UC1, contents.data()), cv::IMREAD_COLOR);
+    cv::Mat img; 
+    if (grayscale)
+        img = cv::imdecode(cv::Mat(1, (int)size, CV_8UC1, contents.data()), cv::IMREAD_GRAYSCALE);
+    else
+        img = cv::imdecode(cv::Mat(1, (int)size, CV_8UC1, contents.data()), cv::IMREAD_COLOR);
    assert(nullptr != img.data);
    m_workspace.push(std::move(contents));
    return img;
--- a/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
+++ b/Source/Readers/LMSequenceReader/LMSequenceReader.vcxproj
@ -69,7 +69,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -90,7 +90,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
    </Link>
  </ItemDefinitionGroup>
@ -107,18 +107,6 @@
    <ClInclude Include="SequenceParser.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp" />
-    <ClCompile Include="..\..\Common\DataWriter.cpp" />
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="Exports.cpp" />
    <ClCompile Include="dllmain.cpp">
      <CompileAsManaged Condition="$(DebugBuild)">false</CompileAsManaged>
--- a/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
+++ b/Source/Readers/LUSequenceReader/LUSequenceReader.vcxproj
@ -72,7 +72,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -93,7 +93,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
    </Link>
  </ItemDefinitionGroup>
@ -109,17 +109,6 @@
    <ClInclude Include="LUSequenceParser.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp" />
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="DataWriterLocal.cpp" />
    <ClCompile Include="Exports.cpp" />
    <ClCompile Include="dllmain.cpp">
--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.vcxproj
@ -69,7 +69,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -90,7 +90,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
    </Link>
  </ItemDefinitionGroup>
@ -105,27 +105,6 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\DataWriter.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="$(ReleaseBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="$(ReleaseBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="$(ReleaseBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="dllmain.cpp">
      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
    </ClCompile>
--- a/Source/Readers/ReaderLib/BlockRandomizer.cpp
+++ b/Source/Readers/ReaderLib/BlockRandomizer.cpp
@ -21,7 +21,8 @@ BlockRandomizer::BlockRandomizer(
    size_t randomizationRangeInSamples,
    IDataDeserializerPtr deserializer,
    DecimationMode decimationMode,
-    bool useLegacyRandomization)
+    bool useLegacyRandomization,
+    bool multithreadedGetNextSequence)
    : m_verbosity(verbosity),
      m_deserializer(deserializer),
      m_decimationMode(decimationMode),
@ -31,7 +32,8 @@ BlockRandomizer::BlockRandomizer(
      m_epochStartPosition(0),
      m_sweepTotalNumberOfSamples(0),
      m_lastSeenChunkId(SIZE_MAX),
-      m_chunkRandomizer(std::make_shared<ChunkRandomizer>(deserializer, randomizationRangeInSamples, useLegacyRandomization))
+      m_chunkRandomizer(std::make_shared<ChunkRandomizer>(deserializer, randomizationRangeInSamples, useLegacyRandomization)),
+      m_multithreadedGetNextSequences(multithreadedGetNextSequence)
 {
    assert(deserializer != nullptr);

@ -116,11 +118,7 @@ Sequences BlockRandomizer::GetNextSequences(size_t sampleCount)

    result.m_data.resize(m_streams.size(), std::vector<SequenceDataPtr>(decimated.size()));

-    // TODO: This will be changed, when we move transformers under the randomizer.
-    // TODO: Randomizer won't should not deal with multithreading.
-#pragma omp parallel for ordered schedule(dynamic)
-    for (int i = 0; i < decimated.size(); ++i)
-    {
+    auto process = [&](int i) -> void {
        const auto& description = decimated[i];
        std::vector<SequenceDataPtr> sequence;
        auto it = m_chunks.find(description.m_chunk->m_chunkId);
@ -134,6 +132,19 @@ Sequences BlockRandomizer::GetNextSequences(size_t sampleCount)
        {
            result.m_data[j][i] = sequence[j];
        }
+    };
+
+    // TODO: This will be changed, when we move transformers under the randomizer, should not deal with multithreading here.
+    if (m_multithreadedGetNextSequences)
+    {
+#pragma omp parallel for schedule(dynamic)
+        for (int i = 0; i < decimated.size(); ++i)
+            process(i);
+    }
+    else
+    {
+        for (int i = 0; i < decimated.size(); ++i)
+            process(i);
    }

    m_sequenceRandomizer->ReleaseChunks();
--- a/Source/Readers/ReaderLib/BlockRandomizer.h
+++ b/Source/Readers/ReaderLib/BlockRandomizer.h
@ -47,7 +47,8 @@ public:
        size_t randomizationRangeInSamples,
        IDataDeserializerPtr deserializer,
        DecimationMode decimationMode = DecimationMode::chunk,
-        bool useLegacyRandomization = false);
+        bool useLegacyRandomization = false,
+        bool multithreadedGetNextSequences = false);

    virtual void Initialize(TransformerPtr, const ConfigParameters&) override {};

@ -118,6 +119,10 @@ private:
    // Decimation mode.
    DecimationMode m_decimationMode;

+    // Whether to get sequences using multiple thread.
+    // TODO temporary; should go away when transformers are moved closer to the deserializer
+    bool m_multithreadedGetNextSequences;
+
    // General configuration
    int m_verbosity;
 };
--- a/Source/Readers/ReaderLib/DataDeserializer.h
+++ b/Source/Readers/ReaderLib/DataDeserializer.h
@ -10,11 +10,22 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-// Sequence key, used for correlations between sequences between different deserializers.
+// Sequence key, used for correlations of sequences between different deserializers.
+// TODO: In many cases sequence keys share the same prefix. Splitting the sequence key on
+// sequence prefix and suffix will allow us to store keys more efficiently.
+
+// The sample identifies a particular sample inside the sequence. In the future it will be hidden, so that deserializers won't know about
+// sequence or sample mode, exposing only sequences.
 struct KeyType
 {
-    size_t m_major;
-    size_t m_minor;
+    // Possible sequence common prefix.
+    // size_t m_prefix;
+
+    // Identifies sequence between different deserializers.
+    size_t m_sequence;
+
+    // Sample id.
+    size_t m_sample;
 };

 class Chunk;
--- a/Source/Readers/ReaderLib/FramePacker.h
+++ b/Source/Readers/ReaderLib/FramePacker.h
@ -16,9 +16,8 @@ public:
    FramePacker(
        MemoryProviderPtr memoryProvider,
        TransformerPtr transformer,
-        size_t minibatchSize,
        const std::vector<StreamDescriptionPtr>& streams) :
-        SequencePacker(memoryProvider, transformer, minibatchSize, streams)
+        SequencePacker(memoryProvider, transformer, streams)
    {

    }
--- a/Source/Readers/ReaderLib/NoRandomizer.cpp
+++ b/Source/Readers/ReaderLib/NoRandomizer.cpp
@ -11,13 +11,14 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-NoRandomizer::NoRandomizer(IDataDeserializerPtr deserializer)
+NoRandomizer::NoRandomizer(IDataDeserializerPtr deserializer, bool multithreadedGetNextSequences)
    : m_deserializer(deserializer),
      m_samplePositionInEpoch(0),
      m_currentChunkPosition(SIZE_MAX),
      m_globalSamplePosition(0),
      m_totalNumberOfSamples(0),
-      m_currentSequencePositionInChunk(0)
+      m_currentSequencePositionInChunk(0),
+      m_multithreadedGetNextSequences(multithreadedGetNextSequences)
 {
    assert(deserializer != nullptr);
    m_streams = m_deserializer->GetStreamDescriptions();
@ -172,22 +173,60 @@ Sequences NoRandomizer::GetNextSequences(size_t sampleCount)
    }

    result.m_data.resize(m_streams.size(), std::vector<SequenceDataPtr>(subsetSize));
-    for (int i = 0; i < subsetSize; ++i)
+
+    // Collect all the chunks that we need
+    std::map<size_t, ChunkPtr> chunks;
+
+    if (m_currentChunk != nullptr)
    {
-        std::vector<SequenceDataPtr> sequence;
-        const auto& sequenceDescription = descriptions[start + i];
-        if (sequenceDescription.m_chunkId != m_currentChunkId)
-        {
-            m_currentChunk = m_deserializer->GetChunk(sequenceDescription.m_chunkId);
-            m_currentChunkId = sequenceDescription.m_chunkId;
+        chunks[m_currentChunkId] = m_currentChunk;
    }

-        m_currentChunk->GetSequence(sequenceDescription.m_id, sequence);
+    for (int i = 0; i < subsetSize; ++i)
+    {
+        const auto& sequenceDescription = descriptions[start + i];
+        auto it = chunks.find(sequenceDescription.m_chunkId);
+        if (it == chunks.end())
+        {
+            chunks[sequenceDescription.m_chunkId] = m_deserializer->GetChunk(sequenceDescription.m_chunkId);
+        }
+    }
+
+    auto process = [&](int i) -> void {
+        std::vector<SequenceDataPtr> sequence;
+        const auto& sequenceDescription = descriptions[start + i];
+
+        auto it = chunks.find(sequenceDescription.m_chunkId);
+        if (it == chunks.end())
+        {
+            LogicError("Invalid chunk requested.");
+        }
+
+        it->second->GetSequence(sequenceDescription.m_id, sequence);
        for (int j = 0; j < m_streams.size(); ++j)
        {
            result.m_data[j][i] = sequence[j];
        }
+    };
+
+    // TODO: This will be changed, when we move transformers under the (no-) randomizer, should not deal with multithreading here.
+    if (m_multithreadedGetNextSequences)
+    {
+#pragma omp parallel for schedule(dynamic)
+        for (int i = 0; i < subsetSize; ++i)
+            process(i);
    }
+    else
+    {
+        for (int i = 0; i < subsetSize; ++i)
+            process(i);
+    }
+
+    // Keep the last chunk for next time
+    m_currentChunkId = descriptions[start + subsetSize - 1].m_chunkId;
+    auto it = chunks.find(m_currentChunkId);
+    assert(it != chunks.end());
+    m_currentChunk = it->second;

    return result;
 }
--- a/Source/Readers/ReaderLib/NoRandomizer.h
+++ b/Source/Readers/ReaderLib/NoRandomizer.h
@ -21,7 +21,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 class NoRandomizer : public Transformer
 {
 public:
-    NoRandomizer(IDataDeserializerPtr deserializer);
+    NoRandomizer(IDataDeserializerPtr deserializer, bool multithreadedGetNextSequences = false);

    virtual void Initialize(TransformerPtr next, const ConfigParameters& readerConfig) override;
    virtual void StartEpoch(const EpochConfiguration& config) override;
@ -43,6 +43,10 @@ private:

    IDataDeserializerPtr m_deserializer;

+    // Whether to get sequences using multiple thread.
+    // TODO temporary; should go away when transformers are moved closer to the deserializer
+    bool m_multithreadedGetNextSequences;
+
    // Stream descriptions
    std::vector<StreamDescriptionPtr> m_streams;

--- a/Source/Readers/ReaderLib/Packer.h
+++ b/Source/Readers/ReaderLib/Packer.h
@ -15,6 +15,9 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 class Packer
 {
 public:
+    // Sets current epoch configuration.
+    virtual void StartEpoch(const EpochConfiguration& config) = 0;
+
    virtual Minibatch ReadMinibatch() = 0;
    virtual ~Packer() {}
 };
--- a/Source/Readers/ReaderLib/PackerBase.cpp
+++ b/Source/Readers/ReaderLib/PackerBase.cpp
@ -24,23 +24,26 @@ void PackerBase::StreamBuffer::Resize(size_t newSize)
    });
 }

+void PackerBase::StartEpoch(const EpochConfiguration& config)
+{
+    m_minibatchSize = config.m_minibatchSizeInSamples;
+    if (m_minibatchSize == 0)
+    {
+        LogicError("Minibatch size cannot be zero.");
+    }
+}
+
 PackerBase::PackerBase(MemoryProviderPtr memoryProvider,
    TransformerPtr transformer,
-    size_t minibatchSize,
    const std::vector<StreamDescriptionPtr>& streams) :
    m_transformer(transformer),
-    m_minibatchSize(minibatchSize),
+    m_minibatchSize(0),
    m_outputStreamDescriptions(streams)
 {
    m_inputStreamDescriptions = m_transformer->GetStreamDescriptions();
    assert(m_inputStreamDescriptions.size() != 0);
    assert(m_inputStreamDescriptions.size() == m_outputStreamDescriptions.size());

-    if (m_minibatchSize == 0)
-    {
-        LogicError("Minibatch size cannot be zero.");
-    }
-
    m_streamBuffers.reserve(m_outputStreamDescriptions.size());

    // Sanity checks:
--- a/Source/Readers/ReaderLib/PackerBase.h
+++ b/Source/Readers/ReaderLib/PackerBase.h
@ -35,7 +35,6 @@ protected:

    PackerBase(MemoryProviderPtr memoryProvider,
               TransformerPtr transformer,
-               size_t minibatchSize,
               const std::vector<StreamDescriptionPtr>& streams);

    typedef std::vector<SequenceDataPtr> StreamBatch;
@ -71,6 +70,10 @@ protected:

    // Minibatch size in samples.
    size_t m_minibatchSize;
+
+public:
+    // Sets current epoch configuration.
+    virtual void StartEpoch(const EpochConfiguration& config) override;
 };

 inline void PackerBase::PackSparseSampleAsDense(char* destination, SparseSequenceDataPtr sequence,
--- a/Source/Readers/ReaderLib/Reader.h
+++ b/Source/Readers/ReaderLib/Reader.h
@ -29,6 +29,7 @@ struct EpochConfiguration
    size_t m_minibatchSizeInSamples;        // Maximum minibatch size for the epoch in samples
    size_t m_totalEpochSizeInSamples;       // Total size of the epoch in samples
    size_t m_epochIndex;                    // Current epoch index [0 .. max number of epochs)
+    size_t m_truncationSize;                // Truncation size in samples for truncated BPTT mode.
 };

 // Supported primitive element types, will be extended in the future.
--- a/Source/Readers/ReaderLib/ReaderLib.vcxproj
+++ b/Source/Readers/ReaderLib/ReaderLib.vcxproj
@ -22,20 +22,11 @@
    <ProjectGuid>{F0A9637C-20DA-42F0-83D4-23B4704DE602}</ProjectGuid>
    <RootNamespace>ReaderLib</RootNamespace>
  </PropertyGroup>
-  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
  <PropertyGroup Label="Configuration">
    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>MultiByte</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
-    <UseDebugLibraries>true</UseDebugLibraries>
-  </PropertyGroup>
-  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
@ -45,39 +36,10 @@
  <PropertyGroup Label="UserMacros" />
  <ItemDefinitionGroup>
    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <SDLCheck>true</SDLCheck>
-      <TreatWarningAsError>true</TreatWarningAsError>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\Math</AdditionalIncludeDirectories>
    </ClCompile>
-    <Link>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-    </Link>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup>
-    <ClCompile>
-      <OpenMPSupport>true</OpenMPSupport>
-    </ClCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="$(DebugBuild)">
-    <ClCompile>
-      <Optimization>Disabled</Optimization>
-    </ClCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
-    <ClCompile>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-    </ClCompile>
-    <Link>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
-    <ClInclude Include="BpttPacker.h" />
    <ClInclude Include="Bundler.h" />
    <ClInclude Include="ChunkRandomizer.h" />
    <ClInclude Include="DataDeserializerBase.h" />
@ -98,9 +60,9 @@
    <ClInclude Include="Reader.h" />
    <ClInclude Include="ReaderShim.h" />
    <ClInclude Include="Transformer.h" />
+    <ClInclude Include="TruncatedBpttPacker.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="BpttPacker.cpp" />
    <ClCompile Include="Bundler.cpp" />
    <ClCompile Include="ChunkRandomizer.cpp" />
    <ClCompile Include="NoRandomizer.cpp" />
@ -110,6 +72,7 @@
    <ClCompile Include="ReaderShim.cpp" />
    <ClCompile Include="SequencePacker.cpp" />
    <ClCompile Include="SequenceRandomizer.cpp" />
+    <ClCompile Include="TruncatedBpttPacker.cpp" />
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
--- a/Source/Readers/ReaderLib/ReaderLib.vcxproj.filters
+++ b/Source/Readers/ReaderLib/ReaderLib.vcxproj.filters
@ -61,7 +61,7 @@
    <ClInclude Include="FramePacker.h">
      <Filter>Packers</Filter>
    </ClInclude>
-    <ClInclude Include="BpttPacker.h">
+    <ClInclude Include="TruncatedBpttPacker.h">
      <Filter>Packers</Filter>
    </ClInclude>
  </ItemGroup>
@ -93,7 +93,7 @@
    <ClCompile Include="FramePacker.cpp">
      <Filter>Packers</Filter>
    </ClCompile>
-    <ClCompile Include="BpttPacker.cpp">
+    <ClCompile Include="TruncatedBpttPacker.cpp">
      <Filter>Packers</Filter>
    </ClCompile>
  </ItemGroup>
--- a/Source/Readers/ReaderLib/ReaderShim.cpp
+++ b/Source/Readers/ReaderLib/ReaderShim.cpp
@ -61,6 +61,12 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
    size_t numSubsets,
    size_t requestedEpochSamples /*= requestDataSize*/)
 {
+    // For adaptive minibatch, make sure there are no outstanding reads.
+    if (m_prefetchTask.valid())
+    {
+        m_prefetchTask.wait();
+    }
+
    EpochConfiguration config;
    config.m_workerRank = subsetNum;
    config.m_numberOfWorkers = numSubsets;
@ -71,12 +77,6 @@ void ReaderShim<ElemType>::StartDistributedMinibatchLoop(
    m_reader->StartEpoch(config);
    m_endOfEpoch = false;

-    // For adaptive minibatch, make sure there are no outstanding reads.
-    if (m_prefetchTask.valid())
-    {
-        m_prefetchTask.wait();
-    }
-
    m_prefetchTask = std::async(m_launchType, [this]()
    {
        return m_reader->ReadMinibatch();
--- a/Source/Readers/ReaderLib/SequencePacker.h
+++ b/Source/Readers/ReaderLib/SequencePacker.h
@ -17,9 +17,8 @@ public:
    SequencePacker(
        MemoryProviderPtr memoryProvider,
        TransformerPtr transformer,
-        size_t minibatchSize,
        const std::vector<StreamDescriptionPtr>& streams) :
-        PackerBase(memoryProvider, transformer, minibatchSize, streams)
+        PackerBase(memoryProvider, transformer, streams)
    {

    }
--- a/Source/Readers/ReaderLib/SequenceRandomizer.cpp
+++ b/Source/Readers/ReaderLib/SequenceRandomizer.cpp
@ -76,11 +76,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        std::vector<RandomizedSequenceDescription> result;
        result.reserve(sampleCount);

+        bool firstSequence = true;
+        while (samples > 0 && m_currentChunkCursor < m_randomizedChunks.size())
+        {
            size_t sequenceOffsetInsideChunk = m_currentSequenceCursor - m_randomizedChunks[m_currentChunkCursor].m_sequencePositionStart;
            RandomizedSequenceDescription* sequence = &m_sequenceWindow[m_currentChunkCursor - m_chunkWindowBegin][sequenceOffsetInsideChunk];

+            if (firstSequence || samples >= (int)sequence->m_numberOfSamples)
+            {
+                firstSequence = false;
                result.push_back(*sequence);
-        samples -= (int)sequence->m_numberOfSamples;
                m_currentSequenceCursor++;
                m_currentSampleCursor += (int)sequence->m_numberOfSamples;

@ -89,24 +94,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                    // Moving to the next chunk.
                    MoveChunkCursor();
                }
+            }

-        while (samples > 0 && m_currentChunkCursor < m_randomizedChunks.size())
-        {
-            sequenceOffsetInsideChunk = m_currentSequenceCursor - m_randomizedChunks[m_currentChunkCursor].m_sequencePositionStart;
-            sequence = &m_sequenceWindow[m_currentChunkCursor - m_chunkWindowBegin][sequenceOffsetInsideChunk];
-            if (samples - sequence->m_numberOfSamples >= 0)
-            {
-                result.push_back(*sequence);
-                m_currentSequenceCursor++;
+            // Always decrease the available number of samples.
            samples -= (int)sequence->m_numberOfSamples;
-                m_currentSampleCursor += (int)sequence->m_numberOfSamples;
-
-                if (sequenceOffsetInsideChunk + 1 >= m_randomizedChunks[m_currentChunkCursor].m_original->m_numberOfSequences)
-                {
-                    // Moving to the next chunk.
-                    MoveChunkCursor();
-                }
-            }
        }

        return result;
--- a/Source/Readers/ReaderLib/TransformerBase.h
+++ b/Source/Readers/ReaderLib/TransformerBase.h
@ -56,7 +56,7 @@ public:
            size_t streamId = appliedStreamIds[j];
            auto& allSamples = samples.m_data[streamId];

-#pragma omp parallel for ordered schedule(dynamic)
+#pragma omp parallel for schedule(dynamic)
            for (int i = 0; i < allSamples.size(); ++i)
            {
                allSamples[i] = Apply(allSamples[i], *m_inputStreams[streamId], *outputStreams[streamId]);
--- a/Source/Readers/ReaderLib/TruncatedBpttPacker.cpp
+++ b/Source/Readers/ReaderLib/TruncatedBpttPacker.cpp
@ -7,7 +7,7 @@
 #define _SCL_SECURE_NO_WARNINGS

 #include <deque>
-#include "BpttPacker.h"
+#include "TruncatedBpttPacker.h"
 #include "ElementTypeUtils.h"

 namespace Microsoft { namespace MSR { namespace CNTK {
@ -105,24 +105,51 @@ struct SequenceBuffer
    vector<Slot> m_slots;
 };

-BpttPacker::BpttPacker(
+TruncatedBPTTPacker::TruncatedBPTTPacker(
    MemoryProviderPtr memoryProvider,
    TransformerPtr transformer,
-    size_t minibatchSize,
-    size_t truncationSize,
    const vector<StreamDescriptionPtr>& streams)
-    : PackerBase(memoryProvider, transformer, minibatchSize, streams),
-    m_truncationSize(truncationSize)
+    : PackerBase(memoryProvider, transformer, streams),
+    m_truncationSize(0)
 {
    auto sparseOutput = find_if(m_outputStreamDescriptions.begin(), m_outputStreamDescriptions.end(), [](const StreamDescriptionPtr& s){ return s->m_storageType == StorageType::sparse_csc; });
    if (sparseOutput != m_outputStreamDescriptions.end())
    {
+        // TODO: add support for sparse.
        RuntimeError("Sparse output is not supported in BPTT mode.");
    }

+    // Preparing layouts.
+    for (int i = 0; i < m_outputStreamDescriptions.size(); ++i)
+    {
+        auto pMBLayout = make_shared<MBLayout>();
+        pMBLayout->SetUniqueAxisName(L"TruncatedBPTTPacker");
+        m_currentLayouts.push_back(pMBLayout);
+    }
+}
+
+void TruncatedBPTTPacker::StartEpoch(const EpochConfiguration& config)
+{
+    if (m_minibatchSize != config.m_minibatchSizeInSamples ||
+        m_truncationSize != config.m_truncationSize)
+    {
+        m_minibatchSize = config.m_minibatchSizeInSamples;
+        m_truncationSize = config.m_truncationSize;
+
+        if (m_minibatchSize == 0)
+        {
+            LogicError("Minibatch size cannot be zero.");
+        }
+        if (m_truncationSize == 0)
+        {
+            LogicError("Truncation size cannot be zero.");
+        }
+
        // Estimating the number of parallel sequences to pack (slots) from the minibatch size and truncation size.
        m_numParallelSequences = max(1, (int)floor(m_minibatchSize / m_truncationSize));

+        m_sequenceBufferPerStream.clear();
+
        // Preparing the buffers.
        for (int i = 0; i < m_outputStreamDescriptions.size(); ++i)
        {
@ -130,9 +157,7 @@ BpttPacker::BpttPacker(
            auto& buffer = m_streamBuffers[i];
            buffer.Resize(m_numParallelSequences * m_truncationSize * GetSampleSize(stream));
            m_sequenceBufferPerStream.push_back(make_shared<SequenceBuffer>(m_numParallelSequences));
-        auto pMBLayout = make_shared<MBLayout>();
-        pMBLayout->SetUniqueAxisName(L"BpttPacker");
-        m_currentLayouts.push_back(pMBLayout);
+        }
    }

    // Filling in the initial set of sequences
@ -142,7 +167,7 @@ BpttPacker::BpttPacker(
    }
 }

-Minibatch BpttPacker::ReadMinibatch()
+Minibatch TruncatedBPTTPacker::ReadMinibatch()
 {
    Minibatch result;

@ -174,7 +199,7 @@ Minibatch BpttPacker::ReadMinibatch()
 }

 // Packs a slot of sequences into the minibatch.
-void BpttPacker::PackSlot(size_t streamIndex, size_t slotIndex, size_t& sequenceId)
+void TruncatedBPTTPacker::PackSlot(size_t streamIndex, size_t slotIndex, size_t& sequenceId)
 {
    auto& slot = m_sequenceBufferPerStream[streamIndex]->m_slots[slotIndex];

@ -274,7 +299,7 @@ void BpttPacker::PackSlot(size_t streamIndex, size_t slotIndex, size_t& sequence
    }
 }

-void BpttPacker::ReadSequencesToSlot(size_t slotIndex)
+void TruncatedBPTTPacker::ReadSequencesToSlot(size_t slotIndex)
 {
    const auto& slot = m_sequenceBufferPerStream.front()->m_slots[slotIndex];
    while (m_truncationSize > slot.AvailableNumberOfSamples())
--- a/Source/Readers/ReaderLib/TruncatedBpttPacker.h
+++ b/Source/Readers/ReaderLib/TruncatedBpttPacker.h
@ -18,18 +18,18 @@ typedef std::shared_ptr<SequenceBuffer> SequenceBufferPtr;

 // A bptt packer that densely packs samples in parallel for GPU consumptions.
 // TODO: Currently supports only packing of streams with sequences of equal length.
-class BpttPacker : public PackerBase
+class TruncatedBPTTPacker : public PackerBase
 {
 public:
-    BpttPacker(
+    TruncatedBPTTPacker(
        MemoryProviderPtr memoryProvider,
        TransformerPtr transformer,
-        size_t minibatchSize,
-        size_t truncationSize,
        const std::vector<StreamDescriptionPtr>& streams);

    virtual Minibatch ReadMinibatch() override;

+    virtual void StartEpoch(const EpochConfiguration& config) override;
+
 private:
    // Reads sequences to slot with the specified index.
    // Number of slots = m_parallelNumberOfSequences
@ -65,6 +65,6 @@ private:
    std::vector<MBLayoutPtr> m_currentLayouts;
 };

-typedef std::shared_ptr<BpttPacker> BpttPackerPtr;
+typedef std::shared_ptr<TruncatedBPTTPacker> TruncatedBPTTPackerPtr;

 }}}
--- a/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
+++ b/Source/Readers/SparsePCReader/SparsePCReader.vcxproj
@ -72,7 +72,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -93,7 +93,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
    </Link>
  </ItemDefinitionGroup>
@ -114,20 +114,6 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp" />
-    <ClCompile Include="..\..\Common\DataWriter.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader Condition="$(ReleaseBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="dllmain.cpp" />
    <ClCompile Include="SparsePCReader.cpp">
      <PrecompiledHeader Condition="$(ReleaseBuild)">Use</PrecompiledHeader>
--- a/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
+++ b/Source/Readers/UCIFastReader/UCIFastReader.vcxproj
@ -71,7 +71,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -92,7 +92,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>Math.lib;Common.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
    </Link>
  </ItemDefinitionGroup>
@ -108,18 +108,6 @@
    <ClInclude Include="UCIParser.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\..\Common\DataReader.cpp" />
-    <ClCompile Include="..\..\Common\DataWriter.cpp" />
-    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
-    <ClCompile Include="..\..\Common\File.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\fileutil.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\..\Common\Config.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
    <ClCompile Include="Exports.cpp" />
    <ClCompile Include="dllmain.cpp">
      <CompileAsManaged Condition="$(DebugBuild)">false</CompileAsManaged>
--- a/Source/SGDLib/Criterion.h
+++ b/Source/SGDLib/Criterion.h
@ -8,6 +8,7 @@

 #include "Basics.h"
 #include "Matrix.h"
+#include "TensorView.h"
 #include <memory> // for pair
 #include <limits> // for isnan() and numeric_limits  --TODO: is that the right header?

@ -40,9 +41,9 @@ struct CriterionAccumulator
 {
    // constructor
    CriterionAccumulator(size_t numCriteria, DEVICEID_TYPE deviceId) :
-        m_aggregateCriterionValues(1, numCriteria, deviceId)
+        m_aggregateCriterionValues(make_shared<Matrix<ElemType>> (1, numCriteria, deviceId))
    {
-        m_aggregateCriterionValues.SetValue(0);
+        m_aggregateCriterionValues->SetValue(0);
        m_aggregateSampleCounts.assign(numCriteria, 0);
    }
    // 'i' is the index of the element we add into (multiple eval criteria share the same matrix object)
@ -63,7 +64,7 @@ struct CriterionAccumulator
        if (m_aggregateSampleCounts[i] == 0)
            return EpochCriterion(0, 0); // avoid unnecessary GPU access
        else
-            return EpochCriterion(m_aggregateCriterionValues(0, i), m_aggregateSampleCounts[i]);
+            return EpochCriterion(m_aggregateCriterionValues->GetValue(0, i), m_aggregateSampleCounts[i]);
    }

 private:
@ -73,23 +74,41 @@ private:
    const CriterionAccumulator& Accumulate(const std::vector<ComputationNodeBasePtr>& nodes, size_t i, size_t legacyNumSamples)
    {
        const auto& node = nodes[i]; // multiple nodes are managed by this struct
-        float beta = reset ? 0 : 1;
-        // Note: A future change will be that criterion nodes emit criteria per frame.
-        // In that case, we will do masking and an implicit reduction right here using TensorView.
+        size_t beta = reset ? 0 : 1;
        size_t numSamples = GetNumSamples(nodes[i], legacyNumSamples);
+#if 1
+        // For criterion nodes that emit criteria per frame, we will at this point
+        // do masking and an implicit reduction.
+
+        // get a TensorView of the criterion values to aggregate
+        FrameRange fr(node->GetMBLayout());
+        node->MaskMissingValueColumnsToZero(fr); // set gaps to zero, so that we can aggregate
+        auto criterionValue = node->As<ComputationNode<ElemType>>()->ValueTensorFor(SIZE_MAX, fr);
+
+        // get a TensorView of our aggregator
+        TensorShape shape{ m_aggregateCriterionValues->GetNumRows(), m_aggregateCriterionValues->GetNumCols() };
+        shape.NarrowTo(1, i, i + 1); // narrow to the single element that corresponds to the accumulator value
+        auto criterionAccumulator = TensorView<ElemType>(m_aggregateCriterionValues, shape);
+
+        // accumulate
+        // Note: If criterion is > [1 x 1] then inverse broadcasting will kick in and aggregate.
+        criterionAccumulator.DoCopyOf((float) beta, criterionValue, 1);
+        m_aggregateSampleCounts[i] = m_aggregateSampleCounts[i] * beta + numSamples;
+#else
        // temp solution until we add TensorView reduction
        if (beta == 0)
        {
            Matrix<ElemType>::AssignElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(),
-                                                     0, 0, m_aggregateCriterionValues, 0, i);
+                                                     0, 0, *m_aggregateCriterionValues, 0, i);
            m_aggregateSampleCounts[i] = numSamples;
        }
        else if (numSamples > 0) // avoid unnecessary GPU access
        {
            Matrix<ElemType>::AddElementToElement(dynamic_pointer_cast<ComputationNode<ElemType>>(node)->Value(),
-                                                  0, 0, m_aggregateCriterionValues, 0, i);
+                                                  0, 0, *m_aggregateCriterionValues, 0, i);
            m_aggregateSampleCounts[i] += numSamples;
        }
+#endif
        return *this;
    }
    // get the number of samples
@ -102,7 +121,7 @@ private:
    }

 private:
-    Matrix<ElemType> m_aggregateCriterionValues; // [1 x N]
+    shared_ptr<Matrix<ElemType>> m_aggregateCriterionValues; // [1 x N]
    vector<size_t> m_aggregateSampleCounts;                  // [N]
 };

--- a/Source/SGDLib/SGDLib.vcxproj
+++ b/Source/SGDLib/SGDLib.vcxproj
@ -24,107 +24,49 @@
    <RootNamespace>CNTK</RootNamespace>
    <ProjectName>SGDLib</ProjectName>
  </PropertyGroup>
+  <PropertyGroup Label="Configuration">
+    <ConfigurationType>StaticLibrary</ConfigurationType>
+  </PropertyGroup>
  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
-  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
-    <ConfigurationType>StaticLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings" />
  <ImportGroup Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup>
-    <PreBuildEventUseInBuild>false</PreBuildEventUseInBuild>
-  </PropertyGroup>
-  <PropertyGroup>
-    <LinkIncremental>$(DebugBuild)</LinkIncremental>
-  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">$(SolutionDir)Source\1BitSGD;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(CNTK_ENABLE_ASGD)'=='true'">$(SolutionDir)Source\multiverso\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">QUANTIZED_GRADIENT_AGGREGATION;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_ASGD)'=='true'">MULTIVERSO_SUPPORT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(NvmlLibPath)</AdditionalLibraryDirectories>
+      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>Math.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
-  <!-- TODO can we merge with above? -->
-  <ItemDefinitionGroup Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">
-    <ClCompile>
-      <AdditionalIncludeDirectories>$(SolutionDir)Source\1BitSGD;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(CNTK_ENABLE_ASGD)'=='true'">
-    <ClCompile>
-      <AdditionalIncludeDirectories>$(SolutionDir)Source\multiverso\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
-    </ClCompile>
-  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
    <ClCompile>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <WarningLevel>Level4</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_ASGD)'=='true'">MULTIVERSO_SUPPORT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">QUANTIZED_GRADIENT_AGGREGATION;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-      <OpenMPSupport>true</OpenMPSupport>
-      <TreatWarningAsError>true</TreatWarningAsError>
+      <PreprocessorDefinitions>_SCL_SECURE_NO_WARNINGS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
-      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
-      <StackReserveSize>100000000</StackReserveSize>
-    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
    <ClCompile>
-      <WarningLevel>Level4</WarningLevel>
-      <PrecompiledHeader>
-      </PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">QUANTIZED_GRADIENT_AGGREGATION;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_ASGD)'=='true'">MULTIVERSO_SUPPORT;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
-      <TreatWarningAsError>true</TreatWarningAsError>
    </ClCompile>
-    <Link>
-      <SubSystem>Console</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; kernel32.lib; user32.lib; shell32.lib; %(AdditionalDependencies)</AdditionalDependencies>
-      <Profile>true</Profile>
-      <DelayLoadDLLs>Math.dll; nvml.dll; cudart64_70.dll</DelayLoadDLLs>
-    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
    <ClCompile>
      <PreprocessorDefinitions>CPUONLY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
-    <Link>
-      <DelayLoadDLLs>Math.dll</DelayLoadDLLs>
-    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(GpuBuild)">
    <ClCompile>
@ -132,9 +74,10 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>%(AdditionalLibraryDirectories);$(CudaLibPath)</AdditionalLibraryDirectories>
+      <DelayLoadDLLs>%(DelayLoadDLLs);nvml.dll;$(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
-      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" $(TargetDir)</Command>
+      <Command>if exist "%ProgramW6432%\NVIDIA Corporation\NVSMI" xcopy /I /D /Y "%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml*.dll" "$(TargetDir)"</Command>
      <Message>Copying NVidia GDK extension DLL to target folder</Message>
    </PostBuildEvent>
  </ItemDefinitionGroup>
@ -181,18 +124,6 @@
    <ClInclude Include="targetver.h" />
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="..\Common\Config.cpp">
-      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\DataReader.cpp" />
-    <ClCompile Include="..\Common\DataWriter.cpp" />
-    <ClCompile Include="..\Common\File.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <PrecompiledHeader Condition="$(DebugBuild)">NotUsing</PrecompiledHeader>
-    </ClCompile>
-    <ClCompile Include="..\Common\TimerUtility.cpp" />
    <ClCompile Include="Profiler.cpp" />
    <ClCompile Include="SGD.cpp" />
    <ClCompile Include="stdafx.cpp" />
--- a/Показать больше
+++ b/Показать больше