Merge remote-tracking branch 'origin/master' into jdroppo/cudnn-rnn-lstm

Conflicts: Source/Math/Math.vcxproj Source/Math/Math.vcxproj.filters Source/Math/MathCUDA.vcxproj.filters
2016-05-20 11:20:07 -07:00 · 2016-05-20 11:20:07 -07:00 · 5558144363
--- a/.gitattributes
+++ b/.gitattributes
@ -40,6 +40,10 @@ run-test-common text eol=lf
 run-timit-test-common text eol=lf
 make_binary_drop_linux text eol=lf

+# Used from Unix / Cygwin 'md5sum -c', needs to have LF line endings:
+Tests/EndToEndTests/Examples/Speech/TIMIT/WriteBottleneck/expected_output_md5sum.*.txt eol=lf
+Tests/EndToEndTests/Examples/Speech/TIMIT/WriteScaledLogLike/expected_output_md5sum.*.txt eol=lf
+
 Makefile text
 *.sln text
 *.vcxproj text
--- a/.gitignore
+++ b/.gitignore
@ -152,7 +152,9 @@ ModelManifest.xml

 # Python
 *.pyc
-__pychache__/
+__pycache__/
+contrib/Python/doc/_build/*
+contrib/Python/_cntk_default/*

 # =========================
 # Windows detritus
--- a/CNTK.sln
+++ b/CNTK.sln
@ -9,6 +9,7 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTK", "Source\CNTK\CNTK.vc
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 		{EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1}
+		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
 		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
 		{DE3C54E5-D7D0-47AF-A783-DFDCE59E7937} = {DE3C54E5-D7D0-47AF-A783-DFDCE59E7937}
 	EndProjectSection
@ -654,8 +655,6 @@ EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Config", "Config", "{850008BC-36B0-4A0A-BD0C-B6D5C2184227}"
 	ProjectSection(SolutionItems) = preProject
 		Examples\Text\PennTreebank\Config\rnn.cntk = Examples\Text\PennTreebank\Config\rnn.cntk
-		Examples\Text\PennTreebank\Config\S2SAutoEncoder.cntk = Examples\Text\PennTreebank\Config\S2SAutoEncoder.cntk
-		Examples\Text\PennTreebank\Config\S2SLib.bs = Examples\Text\PennTreebank\Config\S2SLib.bs
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{E6DC3B7D-303D-4A54-B040-D8DCF8C56E17}"
@ -710,45 +709,30 @@ EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Text", "Text", "{439BE0E0-FABE-403D-BF2C-A41FB8A60616}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "MNIST", "MNIST", "{63C6816D-66BF-487E-B541-094142C8272B}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Examples\Image\MNIST\README.txt = Tests\EndToEndTests\Examples\Image\MNIST\README.txt
+	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "02_Convolution", "02_Convolution", "{6F1D0CE1-0F18-4B4C-9581-1F2146C8D300}"
 	ProjectSection(SolutionItems) = preProject
-		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.debug.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.debug.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.debug.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.debug.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.release.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.release.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.release.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.release.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.debug.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.debug.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.debug.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.debug.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.release.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.release.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.release.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.release.gpu.txt
+		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.linux.txt
+		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.txt = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\baseline.windows.txt
 		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\run-test = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\run-test
 		Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\testcases.yml = Tests\EndToEndTests\Examples\Image\MNIST\02_Convolution\testcases.yml
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "01_OneHidden", "01_OneHidden", "{A0B366FE-2EEA-4E32-9AED-12C46409C30C}"
 	ProjectSection(SolutionItems) = preProject
-		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.debug.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.debug.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.debug.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.debug.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.release.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.release.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.release.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.release.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.debug.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.debug.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.debug.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.debug.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.release.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.release.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.release.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.release.gpu.txt
+		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.linux.txt
+		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.txt = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\baseline.windows.txt
 		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\run-test = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\run-test
 		Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\testcases.yml = Tests\EndToEndTests\Examples\Image\MNIST\01_OneHidden\testcases.yml
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "03_ConvBatchNorm", "03_ConvBatchNorm", "{BD783D50-47E2-485F-BDAF-29BD40D84645}"
 	ProjectSection(SolutionItems) = preProject
-		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.debug.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.debug.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.debug.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.debug.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.release.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.release.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.release.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.release.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.debug.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.debug.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.debug.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.debug.gpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.release.cpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.release.cpu.txt
-		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.release.gpu.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.release.gpu.txt
+		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.txt
+		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.txt = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.txt
 		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\run-test = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\run-test
 		Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\testcases.yml = Tests\EndToEndTests\Examples\Image\MNIST\03_ConvBatchNorm\testcases.yml
 	EndProjectSection
@ -888,12 +872,21 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NetworkTests", "Tests\UnitT
 		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 		{EB2BE26F-6BD4-4274-971F-86D080779DD1} = {EB2BE26F-6BD4-4274-971F-86D080779DD1}
+		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
 		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
 	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Text", "Text", "{8656B71D-E24C-4AC2-8BE4-C07B415A3E15}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SequenceClassification", "SequenceClassification", "{E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Text\SequenceClassification\baseline.linux.cpu.txt = Tests\EndToEndTests\Text\SequenceClassification\baseline.linux.cpu.txt
+		Tests\EndToEndTests\Text\SequenceClassification\baseline.linux.gpu.txt = Tests\EndToEndTests\Text\SequenceClassification\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Text\SequenceClassification\baseline.windows.cpu.txt = Tests\EndToEndTests\Text\SequenceClassification\baseline.windows.cpu.txt
+		Tests\EndToEndTests\Text\SequenceClassification\baseline.windows.gpu.txt = Tests\EndToEndTests\Text\SequenceClassification\baseline.windows.gpu.txt
+		Tests\EndToEndTests\Text\SequenceClassification\run-test = Tests\EndToEndTests\Text\SequenceClassification\run-test
+		Tests\EndToEndTests\Text\SequenceClassification\testcases.yml = Tests\EndToEndTests\Text\SequenceClassification\testcases.yml
+	EndProjectSection
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Miscellaneous", "Miscellaneous", "{8629430A-821E-43BA-AEC5-8B2CF31A2A7A}"
 EndProject
@ -976,6 +969,306 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SLU", "SLU", "{181664AC-4C9
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Common", "Source\Common\Common.vcxproj", "{86883653-8A61-4038-81A0-2379FAE4200A}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CompositeDataReader", "Source\Readers\CompositeDataReader\CompositeDataReader.vcxproj", "{7B7A563D-AA8E-4660-A805-D50235A02120}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
+		{F0A9637C-20DA-42F0-83D4-23B4704DE602} = {F0A9637C-20DA-42F0-83D4-23B4704DE602}
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CNTKTextFormatReader", "CNTKTextFormatReader", "{99FAAACE-C360-43CF-B706-20621F164484}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Examples", "Examples", "{629761D1-7A05-409A-B62B-FC1CCC0D6EED}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Image", "Image", "{D4302516-C77F-4FAF-82FB-18DB39F5A53B}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelTraining", "ParallelTraining", "{06BE675D-80DD-419A-8E00-26953EF11F25}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\SimpleMultiGPU.cntk = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\SimpleMultiGPU.cntk
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Speech", "Speech", "{5642F047-490B-4ABD-8113-8563C872B39F}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Image", "Image", "{2B6CCAB6-A92A-483C-9FDB-8412FA4DC42F}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Other", "Other", "{225F5A3A-7CAF-4C71-9143-3AD2AC4D47A3}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "MNIST", "MNIST", "{EBD36FD9-FE5B-420E-A572-DC6117300DB3}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\run-test-common = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\run-test-common
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Miscellaneous", "Miscellaneous", "{08D284FA-2914-4B35-A89C-896DBA2B4484}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CIFAR-10", "CIFAR-10", "{95FAC6A0-6AE7-4947-9DFD-498FE71311AD}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\run-test-common = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\run-test-common
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Config", "Config", "{A877E526-89C1-422E-9F90-4DDE84135A36}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\Config\01_Conv.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\Config\01_Conv.cntk
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\Config\02_BatchNormConv.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\Config\02_BatchNormConv.cntk
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\Config\05_ConvLocal.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\Config\05_ConvLocal.cntk
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "01_Convolution", "01_Convolution", "{071D8449-D080-4141-869D-600CC3C2A0BE}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.windows.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.windows.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "02_BatchNormConv", "02_BatchNormConv", "{D3A74C52-BC74-4DA3-BE93-8F4241D54EE0}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\baseline.linux.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\baseline.linux.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\baseline.windows.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\baseline.windows.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "05_ConvLocal", "05_ConvLocal", "{EC466625-BC66-41DF-B55A-EB28AFABE24E}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\baseline.linux.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\baseline.linux.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\baseline.windows.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\baseline.windows.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\Miscellaneous\CIFAR-10\05_ConvLocal\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "01_OneHidden", "01_OneHidden", "{34D578DB-0101-45C4-9DF0-37DE9AB87C65}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\baseline.linux.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\baseline.linux.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\baseline.windows.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\baseline.windows.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\01_OneHidden\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "02_Convolution", "02_Convolution", "{1FE04815-E02E-498C-B276-6D058D46D754}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\baseline.linux.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\baseline.linux.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\baseline.windows.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\baseline.windows.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\02_Convolution\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "03_ConvBatchNorm", "03_ConvBatchNorm", "{2A125ED5-9C8A-4BDF-A200-862104289608}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\baseline.linux.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\baseline.windows.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\03_ConvBatchNorm\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Config", "Config", "{E9207003-B860-4D57-B2CA-09AF52FF191F}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\01_OneHidden.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\01_OneHidden.cntk
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\01_OneHidden.ndl = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\01_OneHidden.ndl
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\02_Convolution.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\02_Convolution.cntk
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\02_Convolution.ndl = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\02_Convolution.ndl
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\03_ConvBatchNorm.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\03_ConvBatchNorm.cntk
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\03_ConvBatchNorm.ndl = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\03_ConvBatchNorm.ndl
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\Macros.ndl = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Image\MNIST\Config\Macros.ndl
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Simple2d", "Simple2d", "{50420947-E502-40B4-8739-2C0BADD93BEE}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "MultiGpu", "MultiGpu", "{935E5A95-888D-4922-AB5A-E9C11D65E974}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\baseline.linux.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\baseline.linux.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\baseline.linux.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\baseline.linux.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\baseline.windows.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\baseline.windows.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\baseline.windows.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\baseline.windows.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\MultiGpu\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Simple", "Simple", "{773313DD-69DD-463F-ADC9-E8A902A5223C}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\baseline.linux.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\baseline.linux.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\baseline.linux.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\baseline.linux.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\baseline.windows.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\baseline.windows.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\baseline.windows.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\baseline.windows.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Simple\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Config", "Config", "{C8E2EF3B-CCBF-4BDD-8127-2252626FB22B}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Config\Multigpu.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Config\Multigpu.cntk
+		Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Config\Simple.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Examples\Other\Simple2d\Config\Simple.cntk
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "QuickE2E", "QuickE2E", "{A4F79A83-DE30-40FA-88F4-86304C89AC7F}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\baseline.linux.txt = Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\baseline.linux.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\baseline.windows.txt = Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\baseline.windows.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\Image_QuickE2E.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\Image_QuickE2E.cntk
+		Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Image\QuickE2E\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Simple", "Simple", "{CC47AF62-2558-455F-81CB-36901AF033B0}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\baseline.linux.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\baseline.linux.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\baseline.linux.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\baseline.linux.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\baseline.windows.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\baseline.windows.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\baseline.windows.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\baseline.windows.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\README.txt = Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\README.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\run-test = Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\Speech_Simple.cntk = Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\Speech_Simple.cntk
+		Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\Speech\Simple\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "NoQuantization", "NoQuantization", "{1BA5209D-3EB6-48E7-BE8A-0622315070C0}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{AA14A8DB-669D-447B-A97F-8B726BF30188}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\Data\SimpleDataTrain.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\Data\SimpleDataTrain.txt
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SinglePrecision", "SinglePrecision", "{CA248859-AA91-47D6-AC05-3542AB27E290}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\baseline.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\baseline.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\baseline.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\baseline.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\baseline.windows.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\baseline.windows.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\baseline.windows.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\baseline.windows.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\run-test = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\SinglePrecision\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DoublePrecision", "DoublePrecision", "{8B6E9318-5ED0-49BF-945B-072E0D90A886}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\baseline.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\baseline.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\baseline.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\baseline.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\baseline.windows.cpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\baseline.windows.cpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\baseline.windows.gpu.txt = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\baseline.windows.gpu.txt
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\run-test = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\run-test
+		Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\testcases.yml = Tests\EndToEndTests\CNTKTextFormatReader\ParallelTraining\NoQuantization\DoublePrecision\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SparseDSSM", "SparseDSSM", "{1FB54750-B668-4AC3-966F-ED504020AC06}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Text\SparseDSSM\baseline.cpu.txt = Tests\EndToEndTests\Text\SparseDSSM\baseline.cpu.txt
+		Tests\EndToEndTests\Text\SparseDSSM\baseline.gpu.txt = Tests\EndToEndTests\Text\SparseDSSM\baseline.gpu.txt
+		Tests\EndToEndTests\Text\SparseDSSM\baseline.windows.cpu.txt = Tests\EndToEndTests\Text\SparseDSSM\baseline.windows.cpu.txt
+		Tests\EndToEndTests\Text\SparseDSSM\baseline.windows.gpu.txt = Tests\EndToEndTests\Text\SparseDSSM\baseline.windows.gpu.txt
+		Tests\EndToEndTests\Text\SparseDSSM\dssm.cntk = Tests\EndToEndTests\Text\SparseDSSM\dssm.cntk
+		Tests\EndToEndTests\Text\SparseDSSM\dssm.ndl = Tests\EndToEndTests\Text\SparseDSSM\dssm.ndl
+		Tests\EndToEndTests\Text\SparseDSSM\run-test = Tests\EndToEndTests\Text\SparseDSSM\run-test
+		Tests\EndToEndTests\Text\SparseDSSM\testcases.yml = Tests\EndToEndTests\Text\SparseDSSM\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "WriteCommand", "WriteCommand", "{3E9BD61F-1F0A-4966-BE17-803AEFD1DFA4}"
+	ProjectSection(SolutionItems) = preProject
+		tests\endtoendtests\Speech\DNN\WriteCommand\baseline.cpu.txt = tests\endtoendtests\Speech\DNN\WriteCommand\baseline.cpu.txt
+		tests\endtoendtests\Speech\DNN\WriteCommand\baseline.gpu.txt = tests\endtoendtests\Speech\DNN\WriteCommand\baseline.gpu.txt
+		tests\endtoendtests\Speech\DNN\WriteCommand\baseline.windows.cpu.txt = tests\endtoendtests\Speech\DNN\WriteCommand\baseline.windows.cpu.txt
+		tests\endtoendtests\Speech\DNN\WriteCommand\baseline.windows.gpu.txt = tests\endtoendtests\Speech\DNN\WriteCommand\baseline.windows.gpu.txt
+		tests\endtoendtests\Speech\DNN\WriteCommand\cntk.cntk = tests\endtoendtests\Speech\DNN\WriteCommand\cntk.cntk
+		tests\endtoendtests\Speech\DNN\WriteCommand\Output.ScaledLogLikelihood.cpu = tests\endtoendtests\Speech\DNN\WriteCommand\Output.ScaledLogLikelihood.cpu
+		tests\endtoendtests\Speech\DNN\WriteCommand\Output.ScaledLogLikelihood.gpu = tests\endtoendtests\Speech\DNN\WriteCommand\Output.ScaledLogLikelihood.gpu
+		tests\endtoendtests\Speech\DNN\WriteCommand\Output.ScaledLogLikelihood.windows.cpu = tests\endtoendtests\Speech\DNN\WriteCommand\Output.ScaledLogLikelihood.windows.cpu
+		tests\endtoendtests\Speech\DNN\WriteCommand\Output.ScaledLogLikelihood.windows.gpu = tests\endtoendtests\Speech\DNN\WriteCommand\Output.ScaledLogLikelihood.windows.gpu
+		Tests\endtoendtests\Speech\DNN\WriteCommand\run-test = Tests\endtoendtests\Speech\DNN\WriteCommand\run-test
+		tests\endtoendtests\Speech\DNN\WriteCommand\testcases.yml = tests\endtoendtests\Speech\DNN\WriteCommand\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelBufferedAsyncGradientAggregation", "ParallelBufferedAsyncGradientAggregation", "{5560DDD4-1E6E-4F41-B9BD-F52A19DF0B31}"
+	ProjectSection(SolutionItems) = preProject
+		tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\baseline.cpu.txt = tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\baseline.cpu.txt
+		tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\baseline.gpu.txt = tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\baseline.gpu.txt
+		tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\baseline.windows.cpu.txt = tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\baseline.windows.cpu.txt
+		tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\baseline.windows.gpu.txt = tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\baseline.windows.gpu.txt
+		Tests\endtoendtests\Speech\DNN\ParallelBufferedAsyncGradientAggregation\run-test = Tests\endtoendtests\Speech\DNN\ParallelBufferedAsyncGradientAggregation\run-test
+		tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\testcases.yml = tests\endtoendtests\speech\dnn\ParallelBufferedAsyncGradientAggregation\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelCrossValidation", "ParallelCrossValidation", "{9834E864-A8CD-4D28-A3C9-F79FE0F421AE}"
+	ProjectSection(SolutionItems) = preProject
+		tests\endtoendtests\speech\dnn\ParallelCrossValidation\baseline.cpu.txt = tests\endtoendtests\speech\dnn\ParallelCrossValidation\baseline.cpu.txt
+		tests\endtoendtests\speech\dnn\ParallelCrossValidation\baseline.gpu.txt = tests\endtoendtests\speech\dnn\ParallelCrossValidation\baseline.gpu.txt
+		tests\endtoendtests\speech\dnn\ParallelCrossValidation\baseline.windows.cpu.txt = tests\endtoendtests\speech\dnn\ParallelCrossValidation\baseline.windows.cpu.txt
+		tests\endtoendtests\speech\dnn\ParallelCrossValidation\baseline.windows.gpu.txt = tests\endtoendtests\speech\dnn\ParallelCrossValidation\baseline.windows.gpu.txt
+		tests\endtoendtests\speech\dnn\ParallelCrossValidation\cntkcv.cntk = tests\endtoendtests\speech\dnn\ParallelCrossValidation\cntkcv.cntk
+		Tests\endtoendtests\Speech\DNN\ParallelCrossValidation\run-test = Tests\endtoendtests\Speech\DNN\ParallelCrossValidation\run-test
+		tests\endtoendtests\speech\dnn\ParallelCrossValidation\testcases.yml = tests\endtoendtests\speech\dnn\ParallelCrossValidation\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelNoQuantizationBufferedAsyncGradientAggregation", "ParallelNoQuantizationBufferedAsyncGradientAggregation", "{40F65441-A7B7-4425-8E75-CD74AB262F3F}"
+	ProjectSection(SolutionItems) = preProject
+		tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\baseline.cpu.txt = tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\baseline.cpu.txt
+		tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\baseline.gpu.txt = tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\baseline.gpu.txt
+		tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\baseline.windows.cpu.txt = tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\baseline.windows.cpu.txt
+		tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\baseline.windows.gpu.txt = tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\baseline.windows.gpu.txt
+		Tests\endtoendtests\Speech\DNN\ParallelNoQuantizationBufferedAsyncGradientAggregation\run-test = Tests\endtoendtests\Speech\DNN\ParallelNoQuantizationBufferedAsyncGradientAggregation\run-test
+		tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\testcases.yml = tests\endtoendtests\speech\dnn\ParallelNoQuantizationBufferedAsyncGradientAggregation\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PlotDNN", "PlotDNN", "{4D6F731C-4A6D-4E21-AC3C-9E1F26E5547E}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\endtoendtests\Speech\DNN\PlotDNN\baseline.txt = Tests\endtoendtests\Speech\DNN\PlotDNN\baseline.txt
+		tests\endtoendtests\speech\dnn\PlotDNN\cntkSpeech.dnn.dot = tests\endtoendtests\speech\dnn\PlotDNN\cntkSpeech.dnn.dot
+		tests\endtoendtests\speech\dnn\PlotDNN\plot.cntk = tests\endtoendtests\speech\dnn\PlotDNN\plot.cntk
+		Tests\endtoendtests\Speech\DNN\PlotDNN\run-test = Tests\endtoendtests\Speech\DNN\PlotDNN\run-test
+		tests\endtoendtests\speech\dnn\PlotDNN\testcases.yml = tests\endtoendtests\speech\dnn\PlotDNN\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ParallelBM", "ParallelBM", "{36C42845-0D48-4A46-9C67-2B593A80A09C}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Speech\DNN\ParallelBM\baseline.linux.cpu.txt = Tests\EndToEndTests\Speech\DNN\ParallelBM\baseline.linux.cpu.txt
+		Tests\EndToEndTests\Speech\DNN\ParallelBM\baseline.linux.gpu.txt = Tests\EndToEndTests\Speech\DNN\ParallelBM\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Speech\DNN\ParallelBM\baseline.windows.cpu.txt = Tests\EndToEndTests\Speech\DNN\ParallelBM\baseline.windows.cpu.txt
+		Tests\EndToEndTests\Speech\DNN\ParallelBM\baseline.windows.gpu.txt = Tests\EndToEndTests\Speech\DNN\ParallelBM\baseline.windows.gpu.txt
+		Tests\EndToEndTests\Speech\DNN\ParallelBM\cntk.cntk = Tests\EndToEndTests\Speech\DNN\ParallelBM\cntk.cntk
+		Tests\EndToEndTests\Speech\DNN\ParallelBM\run-test = Tests\EndToEndTests\Speech\DNN\ParallelBM\run-test
+		Tests\EndToEndTests\Speech\DNN\ParallelBM\testcases.yml = Tests\EndToEndTests\Speech\DNN\ParallelBM\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SequenceToSequence", "SequenceToSequence", "{A1521DC4-C8EC-47BD-9E63-7BE30ED2EC26}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Miscellaneous", "Miscellaneous", "{85A05261-41D0-41DF-80B5-ADB6ABB54632}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "G2P", "G2P", "{4AD12278-9705-4BBA-B2C3-D6D5856AADC3}"
+	ProjectSection(SolutionItems) = preProject
+		Examples\SequenceToSequence\Miscellaneous\G2P\G2P.cntk = Examples\SequenceToSequence\Miscellaneous\G2P\G2P.cntk
+		Examples\SequenceToSequence\Miscellaneous\G2P\README.txt = Examples\SequenceToSequence\Miscellaneous\G2P\README.txt
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalClient", "Source\Extensibility\CPPEvalClient\CPPEvalClient.vcxproj", "{578D52A0-3928-4405-A016-F016E8B49031}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "ExperimentalHtkmlfReader", "ExperimentalHtkmlfReader", "{977ECCB7-598D-4548-B95B-BACA9CC7D98B}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DNN", "DNN", "{1DBB2575-F5C8-43F4-B982-D05D6ADC2F9B}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "LSTM", "LSTM", "{772A0DB3-4710-4281-8AA9-A9F1F7C543D3}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "QuickE2E", "QuickE2E", "{FE3592CF-3EB9-4502-BB95-E2AB974C0FB5}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SVD", "SVD", "{BA6A65C5-92A2-4040-ADC3-0727A45694F6}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "FullUtterance", "FullUtterance", "{3BDF52CD-7F3C-42BC-AB78-CF5BBC5F4AB4}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\baseline.cpu.txt = Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\baseline.cpu.txt
+		Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\baseline.gpu.txt = Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\baseline.gpu.txt
+		Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\baseline.windows.cpu.txt = Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\baseline.windows.cpu.txt
+		Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\baseline.windows.gpu.txt = Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\baseline.windows.gpu.txt
+		Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\cntk.cntk = Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\cntk.cntk
+		Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\run-test = Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\run-test
+		Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\testcases.yml = Tests\EndToEndTests\Speech\ExperimentalHtkmlfReader\LSTM\FullUtterance\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Truncated", "Truncated", "{1141DC61-E014-4DEC-9157-F6B1FC055C7A}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug_CpuOnly|x64 = Debug_CpuOnly|x64
@ -1206,6 +1499,22 @@ Global
 		{86883653-8A61-4038-81A0-2379FAE4200A}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
 		{86883653-8A61-4038-81A0-2379FAE4200A}.Release|x64.ActiveCfg = Release|x64
 		{86883653-8A61-4038-81A0-2379FAE4200A}.Release|x64.Build.0 = Release|x64
+		{7B7A563D-AA8E-4660-A805-D50235A02120}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{7B7A563D-AA8E-4660-A805-D50235A02120}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{7B7A563D-AA8E-4660-A805-D50235A02120}.Debug|x64.ActiveCfg = Debug|x64
+		{7B7A563D-AA8E-4660-A805-D50235A02120}.Debug|x64.Build.0 = Debug|x64
+		{7B7A563D-AA8E-4660-A805-D50235A02120}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{7B7A563D-AA8E-4660-A805-D50235A02120}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{7B7A563D-AA8E-4660-A805-D50235A02120}.Release|x64.ActiveCfg = Release|x64
+		{7B7A563D-AA8E-4660-A805-D50235A02120}.Release|x64.Build.0 = Release|x64
+		{578D52A0-3928-4405-A016-F016E8B49031}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{578D52A0-3928-4405-A016-F016E8B49031}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{578D52A0-3928-4405-A016-F016E8B49031}.Debug|x64.ActiveCfg = Debug|x64
+		{578D52A0-3928-4405-A016-F016E8B49031}.Debug|x64.Build.0 = Debug|x64
+		{578D52A0-3928-4405-A016-F016E8B49031}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{578D52A0-3928-4405-A016-F016E8B49031}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{578D52A0-3928-4405-A016-F016E8B49031}.Release|x64.ActiveCfg = Release|x64
+		{578D52A0-3928-4405-A016-F016E8B49031}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -1337,6 +1646,53 @@ Global
 		{EC780385-7580-4D15-914B-1D878A295CBC} = {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}
 		{D11F76CC-DB6D-4CB4-B3B7-AB139DE2F5FA} = {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}
 		{181664AC-4C95-4798-A923-09B879215B33} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
+		{99FAAACE-C360-43CF-B706-20621F164484} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5}
+		{629761D1-7A05-409A-B62B-FC1CCC0D6EED} = {99FAAACE-C360-43CF-B706-20621F164484}
+		{D4302516-C77F-4FAF-82FB-18DB39F5A53B} = {99FAAACE-C360-43CF-B706-20621F164484}
+		{06BE675D-80DD-419A-8E00-26953EF11F25} = {99FAAACE-C360-43CF-B706-20621F164484}
+		{5642F047-490B-4ABD-8113-8563C872B39F} = {99FAAACE-C360-43CF-B706-20621F164484}
+		{2B6CCAB6-A92A-483C-9FDB-8412FA4DC42F} = {629761D1-7A05-409A-B62B-FC1CCC0D6EED}
+		{225F5A3A-7CAF-4C71-9143-3AD2AC4D47A3} = {629761D1-7A05-409A-B62B-FC1CCC0D6EED}
+		{EBD36FD9-FE5B-420E-A572-DC6117300DB3} = {2B6CCAB6-A92A-483C-9FDB-8412FA4DC42F}
+		{08D284FA-2914-4B35-A89C-896DBA2B4484} = {2B6CCAB6-A92A-483C-9FDB-8412FA4DC42F}
+		{95FAC6A0-6AE7-4947-9DFD-498FE71311AD} = {08D284FA-2914-4B35-A89C-896DBA2B4484}
+		{A877E526-89C1-422E-9F90-4DDE84135A36} = {95FAC6A0-6AE7-4947-9DFD-498FE71311AD}
+		{071D8449-D080-4141-869D-600CC3C2A0BE} = {95FAC6A0-6AE7-4947-9DFD-498FE71311AD}
+		{D3A74C52-BC74-4DA3-BE93-8F4241D54EE0} = {95FAC6A0-6AE7-4947-9DFD-498FE71311AD}
+		{EC466625-BC66-41DF-B55A-EB28AFABE24E} = {95FAC6A0-6AE7-4947-9DFD-498FE71311AD}
+		{34D578DB-0101-45C4-9DF0-37DE9AB87C65} = {EBD36FD9-FE5B-420E-A572-DC6117300DB3}
+		{1FE04815-E02E-498C-B276-6D058D46D754} = {EBD36FD9-FE5B-420E-A572-DC6117300DB3}
+		{2A125ED5-9C8A-4BDF-A200-862104289608} = {EBD36FD9-FE5B-420E-A572-DC6117300DB3}
+		{E9207003-B860-4D57-B2CA-09AF52FF191F} = {EBD36FD9-FE5B-420E-A572-DC6117300DB3}
+		{50420947-E502-40B4-8739-2C0BADD93BEE} = {225F5A3A-7CAF-4C71-9143-3AD2AC4D47A3}
+		{935E5A95-888D-4922-AB5A-E9C11D65E974} = {50420947-E502-40B4-8739-2C0BADD93BEE}
+		{773313DD-69DD-463F-ADC9-E8A902A5223C} = {50420947-E502-40B4-8739-2C0BADD93BEE}
+		{C8E2EF3B-CCBF-4BDD-8127-2252626FB22B} = {50420947-E502-40B4-8739-2C0BADD93BEE}
+		{A4F79A83-DE30-40FA-88F4-86304C89AC7F} = {D4302516-C77F-4FAF-82FB-18DB39F5A53B}
+		{CC47AF62-2558-455F-81CB-36901AF033B0} = {5642F047-490B-4ABD-8113-8563C872B39F}
+		{1BA5209D-3EB6-48E7-BE8A-0622315070C0} = {06BE675D-80DD-419A-8E00-26953EF11F25}
+		{AA14A8DB-669D-447B-A97F-8B726BF30188} = {06BE675D-80DD-419A-8E00-26953EF11F25}
+		{CA248859-AA91-47D6-AC05-3542AB27E290} = {1BA5209D-3EB6-48E7-BE8A-0622315070C0}
+		{8B6E9318-5ED0-49BF-945B-072E0D90A886} = {1BA5209D-3EB6-48E7-BE8A-0622315070C0}
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{7B7A563D-AA8E-4660-A805-D50235A02120} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{1FB54750-B668-4AC3-966F-ED504020AC06} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
+		{3E9BD61F-1F0A-4966-BE17-803AEFD1DFA4} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{5560DDD4-1E6E-4F41-B9BD-F52A19DF0B31} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{9834E864-A8CD-4D28-A3C9-F79FE0F421AE} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{40F65441-A7B7-4425-8E75-CD74AB262F3F} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{4D6F731C-4A6D-4E21-AC3C-9E1F26E5547E} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{36C42845-0D48-4A46-9C67-2B593A80A09C} = {6994C86D-A672-4254-824A-51F4DFEB807F}
+		{A1521DC4-C8EC-47BD-9E63-7BE30ED2EC26} = {47755F2E-D674-4175-9E38-8EA053455072}
+		{85A05261-41D0-41DF-80B5-ADB6ABB54632} = {A1521DC4-C8EC-47BD-9E63-7BE30ED2EC26}
+		{4AD12278-9705-4BBA-B2C3-D6D5856AADC3} = {85A05261-41D0-41DF-80B5-ADB6ABB54632}
+		{578D52A0-3928-4405-A016-F016E8B49031} = {60F87E25-BC87-4782-8E20-1621AAEBB113}
+		{977ECCB7-598D-4548-B95B-BACA9CC7D98B} = {C47CDAA5-6D6C-429E-BC89-7CA0F868FDC8}
+		{1DBB2575-F5C8-43F4-B982-D05D6ADC2F9B} = {977ECCB7-598D-4548-B95B-BACA9CC7D98B}
+		{772A0DB3-4710-4281-8AA9-A9F1F7C543D3} = {977ECCB7-598D-4548-B95B-BACA9CC7D98B}
+		{FE3592CF-3EB9-4502-BB95-E2AB974C0FB5} = {977ECCB7-598D-4548-B95B-BACA9CC7D98B}
+		{BA6A65C5-92A2-4040-ADC3-0727A45694F6} = {977ECCB7-598D-4548-B95B-BACA9CC7D98B}
+		{3BDF52CD-7F3C-42BC-AB78-CF5BBC5F4AB4} = {772A0DB3-4710-4281-8AA9-A9F1F7C543D3}
+		{1141DC61-E014-4DEC-9157-F6B1FC055C7A} = {772A0DB3-4710-4281-8AA9-A9F1F7C543D3}
 	EndGlobalSection
 EndGlobal
--- a/Examples/Image/MNIST/Config/01_OneHidden.ndl
+++ b/Examples/Image/MNIST/Config/01_OneHidden.ndl
@ -25,7 +25,8 @@ DNN = [
    err = ErrorPrediction(labels, ol)
    
    # Special Nodes
-    errTop5 = ErrorPrediction(labels, ol, Const(1), tag="eval")
+    # errTop1 can be used to compute, for example, top-5 error by changing Const(1) to Const(5).
+    errTop1 = ErrorPrediction(labels, ol, Const(1), tag="eval")
    FeatureNodes = (features)
    LabelNodes = (labels)
    CriterionNodes = (ce)
--- a/Examples/Image/MNIST/Config/02_Convolution.cntk
+++ b/Examples/Image/MNIST/Config/02_Convolution.cntk
@ -26,7 +26,8 @@ ndlMacros = "$ConfigDir$/Macros.ndl"
 traceLevel=1
 numMBsToShowResult=500

-prefetch=true
+# Note: turn off prefetching; known to crash UCIFastReader occasionally.
+prefetch=false

 # If set to true, always initialize the network on CPU, making initialization consistent across CPU and GPU targets (for testing).
 initOnCPUOnly=true
--- a/Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/01_Conv.cntk
@ -16,7 +16,8 @@ imageLayout = "cudnn"
 # If set to true, always initialize the network on CPU, making initialization consistent across CPU and GPU targets (for testing).
 initOnCPUOnly=true

-prefetch = "true"
+# Note: turn off prefetching; known to crash UCIFastReader occasionally.
+prefetch = "false"

 command = Train:Test

--- a/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/02_BatchNormConv.cntk
@ -16,7 +16,8 @@ imageLayout = "cudnn"
 # If set to true, always initialize the network on CPU, making initialization consistent across CPU and GPU targets (for testing).
 initOnCPUOnly=true

-prefetch = "true"
+# Note: turn off prefetching; known to crash UCIFastReader occasionally.
+prefetch = "false"

 command = Train:Test

--- a/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/05_ConvLocal.cntk
@ -13,7 +13,8 @@ imageLayout = "cudnn"
 # override the above as follows when running on CPU:
 # deviceId = -1

-prefetch = "true"
+# Note: turn off prefetching; known to crash UCIFastReader occasionally.
+prefetch = "false"

 command = Train:Test

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ImageNet1K_intensity.xml
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ImageNet1K_intensity.xml
@ -0,0 +1,21 @@
+<?xml version="1.0"?>
+<opencv_storage>
+  <EigVal type_id="opencv-matrix">
+    <rows>1</rows>
+    <cols>3</cols>
+    <dt>f</dt>
+    <data>
+      0.2175 0.0188 0.0045
+    </data>
+  </EigVal>
+  <EigVec type_id="opencv-matrix">
+    <rows>3</rows>
+    <cols>3</cols>
+    <dt>f</dt>
+    <data>
+      -0.5675  0.7192  0.4009
+      -0.5808 -0.0045 -0.8140
+      -0.5836 -0.6948  0.4203
+    </data>
+  </EigVec>
+</opencv_storage>
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ImageNet1K_mean.xml
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ImageNet1K_mean.xml
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
@ -73,12 +73,12 @@ ResNetNode2AInc(inp, outMap, inWCount, wCount, kW, kH, wScale, bValue, scValue,
 ]

 # Standard building block for ResNet with padding (option B).
-ResNetNode2BInc(inp, outMap, inMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst)
+ResNetNode2BInc(inp, outMap, inMap, inWCount, wCount, kW, kH, wScale, bValue, scValue, bnTimeConst, stride1x1, stride3x3)
 [
    # First convolution layer.
-    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, 2, 2, wScale, bValue, scValue, bnTimeConst)
+    c1 = ConvBNReLULayer(inp, outMap, inWCount, kW, kH, stride1x1, stride1x1, wScale, bValue, scValue, bnTimeConst)
    # Second convolution layer, no ReLU.
-    c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    c2 = ConvBNLayer(c1, outMap, wCount, kW, kH, stride3x3, stride3x3, wScale, bValue, scValue, bnTimeConst)
    
    # Projection convolution layer.
    c_proj = Conv1x1(inp, outMap, inMap, 2, 2, wScale, bValue, scValue, bnTimeConst)
@ -116,12 +116,12 @@ ResNetNode3AInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue
    y = RectifiedLinear(p)
 ]

-ResNetNode3BInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst, projStride)
+ResNetNode3BInc(inp, inMap, convMap, outMap, convWCount, wScale, bValue, scValue, bnTimeConst, projStride, stride1x1, stride3x3)
 [
    # 1x1 reducing convolution.
-    c1 = Conv1x1ReLU(inp, convMap, inMap, projStride, projStride, wScale, bValue, scValue, bnTimeConst)
+    c1 = Conv1x1ReLU(inp, convMap, inMap, stride1x1, stride1x1, wScale, bValue, scValue, bnTimeConst)
    # 3x3 convolution.
-    c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, 1, 1, wScale, bValue, scValue, bnTimeConst)
+    c2 = ConvBNReLULayer(c1, convMap, convWCount, 3, 3, stride3x3, stride3x3, wScale, bValue, scValue, bnTimeConst)
    # 1x1 expanding convolution, no ReLU.
    c3 = Conv1x1(c2, outMap, convMap, 1, 1, wScale, bValue, scValue, bnTimeConst)
    # Input-to-output mapping convolution.
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/README.md
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/README.md
@ -1,12 +1,22 @@
 # CNTK example: ImageNet ResNet 

-**Disclaimer: network configurations and experiment settings in this this folder try to follow those published in the [ResNet paper](http://arxiv.org/abs/1512.03385) as close as possible. However, these samples are NOT endorsed or verified by the researchers who published the original work. It is NOT guaranteed that you get the same (or even close) results as those in the paper.**
-
 ## Overview

 |Data:     |The ILSVRC2012 dataset (http://www.image-net.org/challenges/LSVRC/2012/) of images.
 |:---------|:---
 |Purpose   |This example demonstrates usage of the NDL (Network Description Language) to define networks similar to ResNet.
-|Network   |NDLNetworkBuilder, deep convolutional networks resembling ResNet networks.
+|Network   |NDLNetworkBuilder, deep convolutional residual networks (ResNet).
 |Training  |Stochastic gradient descent with momentum.

+## Details
+The network configurations and experiment settings in this this folder resemble the ones in the original [ResNet paper](http://arxiv.org/abs/1512.03385) with few minor changes inspired by [this work](https://github.com/facebook/fb.resnet.torch).
+The following table contains results as well as links to pre-trained models that can be used in various applications.
+
+| Network       | Top-1 error | Top-5 error | Model
+| ------------- | ----------- | ----------- | ----------
+| ResNet-18     | 29.57       | 10.41       | [Download](https://www.cntk.ai/resnet/ResNet_18.model)
+| ResNet-34     | 27.31       | 8.97        | [Download](https://www.cntk.ai/resnet/ResNet_34.model)
+| ResNet-50     | 24.74       | 7.56        | [Download](https://www.cntk.ai/resnet/ResNet_50.model)
+
+## Notes
+This work is an implementation of ResNets in CNTK. If you are interested in the original implementation of ResNet, follow [this link](https://github.com/KaimingHe/deep-residual-networks).
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.cntk
@ -18,10 +18,9 @@ stderr="$OutputDir$/ResNet_152"
 traceLevel=1
 numMBsToShowResult=500

-Proj64to256Filename = "$ConfigDir$/64to256.txt"
-Proj256to512Filename = "$ConfigDir$/256to512.txt"
-Proj512to1024Filename = "$ConfigDir$/512to1024.txt"
-Proj1024to2048Filename = "$ConfigDir$/1024to2048.txt"
+# Strides for increasing layers. Defaults (paper) are 2 for 1x1 and 1 for 3x3.
+stride1x1=1
+stride3x3=2

 Train=[
    action="train"
@ -36,7 +35,7 @@ Train=[
        minibatchSize=256
        # Note that learning rates are 10x more than in the paper due to a different
        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
-        learningRatesPerMB=1.0*35:0.1*35:0.01
+        learningRatesPerMB=1.0*30:0.1*30:0.01*30:0.001
        momentumPerMB=0.9
        maxEpochs=125
        gradUpdateType="None"
@ -79,8 +78,21 @@ Train=[
            jitterType="UniRatio"
            # Interpolation to use when scaling image to width x height size.
            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
-            interpolations="Linear"
-            # Stores mean values for each pixel in OpenCV matrix XML format.
+            interpolations="cubic"
+            # Aspect ratio jitter radius. Default is 0 (disabled).
+            aspectRatioRadius=0:0.2
+            # Brightness, contrast and color jittering. Default is 0 (disabled).
+            # Using 0 in the first epoch so the network can process original images.
+            brightnessRadius=0:0.2
+            contrastRadius=0:0.2
+            saturationRadius=0:0.4
+            # Intensity jittering: enabled if file is specified and intensityStdDev > 0. 
+            # The file stores 1x3 vector (eigenvalues) and 3x3 matrix (eigenvectors) in OpenCV XML format.
+            intensityFile="$ConfigDir$/ImageNet1K_intensity.xml"
+            # StdDev for intensity jittering. Start from the second epoch.
+            intensityStdDev=0:0.1
+            # Mean subtraction: enabled if file is specified.
+            # The file stores mean values for each pixel in OpenCV matrix XML format.
            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
        ]
        labels=[
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
@ -47,11 +47,11 @@ DNN=[
    pool1vs = 2
    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hs, pool1vs, imageLayout = "cudnn")
    
-    rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst, 1)
+    rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst, 1, 1, 1)
    rn1_2 = ResNetNode3A(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst)
    rn1_3 = ResNetNode3A(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst)

-    rn2_1 = ResNetNode3BInc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst, 2)
+    rn2_1 = ResNetNode3BInc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst, 2, $stride1x1$, $stride3x3$)
    rn2_2 = ResNetNode3A(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst)
    rn2_3 = ResNetNode3A(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst)
    rn2_4 = ResNetNode3A(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst)
@ -60,7 +60,7 @@ DNN=[
    rn2_7 = ResNetNode3A(rn2_6, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst)
    rn2_8 = ResNetNode3A(rn2_7, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst)
    
-    rn3_1 = ResNetNode3BInc(rn2_8,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst, 2)
+    rn3_1 = ResNetNode3BInc(rn2_8,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst, 2, $stride1x1$, $stride3x3$)
    rn3_2 = ResNetNode3A(rn3_1,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)
    rn3_3 = ResNetNode3A(rn3_2,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)
    rn3_4 = ResNetNode3A(rn3_3,  cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)
@ -97,7 +97,7 @@ DNN=[
    rn3_35= ResNetNode3A(rn3_34, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)
    rn3_36= ResNetNode3A(rn3_35, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)

-    rn4_1 = ResNetNode3BInc(rn3_36, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, bnTimeConst, 2)
+    rn4_1 = ResNetNode3BInc(rn3_36, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, bnTimeConst, 2, $stride1x1$, $stride3x3$)
    rn4_2 = ResNetNode3A(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue, bnTimeConst)
    rn4_3 = ResNetNode3A(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue, bnTimeConst)

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.cntk
@ -18,10 +18,14 @@ stderr="$OutputDir$/ResNet_18"
 traceLevel=1
 numMBsToShowResult=500

+# Strides for increasing layers. Defaults (paper) are 2 for 1x1 and 1 for 3x3.
+stride1x1=1
+stride3x3=2
+
 Train=[
    action="train"
    modelPath="$ModelDir$/ResNet_18"
-
+    
     NDLNetworkBuilder=[
        networkDescription="$ConfigDir$/ResNet_18.ndl"
    ]
@ -31,7 +35,7 @@ Train=[
        minibatchSize=256
        # Note that learning rates are 10x more than in the paper due to a different
        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
-        learningRatesPerMB=1.0*35:0.1*35:0.01
+        learningRatesPerMB=1.0*30:0.1*30:0.01*30:0.001
        momentumPerMB=0.9
        maxEpochs=125
        gradUpdateType="None"
@ -74,8 +78,21 @@ Train=[
            jitterType="UniRatio"
            # Interpolation to use when scaling image to width x height size.
            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
-            interpolations="Linear"
-            # Stores mean values for each pixel in OpenCV matrix XML format.
+            interpolations="cubic"
+            # Aspect ratio jitter radius. Default is 0 (disabled).
+            aspectRatioRadius=0:0.2
+            # Brightness, contrast and color jittering. Default is 0 (disabled).
+            # Using 0 in the first epoch so the network can process original images.
+            brightnessRadius=0:0.2
+            contrastRadius=0:0.2
+            saturationRadius=0:0.4
+            # Intensity jittering: enabled if file is specified and intensityStdDev > 0. 
+            # The file stores 1x3 vector (eigenvalues) and 3x3 matrix (eigenvectors) in OpenCV XML format.
+            intensityFile="$ConfigDir$/ImageNet1K_intensity.xml"
+            # StdDev for intensity jittering. Start from the second epoch.
+            intensityStdDev=0:0.1
+            # Mean subtraction: enabled if file is specified.
+            # The file stores mean values for each pixel in OpenCV matrix XML format.
            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
        ]
        labels=[
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.ndl
@ -45,15 +45,15 @@ DNN=[
    rn1_2 = ResNetNode2A(rn1_1, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap2 = 128
-    rn2_1 = ResNetNode2BInc(rn1_2, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn2_1 = ResNetNode2BInc(rn1_2, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst, $stride1x1$, $stride3x3$)
    rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    
    cMap3 = 256
-    rn3_1 = ResNetNode2BInc(rn2_2, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn3_1 = ResNetNode2BInc(rn2_2, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst, $stride1x1$, $stride3x3$)
    rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap4 = 512
-    rn4_1 = ResNetNode2BInc(rn3_2, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn4_1 = ResNetNode2BInc(rn3_2, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst, $stride1x1$, $stride3x3$)
    rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.cntk
@ -18,9 +18,9 @@ stderr="$OutputDir$/ResNet_34"
 traceLevel=1
 numMBsToShowResult=500

-Proj64to128Filename = "$ConfigDir$/64to128.txt"
-Proj128to256Filename = "$ConfigDir$/128to256.txt"
-Proj256to512Filename = "$ConfigDir$/256to512.txt"
+# Strides for increasing layers. Defaults (paper) are 2 for 1x1 and 1 for 3x3.
+stride1x1=1
+stride3x3=2

 Train=[
    action="train"
@ -35,7 +35,7 @@ Train=[
        minibatchSize=256
        # Note that learning rates are 10x more than in the paper due to a different
        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
-        learningRatesPerMB=1.0*35:0.1*35:0.01
+        learningRatesPerMB=1.0*30:0.1*30:0.01*30:0.001
        momentumPerMB=0.9
        maxEpochs=125
        gradUpdateType="None"
@ -78,8 +78,21 @@ Train=[
            jitterType="UniRatio"
            # Interpolation to use when scaling image to width x height size.
            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
-            interpolations="Linear"
-            # Stores mean values for each pixel in OpenCV matrix XML format.
+            interpolations="cubic"
+            # Aspect ratio jitter radius. Default is 0 (disabled).
+            aspectRatioRadius=0:0.2
+            # Brightness, contrast and color jittering. Default is 0 (disabled).
+            # Using 0 in the first epoch so the network can process original images.
+            brightnessRadius=0:0.2
+            contrastRadius=0:0.2
+            saturationRadius=0:0.4
+            # Intensity jittering: enabled if file is specified and intensityStdDev > 0. 
+            # The file stores 1x3 vector (eigenvalues) and 3x3 matrix (eigenvectors) in OpenCV XML format.
+            intensityFile="$ConfigDir$/ImageNet1K_intensity.xml"
+            # StdDev for intensity jittering. Start from the second epoch.
+            intensityStdDev=0:0.1
+            # Mean subtraction: enabled if file is specified.
+            # The file stores mean values for each pixel in OpenCV matrix XML format.
            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
        ]
        labels=[
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
@ -46,13 +46,13 @@ DNN=[
    rn1_3 = ResNetNode2A(rn1_2, cMap1, 576, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap2 = 128
-    rn2_1 = ResNetNode2BInc(rn1_3, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn2_1 = ResNetNode2BInc(rn1_3, cMap2, cMap1, 576, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst, $stride1x1$, $stride3x3$)
    rn2_2 = ResNetNode2A(rn2_1, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn2_3 = ResNetNode2A(rn2_2, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn2_4 = ResNetNode2A(rn2_3, cMap2, 1152, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    
    cMap3 = 256
-    rn3_1 = ResNetNode2BInc(rn2_4, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn3_1 = ResNetNode2BInc(rn2_4, cMap3, cMap2, 1152, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst, $stride1x1$, $stride3x3$)
    rn3_2 = ResNetNode2A(rn3_1, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn3_3 = ResNetNode2A(rn3_2, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn3_4 = ResNetNode2A(rn3_3, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
@ -60,7 +60,7 @@ DNN=[
    rn3_6 = ResNetNode2A(rn3_5, cMap3, 2304, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

    cMap4 = 512
-    rn4_1 = ResNetNode2BInc(rn3_6, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
+    rn4_1 = ResNetNode2BInc(rn3_6, cMap4, cMap3, 2304, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst, $stride1x1$, $stride3x3$)
    rn4_2 = ResNetNode2A(rn4_1, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)
    rn4_3 = ResNetNode2A(rn4_2, cMap4, 4608, kW, kH, convWScale, convBValue, scValue, bnTimeConst)

--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.cntk
@ -18,10 +18,9 @@ stderr="$OutputDir$/ResNet_50"
 traceLevel=1
 numMBsToShowResult=500

-Proj64to256Filename = "$ConfigDir$/64to256.txt"
-Proj256to512Filename = "$ConfigDir$/256to512.txt"
-Proj512to1024Filename = "$ConfigDir$/512to1024.txt"
-Proj1024to2048Filename = "$ConfigDir$/1024to2048.txt"
+# Strides for increasing layers. Defaults (paper) are 2 for 1x1 and 1 for 3x3.
+stride1x1=1
+stride3x3=2

 Train=[
    action="train"
@ -36,7 +35,7 @@ Train=[
        minibatchSize=256
        # Note that learning rates are 10x more than in the paper due to a different
        # momentum update rule in CNTK: v{t + 1} = lr*(1 - momentum)*g{t + 1} + momentum*v{t}
-        learningRatesPerMB=1.0*35:0.1*35:0.01
+        learningRatesPerMB=1.0*30:0.1*30:0.01*30:0.001
        momentumPerMB=0.9
        maxEpochs=125
        gradUpdateType="None"
@ -79,8 +78,21 @@ Train=[
            jitterType="UniRatio"
            # Interpolation to use when scaling image to width x height size.
            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
-            interpolations="Linear"
-            # Stores mean values for each pixel in OpenCV matrix XML format.
+            interpolations="cubic"
+            # Aspect ratio jitter radius. Default is 0 (disabled).
+            aspectRatioRadius=0:0.2
+            # Brightness, contrast and color jittering. Default is 0 (disabled).
+            # Using 0 in the first epoch so the network can process original images.
+            brightnessRadius=0:0.2
+            contrastRadius=0:0.2
+            saturationRadius=0:0.4
+            # Intensity jittering: enabled if file is specified and intensityStdDev > 0. 
+            # The file stores 1x3 vector (eigenvalues) and 3x3 matrix (eigenvectors) in OpenCV XML format.
+            intensityFile="$ConfigDir$/ImageNet1K_intensity.xml"
+            # StdDev for intensity jittering. Start from the second epoch.
+            intensityStdDev=0:0.1
+            # Mean subtraction: enabled if file is specified.
+            # The file stores mean values for each pixel in OpenCV matrix XML format.
            meanFile="$ConfigDir$/ImageNet1K_mean.xml"
        ]
        labels=[
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
@ -47,23 +47,23 @@ DNN=[
    pool1vs = 2
    pool1 = MaxNDPooling(conv1, pool1W, pool1H, pool1hs, pool1vs)
    
-    rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst, 1)
+    rn1_1 = ResNetNode3BInc(pool1, cMap1, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst, 1, 1, 1)
    rn1_2 = ResNetNode3A(rn1_1, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst)
    rn1_3 = ResNetNode3A(rn1_2, cMap3, cMap1, cMap3, 576, convWScale, convBValue, scValue, bnTimeConst)

-    rn2_1 = ResNetNode3BInc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst, 2)
+    rn2_1 = ResNetNode3BInc(rn1_3, cMap3, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst, 2, $stride1x1$, $stride3x3$)
    rn2_2 = ResNetNode3A(rn2_1, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst)
    rn2_3 = ResNetNode3A(rn2_2, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst)
    rn2_4 = ResNetNode3A(rn2_3, cMap4, cMap2, cMap4, 1152, convWScale, convBValue, scValue, bnTimeConst)
    
-    rn3_1 = ResNetNode3BInc(rn2_4,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst, 2)
+    rn3_1 = ResNetNode3BInc(rn2_4,  cMap4, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst, 2, $stride1x1$, $stride3x3$)
    rn3_2 = ResNetNode3A(rn3_1, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)
    rn3_3 = ResNetNode3A(rn3_2, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)
    rn3_4 = ResNetNode3A(rn3_3, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)
    rn3_5 = ResNetNode3A(rn3_4, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)
    rn3_6 = ResNetNode3A(rn3_5, cMap5, cMap3, cMap5, 2304, convWScale, convBValue, scValue, bnTimeConst)

-    rn4_1 = ResNetNode3BInc(rn3_6, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, bnTimeConst, 2)
+    rn4_1 = ResNetNode3BInc(rn3_6, cMap5, cMap4, cMap6, 4608, convWScale, convBValue, scValue, bnTimeConst, 2, $stride1x1$, $stride3x3$)
    rn4_2 = ResNetNode3A(rn4_1, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue, bnTimeConst)
    rn4_3 = ResNetNode3A(rn4_2, cMap6, cMap4, cMap6, 4608, convWScale, convBValue, scValue, bnTimeConst)

--- a/Examples/Other/Simple2d/Data/SimpleDataTest_cntk_text.txt
+++ b/Examples/Other/Simple2d/Data/SimpleDataTest_cntk_text.txt
@ -0,0 +1,603 @@
+|labels 0 1	|features -1 -1
+|labels 0 1	|features -1 -0.99
+|labels 0 1	|features -1 -0.98
+|labels 0 1	|features -1 -0.97
+|labels 0 1	|features -1 -0.96
+|labels 0 1	|features -1 -0.95
+|labels 0 1	|features -1 -0.94
+|labels 0 1	|features -1 -0.93
+|labels 0 1	|features -1 -0.92
+|labels 0 1	|features -1 -0.91
+|labels 0 1	|features -1 -0.9
+|labels 0 1	|features -1 -0.89
+|labels 0 1	|features -1 -0.88
+|labels 0 1	|features -1 -0.87
+|labels 0 1	|features -1 -0.86
+|labels 0 1	|features -1 -0.85
+|labels 0 1	|features -1 -0.84
+|labels 0 1	|features -1 -0.83
+|labels 0 1	|features -1 -0.82
+|labels 0 1	|features -1 -0.81
+|labels 0 1	|features -1 -0.8
+|labels 0 1	|features -1 -0.79
+|labels 0 1	|features -1 -0.78
+|labels 0 1	|features -1 -0.77
+|labels 0 1	|features -1 -0.76
+|labels 0 1	|features -1 -0.75
+|labels 0 1	|features -1 -0.74
+|labels 0 1	|features -1 -0.73
+|labels 0 1	|features -1 -0.72
+|labels 0 1	|features -1 -0.71
+|labels 0 1	|features -1 -0.7
+|labels 0 1	|features -1 -0.69
+|labels 0 1	|features -1 -0.68
+|labels 0 1	|features -1 -0.67
+|labels 0 1	|features -1 -0.66
+|labels 0 1	|features -1 -0.65
+|labels 0 1	|features -1 -0.64
+|labels 0 1	|features -1 -0.63
+|labels 0 1	|features -1 -0.62
+|labels 0 1	|features -1 -0.61
+|labels 0 1	|features -1 -0.6
+|labels 0 1	|features -1 -0.59
+|labels 0 1	|features -1 -0.58
+|labels 0 1	|features -1 -0.57
+|labels 0 1	|features -1 -0.56
+|labels 0 1	|features -1 -0.55
+|labels 0 1	|features -1 -0.54
+|labels 0 1	|features -1 -0.53
+|labels 0 1	|features -1 -0.52
+|labels 0 1	|features -1 -0.51
+|labels 0 1	|features -1 -0.5
+|labels 0 1	|features -1 -0.49
+|labels 0 1	|features -1 -0.48
+|labels 0 1	|features -1 -0.47
+|labels 0 1	|features -1 -0.46
+|labels 0 1	|features -1 -0.45
+|labels 0 1	|features -1 -0.44
+|labels 0 1	|features -1 -0.43
+|labels 0 1	|features -1 -0.42
+|labels 0 1	|features -1 -0.41
+|labels 0 1	|features -1 -0.4
+|labels 0 1	|features -1 -0.39
+|labels 0 1	|features -1 -0.38
+|labels 0 1	|features -1 -0.37
+|labels 0 1	|features -1 -0.36
+|labels 0 1	|features -1 -0.35
+|labels 0 1	|features -1 -0.34
+|labels 0 1	|features -1 -0.33
+|labels 0 1	|features -1 -0.32
+|labels 0 1	|features -1 -0.31
+|labels 0 1	|features -1 -0.3
+|labels 0 1	|features -1 -0.29
+|labels 0 1	|features -1 -0.28
+|labels 0 1	|features -1 -0.27
+|labels 0 1	|features -1 -0.26
+|labels 0 1	|features -1 -0.25
+|labels 0 1	|features -1 -0.24
+|labels 0 1	|features -1 -0.23
+|labels 0 1	|features -1 -0.22
+|labels 0 1	|features -1 -0.21
+|labels 0 1	|features -1 -0.2
+|labels 0 1	|features -1 -0.19
+|labels 0 1	|features -1 -0.18
+|labels 0 1	|features -1 -0.17
+|labels 0 1	|features -1 -0.16
+|labels 0 1	|features -1 -0.15
+|labels 0 1	|features -1 -0.14
+|labels 0 1	|features -1 -0.13
+|labels 0 1	|features -1 -0.12
+|labels 0 1	|features -1 -0.11
+|labels 0 1	|features -1 -0.1
+|labels 1 0	|features -1 -0.09
+|labels 1 0	|features -1 -0.08
+|labels 1 0	|features -1 -0.07
+|labels 1 0	|features -1 -0.06
+|labels 1 0	|features -1 -0.05
+|labels 1 0	|features -1 -0.04
+|labels 1 0	|features -1 -0.03
+|labels 1 0	|features -1 -0.02
+|labels 1 0	|features -1 -0.01
+|labels 1 0	|features -1 0
+|labels 1 0	|features -1 0.01
+|labels 1 0	|features -1 0.02
+|labels 1 0	|features -1 0.03
+|labels 1 0	|features -1 0.04
+|labels 1 0	|features -1 0.05
+|labels 1 0	|features -1 0.06
+|labels 1 0	|features -1 0.07
+|labels 1 0	|features -1 0.08
+|labels 1 0	|features -1 0.09
+|labels 1 0	|features -1 0.1
+|labels 1 0	|features -1 0.11
+|labels 1 0	|features -1 0.12
+|labels 1 0	|features -1 0.13
+|labels 1 0	|features -1 0.14
+|labels 1 0	|features -1 0.15
+|labels 1 0	|features -1 0.16
+|labels 1 0	|features -1 0.17
+|labels 1 0	|features -1 0.18
+|labels 1 0	|features -1 0.19
+|labels 1 0	|features -1 0.2
+|labels 1 0	|features -1 0.21
+|labels 1 0	|features -1 0.22
+|labels 1 0	|features -1 0.23
+|labels 1 0	|features -1 0.24
+|labels 1 0	|features -1 0.25
+|labels 1 0	|features -1 0.26
+|labels 1 0	|features -1 0.27
+|labels 1 0	|features -1 0.28
+|labels 1 0	|features -1 0.29
+|labels 1 0	|features -1 0.3
+|labels 1 0	|features -1 0.31
+|labels 1 0	|features -1 0.32
+|labels 1 0	|features -1 0.33
+|labels 1 0	|features -1 0.34
+|labels 1 0	|features -1 0.35
+|labels 1 0	|features -1 0.36
+|labels 1 0	|features -1 0.37
+|labels 1 0	|features -1 0.38
+|labels 1 0	|features -1 0.39
+|labels 1 0	|features -1 0.4
+|labels 1 0	|features -1 0.41
+|labels 1 0	|features -1 0.42
+|labels 1 0	|features -1 0.43
+|labels 1 0	|features -1 0.44
+|labels 1 0	|features -1 0.45
+|labels 1 0	|features -1 0.46
+|labels 1 0	|features -1 0.47
+|labels 1 0	|features -1 0.48
+|labels 1 0	|features -1 0.49
+|labels 1 0	|features -1 0.5
+|labels 1 0	|features -1 0.51
+|labels 1 0	|features -1 0.52
+|labels 1 0	|features -1 0.53
+|labels 1 0	|features -1 0.54
+|labels 1 0	|features -1 0.55
+|labels 1 0	|features -1 0.56
+|labels 1 0	|features -1 0.57
+|labels 1 0	|features -1 0.58
+|labels 1 0	|features -1 0.59
+|labels 1 0	|features -1 0.6
+|labels 1 0	|features -1 0.61
+|labels 1 0	|features -1 0.62
+|labels 1 0	|features -1 0.63
+|labels 1 0	|features -1 0.64
+|labels 1 0	|features -1 0.65
+|labels 1 0	|features -1 0.66
+|labels 1 0	|features -1 0.67
+|labels 1 0	|features -1 0.68
+|labels 1 0	|features -1 0.69
+|labels 1 0	|features -1 0.7
+|labels 1 0	|features -1 0.71
+|labels 1 0	|features -1 0.72
+|labels 1 0	|features -1 0.73
+|labels 1 0	|features -1 0.74
+|labels 1 0	|features -1 0.75
+|labels 1 0	|features -1 0.76
+|labels 1 0	|features -1 0.77
+|labels 1 0	|features -1 0.78
+|labels 1 0	|features -1 0.79
+|labels 1 0	|features -1 0.8
+|labels 1 0	|features -1 0.81
+|labels 1 0	|features -1 0.82
+|labels 1 0	|features -1 0.83
+|labels 1 0	|features -1 0.84
+|labels 1 0	|features -1 0.85
+|labels 1 0	|features -1 0.86
+|labels 1 0	|features -1 0.87
+|labels 1 0	|features -1 0.88
+|labels 1 0	|features -1 0.89
+|labels 1 0	|features -1 0.9
+|labels 1 0	|features -1 0.91
+|labels 1 0	|features -1 0.92
+|labels 1 0	|features -1 0.93
+|labels 1 0	|features -1 0.94
+|labels 1 0	|features -1 0.95
+|labels 1 0	|features -1 0.96
+|labels 1 0	|features -1 0.97
+|labels 1 0	|features -1 0.98
+|labels 1 0	|features -1 0.99
+|labels 1 0	|features -1 0
+|labels 0 1	|features 0 -1
+|labels 0 1	|features 0 -0.99
+|labels 0 1	|features 0 -0.98
+|labels 0 1	|features 0 -0.97
+|labels 0 1	|features 0 -0.96
+|labels 0 1	|features 0 -0.95
+|labels 0 1	|features 0 -0.94
+|labels 0 1	|features 0 -0.93
+|labels 0 1	|features 0 -0.92
+|labels 0 1	|features 0 -0.91
+|labels 0 1	|features 0 -0.9
+|labels 0 1	|features 0 -0.89
+|labels 0 1	|features 0 -0.88
+|labels 0 1	|features 0 -0.87
+|labels 0 1	|features 0 -0.86
+|labels 0 1	|features 0 -0.85
+|labels 0 1	|features 0 -0.84
+|labels 0 1	|features 0 -0.83
+|labels 0 1	|features 0 -0.82
+|labels 0 1	|features 0 -0.81
+|labels 0 1	|features 0 -0.8
+|labels 0 1	|features 0 -0.79
+|labels 0 1	|features 0 -0.78
+|labels 0 1	|features 0 -0.77
+|labels 0 1	|features 0 -0.76
+|labels 0 1	|features 0 -0.75
+|labels 0 1	|features 0 -0.74
+|labels 0 1	|features 0 -0.73
+|labels 0 1	|features 0 -0.72
+|labels 0 1	|features 0 -0.71
+|labels 0 1	|features 0 -0.7
+|labels 0 1	|features 0 -0.69
+|labels 0 1	|features 0 -0.68
+|labels 0 1	|features 0 -0.67
+|labels 0 1	|features 0 -0.66
+|labels 0 1	|features 0 -0.65
+|labels 0 1	|features 0 -0.64
+|labels 0 1	|features 0 -0.63
+|labels 0 1	|features 0 -0.62
+|labels 0 1	|features 0 -0.61
+|labels 0 1	|features 0 -0.6
+|labels 0 1	|features 0 -0.59
+|labels 0 1	|features 0 -0.58
+|labels 0 1	|features 0 -0.57
+|labels 0 1	|features 0 -0.56
+|labels 0 1	|features 0 -0.55
+|labels 0 1	|features 0 -0.54
+|labels 0 1	|features 0 -0.53
+|labels 0 1	|features 0 -0.52
+|labels 0 1	|features 0 -0.51
+|labels 0 1	|features 0 -0.5
+|labels 0 1	|features 0 -0.49
+|labels 0 1	|features 0 -0.48
+|labels 0 1	|features 0 -0.47
+|labels 0 1	|features 0 -0.46
+|labels 0 1	|features 0 -0.45
+|labels 0 1	|features 0 -0.44
+|labels 0 1	|features 0 -0.43
+|labels 0 1	|features 0 -0.42
+|labels 0 1	|features 0 -0.41
+|labels 0 1	|features 0 -0.4
+|labels 0 1	|features 0 -0.39
+|labels 0 1	|features 0 -0.38
+|labels 0 1	|features 0 -0.37
+|labels 0 1	|features 0 -0.36
+|labels 0 1	|features 0 -0.35
+|labels 0 1	|features 0 -0.34
+|labels 0 1	|features 0 -0.33
+|labels 0 1	|features 0 -0.32
+|labels 0 1	|features 0 -0.31
+|labels 0 1	|features 0 -0.3
+|labels 0 1	|features 0 -0.29
+|labels 0 1	|features 0 -0.28
+|labels 0 1	|features 0 -0.27
+|labels 0 1	|features 0 -0.26
+|labels 0 1	|features 0 -0.25
+|labels 0 1	|features 0 -0.24
+|labels 0 1	|features 0 -0.23
+|labels 0 1	|features 0 -0.22
+|labels 0 1	|features 0 -0.21
+|labels 0 1	|features 0 -0.2
+|labels 0 1	|features 0 -0.19
+|labels 0 1	|features 0 -0.18
+|labels 0 1	|features 0 -0.17
+|labels 0 1	|features 0 -0.16
+|labels 0 1	|features 0 -0.15
+|labels 0 1	|features 0 -0.14
+|labels 0 1	|features 0 -0.13
+|labels 0 1	|features 0 -0.12
+|labels 0 1	|features 0 -0.11
+|labels 0 1	|features 0 -0.1
+|labels 1 0	|features 0 -0.09
+|labels 1 0	|features 0 -0.08
+|labels 1 0	|features 0 -0.07
+|labels 1 0	|features 0 -0.06
+|labels 1 0	|features 0 -0.05
+|labels 1 0	|features 0 -0.04
+|labels 1 0	|features 0 -0.03
+|labels 1 0	|features 0 -0.02
+|labels 1 0	|features 0 -0.01
+|labels 1 0	|features 0 0
+|labels 1 0	|features 0 0.01
+|labels 1 0	|features 0 0.02
+|labels 1 0	|features 0 0.03
+|labels 1 0	|features 0 0.04
+|labels 1 0	|features 0 0.05
+|labels 1 0	|features 0 0.06
+|labels 1 0	|features 0 0.07
+|labels 1 0	|features 0 0.08
+|labels 1 0	|features 0 0.09
+|labels 1 0	|features 0 0.1
+|labels 1 0	|features 0 0.11
+|labels 1 0	|features 0 0.12
+|labels 1 0	|features 0 0.13
+|labels 1 0	|features 0 0.14
+|labels 1 0	|features 0 0.15
+|labels 1 0	|features 0 0.16
+|labels 1 0	|features 0 0.17
+|labels 1 0	|features 0 0.18
+|labels 1 0	|features 0 0.19
+|labels 1 0	|features 0 0.2
+|labels 1 0	|features 0 0.21
+|labels 1 0	|features 0 0.22
+|labels 1 0	|features 0 0.23
+|labels 1 0	|features 0 0.24
+|labels 1 0	|features 0 0.25
+|labels 1 0	|features 0 0.26
+|labels 1 0	|features 0 0.27
+|labels 1 0	|features 0 0.28
+|labels 1 0	|features 0 0.29
+|labels 1 0	|features 0 0.3
+|labels 1 0	|features 0 0.31
+|labels 1 0	|features 0 0.32
+|labels 1 0	|features 0 0.33
+|labels 1 0	|features 0 0.34
+|labels 1 0	|features 0 0.35
+|labels 1 0	|features 0 0.36
+|labels 1 0	|features 0 0.37
+|labels 1 0	|features 0 0.38
+|labels 1 0	|features 0 0.39
+|labels 1 0	|features 0 0.4
+|labels 1 0	|features 0 0.41
+|labels 1 0	|features 0 0.42
+|labels 1 0	|features 0 0.43
+|labels 1 0	|features 0 0.44
+|labels 1 0	|features 0 0.45
+|labels 1 0	|features 0 0.46
+|labels 1 0	|features 0 0.47
+|labels 1 0	|features 0 0.48
+|labels 1 0	|features 0 0.49
+|labels 1 0	|features 0 0.5
+|labels 1 0	|features 0 0.51
+|labels 1 0	|features 0 0.52
+|labels 1 0	|features 0 0.53
+|labels 1 0	|features 0 0.54
+|labels 1 0	|features 0 0.55
+|labels 1 0	|features 0 0.56
+|labels 1 0	|features 0 0.57
+|labels 1 0	|features 0 0.58
+|labels 1 0	|features 0 0.59
+|labels 1 0	|features 0 0.6
+|labels 1 0	|features 0 0.61
+|labels 1 0	|features 0 0.62
+|labels 1 0	|features 0 0.63
+|labels 1 0	|features 0 0.64
+|labels 1 0	|features 0 0.65
+|labels 1 0	|features 0 0.66
+|labels 1 0	|features 0 0.67
+|labels 1 0	|features 0 0.68
+|labels 1 0	|features 0 0.69
+|labels 1 0	|features 0 0.7
+|labels 1 0	|features 0 0.71
+|labels 1 0	|features 0 0.72
+|labels 1 0	|features 0 0.73
+|labels 1 0	|features 0 0.74
+|labels 1 0	|features 0 0.75
+|labels 1 0	|features 0 0.76
+|labels 1 0	|features 0 0.77
+|labels 1 0	|features 0 0.78
+|labels 1 0	|features 0 0.79
+|labels 1 0	|features 0 0.8
+|labels 1 0	|features 0 0.81
+|labels 1 0	|features 0 0.82
+|labels 1 0	|features 0 0.83
+|labels 1 0	|features 0 0.84
+|labels 1 0	|features 0 0.85
+|labels 1 0	|features 0 0.86
+|labels 1 0	|features 0 0.87
+|labels 1 0	|features 0 0.88
+|labels 1 0	|features 0 0.89
+|labels 1 0	|features 0 0.9
+|labels 1 0	|features 0 0.91
+|labels 1 0	|features 0 0.92
+|labels 1 0	|features 0 0.93
+|labels 1 0	|features 0 0.94
+|labels 1 0	|features 0 0.95
+|labels 1 0	|features 0 0.96
+|labels 1 0	|features 0 0.97
+|labels 1 0	|features 0 0.98
+|labels 1 0	|features 0 0.99
+|labels 1 0	|features 0 1
+|labels 0 1	|features 1 -1
+|labels 0 1	|features 1 -0.99
+|labels 0 1	|features 1 -0.98
+|labels 0 1	|features 1 -0.97
+|labels 0 1	|features 1 -0.96
+|labels 0 1	|features 1 -0.95
+|labels 0 1	|features 1 -0.94
+|labels 0 1	|features 1 -0.93
+|labels 0 1	|features 1 -0.92
+|labels 0 1	|features 1 -0.91
+|labels 0 1	|features 1 -0.9
+|labels 0 1	|features 1 -0.89
+|labels 0 1	|features 1 -0.88
+|labels 0 1	|features 1 -0.87
+|labels 0 1	|features 1 -0.86
+|labels 0 1	|features 1 -0.85
+|labels 0 1	|features 1 -0.84
+|labels 0 1	|features 1 -0.83
+|labels 0 1	|features 1 -0.82
+|labels 0 1	|features 1 -0.81
+|labels 0 1	|features 1 -0.8
+|labels 0 1	|features 1 -0.79
+|labels 0 1	|features 1 -0.78
+|labels 0 1	|features 1 -0.77
+|labels 0 1	|features 1 -0.76
+|labels 0 1	|features 1 -0.75
+|labels 0 1	|features 1 -0.74
+|labels 0 1	|features 1 -0.73
+|labels 0 1	|features 1 -0.72
+|labels 0 1	|features 1 -0.71
+|labels 0 1	|features 1 -0.7
+|labels 0 1	|features 1 -0.69
+|labels 0 1	|features 1 -0.68
+|labels 0 1	|features 1 -0.67
+|labels 0 1	|features 1 -0.66
+|labels 0 1	|features 1 -0.65
+|labels 0 1	|features 1 -0.64
+|labels 0 1	|features 1 -0.63
+|labels 0 1	|features 1 -0.62
+|labels 0 1	|features 1 -0.61
+|labels 0 1	|features 1 -0.6
+|labels 0 1	|features 1 -0.59
+|labels 0 1	|features 1 -0.58
+|labels 0 1	|features 1 -0.57
+|labels 0 1	|features 1 -0.56
+|labels 0 1	|features 1 -0.55
+|labels 0 1	|features 1 -0.54
+|labels 0 1	|features 1 -0.53
+|labels 0 1	|features 1 -0.52
+|labels 0 1	|features 1 -0.51
+|labels 0 1	|features 1 -0.5
+|labels 0 1	|features 1 -0.49
+|labels 0 1	|features 1 -0.48
+|labels 0 1	|features 1 -0.47
+|labels 0 1	|features 1 -0.46
+|labels 0 1	|features 1 -0.45
+|labels 0 1	|features 1 -0.44
+|labels 0 1	|features 1 -0.43
+|labels 0 1	|features 1 -0.42
+|labels 0 1	|features 1 -0.41
+|labels 0 1	|features 1 -0.4
+|labels 0 1	|features 1 -0.39
+|labels 0 1	|features 1 -0.38
+|labels 0 1	|features 1 -0.37
+|labels 0 1	|features 1 -0.36
+|labels 0 1	|features 1 -0.35
+|labels 0 1	|features 1 -0.34
+|labels 0 1	|features 1 -0.33
+|labels 0 1	|features 1 -0.32
+|labels 0 1	|features 1 -0.31
+|labels 0 1	|features 1 -0.3
+|labels 0 1	|features 1 -0.29
+|labels 0 1	|features 1 -0.28
+|labels 0 1	|features 1 -0.27
+|labels 0 1	|features 1 -0.26
+|labels 0 1	|features 1 -0.25
+|labels 0 1	|features 1 -0.24
+|labels 0 1	|features 1 -0.23
+|labels 0 1	|features 1 -0.22
+|labels 0 1	|features 1 -0.21
+|labels 0 1	|features 1 -0.2
+|labels 0 1	|features 1 -0.19
+|labels 0 1	|features 1 -0.18
+|labels 0 1	|features 1 -0.17
+|labels 0 1	|features 1 -0.16
+|labels 0 1	|features 1 -0.15
+|labels 0 1	|features 1 -0.14
+|labels 0 1	|features 1 -0.13
+|labels 0 1	|features 1 -0.12
+|labels 0 1	|features 1 -0.11
+|labels 0 1	|features 1 -0.1
+|labels 0 1	|features 1 -0.09
+|labels 0 1	|features 1 -0.08
+|labels 0 1	|features 1 -0.07
+|labels 0 1	|features 1 -0.06
+|labels 0 1	|features 1 -0.05
+|labels 0 1	|features 1 -0.04
+|labels 0 1	|features 1 -0.03
+|labels 0 1	|features 1 -0.02
+|labels 0 1	|features 1 -0.01
+|labels 1 0	|features 1 0
+|labels 1 0	|features 1 0.01
+|labels 1 0	|features 1 0.02
+|labels 1 0	|features 1 0.03
+|labels 1 0	|features 1 0.04
+|labels 1 0	|features 1 0.05
+|labels 1 0	|features 1 0.06
+|labels 1 0	|features 1 0.07
+|labels 1 0	|features 1 0.08
+|labels 1 0	|features 1 0.09
+|labels 1 0	|features 1 0.1
+|labels 1 0	|features 1 0.11
+|labels 1 0	|features 1 0.12
+|labels 1 0	|features 1 0.13
+|labels 1 0	|features 1 0.14
+|labels 1 0	|features 1 0.15
+|labels 1 0	|features 1 0.16
+|labels 1 0	|features 1 0.17
+|labels 1 0	|features 1 0.18
+|labels 1 0	|features 1 0.19
+|labels 1 0	|features 1 0.2
+|labels 1 0	|features 1 0.21
+|labels 1 0	|features 1 0.22
+|labels 1 0	|features 1 0.23
+|labels 1 0	|features 1 0.24
+|labels 1 0	|features 1 0.25
+|labels 1 0	|features 1 0.26
+|labels 1 0	|features 1 0.27
+|labels 1 0	|features 1 0.28
+|labels 1 0	|features 1 0.29
+|labels 1 0	|features 1 0.3
+|labels 1 0	|features 1 0.31
+|labels 1 0	|features 1 0.32
+|labels 1 0	|features 1 0.33
+|labels 1 0	|features 1 0.34
+|labels 1 0	|features 1 0.35
+|labels 1 0	|features 1 0.36
+|labels 1 0	|features 1 0.37
+|labels 1 0	|features 1 0.38
+|labels 1 0	|features 1 0.39
+|labels 1 0	|features 1 0.4
+|labels 1 0	|features 1 0.41
+|labels 1 0	|features 1 0.42
+|labels 1 0	|features 1 0.43
+|labels 1 0	|features 1 0.44
+|labels 1 0	|features 1 0.45
+|labels 1 0	|features 1 0.46
+|labels 1 0	|features 1 0.47
+|labels 1 0	|features 1 0.48
+|labels 1 0	|features 1 0.49
+|labels 1 0	|features 1 0.5
+|labels 1 0	|features 1 0.51
+|labels 1 0	|features 1 0.52
+|labels 1 0	|features 1 0.53
+|labels 1 0	|features 1 0.54
+|labels 1 0	|features 1 0.55
+|labels 1 0	|features 1 0.56
+|labels 1 0	|features 1 0.57
+|labels 1 0	|features 1 0.58
+|labels 1 0	|features 1 0.59
+|labels 1 0	|features 1 0.6
+|labels 1 0	|features 1 0.61
+|labels 1 0	|features 1 0.62
+|labels 1 0	|features 1 0.63
+|labels 1 0	|features 1 0.64
+|labels 1 0	|features 1 0.65
+|labels 1 0	|features 1 0.66
+|labels 1 0	|features 1 0.67
+|labels 1 0	|features 1 0.68
+|labels 1 0	|features 1 0.69
+|labels 1 0	|features 1 0.7
+|labels 1 0	|features 1 0.71
+|labels 1 0	|features 1 0.72
+|labels 1 0	|features 1 0.73
+|labels 1 0	|features 1 0.74
+|labels 1 0	|features 1 0.75
+|labels 1 0	|features 1 0.76
+|labels 1 0	|features 1 0.77
+|labels 1 0	|features 1 0.78
+|labels 1 0	|features 1 0.79
+|labels 1 0	|features 1 0.8
+|labels 1 0	|features 1 0.81
+|labels 1 0	|features 1 0.82
+|labels 1 0	|features 1 0.83
+|labels 1 0	|features 1 0.84
+|labels 1 0	|features 1 0.85
+|labels 1 0	|features 1 0.86
+|labels 1 0	|features 1 0.87
+|labels 1 0	|features 1 0.88
+|labels 1 0	|features 1 0.89
+|labels 1 0	|features 1 0.9
+|labels 1 0	|features 1 0.91
+|labels 1 0	|features 1 0.92
+|labels 1 0	|features 1 0.93
+|labels 1 0	|features 1 0.94
+|labels 1 0	|features 1 0.95
+|labels 1 0	|features 1 0.96
+|labels 1 0	|features 1 0.97
+|labels 1 0	|features 1 0.98
+|labels 1 0	|features 1 0.99
+|labels 1 0	|features 1 1
--- a/Examples/Other/Simple2d/Data/SimpleDataTrain_cntk_text.txt
+++ b/Examples/Other/Simple2d/Data/SimpleDataTrain_cntk_text.txt
--- a/Examples/SequenceToSequence/Miscellaneous/G2P/G2P.cntk
+++ b/Examples/SequenceToSequence/Miscellaneous/G2P/G2P.cntk
@ -0,0 +1,491 @@
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE file in the project root for full license information.
+
+############################################################################
+# G2P.cntk                                                                 #
+#                                                                          #
+# Example for sequence-to-sequence modeling for grapheme-to-phoneme        #
+# (aka letter-to-sound) conversion on the CMUDict                          #
+############################################################################
+
+# directory defaults (if not overridden)
+
+RunRootDir = "../.."             # default if not overridden
+DataDir    = "$RunRootDir$/Data"
+CacheDir   = "$DataDir$/cache"   # (not used currently)
+ExpRootDir = "$RunRootDir$"
+
+# command to execute
+
+command = train
+#command = write
+#command = dump
+makeMode = false
+
+# experiment id
+deviceId = 0             # set the GPU device here, or "auto" to auto-select; or override from the command line.
+ExpId = g2p-1-$deviceId$ # choose a meaningful id here. This is used for unique directory and filenames.
+#ExpId = g2p-1-0 # change to different id when decoding a different model
+
+# directories
+ExpDir    = "$ExpRootDir$/$ExpId$"
+ModelDir  = "$ExpDir$/Models"
+
+stderr = $ExpDir$/G2P
+
+precision  = "float"
+traceLevel = 1
+modelPath  = "$ModelDir$/G2P.dnn"
+
+# decoding config  --used by the "write" command ("write" decodes and writes the result)
+beamDepth = 3                                      # 0=predict; 1=greedy; >1=beam
+decodeModel = 9
+decodeModelPath = "$modelPath$.$decodeModel$"      # note: epoch to decode is appended to the model path
+decodeOutputPath = "$decodeModelPath$.$beamDepth$" # results are written next to the model, with beamDepth appended
+
+# dump config  --used by the "dump" command, for inspecting the model parameters
+dumpModelPath = "$modelPath$.2" # put the epoch id here
+
+# top-level model configuration
+hiddenDim = 512 
+maxLayer = 2
+isBidirectional = false
+
+# comment/uncomment this or the next block to switch between readers
+# Note: Currently this configuration cannot reach the same result with CNTKTextFormatReader.
+# This is being investigated. For now, please use the LMSequenceReader.
+# --- begin uncomment for LMSequenceReader ---
+readerType = "LMSequenceReader"
+useCNTKTextFormatReader = false
+inputVocabSize = 69
+labelVocabSize = 69
+shareEmbeddings = true
+fileExt = "joint"
+# --- end uncomment ---
+
+# --- begin uncomment for CNTKTextFormatReader ---
+#readerType = "CNTKTextFormatReader"
+#useCNTKTextFormatReader = true
+#inputVocabSize = 29     # 26 letters plus start, end, apostrophe
+#labelVocabSize = 41     # 39 phonemes (~AX missing), plus start and end symbol (in index 0)
+#shareEmbeddings = false
+#fileExt = "ctf"
+# --- end uncomment ---
+
+# corpus
+maxLength = 20           # 0 disables attention
+isAutoEncoder=false
+startSymbol = "<s>"
+trainFile = "g014b2b.train-dev-20-21.bsf.$fileExt$"
+validFile = "g014b2b.train-dev-1-21.bsf.$fileExt$"
+testFile  = "g014b2b.test.bsf.$fileExt$"
+vocabFile = "g014b2b.wl"
+
+# some reader variables that occur multiple times
+cntkReaderInputDef = [ rawInput = [ alias = "s" ; dim = $inputVocabSize$ ; format = "sparse" ] ; rawLabels = [ alias = "t" ;  dim = $labelVocabSize$ ;  format = "sparse" ] ]
+lmSequenceReaderInputDef = [ dim = 0 ]
+lmSequenceReaderInputLabelsDef = [ dim = 1 ; labelType = "category" ;  labelDim = "$inputVocabSize$" ; labelMappingFile = "$DataDir$/$vocabFile$" ; beginSequence = "</s>" ;  endSequence   = "</s>" ]
+
+#######################################
+#  network definition                 #
+#######################################
+
+BrainScriptNetworkBuilder = (new ComputationNetwork [
+
+    # import general config options from outside config values
+    useCNTKTextFormatReader = $useCNTKTextFormatReader$
+
+    inputVocabDim = $inputVocabSize$
+    labelVocabDim = $labelVocabSize$
+
+    isAutoencoder = $isAutoEncoder$     # input is only one sequence, meant to reproduce itself
+    attentionSpan = $maxLength$         # attention window, must be large enough for largest input sequence. 0 to disable. Exactly 20 is needed for the g2p CMUDict task
+    useBidirectionalEncoder = $isBidirectional$ # bi-directional LSTM for encoder
+
+    shareEmbeddings = $shareEmbeddings$
+    hiddenDim       = $hiddenDim$
+    attentionDim    = 128               # dim of attention  projection
+    maxLayer        = $maxLayer$        # e.g. 2 for 3 hidden layers
+
+    useStabilizer = true
+    useEncoder    = true                # if false, this becomes a regular RNN
+    useNYUStyle   = false               # if true use thought vector for all inputs, NYU-style
+
+    # dimensions
+    embeddingDim = 300
+    inputEmbeddingDim = if inputVocabDim < 300 then inputVocabDim else embeddingDim
+    labelEmbeddingDim = if labelVocabDim < 300 then labelVocabDim else embeddingDim
+
+    encoderDims[i:0..maxLayer] = hiddenDim # this defines the number of hidden layers in each
+    decoderDims[i:0..maxLayer] = hiddenDim # both are one LSTM layer only for now
+
+    #############################################################
+    # inputs
+    #############################################################
+
+    # inputs and axes must be defined on top-scope level in order to get a clean node name from BrainScript.
+    inputAxis = DynamicAxis()
+    rawInput  = if useCNTKTextFormatReader && !isAutoencoder
+                then Input (inputVocabDim, dynamicAxis=inputAxis, tag='feature')
+                else Input (inputVocabDim,                        tag='feature')
+    rawLabels = if useCNTKTextFormatReader && !isAutoencoder
+                then Input (labelVocabDim, tag='label')
+                else rawInput
+
+    # get out input and label data
+    # Specifically, if the input and label is on a single line, we must split it in two.
+    streams = [
+        out = if isAutoencoder || useCNTKTextFormatReader then [
+            input  = TraceSparse (rawInput, 'inp')
+            labels = TraceSparse (rawLabels, 'lbl')
+        ]
+        else [
+            separatorRow = 2                                                                             # row index of separator symbol 
+            isSeparator = RowSlice (separatorRow, 1, rawInput)                                           # cut out the separator as a flag
+            inInput  = BS.Boolean.Or (FutureValue (1, inInput , defaultHiddenActivation=0), isSeparator) # flag sequence: word is input...
+            inLabels = BS.Boolean.Or (PastValue   (1, inLabels, defaultHiddenActivation=0), isSeparator) # ...or labels
+            input  = BS.Sequences.Gather (inInput,  rawInput)                                            # use flags to split raw input into input and labels
+            labels = BS.Sequences.Gather (inLabels, rawInput)                                            # (both have different lengths)
+        ]
+    ].out
+
+    # inputs and labels are expected to be surrounded by sentence delimiters, e.g. <s> A B C </s>  ==>  <s> D E F </s>
+    # The encoder uses all tokens of 'input', while for the target labels we must exclude the initial sentence start, which is only used as the LM history.
+
+    inputSequence = Pass (streams.input)                             # e.g. <s> A   B   C    </s>
+    labelSequence = Pass (Slice (1,  0, streams.labels,  axis=-1))   # e.g. D   E   F   </s>
+    labelSentenceStart = Pass (BS.Sequences.First (streams.labels))  # e.g. <s>
+    inputSequenceDim = inputVocabDim
+    labelSequenceDim = labelVocabDim
+
+    isFirstLabel = BS.Loop.IsFirst (labelSequence)
+
+    #############################################################
+    # embeddings  --as long as we cannot read multiple sequences, we got one embedding
+    #############################################################
+
+    # Note: Embeddings are linear. Should we use BatchNormalization?
+
+    # note: this is assumed to be applied transposed, hence the swapped dimensions. Actually--why? Still needed?
+    Einput  =                                     BS.Parameters.WeightParam (inputSequenceDim, inputEmbeddingDim)
+    Elabels = if shareEmbeddings then Einput else BS.Parameters.WeightParam (labelSequenceDim, labelEmbeddingDim)
+    EmbedInput (x)  = if inputSequenceDim == inputEmbeddingDim then x else TransposeTimes (Einput, x)
+    EmbedLabels (x) = if labelSequenceDim == labelEmbeddingDim then x else TransposeTimes (Elabels, x)
+
+    inputEmbedded  = EmbedInput  (inputSequence)
+    labelsEmbedded = EmbedLabels (labelSequence)
+    labelSentenceStartEmbedded = Pass (EmbedLabels (labelSentenceStart))  # TODO: remove Pass() if not actually needed in decoder
+    labelSentenceStartEmbeddedScattered = BS.Sequences.Scatter (isFirstLabel, labelSentenceStartEmbedded) # unfortunately needed presently
+
+    S(x) = BS.Parameters.Stabilize (x, enabled=useStabilizer)
+
+    #############################################################
+    # encoder (processes inputEmbedded)
+    #############################################################
+
+    # TODO: do not reverse our inputs; instead, if needed, use a backwards-running loop here
+
+    encoderFunction = if useBidirectionalEncoder then BS.RNNs.RecurrentBirectionalLSTMPStack else BS.RNNs.RecurrentLSTMPStack
+    encoder = encoderFunction (encoderDims, cellDims=encoderDims, S(inputEmbedded), inputDim=inputEmbeddingDim,
+        previousHook=BS.RNNs.PreviousHC,
+        enableSelfStabilization=useStabilizer)
+    encoderOutput = encoder[Length (encoderDims)-1]
+
+    # There are three ways of passing encoder state:
+    #  1. as initial state for decoder (Google style)
+    #  2. as side information for every decoder step (NYU style)
+    #  3. attention
+
+    # get the final encoder state for use as the initial state
+    # For beam decoding, we will also inject a second dimension.
+    thoughtVector = [
+        h = ReshapeDimension (BS.Sequences.Last (encoderOutput.h), 1, (dim:1))
+        c = ReshapeDimension (BS.Sequences.Last (encoderOutput.c), 1, (dim:1))
+        dim = encoderOutput.dim
+    ]
+
+    thoughtVectorBroadcast = [ # broadcast to all time steps of the target sequence
+        h = BS.Sequences.BroadcastSequenceAs (labelsEmbedded, thoughtVector.h)
+        c = BS.Sequences.BroadcastSequenceAs (labelsEmbedded, thoughtVector.c)
+        dim = thoughtVector.dim
+    ]
+
+    #############################################################
+    # decoder reordering hook: propagation of beam hypotheses
+    #############################################################
+
+    # we bake into the LSTMs to multiply h and c with the 'beamSearchReorderHook' matrix, which is
+    # a dummy in training but will be patched through model editing for beam decoding.
+    # Specifically, the decoder will replace this by a per-sample matrix that reorders hypotheses according to
+    # how they propagate. E.g. the 2nd best in a frame may be the history of the 3rd best in the subsequent frame
+
+    beamSearchReorderHook = Pass (BS.Constants.OnesTensor (1:1))
+
+    # helper functions to delay h and c that apply beam-search reordering, if so configured
+
+    PreviousHCWithReorderingHook (lstmState, layerIndex=0) = [
+       h = BS.Loop.Previous (lstmState.h * beamSearchReorderHook)             // hidden state(t-1)
+       c = BS.Loop.Previous (lstmState.c * beamSearchReorderHook)             // cell(t-1)
+       dim = lstmState.dim
+    ]
+
+    PreviousHCFromThoughtVectorWithReorderingHook (lstmState, layerIndex=0) =
+        if layerIndex > 0 then PreviousHCWithReorderingHook (lstmState, layerIndex=1)
+        else [ # with both thought vector and beam-search hook
+            isFirst = BS.Loop.IsFirst (labelsEmbedded)
+            h = BS.Boolean.If (isFirst, thoughtVectorBroadcast.h, BS.Loop.Previous (lstmState.h * beamSearchReorderHook))
+            c = BS.Boolean.If (isFirst, thoughtVectorBroadcast.c, BS.Loop.Previous (lstmState.c * beamSearchReorderHook))
+            dim = lstmState.dim
+        ]
+
+    #############################################################
+    # decoder history hook: LM history, from ground truth vs. output
+    #############################################################
+
+    # these are the two choices for the input to the decoder network
+    decoderHistoryFromGroundTruth = labelsEmbedded              # for training, decoder input is ground truth...
+    decoderHistoryFromOutput = Pass (EmbedLabels (Hardmax (z))) # ...but for (greedy) decoding, the decoder's output is its previous input
+
+    # during training, we use ground truth. For decoding, we will rewire decoderHistoryHook = decoderHistoryFromOutput
+    decoderHistoryHook = Pass (decoderHistoryFromGroundTruth) # this gets redirected in decoding to feed back decoding output instead
+
+    #############################################################
+    # decoder
+    #############################################################
+
+    decoderInput    = Pass (BS.Boolean.If (isFirstLabel, labelSentenceStartEmbeddedScattered, BS.Loop.Previous (decoderHistoryHook)))
+    decoderInputDim = labelEmbeddingDim
+
+    decoderDynamicAxis = labelsEmbedded
+    FixedWindowAttentionHook = BS.Seq2Seq.CreateAugmentWithFixedWindowAttentionHook (attentionDim, attentionSpan, decoderDynamicAxis, encoderOutput, enableSelfStabilization=useStabilizer)
+
+    # some parameters to the decoder stack depend on the mode
+    decoderParams =
+        # with attention
+        if useEncoder && attentionSpan > 0 then [
+            previousHook = PreviousHCWithReorderingHook # add reordering for beam search
+            augmentInputHook = FixedWindowAttentionHook # input gets augmented by the attention window
+            augmentInputDim = encoderOutput.dim
+        ]
+        # with thought vector appended to every frame
+        else if useEncoder && useNYUStyle then [
+            previousHook = PreviousHCWithReorderingHook
+            augmentInputHook (input, lstmState) = S(thoughtVectorBroadcast.h) # each input frame gets augmented by the thought vector
+            augmentInputDim = thoughtVector.dim
+        ]
+        # thought vector as initial state for decoder
+        else [
+            previousHook = PreviousHCFromThoughtVectorWithReorderingHook # Previous() function with thought vector as initial state
+            augmentInputHook = BS.RNNs.NoAuxInputHook
+            augmentInputDim = 0
+        ]
+
+    # this is the decoder LSTM stack
+    decoder = BS.RNNs.RecurrentLSTMPStack (decoderDims, cellDims=decoderDims,
+                                           S(decoderInput), inputDim=decoderInputDim,
+                                           augmentInputHook=decoderParams.augmentInputHook, augmentInputDim=decoderParams.augmentInputDim,
+                                           previousHook=decoderParams.previousHook,
+                                           enableSelfStabilization=useStabilizer)
+
+    decoderOutputLayer = Length (decoder)-1
+    decoderOutput = decoder[decoderOutputLayer].h
+    decoderDim = decoderDims[decoderOutputLayer]
+
+    #############################################################
+    # softmax output layer
+    #############################################################
+
+    W = BS.Parameters.WeightParam (labelSequenceDim, decoderDim)
+    B = BS.Parameters.BiasParam (labelSequenceDim)
+
+    z = W * S(decoderOutput) + B;  // top-level input to Softmax
+
+    #############################################################
+    # training criteria
+    #############################################################
+
+    #ce   = Pass (ReduceLogSum (z) - ReduceSum (labelSequence .*          z ), tag='criterion')
+    #errs = Pass (BS.Constants.One - ReduceSum (labelSequence .* Hardmax (z)), tag='evaluation')
+    #ce2 = Negate (ReduceSum (labelSequence .* LogSoftmax (z)), tag='evaluation')
+    #ce1 = CrossEntropyWithSoftmax (labelSequence, z, tag='evaluation')   // this is the training objective
+    #errs = ErrorPrediction         (labelSequence, z, tag='evaluation')  // this also gets tracked
+
+    ce   = Pass (ReduceLogSum (z) - TransposeTimes (labelSequence,          z),  tag='criterion')
+    errs = Pass (BS.Constants.One - TransposeTimes (labelSequence, Hardmax (z)), tag='evaluation')
+
+    # score output for decoding
+    scoreSequence = Pass (z)
+
+    #############################################################
+    # some helper functions
+    #############################################################
+
+    # these trace functions log their parameter's value
+    TraceState (h, what) = Transpose (Trace (Transpose (h), say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=3, format=[ type = "real" ; transpose = false ; precisionFormat = ".4" ]))
+    TraceDense (h, what) = Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=21, onlyUpToT=25, format=[ type = "real" ; transpose = false ; precisionFormat = ".4" ])
+    TraceDenseTransposed (h, what) = Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=25, format=[ type = "real" ; transpose = true ; precisionFormat = ".4" ])
+    TraceOneHot (h, what) = Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, format=[ type = "category" ; transpose = false ])
+    TraceSparse (h, what) = Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, format=[ type = "sparse" ; transpose = false ])
+])
+
+#######################################
+#  TRAINING CONFIG                    #
+#######################################
+
+train = [
+    action = "train"
+    traceLevel = 1
+    epochSize = 0               # (for quick tests, this can be overridden with something small)
+
+    # BrainScriptNetworkBuilder is defined in outer scope
+
+    SGD = [
+        minibatchSize = 144:144:288:576
+        learningRatesPerSample = 0.007*2:0.0035
+        momentumAsTimeConstant = 1100
+        gradientClippingWithTruncation = true   # (as opposed to clipping the Frobenius norm of the matrix)
+        clippingThresholdPerSample = 2.3   # visibly impacts objectives, but not final result, so keep it for safety
+        maxEpochs = 50
+        numMBsToShowResult = 100
+        firstMBsToShowResult = 10
+        gradUpdateType = "none" # FSAdaGrad?
+        loadBestModel = false   # true # broken for some models (rereading overwrites something that got set by validation)
+
+        # tracing (enable these for debugging)
+        #traceNodeNamesReal = labelsEmbedded:decoderInput:"decoder[0].lstmState._privateInnards.ht":z.Plus_left.Times_right.result:z:ce
+        #traceNodeNamesReal = labelsEmbedded:decoderInput:z:ce
+        #traceNodeNamesCategory = inputSequence.out:labelSequence
+
+        dropoutRate = 0.0
+
+        # settings for Auto Adjust Learning Rate
+        AutoAdjust = [
+            autoAdjustLR = "adjustAfterEpoch"
+            reduceLearnRateIfImproveLessThan = 0.001
+            continueReduce = false
+            increaseLearnRateIfImproveMoreThan = 1000000000
+            learnRateDecreaseFactor = 0.5
+            learnRateIncreaseFactor = 1.382
+            numMiniBatch4LRSearch = 100
+            numPrevLearnRates = 5
+            numBestSearchEpoch = 1
+        ]
+    ]
+
+    # reader definitions
+    reader = [
+        readerType = "$readerType$"
+        file = "$DataDir$/$trainFile$"
+        randomize = "auto"
+
+        # specific to CNTKTextFormatReader
+        skipSequenceIds = "false"
+        maxErrors = 100
+        traceLevel = 2
+        chunkSizeInBytes = 30000000         # large enough for entire data set
+        input = $cntkReaderInputDef$
+
+        # specific to LMSequenceReader
+        mode = "softmax"                    # TODO: find out what this means
+        nbruttsineachrecurrentiter = 0      # 0 means auto-fill given minibatch size
+        cacheBlockSize = 100000000          # read block size. This value is large enough to load entire corpus at once
+        rawInput = $lmSequenceReaderInputDef$
+        inputLabelsDef = $lmSequenceReaderInputLabelsDef$
+        outputDummy = [ labelType = "none" ]
+    ]
+
+    cvReader = [
+        readerType = "$readerType$"
+        file = "$DataDir$/$validFile$"
+        randomize = "none"
+
+        # specific to CNTKTextFormatReader
+        skipSequenceIds = "false"
+        maxErrors = 100
+        traceLevel = 2
+        input = $cntkReaderInputDef$
+
+        # specific to LMSequenceReader
+        mode = "softmax"                    # TODO: find out what this means
+        nbruttsineachrecurrentiter = 0      # 0 means auto-fill given minibatch size
+        cacheBlockSize = 100000000          # read block size. This value is large enough to load entire corpus at once
+        rawInput = $lmSequenceReaderInputDef$
+        inputLabelsDef = $lmSequenceReaderInputLabelsDef$
+        outputDummy = [ labelType = "none" ]
+    ]
+]
+
+#######################################
+#  DUMP CONFIG                        #
+#######################################
+
+# dumps the model, specifically the learnable parameters
+
+dump = [
+    action = "dumpnode"
+    modelPath = "$dumpModelPath$"
+    outputFile = "$dumpModelPath$.txt"
+]
+
+#######################################
+#  WRITE CONFIG                       #
+#######################################
+
+# This will decode the test set. The beamDepth parameter specifies the decoding mode:
+#  beamDepth = 0: word prediction given ground truth history (only useful for perplexity measurement)
+#  beamDepth = 1: greedy decoding: At each time step, choose a word greedily
+#  beamDepth > 1: beam decoder. Keep 'beamDepth' best hypotheses, and output their globally best at the end.
+
+write = [
+    action = "write"
+
+    # select the decoder
+    BrainScriptNetworkBuilder = (
+        # beamDepth = 0 will decode with the unmodified model.
+        # beamDepth = 1 will modify the model to use the decoding output as the decoder's input.
+        # beamDepth > 1 will modify the model to track multiple hypotheses and select the globally best at the end.
+        if      $beamDepth$ == 0 then BS.Network.Load ("$decodeModelPath$")
+        else if $beamDepth$ == 1 then BS.Seq2Seq.GreedySequenceDecoderFrom (BS.Network.Load ("$decodeModelPath$"))
+        else                          BS.Seq2Seq.BeamSearchSequenceDecoderFrom (BS.Network.Load ("$decodeModelPath$"), $beamDepth$)
+    )
+
+    outputPath = $decodeOutputPath$
+    #outputPath = "-"                    # "-" will write to stdout; useful for debugging
+
+    # declare the nodes we want to write out
+    # not all decoder configs have the same node names, so we just list them all
+    #outputNodeNames = inputsOut:labelsOut:decodeOut:network.beamDecodingModel.inputsOut:network.beamDecodingModel.labelsOut:network.beamDecodingModel.decodeOut
+
+    # output format
+    # We configure the output to emit a flat sequence of token strings.
+    format = [
+        type = "category"
+        transpose = false
+        labelMappingFile = "$DataDir$/$vocabFile$"
+    ]
+
+    minibatchSize = 8192                # choose this to be big enough for the longest sentence
+    traceLevel = 1
+    epochSize = 0
+
+    reader = [
+        readerType = "$readerType$"
+        file = "$DataDir$/$testFile$"
+        randomize = "none"
+
+        # specific to CNTKTextFormatReader
+        skipSequenceIds = "false"
+        maxErrors = 100
+        traceLevel = 2
+        input = $cntkReaderInputDef$
+
+        # specific to LMSequenceReader
+        mode = "softmax"                    # TODO: find out what this means
+        nbruttsineachrecurrentiter = 1      # 1 means one sequence at a time
+        # BUGBUG: ^^ =0 currently produces bad output. I suspect Times (data, data)
+        cacheBlockSize = 100000000          # read block size. This value is large enough to load entire corpus at once
+        rawInput = $lmSequenceReaderInputDef$
+        inputLabelsDef = $lmSequenceReaderInputLabelsDef$
+        outputDummy = [ labelType = "none" ]
+    ]
+]
--- a/Examples/SequenceToSequence/Miscellaneous/G2P/README.txt
+++ b/Examples/SequenceToSequence/Miscellaneous/G2P/README.txt
@ -0,0 +1,23 @@
+
+This example demonstrates the use of CNTK for letter-to-sound conversion using a
+sequence-to-sequence model with attention.
+
+The code supports a number of alternative configurations. As configured currently, it implements
+* a 3-hidden layer unidirectional LSTM encoder network, all hidden dimensions are 512
+* a 3-hidden layer unidirectional LSTM decoder network, all hidden dimensions are 512
+* encoder state is passed to the decoder by means of attention, with projection dimension 128 and maximum input length of 20 tokens
+* embedding disabled (the vocabulary is very small)
+* beam decoder with beam width 3
+
+This example uses the CMUDict as a corpus. The data or a conversion script will be included soon.
+
+To Use:
+=======
+
+Modify the following in G2P.cntk:
+* pathnames
+* deviceId to specify CPU (-1) or GPU (>=0 or "auto")
+
+Run:
+* command line:  cntk  configFile=Examples/SequenceToSequence/Miscellaneous/G2P/G2P.cntk  RunRootDir=g2p
+* VS Debugger:   configFile=$(SolutionDir)Examples/SequenceToSequence/Miscellaneous/G2P/G2P.cntk  RunRootDir=$(SolutionDir)g2p
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/Align.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/Align.cntk
@ -10,7 +10,6 @@ write=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    printValues=true
      
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK.cntk
@ -12,7 +12,6 @@ TrainDNN=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    NDLNetworkBuilder=[
        ndlMacros=$NdlDir$/default_macros.ndl	
@ -98,7 +97,6 @@ TrainLSTM=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true
    NDLNetworkBuilder=[
        ndlMacros=$NdlDir$/default_macros.ndl	
        networkDescription=$ndlfile$
@ -183,7 +181,6 @@ TrainPACRNN=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    NDLNetworkBuilder=[
 #        ndlMacros=$NdlDir$/default_macros.ndl	
@ -286,7 +283,6 @@ write=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    printValues=true
      
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2.cntk
@ -12,7 +12,6 @@ TrainDNN=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    NDLNetworkBuilder=[
        ndlMacros=$NdlDir$/default_macros.ndl	
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_dnn.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_dnn.cntk
@ -10,7 +10,6 @@ TrainModel=[
    # deviceId=-1 for CPU, >=0 for GPU devices
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true
    NDLNetworkBuilder=[
        ndlMacros=$NdlDir$/default_macros.ndl
        networkDescription=$NdlDir$/model.ndl
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_lstmp.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_lstmp.cntk
@ -11,7 +11,6 @@ TrainModel=[
    # deviceId=-1 for CPU, >=0 for GPU devices
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true
    NDLNetworkBuilder=[
        ndlMacros=$NdlDir$/default_macros.ndl
        networkDescription=$NdlDir$/model.ndl
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_lstmp_smbr.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_lstmp_smbr.cntk
@ -20,7 +20,6 @@ TrainModel=[
    # deviceId=-1 for CPU, >=0 for GPU devices
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    NDLNetworkBuilder=[
        ndlMacros=$NdlDir$/default_macros.ndl
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_smbr.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_smbr.cntk
@ -19,7 +19,6 @@ TrainModel=[
    # deviceId=-1 for CPU, >=0 for GPU devices
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    NDLNetworkBuilder=[
        ndlMacros=$NdlDir$/default_macros.ndl
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_write.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK2_write.cntk
@ -12,7 +12,6 @@ write=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    printValues=true
      
--- a/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK_write.cntk
+++ b/Examples/Speech/Miscellaneous/AMI/cntk_config/CNTK_write.cntk
@ -12,7 +12,6 @@ write=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    printValues=true
      
--- a/Examples/Speech/Miscellaneous/TIMIT/CPU/TIMIT_LSTM.cntk
+++ b/Examples/Speech/Miscellaneous/TIMIT/CPU/TIMIT_LSTM.cntk
@ -20,7 +20,6 @@ speechTrainNDL=[
    deviceId=-1

    traceLevel=1
-    useValidation=true

    NDLNetworkBuilder=[
        networkDescription=$ConfigFolder$\LSTM_1layer.ndl
--- a/Examples/Speech/Miscellaneous/TIMIT/GPU/TIMIT_LSTM.cntk
+++ b/Examples/Speech/Miscellaneous/TIMIT/GPU/TIMIT_LSTM.cntk
@ -20,7 +20,6 @@ speechTrainNDL=[
    deviceId=0

    traceLevel=1
-    useValidation=true

    NDLNetworkBuilder=[
        networkDescription=$ConfigFolder$\LSTM_1layer.ndl
--- a/Examples/Speech/Miscellaneous/TIMIT/config/TIMIT_TrainLSTM.cntk
+++ b/Examples/Speech/Miscellaneous/TIMIT/config/TIMIT_TrainLSTM.cntk
@ -12,7 +12,6 @@ TIMIT_TrainLSTM=[
    # deviceId=-1 for CPU, >=0 for GPU devices 
    deviceId=$DeviceNumber$
    traceLevel=1
-    useValidation=true

    truncated=true

@ -87,4 +86,4 @@ TIMIT_TrainLSTM=[
            labelMappingFile=$MlfDir$/TIMIT.statelist
        ]
    ]
-]
+]
--- a/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/CPU/fnnlm.cntk
+++ b/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/CPU/fnnlm.cntk
@ -40,7 +40,6 @@ deviceId=-1
 epochSize=4430000
 # which is 886 * 5000
 defaultHiddenActivity=0.1
-useValidation=true
 rnnType=CLASSLM
 # rnnType=LSTM

@ -307,7 +306,6 @@ test=[
    # which is 886 * 5000
    recurrentLayer=1
    defaultHiddenActivity=0.1
-    useValidation=true
    rnnType=CLASSLM

    modelPath=$ExpFolder$\modelRnnCNTK
@ -410,4 +408,4 @@ test=[
        ]
      ]
    ]
-]
+]
--- a/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/CPU/rnnlm.cntk
+++ b/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/CPU/rnnlm.cntk
@ -41,7 +41,6 @@ train=[
    # which is 886 * 5000
    recurrentLayer=1
    defaultHiddenActivity=0.1
-    useValidation=true
    rnnType=CLASSLM
    # rnnType=LSTM

@ -308,7 +307,6 @@ test=[
    # which is 886 * 5000
    recurrentLayer=1
    defaultHiddenActivity=0.1
-    useValidation=true
    rnnType=CLASSLM

    modelPath=$ExpFolder$\modelRnnCNTK
@ -411,4 +409,4 @@ test=[
        ]
      ]
    ]
-]
+]
--- a/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/GPU/fnnlm.cntk
+++ b/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/GPU/fnnlm.cntk
@ -30,7 +30,6 @@ epochSize=4430000
 # which is 886 * 5000
 #    recurrentLayer=1
 defaultHiddenActivity=0.0
-useValidation=true
 rnnType=CLASSLM
 # rnnType=LSTM

@ -297,7 +296,6 @@ test=[
    # which is 886 * 5000
    recurrentLayer=1
    defaultHiddenActivity=0.1
-    useValidation=true
    rnnType=CLASSLM

    modelPath=$ExpFolder$\modelRnnCNTK
@ -400,4 +398,4 @@ test=[
        ]
      ]
    ]
-]
+]
--- a/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/GPU/rnnlm.cntk
+++ b/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/GPU/rnnlm.cntk
@ -31,7 +31,6 @@ epochSize=4430000
 # which is 886 * 5000
 recurrentLayer=1
 defaultHiddenActivity=0.0
-useValidation=true
 rnnType=CLASSLM
 # rnnType=LSTM

@ -298,7 +297,6 @@ test=[
    # which is 886 * 5000
    recurrentLayer=1
    defaultHiddenActivity=0.1
-    useValidation=true
    rnnType=CLASSLM

    modelPath=$ExpFolder$\modelRnnCNTK
@ -401,4 +399,4 @@ test=[
        ]
      ]
    ]
-]
+]
--- a/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/GPU/rnnlm.gpu.cntk
+++ b/Examples/Text/PennTreebank/AdditionalFiles/RNNLM/GPU/rnnlm.gpu.cntk
@ -31,7 +31,6 @@ train=[
    # which is 886 * 5000
    recurrentLayer=1
    defaultHiddenActivity=0.0
-    useValidation=true
    rnnType=CLASSLM
    # rnnType=LSTM

@ -297,7 +296,6 @@ test=[
    # which is 886 * 5000
    recurrentLayer=1
    defaultHiddenActivity=0.1
-    useValidation=true
    rnnType=CLASSLM

    modelPath=$ExpFolder$\modelRnnCNTK
@ -400,4 +398,4 @@ test=[
        ]
      ]
    ]
-]
+]
--- a/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
+++ b/Examples/Text/PennTreebank/Config/S2SAutoEncoder.cntk
--- a/Examples/Text/PennTreebank/Config/S2SLib.bs
+++ b/Examples/Text/PennTreebank/Config/S2SLib.bs
@ -1,48 +0,0 @@
-# TODO: must sort this out. For now, this is just shared stuff between training and decoding.
-
-    # these depend on beamDepth parameter for now, fix this
-    TraceState (h, what) =
-        if enableTracing
-        then Transpose (Trace (Transpose (h), say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3, format=formatDense))
-        else h
-    TraceDense (h, what) =
-        if enableTracing
-        then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=21/*beamDepth*beamDepth*/, onlyUpToT=25, format=formatDense)
-        else h
-    TraceDenseTransposed (h, what) =
-        if enableTracing
-        then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, onlyUpToRow=beamDepth*beamDepth, onlyUpToT=25, format=formatDenseTransposed)
-        else h
-    TraceOneHot (h, what) =
-        if enableTracing
-        then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=15,*/ format=formatOneHot)
-        else h
-    TraceSparse (h, what) =
-        if enableTracing
-        then Trace (h, say=what, logFirst=10, logFrequency=traceFrequency, logGradientToo=false, /*onlyUpToRow=beamDepth*beamDepth, onlyUpToT=3,*/ format=formatSparse)
-        else h
-
-    Trace (node, say='', logFrequency=traceFrequency, logFirst=10, logGradientToo=false, onlyUpToRow=100000000, onlyUpToT=100000000, format=[], tag='') = new ComputationNode [
-        operation = 'Trace' ; inputs = node
-    ]
-
-    formatDense = [
-        type = "real"
-        transpose = false
-        precisionFormat = ".4"
-    ]
-    formatDenseTransposed = [
-        type = "real"
-        transpose = true
-        precisionFormat = ".4"
-    ]
-    formatOneHot = [
-        type = "category"
-        transpose = false
-        labelMappingFile = tracingLabelMappingFile
-    ]
-    formatSparse = [
-        type = "sparse"
-        transpose = false
-        labelMappingFile = tracingLabelMappingFile
-    ]
--- a/38
+++ b/38
@ -171,6 +171,17 @@ ifndef CNTK_CUDA_GENCODE
  endif
 endif

+# Should we relocate *.gcno and *.gcda files using -fprofile-dir option?
+# Use GCOV_PREFIX and GCOV_PREFIX_STRIP if relocating:
+# For example, if the object file /user/build/foo.o was built with -fprofile-arcs, the final executable will try to create the data file
+# /user/build/foo.gcda when running on the target system. This will fail if the corresponding directory does not exist and it is unable
+# to create it. This can be overcome by, for example, setting the environment as ‘GCOV_PREFIX=/target/run’ and ‘GCOV_PREFIX_STRIP=1’.
+# Such a setting will name the data file /target/run/build/foo.gcda
+ifdef CNTK_CODE_COVERAGE
+  CXXFLAGS += -fprofile-arcs -ftest-coverage
+  LDFLAGS += -lgcov --coverage
+endif
+
 ifeq ("$(BUILDTYPE)","debug")
  ifdef CNTK_CUDA_CODEGEN_DEBUG
    GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_DEBUG)
@ -243,6 +254,7 @@ READER_SRC =\
 	$(SOURCEDIR)/Readers/ReaderLib/TruncatedBpttPacker.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/PackerBase.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/FramePacker.cpp \
+    $(SOURCEDIR)/Readers/ReaderLib/ChunkCache.cpp \

 COMMON_SRC =\
 	$(SOURCEDIR)/Common/Config.cpp \
@ -257,10 +269,12 @@ COMMON_SRC =\
 MATH_SRC =\
 	$(SOURCEDIR)/Math/CPUMatrix.cpp \
 	$(SOURCEDIR)/Math/CPUSparseMatrix.cpp \
+	$(SOURCEDIR)/Math/CPURNGHandle.cpp \
 	$(SOURCEDIR)/Math/MatrixQuantizerImpl.cpp \
 	$(SOURCEDIR)/Math/MatrixQuantizerCPU.cpp \
 	$(SOURCEDIR)/Math/QuantizedMatrix.cpp \
 	$(SOURCEDIR)/Math/Matrix.cpp \
+	$(SOURCEDIR)/Math/RNGHandle.cpp \
 	$(SOURCEDIR)/Math/TensorView.cpp \
 	$(SOURCEDIR)/Math/CUDAPageLockedMemAllocator.cpp \
 	$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
@ -272,6 +286,7 @@ MATH_SRC +=\
 	$(SOURCEDIR)/Math/GPUTensor.cu \
 	$(SOURCEDIR)/Math/GPUSparseMatrix.cu \
 	$(SOURCEDIR)/Math/GPUWatcher.cu \
+	$(SOURCEDIR)/Math/GPURNGHandle.cu \
 	$(SOURCEDIR)/Math/MatrixQuantizerGPU.cu \
 	$(SOURCEDIR)/Math/CuDnnCommon.cu \
 	$(SOURCEDIR)/Math/CuDnnConvolutionEngine.cu \
@ -341,6 +356,24 @@ $(LIBDIR)/HTKMLFReader.so: $(HTKMLFREADER_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)

+########################################
+# CompositeDataReader plugin
+########################################
+
+COMPOSITEDATAREADER_SRC =\
+	$(SOURCEDIR)/Readers/CompositeDataReader/CompositeDataReader.cpp \
+	$(SOURCEDIR)/Readers/CompositeDataReader/Exports.cpp \
+
+COMPOSITEDATAREADER_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(COMPOSITEDATAREADER_SRC))
+
+COMPOSITEDATAREADER:=$(LIBDIR)/CompositeDataReader.so
+ALL+=$(COMPOSITEDATAREADER)
+SRC+=$(COMPOSITEDATAREADER_SRC)
+
+$(LIBDIR)/CompositeDataReader.so: $(COMPOSITEDATAREADER_OBJ) | $(CNTKMATH_LIB)
+	@echo $(SEPARATOR)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ -l$(CNTKMATH)
+
 ########################################
 # ExperimentalHTKMLFReader plugin
 ########################################
@ -554,9 +587,10 @@ ifeq (,$(wildcard Source/1BitSGD/*.h))
  $(error Build with 1bit-SGD was requested but cannot find the code. Please check https://github.com/Microsoft/CNTK/wiki/Enabling-1bit-SGD for instructions)
 endif

-  INCLUDEPATH += $(SOURCEDIR)/1BitSGD
+  INCLUDEPATH += $(SOURCEDIR)/1BitSGD 

-  COMMON_FLAGS += -DQUANTIZED_GRADIENT_AGGREGATION
+  COMMON_FLAGS += -DCNTK_PARALLEL_TRAINING_SUPPORT
+  # temporarily adding to 1bit, need to work with others to fix it
 endif

 ########################################
--- a/README.md
+++ b/README.md
@ -1,16 +1,19 @@
 # CNTK

 ## Latest news
+*2016-05-16.* An example illustrating [Using CNTK with ResNet](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Miscellaneous/ImageNet/ResNet) is added to the codebase. The example contains some pre-trained models that can be used in various applications.
+
+*2016-05-16.* CNTK Wiki now has [FAQ Page](https://github.com/Microsoft/CNTK/wiki/CNTK-FAQ)
+
+*2016-05-05.* CNTK now supports *BlockMomentum* Stochastic Gradient Descent (SGD) algorithm. 
+See the details in the [Multiple GPUs and machines Wiki section](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines)
+
+*2016-05-03.* New transformations are implemented for **Image Reader**. 
+See the description in the [Image Reader Wiki section](https://github.com/Microsoft/CNTK/wiki/Image-reader)
+
 *2016-04-25.* V 1.1 Binary release
 CNTK v.1.1 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases/tag/v1.1)

-*2016-04-12.* CNTK is available as [Azure Virtual Machines](https://github.com/Microsoft/CNTK/wiki/CNTK-on-Azure) and [Docker Containers](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers)
-
-*2016-04-12.* Added support for ND convolution and ND pooling and CPU support for `cudnn` layout in convolution, pooling and batch normalization nodes.
-Read [documentation](https://github.com/Microsoft/CNTK/wiki/Full-NDL-Function-Reference) on convolution, pooling and batch normalization nodes.
-
-*2016-04-05.* CUDA7.5 support for Windows Build: Windows project files have been updated to automatically utilize CUDA 7.5 if present
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 ## What is CNTK
--- a/Source/1BitSGD
+++ b/Source/1BitSGD
@ -1 +1 @@
-Subproject commit f57be8b8caeddf385a44a14acc587f4e5168152d
+Subproject commit 18fcb1a9378432ae179948b0f1e281115a2c7d86
--- a/Source/ActionsLib/EvalActions.cpp
+++ b/Source/ActionsLib/EvalActions.cpp
@ -61,6 +61,8 @@ static void DoEvalBase(const ConfigParameters& config, IDataReader& reader)
    size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
    size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);

+    bool enableDistributedMBReading = config(L"distributedMBReading", false);
+
    ConfigArray evalNodeNames = config(L"evalNodeNames", "");
    vector<wstring> evalNodeNamesVector;
    for (int i = 0; i < evalNodeNames.size(); ++i)
@ -75,7 +77,7 @@ static void DoEvalBase(const ConfigParameters& config, IDataReader& reader)
                           config(L"traceNodeNamesCategory", ConfigParameters::Array(stringargvector())),
                           config(L"traceNodeNamesSparse",   ConfigParameters::Array(stringargvector())));

-    SimpleEvaluator<ElemType> eval(net, MPIWrapper::GetInstance(), numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches);
+    SimpleEvaluator<ElemType> eval(net, MPIWrapper::GetInstance(), enableDistributedMBReading, numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches);
    eval.Evaluate(&reader, evalNodeNamesVector, mbSize[0], epochSize);
 }

@ -125,6 +127,8 @@ void DoCrossValidate(const ConfigParameters& config)
    size_t maxSamplesInRAM    = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
    size_t numSubminiBatches  = config(L"numSubminibatches", (size_t)1);

+    bool enableDistributedMBReading = config(L"distributedMBReading", false);
+
    ConfigArray evalNodeNames = config(L"evalNodeNames", "");
    vector<wstring> evalNodeNamesVector;
    for (int i = 0; i < evalNodeNames.size(); ++i)
@ -157,7 +161,7 @@ void DoCrossValidate(const ConfigParameters& config)
        cvModels.push_back(cvModelPath);
        auto net = ComputationNetwork::CreateFromFile<ElemType>(deviceId, cvModelPath);
        
-        SimpleEvaluator<ElemType> eval(net, MPIWrapper::GetInstance(), numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches);
+        SimpleEvaluator<ElemType> eval(net, MPIWrapper::GetInstance(), enableDistributedMBReading, numMBsToShowResult, traceLevel, maxSamplesInRAM, numSubminiBatches);

        fprintf(stderr, "Model %ls --> \n", cvModelPath.c_str());
        auto evalErrors = eval.Evaluate(&cvDataReader, evalNodeNamesVector, mbSize[0], epochSize);
--- a/Source/ActionsLib/NetworkFactory.cpp
+++ b/Source/ActionsLib/NetworkFactory.cpp
@ -89,7 +89,7 @@ bool TryGetNetworkFactory(const ConfigRecordType& config, function<ComputationNe
            L"precision = '%ls'\n"        // 'float' or 'double'
            L"network = %ls",             // source code of expression that evaluates to a ComputationNetwork
            (int)deviceId, ElemTypeName<ElemType>(), sourceOfNetwork.c_str());
-        let expr = BS::ParseConfigDictFromString(sourceOfBS, move(includePaths));
+        let expr = BS::ParseConfigDictFromString(sourceOfBS, L"BrainScriptNetworkBuilder", move(includePaths));

        // the rest is done in a lambda that is only evaluated when a virgin network is needed
        // Note that evaluating the BrainScript *is* instantiating the network, so the evaluate call must be inside the lambda.
--- a/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptEvaluator.cpp
@ -874,27 +874,48 @@ public:
    {
        let &config = *configp;
        double &us = *this; // we write to this
-        let arg = config[L"arg"];
        let whatArg = config[L"what"];
        wstring what = whatArg;
-        if (what == L"Floor")
-            us = floor((double) arg);
-        else if (what == L"Length")
+        if (what == L"Floor" || what == L"Length") // one-arg functions
        {
-            if (arg.Is<String>())
-                us = (double) ((wstring &) arg).size();
-            else // otherwise expect an array
+            let arg = config[L"arg"];
+            if (what == L"Floor")
            {
-                let & arr = arg.AsRef<ConfigArray>();
-                let range = arr.GetIndexRange();
-                us = (double) (range.second + 1 - range.first);
+                us = floor((double)arg);
            }
+            else if (what == L"Length")
+            {
+                if (arg.Is<String>())
+                    us = (double)((wstring &)arg).size();
+                else // otherwise expect an array
+                {
+                    let & arr = arg.AsRef<ConfigArray>();
+                    let range = arr.GetIndexRange();
+                    us = (double)(range.second + 1 - range.first);
+                }
+            }
+        }
+        else if (what == L"Mod" || what == L"IntDiv")  //two-arg int functions
+        {
+            let argsArg = config[L"args"];
+            let& args = argsArg.AsRef<ConfigArray>();
+            auto range = args.GetIndexRange();
+            if (range.second != range.first + 1)
+                argsArg.Fail(L"Mod/IntDiv expects two arguments");
+            let arg1 = (int)args.At(range.first);
+            let arg2 = (int)args.At(range.second);
+
+            if (what == L"Mod")
+                us = (int)(arg1 % arg2);
+            else if (what == L"IntDiv")
+                us = (int)(arg1 / arg2);
        }
        else
            whatArg.Fail(L"Unknown 'what' value to NumericFunction: " + what);
    }
 };

+
 // CompareFunctions
 //  - IsSameObject()
 class CompareFunction : public BoxOf<Bool>
--- a/Source/CNTK/BrainScript/BrainScriptParser.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptParser.cpp
@ -122,11 +122,10 @@ struct Issue
        issues.back().AddMarkup(symbol, location.charPos);
    }
    // print it backwards
-    if (!locations.empty()) // (be resilient to some throwers not having a TextrLocation; to be avoided)
+    if (!locations.empty()) // (be resilient to some throwers not having a TextLocation; to be avoided)
    {
        let& firstLoc = issues.front().location;
-        fprintf(stderr, "\n%ls while %ls line %d char %d of %ls\n", errorKind, kind, (int) firstLoc.lineNo + 1 /*report 1-based*/, (int) firstLoc.charPos + 1, firstLoc.GetSourceFile().path.c_str());
-        fprintf(stderr, "see location marked ^ and parent contexts marked 0..9, a..z, A..Z:\n\n");
+        fprintf(stderr, "[CALL STACK]\n");
        for (auto i = issues.rbegin(); i != issues.rend(); i++)
        {
            let& issue = *i;
@ -135,9 +134,11 @@ struct Issue
            const auto line = (where.lineNo == lines.size()) ? L"(end)" : lines[where.lineNo].c_str();
            fprintf(stderr, "  %ls\n  %ls\n", line, issue.markup.c_str());
        }
+        fprintf(stderr, "%ls while %ls: %ls(%d)", errorKind, kind, firstLoc.GetSourceFile().path.c_str(), (int)firstLoc.lineNo + 1 /*report 1-based*/);
    }
-    fprintf(stderr, "%ls: %ls\n", errorKind, what);
-    fflush(stderr);
+    else
+        fprintf(stderr, "%ls while %ls", errorKind, kind);
+    fprintf(stderr, ": %ls\n", what), fflush(stderr);
 }
 /*static*/ vector<SourceFile> TextLocation::sourceFileMap;

@ -941,7 +942,7 @@ public:
    static void Test()
    {
        let parserTest = L"a=1\na1_=13;b=2 // cmt\ndo = (print\n:train:eval) ; x = array[1..13] (i=>1+i*print.message==13*42) ; print = new PrintAction [ message = 'Hello World' ]";
-        ParseConfigDictFromString(parserTest, vector<wstring>())->Dump();
+        ParseConfigDictFromString(parserTest, L"Test", vector<wstring>())->Dump();
    }
 };

@ -950,9 +951,9 @@ static ExpressionPtr Parse(SourceFile&& sourceFile, vector<wstring>&& includePat
 {
    return Parser(move(sourceFile), move(includePaths)).ParseRecordMembersToDict();
 }
-ExpressionPtr ParseConfigDictFromString(wstring text, vector<wstring>&& includePaths)
+ExpressionPtr ParseConfigDictFromString(wstring text, wstring location, vector<wstring>&& includePaths)
 {
-    return Parse(SourceFile(L"(command line)", text), move(includePaths));
+    return Parse(SourceFile(location, text), move(includePaths));
 }
 //ExpressionPtr ParseConfigDictFromFile(wstring path, vector<wstring> includePaths)
 //{
--- a/Source/CNTK/BrainScript/BrainScriptParser.h
+++ b/Source/CNTK/BrainScript/BrainScriptParser.h
@ -78,9 +78,9 @@ public:
    virtual const wchar_t* kind() const = 0; // e.g. "warning" or "error"

    // pretty-print this as an error message
-    void /*ScriptingException::*/ PrintError() const
+    void /*ScriptingException::*/ PrintError(const std::wstring& linePrefix) const
    {
-        TextLocation::PrintIssue(locations, L"error", kind(), msra::strfun::utf16(what()).c_str());
+        TextLocation::PrintIssue(locations, linePrefix.c_str(), kind(), msra::strfun::utf16(what()).c_str());
    }
    void AddLocation(TextLocation where)
    {
@ -134,7 +134,7 @@ struct Expression
 typedef Expression::ExpressionPtr ExpressionPtr; // circumvent some circular definition problem

 // access the parser through one of these functions
-ExpressionPtr ParseConfigDictFromString(wstring text, vector<wstring>&& includePaths);          // parses a list of dictionary members, returns a dictionary expression
+ExpressionPtr ParseConfigDictFromString(wstring text, wstring location, vector<wstring>&& includePaths);          // parses a list of dictionary members, returns a dictionary expression
 // TODO: These rvalue references are no longer adding value, change to const<>&
 //ExpressionPtr ParseConfigDictFromFile(wstring path, vector<wstring> includePaths);              // likewise, but from a file path
 ExpressionPtr ParseConfigExpression(const wstring& sourceText, vector<wstring>&& includePaths); // parses a single expression from sourceText, which is meant to contain an include statement, hence includePaths
--- a/Source/CNTK/BrainScript/BrainScriptTest.cpp
+++ b/Source/CNTK/BrainScript/BrainScriptTest.cpp
@ -178,7 +178,7 @@ void SomeTests()
        {
            fprintf(stderr, "\n### Test %d ###\n\n", (int) i), fflush(stderr);
            let parserTest = parserTests[i];
-            let expr = ParseConfigDictFromString(standardFunctions + computationNodes + commonMacros + parserTest, vector<wstring>());
+            let expr = ParseConfigDictFromString(standardFunctions + computationNodes + commonMacros + parserTest, L"Test", vector<wstring>());
            //expr->Dump();
            Do(expr);
            if (oneOnly)
@ -187,7 +187,8 @@ void SomeTests()
    }
    catch (const ConfigException& err)
    {
-        err.PrintError();
+        err.PrintError(L"error");
    }
 }
-} } } // namespaces
+
+}}} // namespaces
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -21,6 +21,9 @@ Min(a,b) = if a < b then a else b
 Max(a,b) = if a > b then a else b 
 Fac(n) = if n > 1 then Fac(n-1) * n else 1 
 IsSameObject(a,b) = new CompareFunction [ what = 'IsSameObject' ; args = (a : b) ]
+Mod(x, y)  = new NumericFunction [ what = 'Mod' ;  args = (x:y) ] 
+IntDiv(x, y)  = new NumericFunction [ what = 'IntDiv' ;  args = (x:y) ] 
+

 ##############################################################################
 # ComputationNodes
@ -182,6 +185,8 @@ CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts
 Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = x /*plus the function args*/ ]
 CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
 CrossEntropyWithSoftmax(labelVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = (labelVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
+# once ReduceLogSum becomes proper C++, CrossEntropyWithSoftmax() will become this:
+NewCrossEntropyWithSoftmax (labelSequence, z, tag='') = [ tag1 = tag; out = Minus (ReduceLogSum (z), ReduceSum (labelSequence .* z), tag=tag1) ].out
 DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ]
 // TODO: DiagTimes = ElementTimes
 Dropout(activationVectorSequence, tag='') = new ComputationNode [ operation = 'Dropout' ; inputs = activationVectorSequence /*plus the function args*/ ]
@ -197,6 +202,7 @@ KhatriRaoProduct(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operat
 Log(x, tag='') = new ComputationNode [ operation = 'Log' ; inputs = x /*plus the function args*/ ]
 LogPlus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'LogPlus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 LogSoftmax(z, tag='') = new ComputationNode [ operation = 'LogSoftmax' ; inputs = z /*plus the function args*/ ]
+# TODO: ^^ along axis, like Softmax
 MatrixL1Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL1Reg' ; inputs = matrix /*plus the function args*/ ]
 MatrixL2Reg(matrix, tag='') = new ComputationNode [ operation = 'MatrixL2Reg' ; inputs = matrix /*plus the function args*/ ]
 Mean(dataVectorSequence, tag='') = new ComputationNode [ operation = 'Mean' ; inputs = dataVectorSequence /*plus the function args*/ ]
@ -209,27 +215,42 @@ PerDimMeanVarNormalization(dataVectorSequence, meanVector, invStdDevVector, tag=
 Plus(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'Plus' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 Reciprocal(z, tag='') = new ComputationNode [ operation = 'Reciprocal' ; inputs = z /*plus the function args*/ ]
 RectifiedLinear(z, tag='') = new ComputationNode [ operation = 'RectifiedLinear' ; inputs = z /*plus the function args*/ ]
-ReducePlus (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Plus"    /*plus the function args*/ ]
-#ReduceLogPlus (z, axis=0, tag='') = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogPlus" /*plus the function args*/ ]
+ReduceSum (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Sum"    /*plus the function args*/ ]
+# the following is a temporary workaround until we have the C++ version
+ReduceLogSum (z, axis=0, tag='')  = if axis != 0 then Fail("ReduceLogSum for now only supports axis=0.")
+    else [ tag1=tag ; axis1=axis ; out = RowSlice (0, 1, z - LogSoftmax (z), tag=tag1) ].out
+#ReduceLogSum (z, axis=0, tag='')  = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "LogSum" /*plus the function args*/ ]
 #ReduceMean (z, axis=0, tag='')    = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Mean"    /*plus the function args*/ ]
 #ReduceMax (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Max"     /*plus the function args*/ ]
 #ReduceMin (z, axis=0, tag='')     = new ComputationNode [ operation = 'ReduceElements' ; inputs = z ; reductionOp = "Min"     /*plus the function args*/ ]
 RNN(A, B, hiddenSize=10, numLayers=1, bidirectional=false, rnnMode='LSTM', tag='') = new ComputationNode [ operation = 'RNN' ; inputs = ( A : B ) /*plus the function args*/ ]
 Round(x, tag='') = Floor(Plus(x, ConstantTensor(0.5, (1))), tag=tag)
 Scale(scalarScalingFactor, matrix, tag='') = new ComputationNode [ operation = 'Scale' ; inputs = (scalarScalingFactor : matrix) /*plus the function args*/ ]
-// TODO: Scale = ElementTimes
+# TODO: Scale = ElementTimes
 ScatterPacked(cond, indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'ScatterPacked' ; inputs = (cond : indexSequence : sourceData) /*plus the function args*/ ]
 Sigmoid(z, tag='') = new ComputationNode [ operation = 'Sigmoid' ; inputs = z /*plus the function args*/ ]
 Sin(z, tag='') = new ComputationNode [ operation = 'Sin' ; inputs = z /*plus the function args*/ ]
-Softmax(z, tag='') = new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ]
+Softmax (z, axis=0, tag='') =  # TODO: replace this with more efficient version below once we have ReduceLogSum
+    if axis == 0 then new ComputationNode [ operation = 'Softmax' ; inputs = z /*plus the function args*/ ]
+    else
+    [
+        numerator = Softmax (z)  # do a first Softmax to bring it into harmless numeric range
+        denominator = ReduceSum (axis=axis1, numerator) ; axis1 = axis # reduce along axis
+        P = numerator .* Reciprocal (denominator)         # normalize numerator by the sum along the given axis
+
+        # TODO: This is not efficient. Once we have ReduceLogSum, it will be this:
+        #Z = ReduceLogSum (axis=axis0, z) # reduce along axis
+        #P = Exp (z - Z)
+    ].P
 Hardmax(z, tag='') = new ComputationNode [ operation = 'Hardmax' ; inputs = z /*plus the function args*/ ]
 Sqrt(z, tag='') = new ComputationNode [ operation = 'Sqrt' ; inputs = z /*plus the function args*/ ]
 SquareError(aMatrix, anotherMatrix, tag='') = new ComputationNode [ operation = 'SquareError' ; inputs = (aMatrix : anotherMatrix) /*plus the function args*/ ]
-SumColumnElements(z, tag='') = new ComputationNode [ operation = 'SumColumnElements' ; inputs = z /*plus the function args*/ ] // deprecated
+SumColumnElements(z, tag='') = new ComputationNode [ operation = 'SumColumnElements' ; inputs = z /*plus the function args*/ ] # deprecated
 SumElements(matrix, tag='') = new ComputationNode [ operation = 'SumElements' ; inputs = matrix /*plus the function args*/ ]
 # ^^ TODO: Rename to ReduceSumMB?
 Tanh(z, tag='') = new ComputationNode [ operation = 'Tanh' ; inputs = z /*plus the function args*/ ]
 TimeReverse(vectorSequence, tag='') = new ComputationNode [ operation = 'TimeReverse' ; inputs = vectorSequence /*plus the function args*/ ]
+Trace (node, say='', logFrequency=traceFrequency, logFirst=10, logGradientToo=false, onlyUpToRow=100000000, onlyUpToT=100000000, format=[], tag='') = new ComputationNode [ operation = 'Trace' ; inputs = node ]
 TransposeTimes(leftMatrix, rightMatrix, tag='') = new ComputationNode [ operation = 'TransposeTimes' ; inputs = (leftMatrix : rightMatrix) /*plus the function args*/ ]
 Where(cond, tag='') = new ComputationNode [ operation = 'Where' ; inputs = cond /*plus the function args*/ ]

@ -275,6 +296,8 @@ Constants = [
    # is this like Sequences.Repeat?
    True  = 1
    False = 0
+    None = ConstantTensor (42, (1))
+    IsNone (x) = IsSameObject (x, None)
 ]

 ##############################################################################
@ -301,6 +324,7 @@ Boolean = [
    # select a value
    # Note: This will be replaced by BrainScript 'if cond then thenVal else elseVal' and SwitchNode
    If (cond, thenVal, elseVal, tag='') =  new ComputationNode [ operation = 'If' ; inputs = (cond : thenVal : elseVal) /*plus the function args*/ ]
+    #If (cond, thenVal, elseVal) = cond .* thenVal + Not (cond) .* elseVal
 ]

 ##############################################################################
@ -329,13 +353,25 @@ Sequences = [
    # returns a record [ value=..., valid=... ], both being 1-step sequences of [dim x N]. N can optionally be moved to axes >2.
    # This implementation is suboptimal in that it creates copies for the intermediate steps.
    PastValueWindow (N, in, axis=2) = [
+        isLast = Loop.IsLast (in)
+        isLastIndex = PackedIndex (in, Where (isLast))
+        GatherLast (x) = GatherPacked (isLastIndex, x) # 'cond' matches 'x'
+        onesLikeIn = Constants.OnesLike (in)
        delayLine[t:0..N-1] = [     # shift register for encoder, last N inputs
            value = if t == 0
-                    then in        # delay 0: current value
-                    else Loop.PastValue (0, in, timeStep=t)
+                    then in         # delay 0: current value
+                    else PastValue (0, in, timeStep=t, defaultHiddenActivation=0)
            valid = if t == 0
-                    then Constants.One
-                    else Constants.One - PastValue (1, Constants.ZeroesLike (in), timeStep=t, defaultHiddenActivation=1)
+                    then onesLikeIn   # BUGBUG: if I say Constant.Ones here, it outputs 0. Ones has no MBLayout
+                    else PastValue (1, onesLikeIn, timeStep=t, defaultHiddenActivation=0)
+
+
+            TraceDenseTransposed (h, what) = h
+            #    Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=25, format=[ type = "real" ; transpose = true ; precisionFormat = ".4" ])
+
+
+            lastValue = TraceDenseTransposed(  GatherLast (value)  ,'dvalue')  # [i, delay]
+            lastValid = TraceDenseTransposed(  GatherLast (valid)  ,'dvalid')  # [i, delay]
        ]
        # delayLine[t].value = value of t steps in the past
        # delayLine[t].valid = true if we had a value t steps in the past
@ -343,8 +379,8 @@ Sequences = [
            if      axis == 2 then SplitDimension (x, 1, N)
            else if axis > 2  then TransposeDimensions (SplitDimension (x, 1, N), 2, axis)
            else Fail ("PastValueWindow: axis>2 required.") # BUGBUG: We also require that input is a single vector. Address later.
-        value = Slice (-1, 0, axis=-1, SplitStack (RowStack (array[0..N-1](t=>delayLine[t].value))))  # [i, delay]
-        valid = Slice (-1, 0, axis=-1, SplitStack (RowStack (array[0..N-1](t=>delayLine[t].valid))))  # [i, delay]
+        value = SplitStack (RowStack (array[0..N-1](t=>delayLine[t].lastValue)))  # [i, delay]
+        valid = SplitStack (RowStack (array[0..N-1](t=>delayLine[t].lastValid)))  # [i, delay]
    ]

    # fold left/right: Reduce entire sequence by applying binaryOp, e.g. FoldL (Plus, 0, input)
@ -369,47 +405,48 @@ Sequences = [
    # sequence-altering LINQ-like operators
    # These generate new data packing (MBLayouts)

-    # TakeWhile and DropWhile
-    TakeWhile (predicate, x) = Filter ( _WhilePredicate (PastValue, predicate), x)
-    SkipWhile (predicate, x) = Filter (!_WhilePredicate (PastValue, predicate), x)
-    _WhilePredicate (DelayFn, predicate, input) =
-    [
-        whilePredicateRec = Boolean.And (DelayFn (whilePredicateRec, defaultHiddenActivation=Boolean.True), predicate)
-    ].whilePredicateRec
-    # TODO: do we need operations from the back?
-
    # First and Take
    # LINQ allows predicates as well.
-    First (x)   =  Take (1, x)
-    Take (N, x) = _Take (PastValue, N, x)
-    _Take (DelayFn, N, x) = [
-        selected = Loop._IsWithin (DelayFn, N, x)
-        out = Gather (selected, x)
-    ].out
-    # Last and TakeRight
-    Last (x) = TakeRight (1, x)
-    TakeRight (N, x) = _Take (FutureValue, N, x)
-    Skip (N, x) = if N > 0 then _Skip (PastValue, N, x) else x
-    _Skip (DelayFn, N, x) = [ // TODO: merge with _Take
-        selected = Loop._IsWithin (DelayFn, N, x)
-        out = Gather (Boolean.Not (selected), x)
-    ].out
-    ElementAt (n, x) = [ // not efficient, as it filters twice. Better AND the predicates. TODO: what if n is out of range? ElementAtOrDefault
-        startMask = Skip (n, x)                     // ...000111...
-        mask = startMask - PastValue (0, startMask) // ...000100...
-        out = Gather (mask, x)
-    ]
-    Single (predicate, x) = x
+    First (x) = Slice (0,  1, x,  axis=-1)
+    Last (x)  = Slice (-1, 0, x,  axis=-1)
+
+    # TakeWhile and DropWhile
+    #TakeWhile (predicate, x) = Filter ( _WhilePredicate (PastValue, predicate), x)
+    #SkipWhile (predicate, x) = Filter (!_WhilePredicate (PastValue, predicate), x)
+    #_WhilePredicate (DelayFn, predicate, input) =
+    #[
+    #    whilePredicateRec = Boolean.And (DelayFn (whilePredicateRec, defaultHiddenActivation=Boolean.True), predicate)
+    #].whilePredicateRec
+    # TODO: do we need operations from the back?
+
+    #Take (N, x) = _Take (PastValue, N, x)
+    #TakeRight (N, x) = _Take (FutureValue, N, x)
+    #_Take (DelayFn, N, x) = [
+    #    selected = Loop._IsWithin (DelayFn, N, x)
+    #    out = Gather (selected, x)
+    #].out
+    #
+    #Skip (N, x) = if N > 0 then _Skip (PastValue, N, x) else x
+    #_Skip (DelayFn, N, x) = [ // TODO: merge with _Take
+    #    selected = Loop._IsWithin (DelayFn, N, x)
+    #    out = Gather (Boolean.Not (selected), x)
+    #].out
+    #ElementAt (n, x) = [ // not efficient, as it filters twice. Better AND the predicates. TODO: what if n is out of range? ElementAtOrDefault
+    #    startMask = Skip (n, x)                     // ...000111...
+    #    mask = startMask - PastValue (0, startMask) // ...000100...
+    #    out = Gather (mask, x)
+    #]
+    #Single (predicate, x) = x

    #FirstOrDefault (x) = ? // can empty sequences exist or even be represented by CNTK?

-    Average (x) = Sum (x) / Loop.Count(x)  // TODO: patch opQuotient to check 0/0 = 0
-    Sum (x)    = FoldL (Plus,    0, x)
-    LogSum (x) = FoldL (LogPlus, 0, x)
+    #Average (x) = Sum (x) / Loop.Count(x)  // TODO: patch opQuotient to check 0/0 = 0
+    #Sum (x)    = FoldL (Plus,    0, x)
+    #LogSum (x) = FoldL (LogPlus, 0, x)
    #Max (x) = FoldL (^.Max, ?, x) // TODO: name clash; need to implement ^.
    #Min (x) = FoldL (^.Min, ?, x) // TODO: what's the init value?
-    All (x) = FoldL (Boolean.And,  OnesLike (x), x)
-    Any (x) = FoldL (Boolean.Or, ZeroesLike (x), x)
+    #All (x) = FoldL (Boolean.And,  OnesLike (x), x)
+    #Any (x) = FoldL (Boolean.Or, ZeroesLike (x), x)

    # Join to create 2D fields for s2s attention?

@ -478,10 +515,24 @@ Parameters =
    StabilizeElements (x, inputDim=x.dim, enabled=true) =
        if enabled
        then [
-            beta = Exp (BiasParam ((inputDim)))
-            result = beta .* x
+            #beta = Exp (BiasParam ((inputDim))) # init value is 0
+            #beta = ParameterTensor ((inputDim), init='fixedValue', value=1.0) # init value is 1
+            # or SoftPlus: ln(1+e^beta)
+            #beta = Log (Constants.One + Exp (ParameterTensor ((inputDim), init='fixedValue', value=0.54132485/*ln (e-1)*/))) # init value is 1
+
+            # sharpened Softplus: 1/f ln(1+e^{f*beta})
+            # this behaves linear for weights around 1, yet guarantees positiveness
+
+            f = ConstantTensor (4, (1))
+            fInv = Reciprocal (f)
+            beta = fInv .* Log (Constants.One + Exp (f .* ParameterTensor ((inputDim), init='fixedValue', value=0.99537863/* 1/f*ln (e^f-1) */))) # init value is 1
+
+            TraceDense (h, what) = h  # delete h and uncomment Trace to trace the beta values. They are a valuable indicator.
+                //Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=25, format=[ type = "real" ; transpose = false ; precisionFormat = ".6" ])
+
+            result = TraceDense (    beta,    'beta') .* x
        ].result
-    else x
+        else x

    # and the same with a scalar stabilizer shared across all components
    Stabilize (x, enabled=true) = if enabled then StabilizeElements (x, inputDim=1, enabled=true) else x
@ -494,37 +545,46 @@ Parameters =
 RNNs =
 [
    # LSTMP -- LSTM function with projection and self-stabilization
-    # Projection it enabled by passing different values for outputDim and cellDim.
+    # Projection is enabled by passing different values for outputDim and cellDim.
    # This is the stateless version that takes the previous state as an input.
    # It returns a dictionary with three members: h and c, and dim=h.dim for convenience. prevState must have h and c.
-    LSTMP (outputDim, cellDim=outputDim, x, inputDim=x.dim, prevState, enableSelfStabilization=false) =
+    # This function also takes an optional auxiliary input, e.g. for suporting attention models.
+    LSTMP (outputDim, cellDim=outputDim, x, inputDim=x.dim, aux=Constants.None, auxDim=aux.dim, prevState, enableSelfStabilization=false) =
    [
+        S(x) = Parameters.Stabilize (x, enabled=enableSelfStabilization)
+
        # TODO: rename to just _
-        _privateInnards = [       // encapsulate the inner workings
+        _privateInnards = [     // encapsulate the inner workings
            dh = prevState.h // previous values
            dc = prevState.c

-            // parameter macros--these carry their own weight matrices
+            dhs = S(dh) // previous values, stabilized
+            dcs = S(dc)
+            # note: input does not get a stabilizer here, user is meant to do that outside
+
+            // parameter macros
+            # note: each invocation comes with its own set of weights
            B() = Parameters.BiasParam (cellDim)
+            W() = Parameters.WeightParam (cellDim, inputDim)        // input
+            A() = Parameters.WeightParam (cellDim, auxDim)          // aux input
+            H() = Parameters.WeightParam (cellDim, outputDim)       // hidden-to-hidden
+            C() = Parameters.DiagWeightParam (cellDim)              // cell-to-hiddden (note: applied elementwise)

-            #inputDim1 = inputDim
-            #W(v) = Parameters.WeightParam (cellDim, inputDim)  * Parameters.StabilizeElements (v, inputDim=inputDim1, enabled=enableSelfStabilization) // input-to-hidden
-            # ^^ element-wise stab, use if input is a concatenation; vv stab for entire matrix
-            W(v) = Parameters.WeightParam (cellDim, inputDim)  * Parameters.Stabilize (v, enabled=enableSelfStabilization) // input-to-hidden
-            H(h) = Parameters.WeightParam (cellDim, outputDim) * Parameters.Stabilize (h, enabled=enableSelfStabilization) // hidden-to-hidden
-            C(c) = Parameters.DiagWeightParam (cellDim)       .* Parameters.Stabilize (c, enabled=enableSelfStabilization) // cell-to-hiddden (note: applied elementwise)
+            # projected contribution from input(s) and bias
+            pin() = if Constants.IsNone (aux)
+                    then B() + W() * x
+                    else B() + W() * x + A() * aux

-            // note: the W(x) here are all different, they all come with their own set of weights; same for H(dh), C(dc), and B()
-            it = Sigmoid (W(x) + B() + H(dh) + C(dc))          // input gate(t)
-            bit = it .* Tanh (W(x) + (H(dh) + B()))            // applied to tanh of input network
+            it = Sigmoid (pin() + H() * dhs + C() .* dcs)           // input gate(t)
+            bit = it .* Tanh (pin() + H() * dhs)                    // applied to tanh of input network

-            ft = Sigmoid (W(x) + B() + H(dh) + C(dc))          // forget-me-not gate(t)
-            bft = ft .* dc                                     // applied to cell(t-1)
+            ft = Sigmoid (pin() + H() * dhs + C() .* dcs)           // forget-me-not gate(t)
+            bft = ft .* dc                                          // applied to cell(t-1)

-            ct = bft + bit                                     // c(t) is sum of both
+            ct = bft + bit                                          // c(t) is sum of both

-            ot = Sigmoid (W(x) + B() + H(dh) + C(ct))          // output gate(t)
-            ht = ot .* Tanh (ct)                               // applied to tanh(cell(t))
+            ot = Sigmoid (pin() + H() * dhs + C() .* S(ct))         // output gate(t)
+            ht = ot .* Tanh (ct)                                    // applied to tanh(cell(t))
        ]

        # our return values
@ -532,51 +592,397 @@ RNNs =
        h = if outputDim != cellDim     // output/hidden state
            then [                      // project
                Wmr = Parameters.WeightParam (outputDim, cellDim);
-                htp = Wmr * Parameters.Stabilize (_privateInnards.ht, enabled=enableSelfStabilization)
-            ].htp         // TODO: ^^ extend BS syntax to allow to say: then [ Wmr = WeightParam(outputDim, cellDim) ] in Wmr * Stabilize (...)
+                htp = Wmr * S(_privateInnards.ht)
+            ].htp
            else _privateInnards.ht     // no projection
        dim = outputDim
    ]

    # helper function to delay h and c
    # Callers can provide their own, e.g. useful for beam decoding.
-    PreviousHC (lstmState) = [
+    PreviousHC (lstmState, layerIndex=0) = [
       h = Loop.Previous (lstmState.h)         // hidden state(t-1)
       c = Loop.Previous (lstmState.c)         // cell(t-1)
+       dim = lstmState.dim
    ]

    # pass previousHook=BS.RNNs.NextHC instead of PreviousHC to get a right-to-left recurrence
-    NextHC (lstmState) = [
+    NextHC (lstmState, layerIndex=0) = [
       h = Loop.Next (lstmState.h)             // hidden state(t-1)
       c = Loop.Next (lstmState.c)             // cell(t-1)
+       dim = lstmState.dim
    ]

+    NoAuxInputHook (input, lstmState) = Constants.None
+
    # this implements a recurrent (stateful) LSTM with projection and self-stabilization
    # It returns a record (h,c). To use its output, say .h
    # By default, this is left-to-right. Pass previousHook=BS.RNNs.NextHC for a right-to-left model.
-    # TODO: remove the -2 once this works
-    RecurrentLSTMP = RecurrentLSTMP2
-    RecurrentLSTMP2 (outputDim, cellDim=outputDim.dim, x, inputDim=x.dim, previousHook=PreviousHC, enableSelfStabilization=false) =
+    RecurrentLSTMP (outputDim/*h.dim*/, cellDim=outputDim,
+                    x, inputDim=x.dim,
+                    previousHook=BS.RNNs.PreviousHC,
+                    augmentInputHook=NoAuxInputHook, augmentInputDim=0,
+                    layerIndex=0,
+                    enableSelfStabilization=false) =
    [
-        prevState = previousHook (lstmState)
-        inputDim1 = inputDim ; cellDim1 = cellDim ; enableSelfStabilization1 = enableSelfStabilization // TODO: BS syntax needs to allow to say ^.enableSelfStabilization
-        lstmState = BS.RNNs.LSTMP (outputDim, cellDim=cellDim1, x, inputDim=inputDim1, prevState, enableSelfStabilization=enableSelfStabilization1)
-    ].lstmState // we return the state record (h,c)
+        enableSelfStabilization1 = enableSelfStabilization ; cellDim1 = cellDim ; inputDim1 = inputDim ; layerIndex1 = layerIndex # workaround
+
+        prevState = previousHook (lstmState, layerIndex=layerIndex1) # recurrent memory. E.g. Previous or Next, with or without initial state, beam reordering etc.
+
+        auxInput = augmentInputHook(x, prevState)   # optionally augment input. Constants.None if none.
+
+        lstmState = BS.RNNs.LSTMP (outputDim, cellDim=cellDim1, x, inputDim=inputDim1, aux=auxInput, auxDim=augmentInputDim, prevState, enableSelfStabilization=enableSelfStabilization1)
+    ].lstmState // that's the value we return

    # a stack of recurrent LSTMs (unidirectional)
-    RecurrentLSTMPStack = RecurrentLSTMP2Stack  # TODO: remove the -2 name once this works
-    RecurrentLSTMP2Stack (hiddenDims, cellDims=hiddenDims, input, inputDim=input.dim, previousHook=PreviousHC, enableSelfStabilization=false) = [
-        previousHook1 = previousHook ; useStabilizer = enableSelfStabilization
-        layers[i:0..Length (hiddenDims)-1] =
-            RecurrentLSTMP2 (hiddenDims[i], cellDim=cellDims[i],
-                             if i == 0 then input else layers[i-1].h, inputDim=if i == 0 then inputDim else hiddenDims[i-1] /*TODO: layers[i-1].dim*/,
-                             previousHook=previousHook1,
-                             enableSelfStabilization=useStabilizer)
+    RecurrentLSTMPStack (layerDims, cellDims=layerDims,
+                         input, inputDim=input.dim,
+                         previousHook=PreviousHC,
+                         augmentInputHook=NoAuxInputHook, augmentInputDim=0,
+                         enableSelfStabilization=false) =
+    [
+        previousHook1 = previousHook ; useStabilizer = enableSelfStabilization ; augmentInputHook1 = augmentInputHook ; augmentInputDim1 = augmentInputDim
+        layers[i:0..Length (layerDims)-1] =
+            RecurrentLSTMP (layerDims[i], cellDim=cellDims[i],
+                            if i == 0 then input else Parameters.Stabilize (layers[i-1].h, enabled=useStabilizer), inputDim=if i == 0 then inputDim else layers[i-1].dim,
+                            previousHook=previousHook1,
+                            augmentInputHook=if i == 0 then augmentInputHook1 else NoAuxInputHook, augmentInputDim=if i == 0 then augmentInputDim1 else 0,
+                            layerIndex=i,
+                            enableSelfStabilization=useStabilizer)
+    ].layers
+
+    # a stack of recurrent LSTMs (bidirectional)
+    # TODO: Should we define layerDims as the total (sum of both forward and backward direction)?
+    RecurrentBirectionalLSTMPStack (layerDims, cellDims=layerDims, input, inputDim=input.dim, previousHook=PreviousHC, nextHook=NextHC, enableSelfStabilization=false) = [
+        previousHook1 = previousHook ; nextHook1 = nextHook ; useStabilizer = enableSelfStabilization
+        layers[i:0..Length (layerDims)-1] =
+        [
+            v    = if i == 0 then input    else Parameters.Stabilize (layers[i-1].h, enabled=useStabilizer)
+            vDim = if i == 0 then inputDim else                       layers[i-1].dim
+            fwd = RecurrentLSTMP (layerDims[i], cellDim=cellDims[i],
+                                  v, inputDim=vDim,
+                                  previousHook=previousHook1,
+                                  layerIndex=i,
+                                  enableSelfStabilization=useStabilizer)
+            bwd = RecurrentLSTMP (layerDims[i], cellDim=cellDims[i],
+                                  v, inputDim=vDim,
+                                  previousHook=nextHook1,
+                                  layerIndex=i,
+                                  enableSelfStabilization=useStabilizer)
+            h = Splice ((fwd.h : bwd.h), axis=1)
+            c = Splice ((fwd.c : bwd.c), axis=1)
+            dim = layerDims[i] * 2  # output dimension
+        ]
    ].layers
 ]

 ##############################################################################
-# Network operations
+# sequence-to-sequence models
+# This implements attention model and beam decoding.
+##############################################################################
+
+Seq2Seq =
+[
+    # attention model
+    # The attention model is an additional input vector to the LSTM.
+    # Here, it is implemented by augmenting this vector to the regular input of the LSTM.
+    # The RecurrentLSTMP function does this inside through an optional lambda that the caller can pass in.
+    # This function creates such a lambda, which augments the input vector from a fixed-size attention window.
+    CreateAugmentWithFixedWindowAttentionHook (attentionDim, attentionSpan, decoderDynamicAxis, encoderOutput, enableSelfStabilization=false) =
+    [
+        # attention (fixed rolling window)
+        attentionWindow = Sequences.PastValueWindow (attentionSpan, encoderOutput.h, axis=2) # BUGBUG: We should have this in axis=3 right away for beam search. Track this down.
+
+        S(x) = Parameters.Stabilize (x, enabled=enableSelfStabilization)
+
+        # project it for Tanh() expression
+        # expected to be [attentionDim x 1 x attentionSpan], where that 1 is the axis of the beam in beam decoding
+        projectedAttentionWindowBroadcast = [
+            W = Parameters.WeightParam (attentionDim, encoderOutput.dim)
+            # inject an additional singleton dimension at second axis, as a stand-in for the beam depth in decoding
+            InjectBeamDepth (node) = SplitDimension (node, /*axis*/1, /*N:*/1)
+           #projectedValue = Sequences.BroadcastSequenceAs (decoderDynamicAxis, InjectBeamDepth (W * attentionWindow.value)) # apply the projection columnwise to the attentionWindow tensor
+            projectedValue = if enableSelfStabilization  # apply the projection columnwise to the attentionWindow tensor
+                        then Sequences.BroadcastSequenceAs (decoderDynamicAxis, InjectBeamDepth (W * S(attentionWindow.value .* attentionWindow.valid))) # (mask invalid frames for stabilizer)
+                        else Sequences.BroadcastSequenceAs (decoderDynamicAxis, InjectBeamDepth (W *   attentionWindow.value))
+            value          = Sequences.BroadcastSequenceAs (decoderDynamicAxis, InjectBeamDepth (      attentionWindow.value))
+            valid          = Sequences.BroadcastSequenceAs (decoderDynamicAxis, InjectBeamDepth (      attentionWindow.valid))
+            dim            = encoderOutput.dim
+        ]
+
+        # the return value of this function is this lambda, which gets passed to the RecurrentLSTMP() function as the augmentInputHook parameter
+        AugmentInputHook (input, prevState) =
+        [
+            # compute additional hidden state from attention
+            outputDim = prevState.dim
+            W = Parameters.WeightParam (attentionDim, outputDim)
+            projectedH = W * S(prevState.h)                           # [outputDim] or [outputDim x D] in beam search
+            tanHOut = Tanh (projectedAttentionWindowBroadcast.projectedValue + projectedH) # [attentionDim x beamDepth x attentionSpan]
+
+            # You can enable (uncomment) these Trace macros to enable tracing of the attention weights, which is a useful indicator.
+            TraceDense (h, what) = h
+                //Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=25, format=[ type = "real" ; transpose = false ; precisionFormat = ".4" ])
+            TraceDenseTransposed (h, what) = h
+                //Trace (h, say=what, logFirst=10, logFrequency=100, logGradientToo=false, onlyUpToRow=9, onlyUpToT=25, format=[ type = "real" ; transpose = true ; precisionFormat = ".4" ])
+
+            v = TraceDenseTransposed(    Parameters.WeightParam (1, attentionDim)     ,'v')                           # [1 x attentionDim]
+            u = v * S(tanHOut .* projectedAttentionWindowBroadcast.valid) # [1 x beamDepth x attentionSpan]
+            # ^^ mask 'v' for purpose of stabiliziation; TODO: don't do that if no stabiliziation
+            uValid = u + Log (projectedAttentionWindowBroadcast.valid)    # [1 x beamDepth x attentionSpan]
+
+            attentionWeights = Softmax (uValid, axis=3)                    # [1 x beamDepth x attentionSpan]
+            weightedAttentionWindow = projectedAttentionWindowBroadcast.value .* TraceDense(  attentionWeights    ,'weights') # [encoderHiddenDim x beamDepth x attentionSpan]
+            # TODO: use ReduceSum:
+            # this is the auxiliary input to the LSTMP function
+            weightedAttentionAverage = S(Times (weightedAttentionWindow, BS.Constants.OnesTensor (attentionSpan), outputRank=2)) # [encoderHiddenDim x beamDepth]
+        ].weightedAttentionAverage
+    ].AugmentInputHook
+
+    # helper macro that extracts top D hypotheses from a 2D tensor
+    # input: scores[w,n]    w = word index, d = hyp index in beam (d=0 is the best one)
+    # output: [w,n1,n2]     n1 = input hyp index (prev top N); n2 = output hyp index (new top N)
+    # e.g. 4 words, beam 3; view this as 3 [4x3] planes "drawn" 3-dimensionally, with depth being the 3rd tensor index
+    GetTopNTensor (D, scores) = [
+        # recurse over up to D elements
+        # In each recursion:
+        #  - pick the best over (w,n)
+        #  - subtract it out from scores
+        recursion[n:0..D-1] =
+        [
+            curBestScores = if n == 0                            # scores excluding paths better than rank n
+                            then scores                          # top: just the path scores
+                            else recursion[n - 1].nextBestScores # next: path scores after removing all we already got
+            best = Hardmax (curBestScores)                       # best = one-hot over (w,n)
+            nextBestScores = curBestScores + Constant (-1e30) .* best     # set the ones we've already got to -INF
+            # TODO: use proper -INF; e.g. -1/0 in BS. Needs to be tested thoroughly.
+        ]
+        # splice them together into a single tensor
+        asArray[n:0..D-1] = recursion[n].best  # this is a BS array consisting only of the 'best' field    ('from r in recursion select r.best')
+        spliced = Splice (axis = 3, asArray)   # convert BS array index n to tensor index n1
+    ].spliced
+
+    # Create a greedy decoder model from an existing trained model.
+    # The input model is expected to have these nodes:
+    #  - decoderHistoryFromOutput: the decoding output of a time step (Hardmax (outputProbability))
+    #  - decoderHistoryHook: a node that is the word sequence that will be used as the history for the next time step
+    #    In training, this is the label sequence.
+    #    In greedy decoding, it must be decoderHistoryHook = decoderHistoryFromOutput
+    #  - z: scaled log prediction probability   --TODO: rename this: scoreSequence = Pass (z)
+    #  - inputSequence
+    #  - labelSequence (only passed through for scoring, not used in decoding)
+    # The returned model has the following one-hot outputs:
+    #  - decodedSequence  --TODO: currently decodeOut; rename this
+    #  - inputSequence
+    #  - labelSequence
+    # To decode greedily, in "write" or "eval" specify the model as:
+    #    BrainScriptNetworkBuilder = (BS.S2S.GreedySequenceDecoderFrom (BS.Network.Load ("$decodeModelPath$")))
+    GreedySequenceDecoderFrom (modelAsTrained) = [
+        scoreSequence = modelAsTrained.z
+        decodeOut = Pass (      Hardmax (scoreSequence), tag='output')
+        inputsOut = Pass (modelAsTrained.inputSequence,  tag='output')
+        labelsOut = Pass (modelAsTrained.labelSequence,  tag='output')
+        model = BS.Network.Edit (modelAsTrained,
+                                 #BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.decoderInput/*delayedDecoderFeedback*/, delayedDecoderFeedback),
+                                 BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.decoderHistoryHook, modelAsTrained.decoderHistoryFromOutput),
+                                 decodeOut : inputsOut : labelsOut)
+    ].model
+
+    # turning a regular LSTM to a top-N beam-search decoder:
+    #  - add a depth axis of dimension N to all nodes inside the decoder loop
+    #     - only needs the init signal for PastValue to be that
+    #  - h and c must be shuffled versions of their PastValue
+    #     - since what are the top N in one time step is not the top N in the next
+    #     - reshufling and adding depth to the init signal can be done at the same place
+    #  - decoder output must determine the top N and a reshuffling matrix for h and c
+    #     - the current Hardmax needs to be replaced by something that outputs these (output depth N)
+    #     - we get a N^2 depth: [V x (input set) x (top N output hypos)]
+    #     - reshuffling matrix is reduction over V (multiply with row of V ones) plus possibly a transposition
+    #  - we need an accumulated path score
+    #     - start value constructed by stacking a 0 and N-1 -INF
+    #  - for testing, we can output the current best in each step
+    #     - that's a Slice()
+    #  - traceback is a right-to-left recurrence
+    #     - output best hypo conditioned on the path (it is already known)
+    # beam search of width 'beamDepth'
+    BeamSearchSequenceDecoderFrom (modelAsTrained, beamDepth) = [
+
+        scoreSequence = modelAsTrained.z
+        vocabSize    = scoreSequence.dim
+
+        # TODO: use ReduceSum
+        ReduceAxis (axisDim, x, axis=1) =   # unfortunately, we must feed in the dimension of the axis, it can't be inferred
+            if      axis == 1 then Times (Constants.OnesTensor (axisDim), x, outputRank=0)
+            else if axis == 2 then ReduceAxis (axisDim, TransposeDimensions (x, 1, 2), axis=1)
+            else Fail("ReduceAxis: Only supports axes 1 and 2.")
+
+        # === BEGIN DECODER ===
+
+        # constants for initial score and final traceback
+        initialPathScores = FirstAndOther (0, LOGZERO, beamDepth, axis = 2)  # [1 x D]: [ 0, -INF, -INF, -INF, ... ]
+        finalHyp          = FirstAndOther (1, 0,       beamDepth, axis = 1)  # [D] the final token is the top-scoring hypothesis, that is, hyp[0]
+
+        # path expansion of the D hypotheses that were best in previous time step (ordered as in previous time step)
+        logLLs = Columnwise (LogSoftmax, beamDepth, scoreSequence)                                      # [V x Dprev] log  P(w|hist)
+        expandedPathScores = logLLs + Boolean.If (Loop.IsFirst (logLLs), initialPathScores, Loop.Previous (tokens.score)) # [V x Dprev] log (P(w|hist) * P(hist)) for all top D hypotheses
+
+        # determine top D of expanded paths
+        topPaths      = GetTopNTensor (beamDepth, expandedPathScores) # [V x Dprev] -> [V x Dprev x Dnew]
+        topPathScores = topPaths .* expandedPathScores                #                [V x Dprev x Dnew]
+
+        # form new decoding token, by reducing topPaths(Scores) along relevant dimensions
+        tokens = [                                    # [. x Dnew]
+            from  = ReduceAxis (axis=1, vocabSize, topPaths) # [Dprev x Dnew], reduced over V
+            word  = ReduceAxis (axis=2, beamDepth, topPaths) # [V x Dnew], reduced over Dprev
+            score = Constants.OnesTensor (1/*output dim*/ : /*reduction dims: */vocabSize : beamDepth/*Dprev*/) * topPathScores # [1 x Dnew], reduced over [V x Dprev] and inserted a '1'
+        ]
+
+        # network feedback for next time step
+        # BUGBUG: Need to import EmbedLabels functionality from models
+        decoderFeedback = /*EmbedLabels*/ (tokens.word) # [embeddingDim x Dnew]
+        delayedDecoderFeedback = Boolean.If (Loop.IsFirst (labelSentenceStartEmbeddedScattered), labelSentenceStartEmbeddedScattered, Loop.Previous (decoderFeedback))
+
+        # final traceback
+        traceback = Boolean.If (Loop.IsLast (modelAsTrained.labelSentenceStartEmbeddedScattered/*tokens.from*/), finalHyp, Loop.Next (tokens.from * traceback)) # [D] one-hot, multiplying tokens.from from the left will select another one-hot row of tokens.from
+        decodeHyp = Times (topPaths, traceback, outputRank=2) # [V x Dprev] 2D one-hot, selected the best hyp according to traceback
+        decode = decodeHyp * Constants.OnesTensor (beamDepth) # [V] reduces over Dprev -> 1D one-hot
+        # TODO: Can this be done in one ^^ go?
+
+        # === END DECODER ===
+
+        # propagate LSTM state to the right top-N rank given where that rank came from in the previous time step
+
+        # PropagateTopN:
+        # tokens.from: [Dprev, Dnew]
+        #   v--------- best came from input hyp[1]
+        #     v------- second best came from input hyp[0]
+        #       v----- third best came from input hyp[2]
+        #   0 1 0
+        #   1 0 0
+        #   0 0 1
+        # tokens.from[:,n] one-hot encodes the best predecessor at top-N rank n
+        # each column is a one-hot vector
+        # multiplying with such a column from the right will select the column represented by the one-hot value
+
+        # logLLs: get decoder log likelihoods
+
+        # initialPathScores: decoder start token: 0 for first hyp, -INF for the others
+        LOGZERO = -1e30
+
+        # expandedPathScores: path expansion, [V x 1] + [1 x D] -> [V x D]
+
+        # topPaths:
+        #   +-----+
+        #   |0 0 0|
+        #   |0 0 0|-+
+        #   |0 1 0|0|     means word[2] in input hyp[1] was the best
+        #   |0 0 0|0|-+
+        #   +-----+0|0|
+        #     |1 0 0|0|   means word[3] in input hyp[0] was the second best
+        #     +-----+1|   means word[2] in input hyp[2] was the third best
+        #       |0 0 0|
+        #       +-----+
+
+        # tokens.word:
+        #tokens.word = ReduceSum (axis=2, topPaths) # TODO: add an axis parameter to SumColumnElements()
+        #   +-+
+        #   |0|
+        #   |0|-+
+        #   |1|0|     means word[2] in input hyp[1] was the best
+        #   |0|0|-+
+        #   +-+0|0|
+        #     |1|0|   means word[3] in input hyp[0] was the second best
+        #     +-+1|   means word[2] in input hyp[2] was the third best
+        #       |0|
+        #       +-+
+
+        # tokens.from:
+        # before dropping the first dimension: [V x Dprev x Dnew]
+        #   +-----+
+        #   |0 1 0|       means input hyp[1] gave rise to the best    
+        #   +-----+-+  
+        #     |1 0 0|     means input hyp[0] gave rise to second best
+        #     +-----+-+
+        #       |0 0 1|   means input hyp[2] gave rise to third best
+        #       +-----+
+        # after: [Dprev x Dnew]        e.g. "0 1 0" goes into first column, vertically
+        #   v--------- best came from input hyp[1]
+        #     v------- second best came from input hyp[0]
+        #       v----- third best came from input hyp[2]
+        #   0 1 0
+        #   1 0 0
+        #   0 0 1
+        # tokens.from[:,n] one-hot encodes the best predecessor at top-N rank n
+
+        # topPathScores:
+        #   +-----+
+        #   |0 0 0|
+        #   |0 0 0|-+
+        #   |0 x 0|0|     x denotes the accumulated path score max_w P(w|hyp[1])
+        #   |0 0 0|0|-+
+        #   +-----+0|0|
+        #     |y 0 0|0|   y denotes the accumulated path score max_w P(w|hyp[0])
+        #     +-----+z|   z denotes the accumulated path score max_w P(w|hyp[2])
+        #       |0 0 0|
+        #       +-----+
+
+        # traceback:
+        # last state: take Hardmax over tokens.score
+        # previous states: multiply wth respective tokens.from matrix
+        # -> hyp index for every time step
+        # then finally use that to select the actual output   TODO: That's a sample-wise matrix product between two sequences!!!
+        # TODO: condition must be 1-dim, not 2-dim tensor, so we use labelSentenceStartEmbeddedScattered instead of tokens.from
+        # +-+
+        # |0|
+        # |1|  means at this time step, hyp[1] was the best globally
+        # |0|
+        # +-+
+
+        # decode: and the actual decoding output
+        # This is the one to output (top sentence-level hypothesis after traceback).
+
+        # traceback : [Dnew]
+        # topPaths : [V x Dprev x Dnew]
+        #   +-----+
+        #   |0 0 0|
+        #   |0 0 0|-+
+        #   |0 1 0|0|     means word[2] in input hyp[1] was the best
+        #   |0 0 0|0|-+
+        #   +-----+0|0|
+        #     |1 0 0|0|   means word[3] in input hyp[0] was the second best
+        #     +-----+1|   means word[2] in input hyp[2] was the third best
+        #       |0 0 0|
+        #       +-----+
+
+        # helper macros  --> move to BS.core.bs
+
+        Columnwise (f, beamDepth, z) = # TODO: Takes LogSoftmax over axis=1. it is more tricky to do this over arbitrary axes
+        [
+            cols[d:0..beamDepth-1] = f (Slice (d, d+1, z, axis=2) /*[:,d]*/ )
+            out = Splice (cols, axis=2)
+        ].out
+
+        FirstAndOther (firstVal, otherVals, N, axis = 1) = if N == 1 then ConstantTensor (firstVal, (1)) else [
+            axis1 = axis  # TODO: Is this really necessary? Why? Then we need the syntax   axis = ^.axis or ^axis
+            out = if axis == 1  # maybe this can be unified or pushed into Splice?
+                  then RowStack (ConstantTensor (firstVal, (1)) : ConstantTensor (otherVals, (N -1)))                                # col vector: [ 1; 0; 0; 0 ... ]
+                  else Splice   (Constant       (firstVal)      : ConstantTensor (otherVals, (1 : N -1)), axis = axis1 /*, axis*/)   # row vector: [ 0, -INF, -INF, -INF, ... ]
+        ].out
+
+        model = BS.Network.Edit (modelAsTrained,
+                                 (
+                                     BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.beamSearchReorderHook, tokens.from) :   # reorder LSTM states
+                                     BS.Network.Editing.ReplaceLinksToNode (modelAsTrained.decoderHistoryHook,    decoderFeedback) # feed decoder output back in
+                                 ),
+                                 (inputsOut : labelsOut : decodeOut)) # additional roots
+
+        inputsOut = Pass (modelAsTrained.inputSequence, tag='output')
+        labelsOut = Pass (modelAsTrained.labelSequence, tag='output')
+        decodeOut = Pass (decode, tag='output')
+    ].model
+]
+
+##############################################################################
+# Network-level operations
 # These operations will have undefined behavior for input values != 0 or 1.
 ##############################################################################

--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -707,14 +707,14 @@ int wmain1(int argc, wchar_t* argv[]) // called from wmain which is a wrapper th
    catch (const ScriptableObjects::ScriptingException& err)
    {
        fprintf(stderr, "\n");
-        LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", err.what());
-        err.PrintError();
+        err.PrintError(ProgressTracing::GetTimeStampPrefix() + L"EXCEPTION occurred");
        return EXIT_FAILURE;
    }
    catch (const IExceptionWithCallStackBase& err)
    {
        fprintf(stderr, "\n");
-        LOGPRINTF(stderr, "EXCEPTION occurred: %s\n%s", dynamic_cast<const std::exception&>(err).what(), err.CallStack());
+        fprintf(stderr, "%s", err.CallStack());
+        LOGPRINTF(stderr, "EXCEPTION occurred: %s\n", dynamic_cast<const std::exception&>(err).what());
        return EXIT_FAILURE;
    }
    catch (const std::exception& err)
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -54,7 +54,7 @@
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
-      <AdditionalIncludeDirectories>$(SolutionDir)Source\ActionsLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\ActionsLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(NvmlLibPath)</AdditionalLibraryDirectories>
@ -146,6 +146,7 @@
    <ClInclude Include="..\Common\Include\Basics.h" />
    <ClInclude Include="..\Common\Include\BestGpu.h" />
    <ClInclude Include="..\Common\Include\DataReader.h" />
+    <ClInclude Include="..\Common\Include\CompositeDataReader.h" />
    <ClInclude Include="..\Common\Include\ExceptionWithCallStack.h" />
    <ClInclude Include="..\Common\Include\StringUtil.h" />
    <ClInclude Include="..\Common\Include\TensorShape.h" />
@ -164,6 +165,20 @@
    <ClInclude Include="..\Math\Matrix.h" />
    <ClInclude Include="..\ComputationNetworkLib\PreComputeNodes.h" />
    <ClInclude Include="..\ComputationNetworkLib\MatrixPool.h" />
+    <ClInclude Include="..\Readers\ReaderLib\BlockRandomizer.h" />
+    <ClInclude Include="..\Readers\ReaderLib\Bundler.h" />
+    <ClInclude Include="..\Readers\ReaderLib\ChunkRandomizer.h" />
+    <ClInclude Include="..\Readers\ReaderLib\DataDeserializer.h" />
+    <ClInclude Include="..\Readers\ReaderLib\MemoryProvider.h" />
+    <ClInclude Include="..\Readers\ReaderLib\NoRandomizer.h" />
+    <ClInclude Include="..\Readers\ReaderLib\Packer.h" />
+    <ClInclude Include="..\Readers\ReaderLib\Reader.h" />
+    <ClInclude Include="..\Readers\ReaderLib\SampleModePacker.h" />
+    <ClInclude Include="..\Readers\ReaderLib\SequencePacker.h" />
+    <ClInclude Include="..\Readers\ReaderLib\SequenceRandomizer.h" />
+    <ClInclude Include="..\Readers\ReaderLib\StringToIdMap.h" />
+    <ClInclude Include="..\Readers\ReaderLib\Transformer.h" />
+    <ClInclude Include="..\Readers\ReaderLib\TransformerBase.h" />
    <ClInclude Include="..\SGDLib\DataReaderHelpers.h" />
    <ClInclude Include="..\SGDLib\SGD.h" />
    <ClInclude Include="..\SGDLib\SimpleEvaluator.h" />
--- a/Source/CNTK/CNTK.vcxproj.filters
+++ b/Source/CNTK/CNTK.vcxproj.filters
@ -47,6 +47,9 @@
    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
      <Filter>Common</Filter>
    </ClCompile>
+    <ClCompile Include="..\Common\CompositeDataReader.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="..\Common\Include\fileutil.h">
@ -166,6 +169,51 @@
    <ClInclude Include="..\Common\Include\ExceptionWithCallStack.h">
      <Filter>Common\Include</Filter>
    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\BlockRandomizer.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\Bundler.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\ChunkRandomizer.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\DataDeserializer.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\MemoryProvider.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\NoRandomizer.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\Packer.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\Reader.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\SampleModePacker.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\SequencePacker.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\SequenceRandomizer.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\StringToIdMap.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\Transformer.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Readers\ReaderLib\TransformerBase.h">
+      <Filter>from ReaderLib</Filter>
+    </ClInclude>
+    <ClInclude Include="..\Common\Include\CompositeDataReader.h">
+      <Filter>Common\Include</Filter>
+    </ClInclude>
  </ItemGroup>
  <ItemGroup>
    <Text Include="modelEditor.txt">
@ -224,6 +272,9 @@
    <Filter Include="BrainScript\CNTKCoreLib">
      <UniqueIdentifier>{899f31fa-5906-4485-8875-14ad2c43ed8f}</UniqueIdentifier>
    </Filter>
+    <Filter Include="from ReaderLib">
+      <UniqueIdentifier>{28bc457a-d2f4-4f42-a9aa-89f22e909ab0}</UniqueIdentifier>
+    </Filter>
  </ItemGroup>
  <ItemGroup>
    <None Include="prebuild.bat">
--- a/Source/Common/DataReader.cpp
+++ b/Source/Common/DataReader.cpp
@ -89,6 +89,8 @@ DataReader::DataReader(const ConfigRecordType& config)
    string precision = config(L"precision", "float");

    bool hasMultipleReaders = config.Exists(L"readers");
+    // In case when deserializers are specified, use the new logic to compose them.
+    bool hasDeserializers = config.Exists(L"deserializers");
    if (hasMultipleReaders)
    {
        vector<wstring> ioNames = config(L"readers", ConfigRecordType::Array(stringargvector()));
@ -103,6 +105,16 @@ DataReader::DataReader(const ConfigRecordType& config)
            getReaderProc(&m_dataReaders[ioName]); // instantiates the reader with the default constructor (no config processed at this point)
        }
    }
+    else if (hasDeserializers)
+    {
+        // Creating Composite Data Reader that allow to combine deserializers.
+        // This should be changed to link statically when SGD uses the new interfaces.
+        wstring ioName = L"ioName";
+        GetReaderProc getReaderProc = (GetReaderProc)Plugin::Load(config(L"readerType", L"CompositeDataReader"), GetReaderName(precision));
+        m_ioNames.push_back(ioName);
+        assert(getReaderProc != nullptr);
+        getReaderProc(&m_dataReaders[ioName]);
+    }
    else // legacy
    {
        wstring ioName = L"ioName";
@ -202,7 +214,7 @@ bool DataReader::GetMinibatch(StreamMinibatchInputs& matrices)
        if (nbr > 0)
            m_dataReaders[m_ioNames[i]]->SetNumParallelSequences(nbr); // the first one determines the param of all others --TODO: This is flimsy.
        bRet &= m_dataReaders[m_ioNames[i]]->GetMinibatch(matrices);
-        size_t thisNbr = m_dataReaders[m_ioNames[i]]->GetNumParallelSequences();
+        size_t thisNbr = m_dataReaders[m_ioNames[i]]->GetNumParallelSequencesForFixingBPTTMode();
        if (nbr == 0)
            nbr = thisNbr;
        else if (thisNbr != nbr)
@ -235,15 +247,15 @@ bool DataReader::GetHmmData(msra::asr::simplesenonehmm* hmm)
    return bRet;
 }

-size_t DataReader::GetNumParallelSequences()
+size_t DataReader::GetNumParallelSequencesForFixingBPTTMode()
 {
    size_t nNbr = 0;
    for (size_t i = 0; i < m_ioNames.size(); i++)
    {
        IDataReader* ptr = m_dataReaders[m_ioNames[i]];
        if (nNbr == 0)
-            nNbr = ptr->GetNumParallelSequences();
-        else if (nNbr != ptr->GetNumParallelSequences())
+            nNbr = ptr->GetNumParallelSequencesForFixingBPTTMode();
+        else if (nNbr != ptr->GetNumParallelSequencesForFixingBPTTMode())
            LogicError("GetNumParallelSequences: number of slices in each minibatch not consistent for these streams");
    }
    return nNbr;
--- a/Source/Common/Include/DataReader.h
+++ b/Source/Common/Include/DataReader.h
@ -168,7 +168,10 @@ public:
    {
        NOT_IMPLEMENTED;
    };
-    virtual size_t GetNumParallelSequences() = 0;
+
+    // TODO: Should be removed when BPTT follows proper minibatch size.
+    virtual size_t GetNumParallelSequencesForFixingBPTTMode() = 0;
+
    //virtual int GetSentenceEndIdFromOutputLabel() { return -1; }
    virtual void SetNumParallelSequences(const size_t sz)
    {
@ -337,7 +340,7 @@ public:
    virtual bool GetMinibatch4SE(std::vector<shared_ptr<const msra::dbn::latticepair>>& latticeinput, vector<size_t>& uids, vector<size_t>& boundaries, vector<size_t>& extrauttmap);
    virtual bool GetHmmData(msra::asr::simplesenonehmm* hmm);

-    size_t GetNumParallelSequences();
+    size_t GetNumParallelSequencesForFixingBPTTMode();
    //int GetSentenceEndIdFromOutputLabel();
    //bool RequireSentenceSeg() const override;

--- a/Source/Common/Include/Eval.h
+++ b/Source/Common/Include/Eval.h
@ -28,6 +28,32 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+template <typename ElemType>
+class IEvaluateModelBase 
+{
+public:
+    // 
+    // Load a model based on configuration. The syntax is the same as when calling the cntk executable.
+    // e.g. "modelFile=model.dat deviceId=0".
+    // numCPUThreads can be used to set the thread count of BLAS.
+    // 
+    virtual void Init(const std::string& config) = 0;
+
+    //
+    // Create a network based on an (NDL) network description.
+    //
+    virtual void CreateNetwork(const std::string& networkDescription) = 0;
+
+    //
+    // Free resources
+    //
+    virtual void Destroy() = 0;
+};
+
+// ------------------------------------------------------------------------
+// Basic (legacy) interface
+// ------------------------------------------------------------------------
+
 enum NodeGroup
 {
    nodeInput,  // an input node
@ -39,33 +65,54 @@ enum NodeGroup
 // NOTICE: This interface is a public interface for evaluating models in CNTK. 
 //         Changes to this interface may affect other projects, such as Argon and LatGen,
 //         and therefore need to be communicated with such groups.
-template <class ElemType>
-class IEvaluateModel // Evaluate Model Interface
+template <typename ElemType>
+class IEvaluateModel : public IEvaluateModelBase<ElemType> // Evaluate Model Interface
 {
 public:
-    virtual void Init(const std::string& config) = 0;
-    virtual void Destroy() = 0;
-
-    virtual void CreateNetwork(const std::string& networkDescription) = 0;
+    //
+    // Retrieves the (flattened) dimensions 
+    //
    virtual void GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup) = 0;
+
+    //
+    // Allocate resources for a particular output.
+    //
    virtual void StartEvaluateMinibatchLoop(const std::wstring& outputNodeName) = 0;
+    
+    //
+    // Evaluate a model in frame mode. This does not support dynamic axes or sparse input data.
+    // Given a feature vector of dimension d, the inputs may contain n * d elements. The output will then be computed 
+    // for n samples.
+    // inputs - map from node name to array of input tensors, flattened to vector
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will
+    // happen during evaluation
+    // 
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs) = 0;
+
+    //
+    // Evaluate - Evaluate using the network without input and provide the outputs
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will 
+    // happen during evaluation
+    //
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs) = 0;
+
    virtual void ResetState() = 0;
 };

+
 // GetEval - get a evaluator type from the DLL
 // since we have 2 evaluator types based on template parameters, exposes 2 exports
 // could be done directly with the templated name, but that requires mangled C++ names
-template <class ElemType>
+template <typename ElemType>
 void EVAL_API GetEval(IEvaluateModel<ElemType>** peval);
 extern "C" EVAL_API void GetEvalF(IEvaluateModel<float>** peval);
 extern "C" EVAL_API void GetEvalD(IEvaluateModel<double>** peval);

+
 // Data Reader class
 // interface for clients of the Data Reader
 // mirrors the IEvaluateModel interface, except the Init method is private (use the constructor)
-template <class ElemType>
+template <typename ElemType>
 class Eval : public IEvaluateModel<ElemType>, protected Plugin
 {
 private:
@ -84,6 +131,7 @@ public:
    // modelPath=c:\models\model.dnn (model path, if not specified, must call LoadModel() method before Evaluate()
    // minibatchSize=1024 (minibatch size used during evaluation if < passed data size)
    Eval(const std::string& config);
+
    virtual ~Eval();

    // CreateNetwork - create a network based on the network description
@ -101,14 +149,146 @@ public:

    // Evaluate - Evaluate using the model with the given inputs and outputs
    // inputs - map from node name to input vector
-    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will 
+    // happen during evaluation
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs);

    // Evaluate - Evaluate using the network without input, and provide the outputs
-    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will 
+    // happen during evaluation
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs);

    virtual void Init(const std::string& config);
+
    virtual void ResetState();
 };
+
+
+// ------------------------------------------------------------------------
+// Extended interface
+// ------------------------------------------------------------------------
+
+//
+// A buffer to keep data for all samples in a (variable length) sequence 
+// from a single input or output.
+// This is used for both dense and sparse data.
+//
+template<typename ElemType>
+struct VariableBuffer
+{
+    size_t m_numberOfSamples = 0;
+
+    //
+    // All elements of a sequence, concatenated.
+    //
+    std::vector<ElemType> m_buffer;
+
+    // In case of sparse data, the following is also used. Otherwise, the 
+    // contents are ignored.
+
+    // E.g. a sequence of three sparse vectors with 2 / 4 / 2 non-zero values
+    // could be represented as the following:
+    // colIdx:  0   2       6   8
+    //          v   v       v   v
+    // indices  1 3 2 3 5 6 2 7
+    // buffer   0 1 2 3 4 5 6 7
+
+    //
+    // For every element in buffer, an entry in this array gives its position.
+    // For every vector the entries must be ascending.
+    //
+    std::vector<int> m_indices;
+
+    //
+    // Contains numberOfsamples + 1 indices into the buffer. The first entry
+    // is always 0. The last entry points after the last element.
+    // See http://docs.nvidia.com/cuda/cusparse/#compressed-sparse-column-format-csc
+    //
+    std::vector<int> m_colIndices;
+};
+
+//
+// Meta data
+//
+struct VariableLayout
+{
+    enum DataType
+    {
+        Float32,
+        Float64
+    };
+
+    enum StorageType
+    {
+        Undetermined,
+        Dense,
+        Sparse,
+    };
+
+    // Name of the input
+    std::wstring m_name;
+
+    DataType m_dataType;
+
+    StorageType m_storageType;
+
+    // Dimension of the tensor, flattened to 1 dimension, for one entry on the dynamic axis.
+    // E.g. for a tensor [2,3,*] this would be 6.
+    int m_numElements;
+
+    // Name of the axis, potentially shared between inputs. For any two inputs sharing the same
+    // dynamic axis, the sequence cardinality must be the same.
+    std::wstring m_dynamicAxisName;
+};
+
+template <typename ElemType>
+using Variables = std::vector<VariableBuffer<ElemType>>;
+
+using VariableSchema = std::vector<VariableLayout>;
+
+//
+// Extended interface, allowing for sparse input.
+//
+template <typename ElemType>
+class IEvaluateModelExtended : public IEvaluateModelBase<ElemType>
+{
+public:
+    //
+    // GetOutputSchema - retrieve information about tensor shapes and memory layout of the outputs for this
+    // model.
+    //
+    virtual VariableSchema GetOutputSchema() const = 0;
+
+    //
+    // Allocate internal state for calling ForwardPass(). The call restricts the network (inputs and outputs)
+    // to the functions represented by the output name.
+    //
+    virtual void StartForwardEvaluation(std::vector<std::wstring> outputs) = 0;
+
+    //
+    // GetVariableLayout - retrieve information about tensor shapes and memory layout of inputs necessary for a
+    // particular output. By default this returns all available inputs. After StartForwardEvaluation(), this
+    // returns all the inputs necessary to compute the outputs.
+    //
+    virtual VariableSchema GetInputSchema() const = 0;
+
+    //
+    // Evaluate - Evaluate (perform a forward pass for) a single unit using the model with the given inputs and 
+    // outputs.
+    // The layout and shape of the data in inputs vector must match the schema returned by GetInputLayouts.
+    // This method is not reentrant, as the forward pass keeps internal state.
+    // outputId - output to compute values for. See GetOutputLayouts()
+    // inputs - vector of input buffers, one for every input as given by GetInputLayouts()
+    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing 
+    // will happen during evaluation.
+    // Called after StartForwardEvaluation()
+    //
+    virtual void ForwardPass(const Variables<ElemType>& inputs, Variables<ElemType>& output) = 0;
+};
+
+template <typename ElemType>
+void EVAL_API GetEvalExtended(IEvaluateModelExtended<ElemType>** peval);
+extern "C" EVAL_API void GetEvalExtendedF(IEvaluateModelExtended<float>** peval);
+extern "C" EVAL_API void GetEvalExtendedD(IEvaluateModelExtended<double>** peval);
+
 } } }
--- a/Source/Common/Include/ProgressTracing.h
+++ b/Source/Common/Include/ProgressTracing.h
@ -4,25 +4,25 @@
 //
 #pragma once

+#include "Basics.h"
 #include <chrono>
 #include "TimerUtility.h"
+#include <string>

 namespace Microsoft { namespace MSR { namespace CNTK {

-// TODO: make this proper C++ functions with variadic templates and a name that reflects their difference to fprintf(stderr) which already implies printing to log
 // If the Tracing flag is set, print out a timestamp with no new line at the end
 #define PREPENDTS(stream) \
    do \
    { \
        if (ProgressTracing::GetTimestampingFlag()) \
        { \
-           std::time_t tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); \
-           char mbstr[30]; \
-           if (std::strftime(mbstr, sizeof(mbstr), "%m/%d/%Y %H:%M:%S", std::localtime(&tt))) \
-               fprintf(stream, "%s: ", mbstr);  \
+            char mbstr[30]; \
+            fprintf(stream, "%s: ", ProgressTracing::Timestamp(mbstr));  \
        } \
    } while(0)

+// TODO: make this proper C++ functions with variadic templates and a name that reflects their difference to fprintf(stderr) which already implies printing to log
 // Print out a log message.  If the Tracing flag is set, prepend with a timestamp
 #define LOGPRINTF(stream, ...) \
    do \
@ -80,6 +80,22 @@ public:
        // TODO: timestampFlag or timestampingFlag? (Or timeStampFlag?)
    }

+    template<unsigned int N>
+    static const char* Timestamp(char(&buf)[N])
+    {
+        std::time_t tt = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
+        if (!std::strftime(buf, _countof(buf), "%m/%d/%Y %H:%M:%S", std::localtime(&tt)))
+            LogicError("Timestamp: Buffer too small.");
+        return buf;
+    }
+
+    // helper to return a time-stamp prefix if time-stamping enabled, complete with ': ' at its end
+    static std::wstring GetTimeStampPrefix()
+    {
+        char mbstr[30];
+        return GetTimestampingFlag() ? msra::strfun::wstrprintf(L"%s: ", Timestamp(mbstr)) : L"";
+    }
+
    static void SetTracingFlag()
    {
        auto& us = GetStaticInstance();
@ -167,4 +183,5 @@ public:
        return newNumItersSinceLastPrintOfProgress;
    }
 };
-} } }
+
+}}}
--- a/Source/Common/Include/ScriptableObjects.h
+++ b/Source/Common/Include/ScriptableObjects.h
@ -25,11 +25,11 @@ class ScriptingException : public runtime_error
 {
 public:
    template <typename M>
-    ScriptingException(const M &msg)
-        : runtime_error(msg)
+    ScriptingException(const M &msg) :
+        runtime_error(msg)
    {
    }
-    virtual void PrintError() const = 0;
+    virtual void PrintError(const std::wstring& linePrefix) const = 0;
 };

 // -----------------------------------------------------------------------
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@ -17,7 +17,6 @@
 #include "latticestorage.h"
 #include "simple_checked_arrays.h"
 #include "fileutil.h"
-#include <stdint.h>
 #include <vector>
 #include <string>
 #include <unordered_map>
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -491,27 +491,30 @@ void ComputationNetwork::CollectInputAndLearnableParametersRec(const Computation
 }

 template <class ElemType>
-/*static*/ void ComputationNetwork::SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed)
+/*static*/ void ComputationNetwork::SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, size_t randSeedBase)
 {
+    list<ComputationNodeBasePtr> dropoutNodes = net->GetNodesWithType(OperationNameOf(DropoutNode), criterionNode);
    if (dropoutRate != prevDropoutRate)
    {
        fprintf(stderr, "Setting dropout rate to %.8g.\n", dropoutRate);
        // TODO: Change this to use an interface that is independent of <ElemType>.
-        list<ComputationNodeBasePtr> dropoutNodes = net->GetNodesWithType(OperationNameOf(DropoutNode), criterionNode);
        if (dropoutNodes.size() == 0 && dropoutRate > 0)
-            fprintf(stderr, "WARNING: there is no dropout node.\n");
-        else
-        {
-            for (auto& nodeIter: dropoutNodes)
-            {
-                auto node = dynamic_pointer_cast<DropoutNode<ElemType>>(nodeIter);
-                node->SetDropoutRate(dropoutRate);
-                node->SetRandomSeed(dropOutSeed++);
-            }
-        }
-
-        prevDropoutRate = dropoutRate;
+            fprintf(stderr, "WARNING: Attempting to set dropout rate, but there is no dropout node in the network.\n");
    }
+
+    // Each dropout node gets a distinct seed. The actual seed for each dropout node is computed as follows:
+    // seed = (((parallelWorkerIdx * maxEpochs) + currentEpochNum) /*i.e. randSeedBase*/ * dropoutNodes.size()) + dropoutNodeIdx
+    size_t randSeed = randSeedBase * dropoutNodes.size();
+    for (auto& nodeIter : dropoutNodes)
+    {
+        auto node = dynamic_pointer_cast<DropoutNode<ElemType>>(nodeIter);
+        if (dropoutRate != prevDropoutRate)
+            node->SetDropoutRate(dropoutRate);
+        node->SetRandomSeed(randSeed);
+        randSeed++;
+    }
+
+    prevDropoutRate = dropoutRate;
 }

 template <class ElemType>
@ -1441,7 +1444,7 @@ template void ComputationNetwork::InitLearnableParameters<float>(const Computati
 template void ComputationNetwork::Read<float>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<float>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<float>(const map<wstring, float>& SVDConfig, size_t alignedsize);
-template /*static*/ void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
+template /*static*/ void ComputationNetwork::SetDropoutRate<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, size_t randSeedBase);
 template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
 template void ComputationNetwork::SetSeqParam<float>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
                                                     const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
@ -1451,7 +1454,7 @@ template void ComputationNetwork::InitLearnableParameters<double>(const Computat
 template void ComputationNetwork::Read<double>(const wstring& fileName);
 template void ComputationNetwork::ReadPersistableParameters<double>(File& fstream, bool create);
 template void ComputationNetwork::PerformSVDecomposition<double>(const map<wstring, float>& SVDConfig, size_t alignedsize);
-template /*static*/ void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
+template /*static*/ void ComputationNetwork::SetDropoutRate<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, size_t randSeedBase);
 template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
 template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
                                                      const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -428,7 +428,7 @@ public:

    // TODO: Why are all these static, but then take a network as the first argument? --> make them class members
    template <class ElemType>
-    static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, unsigned long& dropOutSeed);
+    static void SetDropoutRate(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double dropoutRate, double& prevDropoutRate, size_t randSeedBase);

    template <class ElemType>
    static void SetBatchNormalizationTimeConstants(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, 
@ -478,6 +478,47 @@ public:
        return std::vector<ComputationNodeBasePtr>{node};
    }

+    std::vector<ComputationNodeBasePtr> OutputNodesByName(const std::vector<std::wstring>& outputNodeNames) 
+    {
+        std::vector<ComputationNodeBasePtr> outputNodes;
+
+        if (outputNodeNames.size() == 0)
+        {
+            if (OutputNodes().size() == 0)
+                RuntimeError("There is no default output node specified in the network.");
+
+            outputNodes = OutputNodes();
+        }
+        else
+        {
+            for (int i = 0; i < outputNodeNames.size(); i++)
+                outputNodes.push_back(GetNodeFromName(outputNodeNames[i]));
+        }
+
+        return outputNodes;
+    }
+
+    // Collect all input nodes that outputNodes depend on.
+    std::vector<ComputationNodeBasePtr> InputNodesForOutputs(const std::vector<std::wstring>& outputNodeNames)
+    {
+        // use map to remove duplicated items
+        auto outputNodes = OutputNodesByName(outputNodeNames);
+
+        std::set<ComputationNodeBasePtr> inputNodesMap;
+        for (auto& onode : outputNodes)
+        {
+            for (auto& inode : InputNodes(onode))
+                inputNodesMap.insert(inode);
+        }
+
+        std::vector<ComputationNodeBasePtr> inputNodes;
+        for (auto& inode : inputNodesMap)
+            inputNodes.push_back(inode);
+
+        return inputNodes;
+    }
+
+
    // these are specified as such by the user
    const std::vector<ComputationNodeBasePtr>& FeatureNodes()        const { return m_featureNodes   ; }
    const std::vector<ComputationNodeBasePtr>& LabelNodes()          const { return m_labelNodes     ; }
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -105,7 +105,7 @@ ComputationNodeBasePtr ComputationNetwork::GetNestedNetwork(const ComputationNod
 ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(const std::vector<shared_ptr<SEQTraversalFlowControlNode>>& recurrentInfo, const std::list<ComputationNodeBasePtr>& allNodes /*must be in eval order*/)
 {
    // traverse the network in evaluation order and create a new list that replaces all recurrence by a SEQTraversalFlowControlNode
-    std::set<shared_ptr<IComputationNode>> loopsSeen; // for consistency check only
+    set<shared_ptr<IComputationNode>> loopsSeen; // for consistency check only
    for (auto nodeIter = allNodes.begin(); nodeIter != allNodes.end();)
    {
        shared_ptr<SEQTraversalFlowControlNode> recInfo = FindInRecurrentLoops(recurrentInfo, *nodeIter); // check if this node participates in a recurrent loop
@ -853,18 +853,22 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa

    VerifyIsCompiled("AllocateAllMatrices");

-    // Due to special topology, if a node is solely induced by parameters, its function value should not be shared
-    MarkValueNonSharableNodes();
-
-    bool performingBackPropagation = (trainRootNode != nullptr);
-
-    // Create a composite Eval order with the specified nodes as roots
    std::vector<ComputationNodeBasePtr> forwardPropRoots;
    forwardPropRoots.insert(forwardPropRoots.end(), evalRootNodes.begin(), evalRootNodes.end());
    forwardPropRoots.insert(forwardPropRoots.end(), outValueRootNodes.begin(), outValueRootNodes.end());
    if (trainRootNode != nullptr)
        forwardPropRoots.push_back(trainRootNode);

+    // Mark all the eval, output and criterion roots as non-shareable
+    for (auto& rootNode : forwardPropRoots)
+        rootNode->MarkValueNonSharable();
+
+    // Due to special topology, if a node is solely induced by parameters, its function value should not be shared
+    MarkValueNonSharableNodes();
+
+    bool performingBackPropagation = (trainRootNode != nullptr);
+
+    // Create a composite Eval order with the specified nodes as roots
    // For each node determine parents and whether the output of the
    // node is needed during back propagation
    std::unordered_map<ComputationNodeBasePtr, bool> outputValueNeededDuringBackProp;
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -34,8 +34,11 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
 #if 1 // keep enabled once this works
 #if 1 // log the cases where this is needed
    if (m_needsGradient && !m_gradientInitialized)
-        //LogicError("%ls %ls operation: Backprop called with uninitialized gradient.", NodeName().c_str(), OperationName().c_str());
-        fprintf(stderr, "%ls %ls operation: Initializing gradient out of line.\n", NodeName().c_str(), OperationName().c_str());
+    {
+        static size_t c = 0;
+        if (c++ < 100)
+            fprintf(stderr, "%ls %ls operation: Initializing gradient out of line.\n", NodeName().c_str(), OperationName().c_str());
+    }
 #endif
    if (m_needsGradient)
        LazyZeroGradient(); // set gradient to 0 if this is the first time
@ -70,6 +73,8 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh

            // fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str());
            BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child
+
+            //child->DebugLogMinibatch(/*gradient*/true);
        }
 #ifdef DISPLAY_DEBUG
        else
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -230,6 +230,10 @@ public:
    {
        m_evalTimeStamp = s_timeStampCounter;
    }
+    void SetEvalTimeStampOutdatedWrtAll()
+    {
+        m_evalTimeStamp = 0;
+    }
    int64_t GetEvalTimeStamp() const
    {
        return m_evalTimeStamp;
@ -938,7 +942,7 @@ public:
            if (m_value)
            {
                node->CreateValueMatrixIfNull();
-            node->m_value->SetValue(*m_value);
+                node->m_value->SetValue(*m_value);
            }
            else
                node->m_value = nullptr;
@ -1320,7 +1324,7 @@ public:
    void UpdateFunctionValuesSize()
    {
        UpdateDataSize(Value());
-        Value().CollapseDataLocation(); // actually before writing, should change the name
+        Value().CollapseDataLocation();
    }

    // -----------------------------------------------------------------------
@ -1549,6 +1553,7 @@ public:

    void Trace()
    {
+        //DebugLogMinibatch();
 #if 0
        static const std::set<std::wstring> toLog{
            L"labelSentenceStartEmbedded",
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -219,6 +219,7 @@ template class ElementTimesNode<double>;
 // If A is minibatch data, then this operation is currently not efficient.
 // TODO: Implement this with TensorView::DoElementwiseProductOf() and stride magic
 // TODO: Transpose flags for all matrices, inputs and outputs?
+// TODO: allow outputRank < 0 meaning to denote "all but", from right
 // -----------------------------------------------------------------------

 template <class ElemType, bool m_transpose>
@ -232,6 +233,16 @@ public:
    {
    }

+    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
+    {
+        Base::CopyTo(nodeP, newName, flags);
+        if (flags & CopyNodeFlags::copyNodeValue)
+        {
+            auto node = dynamic_pointer_cast<TimesNodeBase<ElemType, m_transpose>>(nodeP);
+            node->m_outputRank = m_outputRank;
+        }
+    }
+
    void Save(File& fstream) const
    {
        Base::Save(fstream);
@ -255,6 +266,8 @@ private:
        auto input = inputIndex < 0 ? this : Input(inputIndex).get();
        auto data = gradient ? input->GradientPtr() : input->ValuePtr();
        size_t rank = input->GetSampleLayout().GetRank();
+        if (inputIndex == 0 && m_transpose && rank == 1) // transposing a 1D tensor implies it is really a 2D tensor. Note that m_transpose applies to left operand only.
+            rank = 2;
        if (!Input(0)->HasMBLayout()) // left input is no MB data: run normally
            return input->DataTensorFor(data, rank, fr);
        auto tensorShape = input->GetOneSampleTensorSliceFor(rank, fr);
@ -309,17 +322,18 @@ public:
        {
            // currently we only support one combination when the input is sparse
            // If input data is sparse, then gradient is block sparse.
+            // BUGBUG: This does not accumulate into the Input(0)->Gradient, which might cause problems elsewhere.
            if (Input(1)->Value().GetMatrixType() == SPARSE && Input(0)->Gradient().GetMatrixType() == DENSE && Gradient().GetMatrixType() == DENSE)
                Input(0)->Gradient().SwitchToMatrixType(SPARSE, MatrixFormat::matrixFormatSparseBlockCol, false);
-            auto input0Gradient = OneSampleTensorFor(0, /*gradient=*/true,   fr.AllowBroadcast());
+            auto input0Gradient = OneSampleTensorFor(0,  /*gradient=*/true,  fr.AllowBroadcast());
            auto input1         = OneSampleTensorFor(1,  /*gradient=*/false, fr.AllowBroadcast());
            auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true,  fr);
            input0Gradient.AddMatrixProductOf(m_transpose/*transC*/, outputGradient, false/*transA*/, input1, true/*transB*/);
        }
        else if (inputIndex == 1) // right derivative
        {
-            auto input0         = OneSampleTensorFor(0, /*gradient=*/false, fr.AllowBroadcast());
-            auto input1Gradient = OneSampleTensorFor(1, /*gradient=*/true,  fr.AllowBroadcast());
+            auto input0         = OneSampleTensorFor(0,  /*gradient=*/false, fr.AllowBroadcast());
+            auto input1Gradient = OneSampleTensorFor(1,  /*gradient=*/true,  fr.AllowBroadcast());
            auto outputGradient = OneSampleTensorFor(-1, /*gradient=*/true, fr);
            input1Gradient.AddMatrixProductOf(false/*transC*/, input0, !m_transpose/*transA*/, outputGradient, false/*transB*/);
        }
@ -422,9 +436,6 @@ public:
                std::swap(dimsA[0], dimsA[1]);
            // update if LearnableParameter
            Input(0)->ValidateInferInputDimsFrom(TensorShape(dimsA));
-            // and verify once again
-            if (isFinalValidationPass && Input(0)->GetSampleLayout().GetDims() != dimsA)
-                InvalidArgument("%ls %ls operation: Left [%s] and right [%s] operands' shapes are not compatible.", NodeName().c_str(), OperationName().c_str(), dimsAstring.c_str(), dimsBstring.c_str());
        }
    }

@ -490,7 +501,7 @@ template class TimesNode<double>;
 // This differs from TimesNode in that A is transposed, where A must be a
 // rank-1 or rank-2 tensor.
 // A common use of transposition is trace(X'X) where X is a matrix of samples.
-// This can be more efficiently implemented as ReducePlus (ElementTimes (X, X))
+// This can be more efficiently implemented as ReduceSum (ElementTimes (X, X))
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -564,7 +575,7 @@ public:
        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
        Matrix<ElemType> sliceOutputValue = ValueFor(fr);

-        sliceOutputValue.SetValue(sliceInput1Value);
+        sliceOutputValue.AssignValuesOf(sliceInput1Value);
        sliceOutputValue.ColumnElementMultiplyWith(Input(0)->ValueAsMatrix());
    }

--- a/Source/ComputationNetworkLib/MatrixPool.h
+++ b/Source/ComputationNetworkLib/MatrixPool.h
@ -17,9 +17,12 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+// MatrixPool -- class to support memory sharing
+// Despite the gather general name of this class, it is specifically designed to support the memory sharing of ComputationNodes.
+// Note: see #define SUPRESS_MEMSHARING below as for how to temporarily disable memory sharing altogether, for debugging
 class MatrixPool
 {
-    vector<shared_ptr<Matrix<float>>> m_releasedFloatMatrices;
+    vector<shared_ptr<Matrix<float>>>  m_releasedFloatMatrices;
    vector<shared_ptr<Matrix<double>>> m_releasedDoubleMatrices;

    template <class ElemType>
@ -30,9 +33,12 @@ public:
    template <class ElemType>
    void Release(shared_ptr<Matrix<ElemType>> freeMatrix)
    {
-        vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
        if (freeMatrix == nullptr || freeMatrix->GetMatrixType() == SPARSE)
-            RuntimeError("MatrixPool::Release: freeMatrix should not be null or sparse.");
+            LogicError("MatrixPool::Release: freeMatrix should not be null or sparse.");
+//#define SUPRESS_MEMSHARING // #define this to disable memory sharing through this structure
+        // TODO: Make this a runtime option.
+#ifndef SUPRESS_MEMSHARING
+        vector<shared_ptr<Matrix<ElemType>>>& releasedMatrices = GetReleasedMatrices<ElemType>();
 #ifdef _DEBUG
        for (int i = 0; i < releasedMatrices.size(); i++)
        {
@ -42,6 +48,7 @@ public:

 #endif
        releasedMatrices.push_back(freeMatrix);
+#endif
    }

    template <class ElemType>
@ -65,4 +72,5 @@ public:
        return matrixPtr;
    }
 };
-} } }
+
+}}}
--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -448,8 +448,18 @@ public:
    {
    }

-    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex)  const override { return childIndex == 0; }
-    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual void /*IComputationNode::*/ BeginForwardProp() override // called before first iteration step of ForwardProp()
+    {
+        Base::BeginForwardProp();
+        // we switch result to dense as a work-around because ColumnSlice doesn't support all the sparse formats
+        // TODO: This is a stopgap. Is this the right thing to do? It changes the matrix type in-place.
+        Value().SwitchToMatrixType(MatrixType::DENSE, MatrixFormat::matrixFormatDense, false);
+    }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        ValidateNaryZip(isFinalValidationPass, /* allow broadcast */ true, /* num Inputs */ 3);
+    }

    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
@ -485,10 +495,8 @@ public:
        }
    }

-    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
-    {
-        ValidateNaryZip(isFinalValidationPass, /* allow broadcast */ true, /* num Inputs */ 3);
-    }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t childIndex)  const override { return childIndex == 0; }
+    virtual bool OutputUsedInComputingInputNodesGradients()  const override { return false; }
 };

 template class IfNode<float>;
--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -321,7 +321,7 @@ public:
                        inp = Input(0)->ValueFor(frDelayed.Sequence(id));
                    // inp = Input(0)->ValueFor(FrameRange(m_pMBLayout, t_delayed).Sequence(id));

-                    out.SetValue(inp);
+                    out.AssignValuesOf(inp);
                }
            }
        }
@ -358,7 +358,7 @@ public:
                inp = Input(0)->ValueFor(frDelayed);
            // inp = Input(0)->ValueFor(FrameRange(m_pMBLayout, t_delayed));

-            out.SetValue(inp);
+            out.AssignValuesOf(inp);
        }
    }

--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -31,10 +31,13 @@ template <class ElemType>
 /*virtual*/ void ReduceElementsNode<ElemType>::CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const /*override*/
 {
    Base::CopyTo(nodeP, newName, flags);
-    auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
-    node->m_axis      = m_axis;
-    node->m_operation = m_operation;
-    node->m_op        = m_op;
+    if (flags & CopyNodeFlags::copyNodeValue)
+    {
+        auto node = dynamic_pointer_cast<ReduceElementsNode<ElemType>>(nodeP);
+        node->m_axis      = m_axis;
+        node->m_operation = m_operation;
+        node->m_op        = m_op;
+    }
 }

 template <class ElemType>
@ -60,7 +63,7 @@ template <class ElemType>
    auto result =           ValueTensorFor(rank, fr);
    auto input  = Input(0)->ValueTensorFor(rank, fr);

-    // the actual operation is a Copy with a reduction op
+    // the actual operation is a Copy with reduction, where the magic is in the reduction op
    result.DoUnaryOpOf(0, input, 1, ElementWiseOperator::opCopy, m_op);
    // note: we can implement "Mean" by passing 1/dim for alpha
 }
@ -79,7 +82,7 @@ template <class ElemType>
    switch (m_op)
    {
    case ElementWiseOperator::opSum:
-        // "Plus": broadcast the gradient
+        // "Sum": broadcast the gradient
        sliceInputGrad.AddCopyOf(sliceOutputGrad);
        break;

@ -121,9 +124,13 @@ template <class ElemType>
 template <class ElemType>
 void ReduceElementsNode<ElemType>::ValidateOp()
 {
+#if 1 // legacy with initial experiments, delete this soon
    if (m_operation == L"Plus") m_op = ElementWiseOperator::opSum;
+    else
+#endif
+    if (m_operation == L"Sum") m_op = ElementWiseOperator::opSum;
    // more here
-    else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Plus'. And a few more soon.", NodeDescription().c_str(), m_operation.c_str());
+    else InvalidArgument("%ls was given an invalid operation code '%ls'. Allowed are: 'Sum'. And a few more soon.", NodeDescription().c_str(), m_operation.c_str());
 }

 template <class ElemType>
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -150,12 +150,12 @@ public:

    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
    {
-        ValueFor(fr).SetValue(Input(0)->ValueFor(fr));
+        ValueFor(fr).AssignValuesOf(Input(0)->ValueFor(fr));
    }

    virtual void /*ComputationNode::*/ BackpropTo(const size_t inputIndex, const FrameRange& fr) override
    {
-        Input(inputIndex)->GradientFor(fr).SetValue(GradientFor(fr));
+        Input(inputIndex)->GradientFor(fr).AssignValuesOf(GradientFor(fr));
    }

    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
@ -175,13 +175,16 @@ template class ReshapeNode<double>;
 // Reduces (e.g. sums up) all elements in each sample (column) of the input.
 // The optional axis can be 0 (meaning all elements) or a specific axis.
 // Allowed operations:
-//  - "Plus"
-//  - "LogPlus"   --not implemented yet
+//  - "Sum"
+//  - "LogSum"    --not implemented yet
 //  - "Mean"      --not implemented yet
 //  - "Max"       --not implemented yet
 //  - "Min"       --not implemented yet
 //  - "All"       --not implemented yet
 //  - "Any"       --not implemented yet
+// TODO:
+//  - move to a different header, since it's not really Reshaping
+//  - consider to change to pass in a set of axes instead of only one
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -216,7 +219,7 @@ public:

 private:
    int m_axis;
-    std::wstring m_operation; // the operation as a string, e.g. "Plus", see GetOpcode()
+    std::wstring m_operation; // the operation as a string, e.g. "Sum", see ValidateOp()
    ElementWiseOperator m_op; // the operation mapped to our internal opCode
 };

@ -252,7 +255,7 @@ public:
                            Input(1)->NodeName().c_str(), Input(1)->OperationName().c_str());

        // copy the data from 'dataInput'
-        ValueFor(fr).SetValue(Input(0)->ValueFor(fr.WithLayout(Input(0)->GetMBLayout()))); // just propagate through
+        ValueFor(fr).AssignValuesOf(Input(0)->ValueFor(fr.WithLayout(Input(0)->GetMBLayout()))); // just propagate through
        // TODO: Once we do in-place, the above must include a copy-to-self check (either here or inside the matrix lib).
    }

@ -653,6 +656,7 @@ public:
    WhereNode(DEVICEID_TYPE deviceId, const wstring& name) :
        Base(deviceId, name)
    {
+        MarkValueNonSharable();
    }

    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
@ -693,6 +697,7 @@ public:
    PackedIndexNode(DEVICEID_TYPE deviceId, const wstring& name) :
        Base(deviceId, name)
    {
+        MarkValueNonSharable();
    }

    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override;
@ -1077,7 +1082,7 @@ public:
        // (We still need to copy the values since there is currently no way to point to an input function value while reshaping at the same time.)
        if (!m_pMBLayout || factor() == 1)
        {
-            Value().Reshaped(newCols * m_numTargetRows, 1).SetValue(Input(0)->Value().Reshaped(cols * rows, 1)); // copy the values as one long vector
+            Value().Reshaped(newCols * m_numTargetRows, 1).AssignValuesOf(Input(0)->Value().Reshaped(cols * rows, 1)); // copy the values as one long vector
        }
        // layout case: reshape semantics happens across parallel seqeunces, i.e. requiring data shuffling
        else
@ -1371,7 +1376,7 @@ reductions
 ----------

 - these are/will be implemented as a node for samples, and as recurrences for sequences
- - ReducePlus
+ - ReduceSum
    - sum over all elements of a dimension, or over time
 - ReduceMax, ReduceMin
    - max
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -7,6 +7,7 @@
 #include "Basics.h"
 #include "ComputationNode.h"
 #include "BatchNormalizationEngine.h"
+#include "RNGHandle.h"

 #include <map>
 #include <string>
@ -177,6 +178,7 @@ public:
        // first compute the softmax (column-wise)
        // Note that we need both log and non-log for gradient computation.
        m_logSoftmaxOfRight->AssignLogSoftmaxOf(Input(1)->ValueFor(fr), true);
+        // BUGBUG: No need to compute m_softmaxOfRight in ForwardProp, should be moved to BackpropTo().
        m_softmaxOfRight->SetValue(*m_logSoftmaxOfRight);
        m_softmaxOfRight->InplaceExp();
        // flatten all gaps to zero, such that gaps will contribute zero to the sum
@ -780,7 +782,7 @@ private:
                case 3:
                {
                    Matrix<ElemType> grd_t = Input(CLASSPROBINDATA)->GradientFor(fr);
-                    grd_t.SetValue(Input(CLASSPROBINDATA)->DataFor(m_clsSoftmax, fr));
+                    grd_t.AssignValuesOf(Input(CLASSPROBINDATA)->DataFor(m_clsSoftmax, fr));
                    ComputeCEPartialToSoftmaxInputs(grd_t, Gradient(), c_t);
                    break;
                }
@ -811,7 +813,7 @@ private:
                size_t idx_in_class = y_t - lft_bnd;
                ComputeCEPartialToSoftmaxInputs(softMax, Gradient(), idx_in_class);

-                m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd).SetValue(softMax);
+                m_grdToSoftMaxInput.ColumnSlice(sz, nbr_wrd).AssignValuesOf(softMax);
            });

            m_needRecomputeGradientToSoftmaxInput = false;
@ -1459,8 +1461,7 @@ public:
        {
            // determine drop-out mask for this minibatch
            auto sliceMask = DataFor(*m_maskOfDropout, fr);
-            sliceMask.SetUniformRandomMask((ElemType) m_dropoutRate, (ElemType)(1.0 / (1.0 - m_dropoutRate)) /*pre-scaled*/, m_randomSeed);
-            m_randomSeed += 1073807359; // 1073807359 is a very large prime number to avoid collision with other dropout nodes
+            sliceMask.SetUniformRandomMask((ElemType)m_dropoutRate, (ElemType)(1.0 / (1.0 - m_dropoutRate)) /*pre-scaled*/, GetRNGHandle());
            // apply dropout mask
            sliceOutputValue.AssignElementProductOf(sliceMask, sliceInput0Value);
        }
@ -1482,6 +1483,18 @@ public:
    void SetRandomSeed(const unsigned long val)
    {
        m_randomSeed = (unsigned long) val;
+
+        // Upon change of the seed, reset RNGHandle to force the creation of a new RNGHandle
+        // during forward propagation
+        m_RNGHandle = nullptr;
+    }
+
+    RNGHandle& GetRNGHandle()
+    {
+        if (m_RNGHandle == nullptr) 
+            m_RNGHandle = RNGHandle::Create(ValuePtr()->GetDeviceId(), m_randomSeed);
+
+        return *m_RNGHandle;
    }

    virtual void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -1512,6 +1525,7 @@ public:
 private:
    double m_dropoutRate;
    unsigned long m_randomSeed;
+    std::shared_ptr<RNGHandle> m_RNGHandle;

    shared_ptr<Matrix<ElemType>> m_maskOfDropout;
 };
@ -1765,10 +1779,10 @@ public:
        }

        m_bnEng->Forward(sliceInputValue, scale, bias, expAvgFactor, blendFactor, runMean, runInvStdDev,
-                                      sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);
+                         sliceOutputValue, m_epsilon, *m_saveMean, *m_saveInvStdDev);

-            m_mbCount++;
-        }
+        m_mbCount++;
+    }

    void Validate(bool isFinalValidationPass) override
    {
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -18,6 +18,11 @@
 #endif
 #include "BestGpu.h"
 #include "MPIWrapper.h"
+#include "DataDeserializer.h"
+#include "SequencePacker.h"
+#include "NoRandomizer.h"
+#include "HeapMemoryProvider.h"
+#include "InputAndParamNodes.h"

 // TODO: Temporary mechanism to enable memory sharing for
 // node output value matrices. This will go away when the
@ -26,7 +31,50 @@ bool g_shareNodeValueMatrices = false;

 namespace Microsoft { namespace MSR { namespace CNTK {

-template <class ElemType>
+
+template <typename ElemType>
+void CNTKEvalBase<ElemType>::Init(const std::string& config)
+{
+    m_config.Parse(config);
+    size_t nThreads = m_config("numCPUThreads", "1");
+    CPUMatrix<ElemType>::SetNumThreads(nThreads);
+    g_shareNodeValueMatrices = m_config(L"shareNodeValueMatrices", false);
+}
+
+
+// CreateNetwork - create a network based on the network description
+// networkDescription - network description
+template <typename ElemType>
+void CNTKEvalBase<ElemType>::CreateNetwork(const std::string& networkDescription)
+{
+    ConfigParameters config;
+    config.Parse(networkDescription);
+
+    std::vector<wstring> outputNodeNames;
+    m_net = GetModelFromConfig<ConfigParameters, ElemType>(config, outputNodeNames);
+    
+    if (m_net == nullptr)
+    {
+        LogicError("Unable to construct network from description");
+    }
+}
+
+
+// Destroy - cleanup and remove this class
+// NOTE: this destroys the object, and it can't be used past this point
+template <typename ElemType>
+void CNTKEvalBase<ElemType>::Destroy()
+{
+    // cleanup everything
+    m_net.reset();
+}
+
+
+// ----------------------------------------------------------------------------
+// Basic interface
+// ----------------------------------------------------------------------------
+
+template <typename ElemType>
 void EVAL_API GetEval(IEvaluateModel<ElemType>** peval)
 {
    *peval = new CNTKEval<ElemType>();
@ -41,51 +89,11 @@ extern "C" EVAL_API void GetEvalD(IEvaluateModel<double>** peval)
    GetEval(peval);
 }

-template <class ElemType>
-void CNTKEval<ElemType>::Init(const std::string& config)
-{
-    m_start = 0;
-    m_config.Parse(config);
-    size_t nThreads = m_config("numCPUThreads", "1");
-    CPUMatrix<ElemType>::SetNumThreads(nThreads);
-
-    g_shareNodeValueMatrices = m_config(L"shareNodeValueMatrices", false);
-}
-
-// Destroy - cleanup and remove this class
-// NOTE: this destroys the object, and it can't be used past this point
-template <class ElemType>
-void CNTKEval<ElemType>::Destroy()
-{
-    // cleanup everything
-    m_net.reset();
-    delete m_reader;
-    delete m_writer;
-    delete this;
-}
-
-// CreateNetwork - create a network based on the network description
-// networkDescription - network description
-template <class ElemType>
-void CNTKEval<ElemType>::CreateNetwork(const std::string& networkDescription)
-{
-    ConfigParameters config;
-    config.Parse(networkDescription);
-
-    std::vector<wstring> outputNodeNames;
-    m_net = GetModelFromConfig<ConfigParameters, ElemType>(config, outputNodeNames);
-    
-    if (m_net == nullptr)
-    {
-        LogicError("Unable to construct network from description");
-    }
-}
-
 // GetNodeDimensions - Get the node dimensions of the specified nodes
 // dimensions - map from name of node to dimension of the node, will be appended to for Input/Output scenarios
 // nodeGroup - type of node we are requesting (input/output/specified)
 // NOTE: when nodeGroup==specified the dimensions map is expected to be populated with the string names of the nodes requested, dimensions will be modified return the current value.
-template <class ElemType>
+template <typename ElemType>
 void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup)
 {
    if (m_net == NULL)
@ -137,7 +145,7 @@ void CNTKEval<ElemType>::GetNodeDimensions(std::map<std::wstring, size_t>& dimen

 // StartEvaluateMinibatchLoop - Prepare network for Evaluate() calls.
 // ouputNodeName - name of node that will be evaluated
-template <class ElemType>
+template <typename ElemType>
 void CNTKEval<ElemType>::StartEvaluateMinibatchLoop(const std::wstring& outputNodeName)
 {
    m_net->StartEvaluateMinibatchLoop(m_net->GetNodeFromName(outputNodeName));
@ -146,7 +154,7 @@ void CNTKEval<ElemType>::StartEvaluateMinibatchLoop(const std::wstring& outputNo
 // Evaluate - Evalute using the model with the given inputs and outputs
 // inputs - map from node name to input vector
 // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
-template <class ElemType>
+template <typename ElemType>
 void CNTKEval<ElemType>::Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs)
 {
    size_t minibatchSize = m_config(L"minibatchSize", (size_t) 10240);
@ -183,7 +191,7 @@ void CNTKEval<ElemType>::Evaluate(std::map<std::wstring, std::vector<ElemType>*>

 // Evaluate - Evalute using the model with the given inputs and outputs
 // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
-template <class ElemType>
+template <typename ElemType>
 void CNTKEval<ElemType>::Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs)
 {
    // get the evaluation names from the output string
@ -206,14 +214,168 @@ void CNTKEval<ElemType>::Evaluate(std::map<std::wstring, std::vector<ElemType>*>
    eval.WriteOutput(*m_writer, outNodeNames);
 }

-// ResetState - Reset the cell state when we get start of an utterance
-template <class ElemType>
-void CNTKEval<ElemType>::ResetState()
+
+template <typename ElemType>
+void CNTKEval<ElemType>::Destroy()
 {
-    m_start = 1 - m_start;
+    CNTKEvalBase<ElemType>::Destroy();
+    delete m_reader;
+    delete m_writer;
+    delete this;
 }

 // instantiate all the combinations we expect to be used
 template class CNTKEval<double>;
 template class CNTKEval<float>;
+
+// ----------------------------------------------------------------------------
+// Extended interface
+// ----------------------------------------------------------------------------
+
+template<typename ElemType>
+VariableLayout CNTKEvalExtended<ElemType>::ToVariableLayout(const ComputationNodeBasePtr n) 
+{
+    auto matrix = dynamic_pointer_cast<Matrix<ElemType>>(n->ValuePtr());
+    return VariableLayout
+    {
+        /* name */ n->GetName(),
+        /* type */ sizeof(ElemType) == sizeof(float) ? VariableLayout::Float32 : VariableLayout::Float64,
+        /* storage */  matrix ? matrix->GetMatrixType() == MatrixType::DENSE ? VariableLayout::Dense :
+                                matrix->GetMatrixType() == MatrixType::SPARSE ? VariableLayout::Sparse : 
+                                VariableLayout::Undetermined :
+                                VariableLayout::Undetermined,
+        /* dimension */ n->GetSampleLayout().GetNumElements(),
+        /* dynamic axis */ wstring(n->GetMBLayout()->GetAxisName())
+    };
+}
+
+
+template<typename ElemType>
+void CNTKEvalExtended<ElemType>::StartForwardEvaluation(std::vector<wstring> outputNodeNames)
+{
+    m_scopedNetworkOperationMode = make_shared<ScopedNetworkOperationMode>(m_net, NetworkOperationMode::inferring);
+    // allocate memory for forward computation
+    m_outputNodes  = m_net->OutputNodesByName(outputNodeNames);
+    m_inputNodes = m_net->InputNodesForOutputs(outputNodeNames);
+    // allocate memory for forward computation
+    m_net->AllocateAllMatrices({}, m_outputNodes, nullptr);
+    m_net->StartEvaluateMinibatchLoop(m_outputNodes);
+    m_inputMatrices = DataReaderHelpers::RetrieveInputMatrices(m_inputNodes);
+} 
+
+template<typename ElemType>
+VariableSchema CNTKEvalExtended<ElemType>::GetOutputSchema() const
+{
+    VariableSchema schema;
+    for (const auto& n : m_net->OutputNodes())
+    {
+        schema.push_back(ToVariableLayout(n));
+    }
+    return schema;
+}
+
+template<typename ElemType>
+VariableSchema CNTKEvalExtended<ElemType>::GetInputSchema() const
+{
+    VariableSchema inputLayouts;
+    auto nodes = m_inputNodes;
+    if (nodes.size() == 0)
+    {
+        // Default to all nodes
+        nodes = m_net->InputNodesForOutputs({});
+    }
+
+    for (const auto& n : nodes)
+    {
+        inputLayouts.push_back(ToVariableLayout(n));
+    }
+    return inputLayouts;
+}
+
+template<typename ElemType>
+void CNTKEvalExtended<ElemType>::ForwardPass(const Variables<ElemType>& inputs, Variables<ElemType>& output)
+{
+    if (inputs.size() != (size_t)std::distance(m_inputMatrices.begin(), m_inputMatrices.end()))
+    {
+        RuntimeError("Expected %d inputs, but got %d", (int)std::distance(m_inputMatrices.begin(), m_inputMatrices.end()), (int)inputs.size());
+    }
+
+    int i = 0;
+    for (auto& input : m_inputMatrices)
+    {
+        VariableBuffer<ElemType> buffer = inputs[i];
+        int numRows = input.second.sampleLayout.GetNumElements(); 
+        int numCols = buffer.m_numberOfSamples;
+        shared_ptr<Matrix<ElemType>> matrix = dynamic_pointer_cast<Matrix<ElemType>>(input.second.matrix);
+        auto type = matrix->GetMatrixType(); 
+
+        input.second.pMBLayout->Init(1, numCols);
+        input.second.pMBLayout->AddSequence(0, 0, 0, numCols);
+       
+        if (type == MatrixType::DENSE)
+        {
+            matrix->SetValue(numRows, numCols, matrix->GetDeviceId(), buffer.m_buffer.data(), matrixFlagNormal);
+        }
+        else if (type == MatrixType::SPARSE)
+        {
+            // In the sparse case the m_data layout is identical to CUDA's CSC layout
+            // (see http://docs.nvidia.com/cuda/cusparse/#compressed-sparse-column-format-csc).
+            matrix->SetMatrixFromCSCFormat(buffer.m_colIndices.data(), buffer.m_indices.data(), buffer.m_buffer.data(), buffer.m_buffer.size(), numRows, numCols);
+        }
+
+        ++i;
+    }
+
+    ComputationNetwork::BumpEvalTimeStamp(m_inputNodes);
+    
+    for (int i = 0; i < m_outputNodes.size(); ++i)
+    {
+        auto node = m_outputNodes[i];
+        m_net->ForwardProp(node);
+        shared_ptr<Matrix<ElemType>> outputMatrix = dynamic_pointer_cast<Matrix<ElemType>>(node->ValuePtr());
+        auto pMBLayout = node->GetMBLayout();
+        if (!pMBLayout) 
+        {
+            pMBLayout = make_shared<MBLayout>();
+            pMBLayout->InitAsFrameMode(1); // treat this as if we have one single sample
+        }
+
+        const auto& seq = pMBLayout->GetAllSequences();
+        if (seq.size() != 1)
+        {
+            RuntimeError("Only 1 sequence supported by this API"); // TODO
+        }
+        std::vector<ElemType>& vec = output[i].m_buffer;
+        
+        vec.resize(outputMatrix->GetNumElements());
+        ElemType* data = const_cast<ElemType*>(vec.data());
+        size_t numElements = outputMatrix->GetNumElements();
+        outputMatrix->CopyToArray(data, numElements);
+    }
+}
+
+template <typename ElemType>
+void CNTKEvalExtended<ElemType>::Destroy()
+{
+    CNTKEvalBase<ElemType>::Destroy();
+    delete this;
+}
+
+template <typename ElemType>
+void EVAL_API GetEvalExtended(IEvaluateModelExtended<ElemType>** peval)
+{
+    *peval = new CNTKEvalExtended<ElemType>();
+}
+
+extern "C" EVAL_API void  GetEvalExtendedF(IEvaluateModelExtended<float>** peval)
+{
+    GetEvalExtended(peval);
+}
+extern "C" EVAL_API void GetEvalExtendedD(IEvaluateModelExtended<double>** peval)
+{
+    GetEvalExtended(peval);
+}
+
+template class CNTKEvalExtended<double>;
+template class CNTKEvalExtended<float>;
 } } }
--- a/Source/EvalDll/CNTKEval.h
+++ b/Source/EvalDll/CNTKEval.h
@ -22,48 +22,97 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-template <class ElemType>
-class CNTKEval : public IEvaluateModel<ElemType>
+template <typename ElemType>
+class CNTKEvalBase : public IEvaluateModelBase<ElemType>
 {
+protected:
    typedef shared_ptr<ComputationNode<ElemType>> ComputationNodePtr;
-    EvalReader<ElemType>* m_reader;
-    EvalWriter<ElemType>* m_writer;
    ConfigParameters m_config;
    ComputationNetworkPtr m_net;
-    std::map<std::wstring, size_t> m_dimensions;
-    size_t m_start;

-public:
    // constructor
-    CNTKEval()
-        : m_reader(nullptr), m_net(nullptr)
-    {
-    }
+    CNTKEvalBase() : m_net(nullptr) { }
+public:

    // CreateNetwork - create a network based on the network description
    // networkDescription - network description
    virtual void CreateNetwork(const std::string& networkDescription);
-
-    // GetNodeDimensions - Get the node dimensions of the specified nodes
-    // dimensions - map from name of node to dimension of the node
-    // nodeGroup - type of node we are requesting (input/output/specified)
-    virtual void GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup);
-
-    // StartEvaluateMinibatchLoop - Prepare network for Evaluate() calls.
-    // ouputNodeName - name of node that will be evaluated
-    virtual void StartEvaluateMinibatchLoop(const std::wstring& outputNodeName);
-
-    // Evaluate - Evalute using the model with the given inputs and outputs
-    // inputs - map from node name to input vector
-    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
-    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs);
-
-    // Evaluate - Evalute using the model with the given inputs and outputs
-    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
-    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs);
-
    virtual void Init(const std::string& config);
    virtual void Destroy();
-    virtual void ResetState();
+};
+
+// ------------------------------------------------------------------------
+// Basic interface
+// ------------------------------------------------------------------------
+template <typename ElemType>
+class CNTKEval : public CNTKEvalBase<ElemType>, public IEvaluateModel<ElemType>
+{
+    EvalReader<ElemType>* m_reader;
+    EvalWriter<ElemType>* m_writer;
+    std::map<std::wstring, size_t> m_dimensions;
+    size_t m_start;
+public:
+    CNTKEval() : CNTKEvalBase<ElemType>(), m_reader(nullptr), m_writer(nullptr) {}
+
+    virtual void GetNodeDimensions(std::map<std::wstring, size_t>& dimensions, NodeGroup nodeGroup);
+
+    virtual void StartEvaluateMinibatchLoop(const std::wstring& outputNodeName);
+
+    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs);
+
+    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& outputs);
+
+    virtual void Destroy() override;
+
+    virtual void CreateNetwork(const std::string& networkDescription) override
+    {
+        CNTKEvalBase<ElemType>::CreateNetwork(networkDescription);
+    }
+    
+    virtual void Init(const std::string& config) override
+    {
+        CNTKEvalBase<ElemType>::Init(config);
+        m_start = 0;
+    }
+
+    virtual void ResetState() override
+    {
+        m_start = 1 - m_start;
+    }
+};
+
+
+
+// ------------------------------------------------------------------------
+// Extended interface
+// ------------------------------------------------------------------------
+template <typename ElemType>
+class CNTKEvalExtended : public CNTKEvalBase<ElemType>, public IEvaluateModelExtended<ElemType>
+{
+    virtual VariableSchema GetOutputSchema() const override;
+
+    virtual void StartForwardEvaluation(std::vector<wstring> outputs) override;
+
+    virtual VariableSchema GetInputSchema() const override;
+
+    virtual void ForwardPass(const Variables<ElemType>& inputs, Variables<ElemType>& output) override;
+
+    virtual void Destroy() override;
+
+    virtual void CreateNetwork(const std::string& networkDescription) override
+    {
+        CNTKEvalBase<ElemType>::CreateNetwork(networkDescription);
+    }
+
+    virtual void Init(const std::string& config) override
+    {
+        CNTKEvalBase<ElemType>::Init(config);
+    }
+private:
+    static VariableLayout ToVariableLayout(const ComputationNodeBasePtr n);
+    std::vector<ComputationNodeBasePtr> m_outputNodes;
+    std::shared_ptr<ScopedNetworkOperationMode> m_scopedNetworkOperationMode;
+    std::vector<ComputationNodeBasePtr> m_inputNodes;
+    StreamMinibatchInputs m_inputMatrices;
 };
 } } }
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@ -55,8 +55,8 @@
    <TargetName>EvalDll</TargetName>
  </PropertyGroup>
  <ItemDefinitionGroup>
-    <ClCompile>
-      <AdditionalIncludeDirectories>$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <ClCompile>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(MSMPI_LIB64);$(SolutionDir)$(Platform)\$(Configuration);$(NvmlLibPath)</AdditionalLibraryDirectories>
@ -99,7 +99,7 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ActionsLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ActionsLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib;ReaderLib.lib; %(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
@ -153,4 +153,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/Source/EvalDll/EvalDll.vcxproj.filters
+++ b/Source/EvalDll/EvalDll.vcxproj.filters
@ -2,39 +2,18 @@
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <ClCompile Include="CNTKEval.cpp" />
-    <ClCompile Include="..\Common\fileutil.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\File.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\TimerUtility.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="dllmain.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
    <ClCompile Include="stdafx.cpp">
      <Filter>Misc</Filter>
    </ClCompile>
-    <ClCompile Include="..\Common\Config.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\Eval.cpp">
-      <Filter>For External Use</Filter>
-    </ClCompile>
-    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptEvaluator.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptParser.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
-    <ClCompile Include="..\Common\DataReader.cpp">
-      <Filter>Common</Filter>
-    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="EvalReader.h" />
--- a/Source/EvalDll/EvalReader.h
+++ b/Source/EvalDll/EvalReader.h
@ -150,7 +150,7 @@ public:
        return true;
    }

-    size_t GetNumParallelSequences()
+    size_t GetNumParallelSequencesForFixingBPTTMode()
    {
        return 1;
    }
--- a/Source/Extensibility/CPPEvalClient/CPPEvalClient.cpp
+++ b/Source/Extensibility/CPPEvalClient/CPPEvalClient.cpp
@ -0,0 +1,99 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CPPEvalClient.cpp : Sample application using the evaluation interface from C++
+//
+
+#include "stdafx.h"
+#include "eval.h"
+
+using namespace Microsoft::MSR::CNTK;
+
+// Used for retrieving the model appropriate for the element type (float / double)
+template<typename ElemType>
+using GetEvalProc = void(*)(IEvaluateModel<ElemType>**);
+
+typedef std::pair<std::wstring, std::vector<float>*> MapEntry;
+typedef std::map<std::wstring, std::vector<float>*> Layer;
+
+/// <summary>
+/// Program for demonstrating how to run model evaluations using the native evaluation interface
+/// </summary>
+/// <description>
+/// This program is a native C++ client using the native evaluation interface
+/// located in the <see cref="eval.h"/> file.
+/// The CNTK evaluation dll (EvalDLL.dll), must be found through the system's path. 
+/// The other requirement is that Eval.h be included
+/// In order to run this program the model must already exist in the example. To create the model,
+/// first run the example in <CNTK>/Examples/Image/MNIST. Once the model file 01_OneHidden is created,
+/// you can run this client.
+/// This program demonstrates the usage of the Evaluate method requiring the input and output layers as parameters.
+int _tmain(int argc, _TCHAR* argv[])
+{
+    // Get the binary path (current working directory)
+    argc = 0;
+    std::wstring wapp(argv[0]);
+    std::string app(wapp.begin(), wapp.end());
+    std::string path = app.substr(0, app.rfind("\\"));
+
+    // Load the eval library
+    auto hModule = LoadLibrary(L"evaldll.dll");
+    if (hModule == nullptr)
+    {
+        const std::wstring msg(L"Cannot find evaldll.dll library");
+        const std::string ex(msg.begin(), msg.end());
+        throw new std::exception(ex.c_str());
+    }
+
+    // Get the factory method to the evaluation engine
+    std::string func = "GetEvalF";
+    auto procAddress = GetProcAddress(hModule, func.c_str());
+    auto getEvalProc = (GetEvalProc<float>)procAddress;
+
+    // Native model evaluation instance
+    IEvaluateModel<float> *model;
+    getEvalProc(&model);
+
+    // This relative path assumes launching from CNTK's binary folder
+    const std::string modelWorkingDirectory = path + "\\..\\..\\Examples\\Image\\MNIST\\Data\\";
+    const std::string modelFilePath = modelWorkingDirectory + "..\\Output\\Models\\01_OneHidden";
+
+    // Load model
+    model->CreateNetwork("modelPath=\"" + modelFilePath + "\"");
+
+    // get the model's layers dimensions
+    std::map<std::wstring, size_t> inDims;
+    std::map<std::wstring, size_t> outDims;
+    model->GetNodeDimensions(inDims, NodeGroup::nodeInput);
+    model->GetNodeDimensions(outDims, NodeGroup::nodeOutput);
+    
+    // Generate dummy input values in the appropriate structure and size
+    auto inputLayerName = inDims.begin()->first;
+    std::vector<float> inputs;
+    for (int i = 0; i < inDims[inputLayerName]; i++)
+    {
+        inputs.push_back(static_cast<float>(i % 255));
+    }
+
+    // Allocate the output values layer
+    std::vector<float> outputs;
+
+    // Setup the maps for inputs and output
+    Layer inputLayer;
+    inputLayer.insert(MapEntry(inputLayerName, &inputs));
+    Layer outputLayer;
+    auto outputLayerName = outDims.begin()->first;
+    outputLayer.insert(MapEntry(outputLayerName, &outputs));
+
+    // We can call the evaluate method and get back the results (single layer)...
+    model->Evaluate(inputLayer, outputLayer);
+
+    // Output the results
+    for each (auto& value in outputs)
+    {
+        fprintf(stderr, "%f\n", value);
+    }
+
+    return 0;
+}
--- a/Source/Extensibility/CPPEvalClient/CPPEvalClient.vcxproj
+++ b/Source/Extensibility/CPPEvalClient/CPPEvalClient.vcxproj
@ -0,0 +1,119 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_CpuOnly|x64">
+      <Configuration>Debug_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_CpuOnly|x64">
+      <Configuration>Release_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{578D52A0-3928-4405-A016-F016E8B49031}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CPPEvalClient</RootNamespace>
+  </PropertyGroup>
+  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
+  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <UseIntelMKL>No</UseIntelMKL>
+  </PropertyGroup>
+  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+    <UseIntelMKL>No</UseIntelMKL>
+    <UseIntelIPP>false</UseIntelIPP>
+  </PropertyGroup>
+  <!--Importing CPP defaults must occur after declaring the desired toolset above
+  Otherwise, the build may default back to an previous toolset -->
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings" />
+  <ImportGroup Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup>
+    <!-- TODO intentional for all? -->
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>CPPEvalClient</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level4</WarningLevel>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;UNICODE;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OpenMPSupport>true</OpenMPSupport>
+      <TreatWarningAsError>true</TreatWarningAsError>
+    </ClCompile>
+    <Link>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>%(DelayLoadDLLs)</DelayLoadDLLs>
+      <Profile>true</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(DebugBuild)">
+    <ClCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Optimization>Disabled</Optimization>
+      <MinimalRebuild>false</MinimalRebuild>
+    </ClCompile>
+    <Link />
+    <ProjectReference>
+      <LinkLibraryDependencies>false</LinkLibraryDependencies>
+    </ProjectReference>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClInclude Include="stdafx.h" />
+    <ClInclude Include="targetver.h" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="CPPEvalClient.cpp" />
+    <ClCompile Include="stdafx.cpp">
+      <PrecompiledHeader>Create</PrecompiledHeader>
+    </ClCompile>
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets" />
+</Project>
--- a/Source/Extensibility/CPPEvalClient/CPPEvalClient.vcxproj.filters
+++ b/Source/Extensibility/CPPEvalClient/CPPEvalClient.vcxproj.filters
@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="stdafx.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+    <ClInclude Include="targetver.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="stdafx.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+    <ClCompile Include="CPPEvalClient.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/Source/Extensibility/CPPEvalClient/stdafx.cpp
+++ b/Source/Extensibility/CPPEvalClient/stdafx.cpp
@ -0,0 +1,12 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// stdafx.cpp : source file that includes just the standard includes
+// CPPEvalClient.pch will be the pre-compiled header
+// stdafx.obj will contain the pre-compiled type information
+
+#include "stdafx.h"
+
+// TODO: reference any additional headers you need in STDAFX.H
+// and not in this file
--- a/Source/Extensibility/CPPEvalClient/stdafx.h
+++ b/Source/Extensibility/CPPEvalClient/stdafx.h
@ -0,0 +1,19 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// stdafx.h : include file for standard system include files,
+// or project specific include files that are used frequently, but
+// are changed infrequently
+//
+
+#pragma once
+
+#include "targetver.h"
+
+#include <stdio.h>
+#include <tchar.h>
+#include "targetver.h"
+
+// This is a windows only application
+#include "Windows.h"
--- a/Source/Extensibility/CPPEvalClient/targetver.h
+++ b/Source/Extensibility/CPPEvalClient/targetver.h
@ -0,0 +1,13 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#pragma once
+
+// Including SDKDDKVer.h defines the highest available Windows platform.
+
+// If you wish to build your application for a previous Windows platform, include WinSDKVer.h and
+// set the _WIN32_WINNT macro to the platform you wish to support before including SDKDDKVer.h.
+
+#include <SDKDDKVer.h>
--- a/Source/Extensibility/CSEvalClient/Program.cs
+++ b/Source/Extensibility/CSEvalClient/Program.cs
@ -9,7 +9,6 @@ using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Linq;
-using System.Net.Configuration;

 namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
 {
@ -63,6 +62,8 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
        {
            try
            {
+                string outputLayerName;
+
                // The examples assume the executable is running from the data folder
                // We switch the current directory to the data folder (assuming the executable is in the <CNTK>/x64/Debug|Release folder
                Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\MNIST\Data\");
@ -70,22 +71,22 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient

                using (var model = new IEvaluateModelManagedF())
                {
-                    // Initialize model evaluator
-                    string config = GetFileContents(Path.Combine(Environment.CurrentDirectory, @"..\Config\01_OneHidden.cntk"));
-                    model.Init(config);
-
                    // Load model
                    string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\01_OneHidden");
-                    model.CreateNetwork(string.Format("deviceId=-1\nmodelPath=\"{0}\"", modelFilePath));
+                    model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId:-1);

                    // Generate random input values in the appropriate structure and size
-                    var inputs = GetDictionary("features", 28*28, 255);
+                    var inDims = model.GetNodeDimensions(NodeGroup.nodeInput);
+                    var inputs = GetDictionary(inDims.First().Key, inDims.First().Value, 255);
                    
+                    // We request the output layer names(s) and dimension, we'll use the first one.
+                    var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
+                    outputLayerName = outDims.First().Key;
                    // We can call the evaluate method and get back the results (single layer)...
-                    outputs = model.Evaluate(inputs, "ol.z", 10);
+                    outputs = model.Evaluate(inputs, outputLayerName);
                }

-                OutputResults("ol.z", outputs);
+                OutputResults(outputLayerName, outputs);
            }
            catch (CNTKException ex)
            {
@ -112,20 +113,20 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient

                using (var model = new IEvaluateModelManagedF())
                {
-                    // Initialize model evaluator
-                    string config = GetFileContents(Path.Combine(Environment.CurrentDirectory, @"..\Config\01_OneHidden.cntk"));
-                    model.Init(config);
-
                    // Load model
                    string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\01_OneHidden");
-                    model.CreateNetwork(string.Format("deviceId=-1\nmodelPath=\"{0}\"", modelFilePath));
+                    model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId:-1);

                    // Generate random input values in the appropriate structure and size
-                    var inputs = GetDictionary("features", 28*28, 255);
+                    var inDims = model.GetNodeDimensions(NodeGroup.nodeInput);
+                    var inputs = GetDictionary(inDims.First().Key, inDims.First().Value, 255);
+
+                    // We request the output layer names(s) and dimension, we'll use the first one.
+                    var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
+                    string outputLayerName = outDims.First().Key;

                    // We can preallocate the output structure and pass it in (multiple output layers)
-                    outputs = GetDictionary("ol.z", 10, 1);
-
+                    outputs = GetDictionary(outputLayerName, outDims[outputLayerName], 1);
                    model.Evaluate(inputs, outputs);
                }

@ -154,24 +155,26 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
                Environment.CurrentDirectory = initialDirectory;

                List<float> outputs;
+                string outputLayerName;

                using (var model = new IEvaluateModelManagedF())
                {
-                    // Initialize model evaluator
-                    model.Init("deviceId=-1");
-
                    // Create the network
-                    string networkDescription = GetFileContents(Path.Combine(workingDirectory, @"AddOperatorConstant.cntk"));
-                    model.CreateNetwork(networkDescription);
+                    // This network (AddOperatorConstant.cntk) is a simple network consisting of a single binary operator (Plus)
+                    // operating over a single input and a constant
+                    string networkDescription = File.ReadAllText(Path.Combine(workingDirectory, @"AddOperatorConstant.cntk"));
+                    model.CreateNetwork(networkDescription, deviceId:-1);

-                    // Generate random input values in the appropriate structure and size
-                    var inputs = new Dictionary<string, List<float>>() { { "features", new List<float>() { { 1.0f } } } };
+                    // Generate random input value in the appropriate structure and size
+                    var inputs = new Dictionary<string, List<float>>() { { "features", new List<float>() { 1.0f } } };

                    // We can call the evaluate method and get back the results (single layer)...
-                    outputs = model.Evaluate(inputs, "ol", 1);
+                    var outDims = model.GetNodeDimensions(NodeGroup.nodeOutput);
+                    outputLayerName = outDims.First().Key;
+                    outputs = model.Evaluate(inputs, outputLayerName);
                }

-                OutputResults("ol", outputs);
+                OutputResults(outputLayerName, outputs);
            }
            catch (CNTKException ex)
            {
@ -199,12 +202,11 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient

                using (var model = new IEvaluateModelManagedF())
                {
-                    // Initialize model evaluator
-                    model.Init("deviceId=-1");
-
                    // Create the network
-                    string networkDescription = GetFileContents(Path.Combine(workingDirectory, @"AddOperatorConstantNoInput.cntk"));
-                    model.CreateNetwork(networkDescription);
+                    // This network (AddOperatorConstantNoInput.cntk) is a simple network consisting of a single binary operator (Plus)
+                    // operating over a two constants, therefore no input is necessary.
+                    string networkDescription = File.ReadAllText(Path.Combine(workingDirectory, @"AddOperatorConstantNoInput.cntk"));
+                    model.CreateNetwork(networkDescription, deviceId:-1);

                    // We can call the evaluate method and get back the results (single layer)...
                    outputs = model.Evaluate("ol", 1);
@ -273,16 +275,6 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
            return dict;
        }

-        /// <summary>
-        /// Reads the configuration file and returns the contents as a string
-        /// </summary>
-        /// <returns>The content of the configuration file</returns>
-        static string GetFileContents(string filePath)
-        {
-            var lines = System.IO.File.ReadAllLines(filePath);
-            return string.Join("\n", lines);
-        }
-
        /// <summary>
        /// Creats a list of random numbers
        /// </summary>
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.cpp
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.cpp
@ -33,6 +33,14 @@ ref class CNTKException;
 template<typename ElemType>
 using GetEvalProc = void(*)(IEvaluateModel<ElemType>**);

+/// Enumeration for the types of nodes
+public enum class NodeGroup
+{
+    nodeInput,  // an input node
+    nodeOutput, // an output node
+    nodeSpecified
+};
+
 /// Managed wrapper for the native evaluation model
 template<typename ElemType>
 public ref class IEvaluateModelManaged : IDisposable
@ -110,11 +118,24 @@ public:
        }
    }

+    /// <summary>Creates a network based from the network description in the configuration</summary>
+    /// <param name="networkDescription">The configuration file containing the network description</param>
+    /// <param name="deviceId">The device ID to specify for the network</param>
+    void CreateNetwork(String^ networkDescription, int deviceId)
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        this->CreateNetwork(String::Format("deviceId={0}\n{1}", deviceId, networkDescription));
+    }
+
    /// <summary>Evaluates the model using a single forward feed pass and retrieves the output layer data</summary>
    /// <param name="outputKey"></param>
    /// <param name="outputSize"></param>
    /// <returns>Results for specified layer</returns>
-    List<ElemType>^ Evaluate(String^ outputKey, int outputSize)
+    __declspec(deprecated) List<ElemType>^ Evaluate(String^ outputKey, int outputSize)
    {
        if (m_eval == nullptr)
        {
@ -179,6 +200,75 @@ public:
        }
    }

+    /// <summary>Evaluates the model using a single forward feed pass and retrieves the output layer data</summary>
+    /// <param name="outputKey"></param>
+    /// <param name="outputSize"></param>
+    /// <returns>Results for specified layer</returns>
+    List<ElemType>^ Evaluate(String^ outputKey)
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        std::map<std::wstring, std::vector<ElemType>*> stdOutputs;
+
+        try
+        {
+            std::vector<shared_ptr<std::vector<ElemType>>> sharedOutputVectors;
+            int outputSize = GetNodeDimensions(NodeGroup::nodeOutput)[outputKey];
+
+            List<ElemType>^ outputs = gcnew List<ElemType>(outputSize);
+            for (int i = 0; i < outputSize; i++)
+            {
+                outputs->Add(*(gcnew ElemType));
+            }
+
+            Dictionary<String^, List<ElemType>^>^ outputMap = gcnew Dictionary<String^, List<ElemType>^>();
+            outputMap->Add(outputKey, outputs);
+
+            for each (auto item in outputMap)
+            {
+                pin_ptr<const WCHAR> key = PtrToStringChars(item.Key);
+                shared_ptr<std::vector<ElemType>> ptr = CopyList(item.Value);
+                sharedOutputVectors.push_back(ptr);
+                stdOutputs.insert(MapEntry(key, ptr.get()));
+            }
+
+            try
+            {
+                m_eval->Evaluate(stdOutputs);
+            }
+            catch (const exception& ex)
+            {
+                throw GetCustomException(ex);
+            }
+
+            auto enumerator = outputMap->Keys->GetEnumerator();
+            for (auto& map_item : stdOutputs)
+            {
+                // Retrieve the layer key
+                enumerator.MoveNext();
+                String^ key = enumerator.Current;
+
+                std::vector<ElemType> &refVec = *(map_item.second);
+                int index = 0;
+
+                // Copy output to CLI structure
+                for (auto& vec : refVec)
+                {
+                    outputMap[key][index++] = vec;
+                }
+            }
+
+            return outputMap[outputKey];
+        }
+        catch (Exception^)
+        {
+            throw;
+        }
+    }
+
    /// <summary>Evaluates the model against input data and retrieves the output layer data</summary>
    /// <param name="inputs"></param>
    /// <param name="outputs"></param>
@ -250,7 +340,7 @@ public:
    /// <param name="outputKey"></param>
    /// <param name="outputSize"></param>
    /// <returns>Results for specified layer</returns>
-    List<ElemType>^ Evaluate(Dictionary<String^, List<ElemType>^>^ inputs, String^ outputKey, int outputSize)
+    __declspec(deprecated) List<ElemType>^ Evaluate(Dictionary<String^, List<ElemType>^>^ inputs, String^ outputKey, int outputSize)
    {
        List<ElemType>^ outputs = gcnew List<ElemType>(outputSize);
        for (int i = 0; i < outputSize; i++)
@ -266,6 +356,63 @@ public:
        return outputMap[outputKey];
    }

+    /// <summary>Evaluates the model against input data and retrieves the desired output layer data</summary>
+    /// <param name="inputs"></param>
+    /// <param name="outputKey"></param>
+    /// <returns>Results for requested layer</returns>
+    List<ElemType>^ Evaluate(Dictionary<String^, List<ElemType>^>^ inputs, String^ outputKey)
+    {
+        auto outDims = GetNodeDimensions(NodeGroup::nodeOutput);
+        int outputSize = outDims[outputKey];
+
+        List<ElemType>^ outputs = gcnew List<ElemType>(outputSize);
+        for (int i = 0; i < outputSize; i++)
+        {
+            outputs->Add(*(gcnew ElemType));
+        }
+
+        Dictionary<String^, List<ElemType>^>^ outputMap = gcnew Dictionary<String^, List<ElemType>^>();
+        outputMap->Add(outputKey, outputs);
+
+        Evaluate(inputs, outputMap);
+
+        return outputMap[outputKey];
+    }
+
+    /// <summary>Returns the layer(s) and associated dimensions for the specified node group
+    /// <param name="nodeGroup">The node type to query for</param>
+    /// <returns>A dictionary mapping layer names to their dimension</returns>
+    Dictionary<String^, int>^ GetNodeDimensions(NodeGroup nodeGroup)
+    {
+        if (m_eval == nullptr)
+        {
+            throw gcnew ObjectDisposedException("Object has been disposed.");
+        }
+
+        std::map<std::wstring, size_t> stdDims;
+
+        try
+        {
+            Microsoft::MSR::CNTK::NodeGroup gr(GetNodeGroup(nodeGroup));
+            m_eval->GetNodeDimensions(stdDims, gr);
+        }
+        catch (const exception& ex)
+        {
+            throw GetCustomException(ex);
+        }
+
+        Dictionary<String^, int>^ dims = gcnew Dictionary<String^, int>();
+
+        for (auto& map_item : stdDims)
+        {
+            String^ key = gcnew String(map_item.first.c_str());
+            int dim = static_cast<int>(map_item.second);
+            dims->Add(key, dim);
+        }
+
+        return dims;
+    }
+
    ~IEvaluateModelManaged()
    {
        if (m_eval == nullptr)
@ -336,6 +483,23 @@ private:
            return gcnew CNTKException(gcnew System::String(ex.what()));
        }
    }
+
+    /// <summary Converts a managed (CLI) enum NodeGroup to a native NodeGroup
+    /// <param name="nodeGroup">The managed (CLI) NodeGroup to convert to native</param>
+    Microsoft::MSR::CNTK::NodeGroup GetNodeGroup(NodeGroup nodeGroup)
+    {
+        switch ((int)nodeGroup)
+        {
+        case Microsoft::MSR::CNTK::NodeGroup::nodeInput:
+            return Microsoft::MSR::CNTK::NodeGroup::nodeInput;
+        case Microsoft::MSR::CNTK::NodeGroup::nodeOutput:
+            return Microsoft::MSR::CNTK::NodeGroup::nodeOutput;
+        case Microsoft::MSR::CNTK::NodeGroup::nodeSpecified:
+            return Microsoft::MSR::CNTK::NodeGroup::nodeSpecified;
+        default:
+            throw gcnew CNTKRuntimeException(String::Format("Cannot convert native NodeGroup with value: {0} to corresponding managed NodeGroup.",(int)nodeGroup), "");
+        }
+    }
 };

 /// <summary>Managed float-specific model evaluation class</summary>
@ -420,19 +584,35 @@ public:
 // explanation to this behavior
 void emit()
 {
+    Dictionary<String^, List<float>^>^ nullDictF = nullptr;
+    Dictionary<String^, List<double>^>^ nullDictD = nullptr;
+
    IEvaluateModelManagedF f;
    f.Init("");
-    f.Evaluate(nullptr, nullptr);
-    f.Evaluate(nullptr, "", 0);
-    f.Evaluate("", 0);
+    f.Evaluate(nullptr, nullDictF);
+    f.Evaluate(nullptr, "");
+    f.Evaluate("");
    f.CreateNetwork("");
+    f.CreateNetwork("", 0);
+    f.GetNodeDimensions(NodeGroup::nodeSpecified);

    IEvaluateModelManagedD d;
    d.Init("");
-    d.Evaluate(nullptr, nullptr);
+    d.Evaluate(nullptr, nullDictD);
+    d.Evaluate(nullptr, "");
+    d.Evaluate("");
+    d.CreateNetwork("");
+    d.CreateNetwork("", 0);
+    d.GetNodeDimensions(NodeGroup::nodeSpecified);
+
+    // Deprecated code, hush warnings locally only
+#pragma warning(push)
+#pragma warning(disable: 4996)
+    f.Evaluate(nullptr, "", 0);
+    f.Evaluate("", 0);
    d.Evaluate(nullptr, "", 0);
    d.Evaluate("", 0);
-    d.CreateNetwork("");
+#pragma warning(pop)
 }

 }}}}}
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -659,8 +659,8 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
 #pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows.
    foreach_column(jOut, us)
    {
-        auto jInF = idx(0, jOut); // this is the column we need to get
-        if (jInF < 0)           // negative index means gap
+        auto jInF = idx(0, jOut);         // this is the column we need to get
+        if (std::isnan(jInF) || jInF < 0) // negative index means gap
            continue;
        size_t jIn = (size_t)jInF;
        if (jIn >= a.GetNumCols())
@ -691,8 +691,8 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, cons
 #pragma omp parallel for // TODO: Depending in circumstance, it may be more efficient to parallelize over rows.
    foreach_column(jIn, a)
    {
-        auto jOutF = idx(0, jIn); // this is the column we copy/add into
-        if (jOutF < 0)            // negative index means gap
+        auto jOutF = idx(0, jIn);           // this is the column we copy/add into
+        if (std::isnan(jOutF) || jOutF < 0) // negative index means gap
            continue;
        size_t jOut = (size_t)jOutF;
        if (jOut >= GetNumCols())
@ -715,11 +715,12 @@ void CPUMatrix<ElemType>::SetValue(const ElemType v)
    }
    else
    {
-		ElemType* bufPtr = Data();
+        ElemType* bufPtr = Data();
        long m = (long) GetNumElements();
        // 2-way thread parallelism is sufficient for the memory bound
        // operation of just setting the values of an array.
        const unsigned SETVALUE_NUM_THREADS = 2;
+        UNUSED(SETVALUE_NUM_THREADS); // in case OMP is turned off.
 #pragma omp parallel for num_threads(SETVALUE_NUM_THREADS)
        // four-way unrolling
        for (long i = 0; i < (m & ~3); i += 4)
@ -852,6 +853,26 @@ void CPUMatrix<ElemType>::SetValue(const CPUMatrix<ElemType>& deepCopyFrom)
 	SetValue(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols(), deepCopyFrom.Data(), 0);
 }

+#if 0
+template <class ElemType>
+void CPUMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& /*deepCopyFrom*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom)
+{
+    deepCopyFrom.AssignColumnSliceToDense(*this, 0, deepCopyFrom.GetNumCols());
+}
+
+template <class ElemType>
+void CPUMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& /*deepCopyFrom*/)
+{
+    NOT_IMPLEMENTED;
+}
+#endif
+
 template <class ElemType>
 void CPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, ElemType* pArray, const size_t matrixFlags)
 {
@ -1093,18 +1114,15 @@ void CPUMatrix<ElemType>::AddGaussianRandomValue(const ElemType mean, const Elem
 //maskRate: percentage of values masked out (similar to dropout rate)
 //scaleValue: which scale value to set to the left ones (unmasked items).
 template <class ElemType>
-void CPUMatrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed)
+void CPUMatrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, RNGHandle& rngHandle)
 {
    if (IsEmpty())
        LogicError("SetUniformRandomValue: Matrix is empty.");

+    CPURNGHandle* cpuRNGHandle = dynamic_cast<CPURNGHandle*>(&rngHandle);
+    assert(cpuRNGHandle != nullptr);
+
    auto& us = *this;
-#ifdef _MSC_VER // TODO: check if available under GCC/Linux
-    std::ranlux64_base_01 generator;
-    generator.seed(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed);
-#else
-    std::default_random_engine generator(seed == USE_TIME_BASED_SEED ? (unsigned long) time(NULL) : seed);
-#endif
    std::uniform_real_distribution<ElemType> r(0, 1);

    long m = (long) GetNumRows(), n = (long) GetNumCols();
@ -1114,19 +1132,19 @@ void CPUMatrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const El
        // four-way unrolling
        for (long i = 0; i < (m & ~3); i += 4)
        {
-            v = r(generator);
+            v = r(cpuRNGHandle->Generator());
            us(i, j) = v <= maskRate ? 0 : scaleValue;
-            v = r(generator);
+            v = r(cpuRNGHandle->Generator());
            us(i + 1, j) = v <= maskRate ? 0 : scaleValue;
-            v = r(generator);
+            v = r(cpuRNGHandle->Generator());
            us(i + 2, j) = v <= maskRate ? 0 : scaleValue;
-            v = r(generator);
+            v = r(cpuRNGHandle->Generator());
            us(i + 3, j) = v <= maskRate ? 0 : scaleValue;
        }
        // handle remaining stuffs
        for (long i = m & ~3; i < m; i++)
        {
-            v = r(generator);
+            v = r(cpuRNGHandle->Generator());
            us(i, j) = v <= maskRate ? 0 : scaleValue;
        }
    }
@ -1365,7 +1383,6 @@ void CPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols
 // Resize() -- change matrix size
 // This function is cheap if the matrix size does not change.
 // Current content is not preserved.
-// BUGBUG: There is code that relies on zero initialization (without, we get subtle variations of output). That is wrong--we should initialize to QNaN and see where it fails.
 // If growOnly is true, resize will not reallocate memory if the current memory is large enough (i.e., will not shrink).
 // If this object does not own its memory then new memory cannot be allocated (one can still shrink and/or reshape).
 template <class ElemType>
@ -1394,8 +1411,9 @@ void CPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, boo
    }

    // success
-    m_numRows = numRows;
-    m_numCols = numCols;
+    m_sliceViewOffset = 0;
+    m_numRows         = numRows;
+    m_numCols         = numCols;
 }

 // allocated by the callee but should be deleted by the caller
@ -6290,6 +6308,10 @@ template CPUMatrix<char>& CPUMatrix<char>::operator=(CPUMatrix<char>&&);
 template void CPUMatrix<char>::SetValue(const char);
 template void CPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, char* pArray, size_t matrixFlags);
 template void CPUMatrix<char>::SetValue(CPUMatrix<char> const&);
+//template void CPUMatrix<char>::SetValue(GPUMatrix<char> const&);
+//template void CPUMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
+//template void CPUMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
+template void CPUMatrix<char>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly);
 template void CPUMatrix<char>::Resize(const size_t numRows, const size_t numCols, bool growOnly);

 template CPUMatrix<int>::CPUMatrix(const size_t, const size_t, int*, const size_t);
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -8,11 +8,16 @@
 #include "File.h"
 #include "Helpers.h"
 #include "CommonMatrix.h"
+#include "CPURNGHandle.h"
 #include <vector>
 #include <stdio.h>
 #include <ctime>
 #include <limits.h>

+//#include "GPUMatrix.h"
+//#include "CPUSparseMatrix.h"
+//#include "GPUSparseMatrix.h"
+
 // NOTE NOTE NOTE:
 // use CPUSingleMatrix and CPUDoubleMatrix instead of using the template directly
 ///////////////////////////////////////////////
@ -127,6 +132,9 @@ public:

    void SetValue(const ElemType v);
    void SetValue(const CPUMatrix<ElemType>& deepCopyFrom);
+    //void SetValue(const GPUMatrix<ElemType>& deepCopyFrom);
+    //void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
+    //void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
    void SetValue(const size_t numRows, const size_t numCols, ElemType* pArray, size_t matrixFlags = matrixFlagNormal);

    void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val);
@ -139,7 +147,7 @@ public:
    void SetDiagonalValue(const CPUMatrix<ElemType>& vector);
    void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED);
    void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);
-    void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed = USE_TIME_BASED_SEED);
+    void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, RNGHandle& rngHandle);
    void AddGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);

    CPUMatrix<ElemType> Transpose();
--- a/Source/Math/CPURNGHandle.cpp
+++ b/Source/Math/CPURNGHandle.cpp
@ -0,0 +1,24 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CPUMatrix.cpp : full implementation of all matrix functions on the CPU side
+//
+
+#include "stdafx.h"
+#include "CPURNGHandle.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+CPURNGHandle::CPURNGHandle(int deviceId, unsigned long seed)
+    : RNGHandle(deviceId)
+{
+#ifdef _MSC_VER // TODO: check if available under GCC/Linux
+    m_generator.reset(new std::ranlux64_base_01());
+    m_generator->seed(seed);
+#else
+    m_generator.reset(new std::default_random_engine(seed));
+#endif
+}
+
+}}}
--- a/Source/Math/CPURNGHandle.h
+++ b/Source/Math/CPURNGHandle.h
@ -0,0 +1,42 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CPUMatrix.cpp : full implementation of all matrix functions on the CPU side
+//
+
+#pragma once
+
+#include "RNGHandle.h"
+#include <memory>
+#include <random>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+class CPURNGHandle : public RNGHandle
+{
+public:
+    CPURNGHandle(int deviceId, unsigned long seed);
+
+#ifdef _MSC_VER // TODO: check if available under GCC/Linux
+    std::ranlux64_base_01& Generator()
+    {
+        return *m_generator;
+    }
+
+private:
+    std::unique_ptr<std::ranlux64_base_01> m_generator;
+
+#else
+    std::default_random_engine& Generator()
+    {
+        return *m_generator;
+    }
+
+private:
+    std::unique_ptr<std::default_random_engine> m_generator;
+#endif
+
+};
+
+}}}
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -264,8 +264,36 @@ void CPUSparseMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& v)
        memcpy(RowLocation(), v.RowLocation(), v.RowSize());
        memcpy(ColLocation(), v.ColLocation(), v.ColSize());
    }
+    if (v.m_sliceViewOffset > 0)
+    {
+        CPUSPARSE_INDEX_TYPE* loc = (GetFormat() == matrixFormatSparseCSC) ? ColLocation() : RowLocation();
+        size_t len = (GetFormat() == matrixFormatSparseCSC) ? ColSize() : RowSize();
+        CPUSPARSE_INDEX_TYPE offset = loc[0];
+        for (size_t c = 0; c < len; c++)
+            loc[c] -= offset;
+    }
 }

+#if 0
+template <class ElemType>
+void CPUSparseMatrix<ElemType>::SetValue(const CPUMatrix<ElemType>& /*v*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void CPUSparseMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& /*v*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void CPUSparseMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& /*v*/)
+{
+    NOT_IMPLEMENTED;
+}
+#endif
+
 template <class ElemType>
 void CPUSparseMatrix<ElemType>::MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val)
 {
@ -392,7 +420,7 @@ CPUSparseMatrix<ElemType> CPUSparseMatrix<ElemType>::ColumnSlice(size_t startCol
 }

 template <class ElemType>
-CPUMatrix<ElemType> CPUSparseMatrix<ElemType>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const
+void CPUSparseMatrix<ElemType>::AssignColumnSliceToDense(CPUMatrix<ElemType>& slice, size_t startColumn, size_t numCols) const
 {
    if (startColumn + numCols > m_numCols)
        InvalidArgument("The slice (%d+%d) is out of range of the source matrix (%d).", (int) startColumn, (int) numCols, (int) m_numCols);
@ -400,8 +428,10 @@ CPUMatrix<ElemType> CPUSparseMatrix<ElemType>::CopyColumnSliceToDense(size_t sta
    if (GetFormat() != MatrixFormat::matrixFormatSparseCSC)
        NOT_IMPLEMENTED;

-    CPUMatrix<ElemType> slice(m_numRows, numCols);
+    // We can either error out or RequireSize. Because RequireSize will error out if it's not allowed, I think this makes more sense.
+    slice.RequireSize(m_numRows, numCols);

+    memset(slice.Data(), 0, sizeof(ElemType) * slice.GetNumElements());
 #pragma omp parallel for
    for (long j = 0; j < numCols; j++)
    {
@ -416,6 +446,14 @@ CPUMatrix<ElemType> CPUSparseMatrix<ElemType>::CopyColumnSliceToDense(size_t sta
        }
    }

+}
+template <class ElemType>
+CPUMatrix<ElemType> CPUSparseMatrix<ElemType>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const
+{
+    CPUMatrix<ElemType> slice(m_numRows, numCols);
+
+    AssignColumnSliceToDense(slice, startColumn, numCols);
+
    return slice;
 }

@ -1339,15 +1377,20 @@ template CPUSparseMatrix<char>::CPUSparseMatrix(CPUSparseMatrix<char> const&);
 template CPUSparseMatrix<char>::CPUSparseMatrix(CPUSparseMatrix<char>&&);
 template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(CPUSparseMatrix<char>&& moveFrom);
 template void CPUSparseMatrix<char>::SetValue(size_t, size_t, char);
+//template void CPUSparseMatrix<char>::SetValue(CPUMatrix<char> const&);
+//template void CPUSparseMatrix<char>::SetValue(GPUMatrix<char> const&);
 template void CPUSparseMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
+//template void CPUSparseMatrix<char>::SetValue(GPUSparseMatrix<char> const&);
 template char* CPUSparseMatrix<char>::Data() const;
 template char* CPUSparseMatrix<char>::Data();
 template void CPUSparseMatrix<char>::Reset(void);
+template void CPUSparseMatrix<char>::Resize(const size_t, const size_t, const size_t, const bool);
 template void CPUSparseMatrix<char>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const bool, bool);
 template void CPUSparseMatrix<char>::RequireSizeAndAllocate(const size_t, const size_t, const size_t, const MatrixFormat, const bool, bool);
 template CPUSparseMatrix<char>::~CPUSparseMatrix();
 template CPUSparseMatrix<char> CPUSparseMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
 template CPUMatrix<char> CPUSparseMatrix<char>::CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
+template void CPUSparseMatrix<char>::AssignColumnSliceToDense(CPUMatrix<char>&, size_t startColumn, size_t numCols) const;
 template CPUSparseMatrix<char>& CPUSparseMatrix<char>::operator=(const CPUSparseMatrix<char>& deepCopyFrom);

 template CPUSparseMatrix<int>::CPUSparseMatrix(const MatrixFormat, const size_t, const size_t, const size_t);
--- a/Source/Math/CPUSparseMatrix.h
+++ b/Source/Math/CPUSparseMatrix.h
@ -6,6 +6,8 @@

 #include <stdio.h>
 #include "CPUMatrix.h"
+//#include "GPUMatrix.h"
+//#include "GPUSparseMatrix.h"
 #include <map>
 #include <unordered_map>

@ -82,7 +84,11 @@ public:
 public:

    void SetValue(const size_t row, const size_t col, ElemType val);
+    //void SetValue(const CPUMatrix<ElemType>& /*val*/);
+    //void SetValue(const GPUMatrix<ElemType>& /*val*/);
    void SetValue(const CPUSparseMatrix<ElemType>& /*val*/);
+    //void SetValue(const GPUSparseMatrix<ElemType>& /*val*/);
+
    void MaskColumnsValue(const CPUMatrix<char>& columnsMask, ElemType val);

    size_t BufferSize() const
@ -98,6 +104,7 @@ public:

    CPUSparseMatrix<ElemType> ColumnSlice(size_t startColumn, size_t numCols) const;
    CPUMatrix<ElemType> CopyColumnSliceToDense(size_t startColumn, size_t numCols) const;
+    void AssignColumnSliceToDense(CPUMatrix<ElemType>& slice, size_t startColumn, size_t numCols) const;

    CPUMatrix<ElemType> DiagonalToDense() const;

--- a/Source/Math/CntkBatchNormalization.cuh
+++ b/Source/Math/CntkBatchNormalization.cuh
@ -175,7 +175,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize, con
    assert(gridDim.y == 1);
    assert(gridDim.z == 1);
    assert(::isfinite(epsilon) && epsilon > 0);
-    assert(::isfinite(expAvgFactor) && expAvgFactor > 0);
+    assert(::isfinite(expAvgFactor) && expAvgFactor >= 0);

    int irowSrcBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
    if (irowSrcBase >= vectorSize)
--- a/Source/Math/Convolution.cuh
+++ b/Source/Math/Convolution.cuh
@ -269,4 +269,4 @@ __global__ void kAveragePoolingBackward(int batchSize, const int* mpRowCol, cons
    }
 }

-} } }
+}}}
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -312,7 +312,7 @@ protected:
            if (in.GetMatrixType() == MatrixType::DENSE || m_gpuSparse1D)
                inputSubBatch = in.ColumnSlice(startSampleId, smallBatchSize);
            else
-                inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize), in.GetFormat());
+                inputSubBatch.SetValue(in.ColumnSlice(startSampleId, smallBatchSize));

            if (m_gpuSparseOpt)
            {
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -313,10 +313,12 @@ private:
    template <typename TAlgo, typename TFinder, typename TStaticFinder>
    void FindBestAlgo(size_t batchSize, TAlgo& algo, TFinder finder, TStaticFinder staticFinder)
    {
-        if (!algo.NeedAutotuning(batchSize))
-            return;
        m_inT.UpdateBatchSize(batchSize);
        m_outT.UpdateBatchSize(batchSize);
+
+        if (!algo.NeedAutotuning(batchSize))
+            return;
+
        using CuDnnAlgoT = decltype(TAlgo::Algo);
        CuDnnAlgoT algoPerf[MaxAlgoCount];
        int calgo = 0;
@ -327,7 +329,7 @@ private:
        {
            decltype(CuDnnAlgoT::algo) noMemAlgo;
            CUDNN_CALL(staticFinder(noMemAlgo));
-            algo.CurMBSize = batchSize;
+            algo.MaxAllowedMBSizeForCurrentAlgo = batchSize;
            algo.Algo = algoPerf[0];
            algo.Algo.algo = noMemAlgo;
            algo.Algo.memory = 0;
@ -347,7 +349,7 @@ private:
            });
        if (res == algoPerf + calgo)
            RuntimeError("cuDNN could not find suitable algorithm for the current convolution configuration.");
-        algo.CurMBSize = batchSize;
+        algo.MaxAllowedMBSizeForCurrentAlgo = batchSize;
        algo.Algo = *res;
        // Find fastest algorithm that does NOT require workspace. It is used as a fallback algo in Forward function.
        res = std::find_if(algoPerf, algoPerf + calgo,
@ -380,13 +382,14 @@ private:
        using CuDnnAlgoT = decltype(T::algo);

        ConvAlgoInfo()
-            : CurMBSize(0)
+            : MaxAllowedMBSizeForCurrentAlgo(0)
        {
            Algo.status = CUDNN_STATUS_NOT_INITIALIZED;
            NoWorkspaceAlgo = (CuDnnAlgoT)-1;
        }
        // Current mini-batch size, needed for re-computing statistics in auto-tuner.
-        size_t CurMBSize;
+        size_t MaxAllowedMBSizeForCurrentAlgo;
+
        T Algo;
        CuDnnAlgoT NoWorkspaceAlgo;

@ -399,7 +402,7 @@ private:
            // We also need to reset auto-tuning status at the beginning of each epoch but ComputationNode currently does not provide such notification.
            // We assume no other dimensions of tensors can change so we don't check it.
            // REVIEW alexeyk: review once we get response from NVIDIA.
-            return (Algo.status != CUDNN_STATUS_SUCCESS || batchSize > CurMBSize);
+            return (Algo.status != CUDNN_STATUS_SUCCESS || batchSize > MaxAllowedMBSizeForCurrentAlgo);
        }
    };

@ -432,7 +435,8 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId
    // REVIEW alexeyk: IsSupported check should be performed by cuDNN itself. Is there a good way to do that?

    cudaDeviceProp props = {0};
-    if (cudaGetDeviceProperties(&props, deviceId) != cudaSuccess || props.major < 3)
+    // Note that cudaGetDeviceProperties also sets CUDA last error so need to check/clear both.
+    if (deviceId < 0 || (cudaGetDeviceProperties(&props, deviceId) | cudaGetLastError()) != cudaSuccess || props.major < 3)
        return false;

    const auto& input = geometry->InputShape();
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -11,7 +11,7 @@

 #include "GPUMatrix.h"
 #include "GPUMatrixCUDAKernels.cuh"
-#include "GPUSparseMatrix.h"
+//#include "GPUSparseMatrix.h"
 #include "GPUTensor.h"
 #include "CommonMatrix.h"
 #define TENSOR_OPS_DECL __device__ __host__
@ -450,7 +450,7 @@ template <class ElemType>
 GPUMatrix<ElemType>::GPUMatrix(const GPUMatrix<ElemType>& deepCopyFrom)
 {
    ZeroInit();
-	SetValue(deepCopyFrom);
+    SetValue(deepCopyFrom);
 }

 template <class ElemType>
@ -886,11 +886,11 @@ __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType
    CUDA_LONG jOut = id / usStride; // col index into 'us' and 'idx'

    auto jInF = idx[jOut * idxStride]; // this is the column we need to get
-    if (jInF < 0)                      // negative index means gap
+    if (::isnan(jInF) || jInF < 0)     // negative index means gap
        return;
    size_t jIn = (size_t)jInF;
-    if (jIn >= aCols)
-        return; // actually a failure
+    //if (jIn >= aCols)
+    //    return; // actually a failure

    const ElemType&  ra = a[    i + jIn  *  aStride  ];
    ElemType&       rus = us[id/*i + jOut * usStride*/];
@ -929,6 +929,21 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
    return *this;
 }

+// little helper for debugging
+template <class ElemType>
+static void Peek(const GPUMatrix<ElemType>& m, const char* which)
+{
+    size_t rows = m.GetNumRows();
+    size_t cols = m.GetNumCols();
+    ElemType buf[10000] = { 0 };
+    size_t n = min(rows * cols, _countof(buf));
+    CUDA_CALL(cudaMemcpy(buf, m.Data(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost));
+    UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here
+    //CUDA_CALL(cudaMemcpy(const_cast<ElemType*>(m.Data()), buf, sizeof(ElemType) * n, cudaMemcpyHostToDevice));
+}
+
+#define ALLOW_ATOMIC_SCATTER // allow to disable this, until we know atomicAdd() works properly here
+
 template <class ElemType>
 __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols, const ElemType* idx, size_t idxStride, const ElemType* a, size_t aStride, const ElemType alpha, CUDA_LONG numElements)
 {
@ -941,35 +956,26 @@ __global__ void _doScatterColumnsOf(ElemType* us, size_t usStride, size_t usCols
    CUDA_LONG i   = id % aStride; // row index into 'a' and 'us'
    CUDA_LONG jIn = id / aStride; // col index into 'a' and 'idx'

-    auto jOutF = idx[jIn * idxStride]; // this is the column we copy/add into
-    if (jOutF < 0)                     // negative index means gap
+    auto jOutF = idx[jIn * idxStride];  // this is the column we copy/add into
+    if (::isnan(jOutF) || jOutF < 0)    // negative index means gap
        return;
    size_t jOut = (size_t)jOutF;
-    if (jOut >= usCols)
-        return; // actually a failure  --TODO: This should not be necessary. Why is it?
+    //if (jOut >= usCols)
+    //    return; // actually a failure  --TODO: This should not be necessary. Why is it?

    const ElemType&  ra =  a[id/*i + jIn  *  aStride*/];
    ElemType&       rus = us[    i + jOut * usStride  ];

    ElemType res = ra * alpha;
    if (res != 0)             // avoid memory conflict if e.g. an entire column has no gradient
+#ifdef ALLOW_ATOMIC_SCATTER
        atomicAdd(&rus, res); // rus += res;
+#else
+        rus += res;
+#endif
    // Note: atomicAdd() is supposed to be fast in case of no conflict (the simple case of Scatter())
 }

-// little helper for debugging
-template <class ElemType>
-static void Peek(const GPUMatrix<ElemType>& m, const char* which)
-{
-    size_t rows = m.GetNumRows();
-    size_t cols = m.GetNumCols();
-    ElemType buf[10000] = { 0 };
-    size_t n = min(rows * cols, _countof(buf));
-    CUDA_CALL(cudaMemcpy(buf, m.Data(), sizeof(ElemType) * n, cudaMemcpyDeviceToHost));
-    UNUSED(which); UNUSED(rows); UNUSED(cols); sin(1.0f); // set breakpoint here
-    //CUDA_CALL(cudaMemcpy(const_cast<ElemType*>(m.Data()), buf, sizeof(ElemType) * n, cudaMemcpyHostToDevice));
-}
-
 // *this[:,idx[j]] = a[:,j] * alpha + *this[:,idx[j]] * beta
 template <class ElemType>
 GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, const GPUMatrix<ElemType>& idx, const GPUMatrix<ElemType>& a, ElemType alpha)
@ -987,6 +993,27 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::DoScatterColumnsOf(ElemType beta, cons

    auto& us = *this;

+#ifndef ALLOW_ATOMIC_SCATTER // verify that atomicAdd is not needed  --this is not efficient
+    {
+        vector<ElemType> buf(idx.GetNumRows() * idx.GetNumCols()); // idx(,)are the column(s) we copy/add into
+        CUDA_CALL(cudaMemcpy(buf.data(), idx.Data(), sizeof(ElemType) * buf.size(), cudaMemcpyDeviceToHost));
+        vector<bool> writtenTo(GetNumCols(), false); // remember whether an output column is in fact a target
+        for (size_t i = 0; i < buf.size(); i++)
+        {
+            auto colF = buf[i];
+            if (std::isnan(colF) || colF < 0)
+                continue;
+            size_t col = (size_t)colF;
+            if (col >= GetNumCols())
+                LogicError("DoScatterColumnsOf: Index value out of bounds.");
+            if (writtenTo[col])
+                LogicError("DoScatterColumnsOf: #ifndef ALLOW_ATOMIC_SCATTER then columns must be unique. Column idx(%d,%d)=%d is used twice.", (int)(i % idx.GetNumCols()), (int)(i / idx.GetNumCols()), (int)col);
+            else
+                writtenTo[col] = true;
+        }
+    }
+#endif
+
    // pre-scale with beta upfront
    // Scatter may add more than one source column to the same target, so we must pre-scale with beta, and then just keep adding.
    Scale(beta, us); // if beta is 0, then this will be a memset()
@ -1091,9 +1118,29 @@ void GPUMatrix<ElemType>::SetValue(const GPUMatrix<ElemType>& deepCopyFrom)
    if (this == &deepCopyFrom)
        return;

-	SetValue(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols(), deepCopyFrom.GetComputeDeviceId(), deepCopyFrom.Data(), matrixFlagSetValueOnDevice);
+    SetValue(deepCopyFrom.GetNumRows(), deepCopyFrom.GetNumCols(), deepCopyFrom.GetComputeDeviceId(), deepCopyFrom.Data(), matrixFlagSetValueOnDevice);
 }

+#if 0
+template <class ElemType>
+void GPUMatrix<ElemType>::SetValue(const CPUMatrix<ElemType>& /*deepCopyFrom*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::SetValue(const CPUSparseMatrix<ElemType>& /*deepCopyFrom*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+template <class ElemType>
+void GPUMatrix<ElemType>::SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom)
+{
+    deepCopyFrom.CopyToDenseMatrix(*this);
+}
+#endif
+
 template <class ElemType>
 void GPUMatrix<ElemType>::SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, size_t matrixFlags)
 {
@ -1225,21 +1272,22 @@ void GPUMatrix<ElemType>::SetGaussianRandomValue(const ElemType mean, const Elem
 //maskRate: percentage of values masked out (similar to dropout rate)
 //scaleValue: which scale value to set to the left ones (unmasked items).
 template <class ElemType>
-void GPUMatrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed)
+void GPUMatrix<ElemType>::SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, RNGHandle& rngHandle)
 {
    PrepareDevice();
-    CreateCurandObject(seed, __FUNCTION__); // TODO call ResetCurandObject() instead?
+
+    GPURNGHandle* gpuRNGHandle = dynamic_cast<GPURNGHandle*>(&rngHandle);
+    assert(gpuRNGHandle != nullptr);

    cudaEvent_t done = nullptr;
    CUDA_CALL(cudaEventCreate(&done)); // TODO: why not condition on do_sync, so that we can use SyncGuard?
    if (sizeof(ElemType) == sizeof(float))
-        CURAND_CALL(curandGenerateUniform((((curandGenerator_t*) s_curandGenerator)[0]), reinterpret_cast<float*>(Data()), GetNumElements()));
+        CURAND_CALL(curandGenerateUniform(gpuRNGHandle->Generator(), reinterpret_cast<float*>(Data()), GetNumElements()));
    else
-        CURAND_CALL(curandGenerateUniformDouble((((curandGenerator_t*) s_curandGenerator)[0]), reinterpret_cast<double*>(Data()), GetNumElements()));
+        CURAND_CALL(curandGenerateUniformDouble(gpuRNGHandle->Generator(), reinterpret_cast<double*>(Data()), GetNumElements()));
    CUDA_CALL(cudaEventRecord(done));
    CUDA_CALL(cudaEventSynchronize(done));
    CUDA_CALL(cudaEventDestroy(done));
-    // CURAND_CALL(curandDestroyGenerator(gen));

    size_t N = GetNumElements();
    size_t blocksPerGrid = (size_t) ceil(N / (double) GridDim::maxThreadsPerBlock);
@ -1420,29 +1468,27 @@ void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, boo
    if (GetNumRows() == numRows && GetNumCols() == numCols)
        return;

+    size_t numElements = numRows * numCols;
+    if (numElements > GetSizeAllocated() ||                 // grow allocation
+        (!growOnly && numElements != GetSizeAllocated()))   // shrink allocation if not growOnly
+    {
+        // reallocate buffer if numElements > 0
+        ElemType* pArray = nullptr;
+        if (numElements > 0)
+            pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
+
+        // If the buffer exists, free it
+        if (Buffer())
+            TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
+
+        SetBuffer(pArray, numElements * sizeof(ElemType));
+        SetSizeAllocated(numElements);
+    }
+    
+    // success
+    m_sliceViewOffset = 0;
    m_numRows = numRows;
    m_numCols = numCols;
-
-    size_t numElements = GetNumElements();
-    if (numElements > GetSizeAllocated() || (!growOnly && numElements != GetSizeAllocated()))
-    {
-        if (IsEmpty())
-        {
-            SetSizeAllocated(0);
-            SetBuffer(nullptr, 0);
-        }
-        else
-        {
-            if (Buffer())
-            {
-                TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
-            }
-            SetSizeAllocated(numElements);
-            SetBuffer(TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), m_numRows, m_numCols), numElements * sizeof(ElemType));
-            CUDA_CALL(cudaMemset(Buffer(), 0, sizeof(ElemType) * GetSizeAllocated()));
-        }
-    }
-    m_sliceViewOffset = 0;
 }

 template <class ElemType>
@ -2711,7 +2757,7 @@ void GPUMatrix<ElemType>::VectorMax(GPUMatrix<ElemType>& maxIndexes, GPUMatrix<E
        reinterpret_cast<uint8_t*&>(inIdx) += sizeof(uint64_t) - cbAlign;
    outIdx = inIdx + celt;
    void* ptmp = outIdx + celt;
-    assert(reinterpret_cast<ElemType*>(reinterpret_cast<uint8_t*>(ptmp) + cbtemp) <= workspace->Data()+ workspace->GetNumElements());
+    assert(reinterpret_cast<ElemType*>(reinterpret_cast<uint8_t*>(ptmp) + cbtemp) <= workspace->Data() + workspace->GetNumElements());

    // Initialize indices.
    const int ThreadsPerBlock = 128;
@ -4390,7 +4436,10 @@ template GPUMatrix<char>& GPUMatrix<char>::operator=(GPUMatrix<char>&&);
 template GPUMatrix<char>::GPUMatrix(int);
 template void GPUMatrix<char>::SetValue(const char);
 template void GPUMatrix<char>::SetValue(const size_t numRows, const size_t numCols, int deviceId, char* pArray, size_t matrixFlags);
+//template void GPUMatrix<char>::SetValue(CPUMatrix<char> const&);
 template void GPUMatrix<char>::SetValue(GPUMatrix<char> const&);
+//template void GPUMatrix<char>::SetValue(CPUSparseMatrix<char> const&);
+//template void GPUMatrix<char>::SetValue(GPUSparseMatrix<char> const&);

 template GPUMatrix<int>::GPUMatrix(const size_t, const size_t, int, int*, const size_t);
 template GPUMatrix<int>::~GPUMatrix();
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -11,6 +11,7 @@
 #include "TensorShape.h" // only for SmallVector; I was hoping to keep this out
 #include "BestGpu.h" // for CPUONLY macro
 #include "ConcStack.h"
+#include "GPURNGHandle.h"
 #include <string>
 #include <vector>
 #include <array>
@ -19,6 +20,10 @@
 #include <memory>   // for unique_ptr
 #include <limits.h> // for ULONG_MAX

+//#include "CPUMatrix.h"
+//#include "CPUSparseMatrix.h"
+//#include "GPUSparseMatrix.h"
+
 #ifndef _WIN32
 #include <unistd.h>
 #endif
@ -225,14 +230,17 @@ public:

    void MaskColumnsValue(const GPUMatrix<char>& columnsMask, ElemType val);

+    //void SetValue(const CPUMatrix<ElemType>& deepCopyFrom);
    void SetValue(const GPUMatrix<ElemType>& deepCopyFrom);
+    //void SetValue(const CPUSparseMatrix<ElemType>& deepCopyFrom);
+    //void SetValue(const GPUSparseMatrix<ElemType>& deepCopyFrom);
    void SetValue(const size_t numRows, const size_t numCols, int deviceId, ElemType* pArray, size_t matrixFlags = matrixFlagNormal);

    void SetDiagonalValue(const ElemType v);
    void SetDiagonalValue(const GPUMatrix<ElemType>& vector);
    void SetUniformRandomValue(const ElemType low, const ElemType high, unsigned long seed = USE_TIME_BASED_SEED);
    void SetGaussianRandomValue(const ElemType mean, const ElemType sigma, unsigned long seed = USE_TIME_BASED_SEED);
-    void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, unsigned long seed = USE_TIME_BASED_SEED);
+    void SetUniformRandomMask(const ElemType maskRate, const ElemType scaleValue, RNGHandle& rngHandle);

    GPUMatrix<ElemType> Transpose() const;
    GPUMatrix<ElemType>& AssignTransposeOf(const GPUMatrix<ElemType>& a);
@ -642,7 +650,10 @@ public:
    {
        m_done = nullptr;
        if (DoSync())
+        {
+            CUDA_CALL(cudaGetLastError());
            CUDA_CALL(cudaEventCreate(&m_done));
+        }
    }
    ~SyncGuard()
    {
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -47,6 +47,7 @@
 // NVIDIA should fix their CUDA 8.0 headers
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 600
 // CUDA atomicAdd() only exists for 'float'. This is the 'double' version.
+// TODO: This may need to be guarded by CUDA version; newer devices may support this.
 static __inline__ __device__ double atomicAdd(double* address, double val)
 {
    unsigned long long int* address_as_ull = (unsigned long long int*) address;
@ -100,7 +101,7 @@ static INT CeilDiv(INT a, INT2 b) // ceil(a/b)
 struct GridDim
 {
    static const CUDA_LONG maxThreadsPerBlock = 512; // use this many threads per block
-    static const CUDA_LONG maxWarpsPerBlock = 16;    // use this many warps per block
+    static const CUDA_LONG maxWarpsPerBlock = 16;    // use this many warps per block. This means 512 threads for warpSize=32

    // use these for launching
    //   GridDim grid(NN);
@ -123,6 +124,7 @@ struct GridDim
        CUDA_LONG warpsPerProc = CeilDiv(N, numProcs * warpSize);

        // if too many warps per block then reduce #warps
+        // This limits the number of threads to 512.
        if (warpsPerProc > maxWarpsPerBlock)
        {
            CUDA_LONG overBy = CeilDiv(warpsPerProc, maxWarpsPerBlock); // we are over by this factor
@ -130,7 +132,7 @@ struct GridDim
        }

        // put it back together
-        m_threadsPerBlock = warpsPerProc * warpSize;
+        m_threadsPerBlock = warpsPerProc * warpSize;        // =a multiple of 32 that is as close to 512 as makes sense given NN
        m_blocksPerGrid = CeilDiv(N, m_threadsPerBlock);
        if (m_blocksPerGrid == 1)
            m_threadsPerBlock = N; // don't launch more than necessary  --TODO: Does this make a difference at all?
@ -151,13 +153,18 @@ struct GridDim
        return props;
    }

+    static size_t GetCurrentDeviceId()
+    {
+        int deviceId;
+        cudaGetDevice(&deviceId);
+        return (size_t)deviceId;
+    }
+
    // get device properties of current device
    static const cudaDeviceProp& GetDeviceProps()
    {
        static std::vector<cudaDeviceProp> props = CacheDeviceProps(); // thread-safe according to C++ standard
-        int deviceId;
-        cudaGetDevice(&deviceId);
-        return props[deviceId];
+        return props[GetCurrentDeviceId()];
    }

    // compute our location on the grid
@ -3157,7 +3164,8 @@ __global__ void _scaleSparseBlockAndAddToDense(
    rhs[IDX2C(row, col, numRows)] += alpha * lhsValues[index];
 }

-// compute predictions in cross entory node
+#if 0
+// compute predictions in cross entropy node
 template <class ElemType>
 __global__ void _computePrediction(
    int nv,
@ -3340,6 +3348,7 @@ __global__ void _computeGradientOfInput(

    atomicAdd(&grd[IDX2C(h, j, numrows)], sum);
 }
+#endif

 template <class ElemType>
 __global__ void computeNCEForwardProp(
@ -3718,6 +3727,8 @@ __global__ void _assignNceDerivativeNew(
            atomicAdd(&c[wid], -er);
    }
 }
+
+#if 0
 // compute gradients of weights in cross entropy node
 template <class ElemType>
 __global__ void _computeGradientOfWeight(
@ -3779,6 +3790,7 @@ __global__ void _computeGradientOfWeight(
        blockIds[ii] = i;
    }
 }
+#endif

 // used in clipping gradients
 template <class ElemType>
--- a/Показать больше
+++ b/Показать больше