Merge from master branch

2015-05-20 17:11:52 -07:00 · 2015-05-20 17:11:52 -07:00 · b1f7263326
--- a/.gitignore
+++ b/.gitignore
@ -171,6 +171,7 @@ core
 # =========================
 # prebuild file 
 # =========================
-MachineLearning/cn/buildinfo.h
+MachineLearning/CNTK/buildinfo.h
+MachineLearning/CNTK/buildinfo.h$$


--- a/CNTK.sln
+++ b/CNTK.sln
@ -0,0 +1,244 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 2013
+VisualStudioVersion = 12.0.21005.1
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMath", "Math\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
+	ProjectSection(ProjectDependencies) = postProject
+		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {B3DD765E-694E-4494-BAD7-37BBF2942517}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTK", "MachineLearning\CNTK\CNTK.vcxproj", "{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}"
+	ProjectSection(ProjectDependencies) = postProject
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33D2FD22-DEF2-4507-A58A-368F641AEBE5}
+		{D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {D667AF32-028A-4A5D-BE19-F46776F0F6B2}
+		{9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {9A2F2441-5972-4EA8-9215-4119FCE0FB68}
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{014DA766-B37B-4581-BC26-963EA5507931} = {014DA766-B37B-4581-BC26-963EA5507931}
+		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {62836DC1-DF77-4B98-BF2D-45C943B7DDC6}
+		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {1D5787D4-52E4-45DB-951B-82F220EE0C6A}
+		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {E6646FFE-3588-4276-8A15-8D65C22711C1}
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Unit Tests", "Unit Tests", "{D45DF403-6781-444E-B654-A96868C5BE68}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathTest", "Math\CNTKMathTest\CNTKMathTest.vcxproj", "{6CEE834A-8104-46A8-8902-64C81BD7928F}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HTKMLFReader", "DataReader\HTKMLFReader\HTKMLFReader.vcxproj", "{33D2FD22-DEF2-4507-A58A-368F641AEBE5}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathPerformanceTests", "Math\MathPerformanceTests\MathPerformanceTests.vcxproj", "{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UCIFastReader", "DataReader\UCIFastReader\UCIFastReader.vcxproj", "{E6646FFE-3588-4276-8A15-8D65C22711C1}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {1D5787D4-52E4-45DB-951B-82F220EE0C6A}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BinaryReader", "DataReader\BinaryReader\BinaryReader.vcxproj", "{1D5787D4-52E4-45DB-951B-82F220EE0C6A}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LUSequenceReader", "DataReader\LUSequenceReader\LUSequenceReader.vcxproj", "{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKEval", "MachineLearning\CNTKEval\CNTKEval.vcxproj", "{482999D1-B7E2-466E-9F8D-2119F93EAFD9}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKEvalTest", "MachineLearning\CNTKEval\CNTKEvalTest\CNTKEvalTest.vcxproj", "{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Reader Plugins", "Reader Plugins", "{33EBFE78-A1A8-4961-8938-92A271941F94}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CNTK Core", "CNTK Core", "{DD043083-71A4-409A-AA91-F9C548DCF7EC}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathCUDA", "Math\Math\CNTKMathCUDA.vcxproj", "{B3DD765E-694E-4494-BAD7-37BBF2942517}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LMSequenceReader", "DataReader\LMSequenceReader\LMSequenceReader.vcxproj", "{9A2F2441-5972-4EA8-9215-4119FCE0FB68}"
+	ProjectSection(ProjectDependencies) = postProject
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DSSMReader", "DataReader\DSSMReader\DSSMReader.vcxproj", "{014DA766-B37B-4581-BC26-963EA5507931}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LibSVMBinaryReader", "DataReader\LibSVMBinaryReader\LibSVMBinaryReader.vcxproj", "{D667AF32-028A-4A5D-BE19-F46776F0F6B2}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Linux build files", "Linux build files", "{3ED0465D-23E7-4855-9694-F788717B6533}"
+	ProjectSection(SolutionItems) = preProject
+		Makefile.cpu = Makefile.cpu
+		Makefile.gpu = Makefile.gpu
+		Makefile_kaldi.cpu = Makefile_kaldi.cpu
+		Makefile_kaldi.gpu = Makefile_kaldi.gpu
+		README = README
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Documentation", "Documentation", "{065AF55D-AF02-448B-BFCD-52619FDA4BD0}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Tutorial", "Tutorial", "{98D2C32B-0C1F-4E19-A626-65F7BA4600CF}"
+	ProjectSection(SolutionItems) = preProject
+		Documentation\Tutorial\CNTK-Tutorial-ICASSP2015.pdf = Documentation\Tutorial\CNTK-Tutorial-ICASSP2015.pdf
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CNTK-TechReport", "CNTK-TechReport", "{EA67F51F-1FE8-462D-9F3E-01161685AD59}"
+	ProjectSection(SolutionItems) = preProject
+		Documentation\CNTK-TechReport\lyx\CNTKBook-20150518.pdf = Documentation\CNTK-TechReport\lyx\CNTKBook-20150518.pdf
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Documents", "Documents", "{DE1A06BA-EC5C-4E0D-BCA8-3EA555310C58}"
+	ProjectSection(SolutionItems) = preProject
+		Documentation\Documents\Configuration Files.docx = Documentation\Documents\Configuration Files.docx
+		Documentation\Documents\External Buffer Behavior.docx = Documentation\Documents\External Buffer Behavior.docx
+		Documentation\Documents\Model Editing Language.docx = Documentation\Documents\Model Editing Language.docx
+		Documentation\Documents\Network Description Language.docx = Documentation\Documents\Network Description Language.docx
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "License", "License", "{63024704-A2D7-497E-AD4B-5C10C6AA1374}"
+	ProjectSection(SolutionItems) = preProject
+		license\MSR Computational Network Toolkit_MSR-LA (2014-03-28).docx = license\MSR Computational Network Toolkit_MSR-LA (2014-03-28).docx
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "lyx", "lyx", "{F9BEB27E-8AF5-464E-8D45-0000D5AFA2D3}"
+	ProjectSection(SolutionItems) = preProject
+		Documentation\CNTK-TechReport\lyx\#CNTKBook_CNTK_Programmer_Chapter.lyx# = Documentation\CNTK-TechReport\lyx\#CNTKBook_CNTK_Programmer_Chapter.lyx#
+		Documentation\CNTK-TechReport\lyx\CNTKBook-master.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook-master.lyx
+		Documentation\CNTK-TechReport\lyx\CNTKBook_Abstract.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook_Abstract.lyx
+		Documentation\CNTK-TechReport\lyx\CNTKBook_ASRDecoder_Chapter.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook_ASRDecoder_Chapter.lyx
+		Documentation\CNTK-TechReport\lyx\CNTKBook_CN_Chapter.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook_CN_Chapter.lyx
+		Documentation\CNTK-TechReport\lyx\CNTKBook_CNTK_Adv_Chapter.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook_CNTK_Adv_Chapter.lyx
+		Documentation\CNTK-TechReport\lyx\CNTKBook_CNTK_Chapter.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook_CNTK_Chapter.lyx
+		Documentation\CNTK-TechReport\lyx\CNTKBook_CNTK_Programmer_Chapter.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook_CNTK_Programmer_Chapter.lyx
+		Documentation\CNTK-TechReport\lyx\CNTKBook_ExampleSetup_Chapter.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook_ExampleSetup_Chapter.lyx
+		Documentation\CNTK-TechReport\lyx\CNTKBook_Introduction.lyx = Documentation\CNTK-TechReport\lyx\CNTKBook_Introduction.lyx
+		Documentation\CNTK-TechReport\lyx\references.bib = Documentation\CNTK-TechReport\lyx\references.bib
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "figures", "figures", "{889C1CCF-92B3-450B-B00D-FC9A9D5BE464}"
+	ProjectSection(SolutionItems) = preProject
+		Documentation\CNTK-TechReport\figures\CN+TrainingCriterion.pdf = Documentation\CNTK-TechReport\figures\CN+TrainingCriterion.pdf
+		Documentation\CNTK-TechReport\figures\CN+TrainingCriterion.png = Documentation\CNTK-TechReport\figures\CN+TrainingCriterion.png
+		Documentation\CNTK-TechReport\figures\CN-1HiddenNN.pdf = Documentation\CNTK-TechReport\figures\CN-1HiddenNN.pdf
+		Documentation\CNTK-TechReport\figures\CN-1HiddenNN.png = Documentation\CNTK-TechReport\figures\CN-1HiddenNN.png
+		Documentation\CNTK-TechReport\figures\CN-2Inputs.pdf = Documentation\CNTK-TechReport\figures\CN-2Inputs.pdf
+		Documentation\CNTK-TechReport\figures\CN-2Inputs.png = Documentation\CNTK-TechReport\figures\CN-2Inputs.png
+		Documentation\CNTK-TechReport\figures\CN-EfficientGradient.pdf = Documentation\CNTK-TechReport\figures\CN-EfficientGradient.pdf
+		Documentation\CNTK-TechReport\figures\CN-EfficientGradient.png = Documentation\CNTK-TechReport\figures\CN-EfficientGradient.png
+		Documentation\CNTK-TechReport\figures\CN-NaiveGradient.pdf = Documentation\CNTK-TechReport\figures\CN-NaiveGradient.pdf
+		Documentation\CNTK-TechReport\figures\CN-NaiveGradient.png = Documentation\CNTK-TechReport\figures\CN-NaiveGradient.png
+		Documentation\CNTK-TechReport\figures\CN-ShareWeight.pdf = Documentation\CNTK-TechReport\figures\CN-ShareWeight.pdf
+		Documentation\CNTK-TechReport\figures\CN-ShareWeight.png = Documentation\CNTK-TechReport\figures\CN-ShareWeight.png
+		Documentation\CNTK-TechReport\figures\CN-WithDelayNode.pdf = Documentation\CNTK-TechReport\figures\CN-WithDelayNode.pdf
+		Documentation\CNTK-TechReport\figures\CN-WithDelayNode.png = Documentation\CNTK-TechReport\figures\CN-WithDelayNode.png
+		Documentation\CNTK-TechReport\figures\CNNComputation.pdf = Documentation\CNTK-TechReport\figures\CNNComputation.pdf
+		Documentation\CNTK-TechReport\figures\CNNComputation.png = Documentation\CNTK-TechReport\figures\CNNComputation.png
+		Documentation\CNTK-TechReport\figures\CNTKArch.pdf = Documentation\CNTK-TechReport\figures\CNTKArch.pdf
+		Documentation\CNTK-TechReport\figures\CNTKArch.png = Documentation\CNTK-TechReport\figures\CNTKArch.png
+		Documentation\CNTK-TechReport\figures\ConfusionData1.png = Documentation\CNTK-TechReport\figures\ConfusionData1.png
+		Documentation\CNTK-TechReport\figures\ConfusionData100.png = Documentation\CNTK-TechReport\figures\ConfusionData100.png
+		Documentation\CNTK-TechReport\figures\SequenceBatch.pdf = Documentation\CNTK-TechReport\figures\SequenceBatch.pdf
+		Documentation\CNTK-TechReport\figures\SequenceBatch.png = Documentation\CNTK-TechReport\figures\SequenceBatch.png
+		Documentation\CNTK-TechReport\figures\SimpleDemoDataReference.png = Documentation\CNTK-TechReport\figures\SimpleDemoDataReference.png
+		Documentation\CNTK-TechReport\figures\SimpleDemoErrorRateReference.png = Documentation\CNTK-TechReport\figures\SimpleDemoErrorRateReference.png
+		Documentation\CNTK-TechReport\figures\SimpleDemoOutputReference.png = Documentation\CNTK-TechReport\figures\SimpleDemoOutputReference.png
+		Documentation\CNTK-TechReport\figures\simpleRNN.png = Documentation\CNTK-TechReport\figures\simpleRNN.png
+		Documentation\CNTK-TechReport\figures\SpeechErrorRate.png = Documentation\CNTK-TechReport\figures\SpeechErrorRate.png
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Other", "Other", "{39E42C4B-A078-4CA4-9D92-B883D8129601}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Debug|x64.ActiveCfg = Debug|x64
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Debug|x64.Build.0 = Debug|x64
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Release|x64.ActiveCfg = Release|x64
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Release|x64.Build.0 = Release|x64
+		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Debug|x64.ActiveCfg = Debug|x64
+		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Debug|x64.Build.0 = Debug|x64
+		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Release|x64.ActiveCfg = Release|x64
+		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Release|x64.Build.0 = Release|x64
+		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Debug|x64.ActiveCfg = Debug|x64
+		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Debug|x64.Build.0 = Debug|x64
+		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Release|x64.ActiveCfg = Release|x64
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Debug|x64.ActiveCfg = Debug|x64
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Debug|x64.Build.0 = Debug|x64
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Release|x64.ActiveCfg = Release|x64
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Release|x64.Build.0 = Release|x64
+		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}.Debug|x64.ActiveCfg = Debug|x64
+		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}.Debug|x64.Build.0 = Debug|x64
+		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}.Release|x64.ActiveCfg = Release|x64
+		{E6646FFE-3588-4276-8A15-8D65C22711C1}.Debug|x64.ActiveCfg = Debug|x64
+		{E6646FFE-3588-4276-8A15-8D65C22711C1}.Debug|x64.Build.0 = Debug|x64
+		{E6646FFE-3588-4276-8A15-8D65C22711C1}.Release|x64.ActiveCfg = Release|x64
+		{E6646FFE-3588-4276-8A15-8D65C22711C1}.Release|x64.Build.0 = Release|x64
+		{1D5787D4-52E4-45DB-951B-82F220EE0C6A}.Debug|x64.ActiveCfg = Debug|x64
+		{1D5787D4-52E4-45DB-951B-82F220EE0C6A}.Debug|x64.Build.0 = Debug|x64
+		{1D5787D4-52E4-45DB-951B-82F220EE0C6A}.Release|x64.ActiveCfg = Release|x64
+		{1D5787D4-52E4-45DB-951B-82F220EE0C6A}.Release|x64.Build.0 = Release|x64
+		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}.Debug|x64.ActiveCfg = Debug|x64
+		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}.Debug|x64.Build.0 = Debug|x64
+		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}.Release|x64.ActiveCfg = Release|x64
+		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}.Release|x64.Build.0 = Release|x64
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Debug|x64.ActiveCfg = Debug|x64
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Debug|x64.Build.0 = Debug|x64
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Release|x64.ActiveCfg = Release|x64
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Release|x64.Build.0 = Release|x64
+		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Debug|x64.ActiveCfg = Debug|x64
+		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Debug|x64.Build.0 = Debug|x64
+		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Release|x64.ActiveCfg = Release|x64
+		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Debug|x64.ActiveCfg = Debug|x64
+		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Debug|x64.Build.0 = Debug|x64
+		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Release|x64.ActiveCfg = Release|x64
+		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Release|x64.Build.0 = Release|x64
+		{9A2F2441-5972-4EA8-9215-4119FCE0FB68}.Debug|x64.ActiveCfg = Debug|x64
+		{9A2F2441-5972-4EA8-9215-4119FCE0FB68}.Debug|x64.Build.0 = Debug|x64
+		{9A2F2441-5972-4EA8-9215-4119FCE0FB68}.Release|x64.ActiveCfg = Release|x64
+		{9A2F2441-5972-4EA8-9215-4119FCE0FB68}.Release|x64.Build.0 = Release|x64
+		{014DA766-B37B-4581-BC26-963EA5507931}.Debug|x64.ActiveCfg = Debug|x64
+		{014DA766-B37B-4581-BC26-963EA5507931}.Debug|x64.Build.0 = Debug|x64
+		{014DA766-B37B-4581-BC26-963EA5507931}.Release|x64.ActiveCfg = Release|x64
+		{014DA766-B37B-4581-BC26-963EA5507931}.Release|x64.Build.0 = Release|x64
+		{D667AF32-028A-4A5D-BE19-F46776F0F6B2}.Debug|x64.ActiveCfg = Debug|x64
+		{D667AF32-028A-4A5D-BE19-F46776F0F6B2}.Debug|x64.Build.0 = Debug|x64
+		{D667AF32-028A-4A5D-BE19-F46776F0F6B2}.Release|x64.ActiveCfg = Release|x64
+		{D667AF32-028A-4A5D-BE19-F46776F0F6B2}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(NestedProjects) = preSolution
+		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
+		{6CEE834A-8104-46A8-8902-64C81BD7928F} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
+		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{014DA766-B37B-4581-BC26-963EA5507931} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {33EBFE78-A1A8-4961-8938-92A271941F94}
+		{3ED0465D-23E7-4855-9694-F788717B6533} = {39E42C4B-A078-4CA4-9D92-B883D8129601}
+		{98D2C32B-0C1F-4E19-A626-65F7BA4600CF} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
+		{EA67F51F-1FE8-462D-9F3E-01161685AD59} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
+		{DE1A06BA-EC5C-4E0D-BCA8-3EA555310C58} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
+		{63024704-A2D7-497E-AD4B-5C10C6AA1374} = {065AF55D-AF02-448B-BFCD-52619FDA4BD0}
+		{F9BEB27E-8AF5-464E-8D45-0000D5AFA2D3} = {EA67F51F-1FE8-462D-9F3E-01161685AD59}
+		{889C1CCF-92B3-450B-B00D-FC9A9D5BE464} = {EA67F51F-1FE8-462D-9F3E-01161685AD59}
+	EndGlobalSection
+EndGlobal
--- a/CNTKSolution/CNTKSolution.sln
+++ b/CNTKSolution/CNTKSolution.sln
@ -1,154 +0,0 @@
-
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio 2013
-VisualStudioVersion = 12.0.21005.1
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMath", "..\Math\Math\Math.vcxproj", "{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}"
-	ProjectSection(ProjectDependencies) = postProject
-		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {B3DD765E-694E-4494-BAD7-37BBF2942517}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "cn", "..\MachineLearning\cn\cn.vcxproj", "{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}"
-	ProjectSection(ProjectDependencies) = postProject
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33D2FD22-DEF2-4507-A58A-368F641AEBE5}
-		{9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {9A2F2441-5972-4EA8-9215-4119FCE0FB68}
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {B3DD765E-694E-4494-BAD7-37BBF2942517}
-		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {62836DC1-DF77-4B98-BF2D-45C943B7DDC6}
-		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {1D5787D4-52E4-45DB-951B-82F220EE0C6A}
-		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {E6646FFE-3588-4276-8A15-8D65C22711C1}
-	EndProjectSection
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Unit Tests", "Unit Tests", "{D45DF403-6781-444E-B654-A96868C5BE68}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathTest", "..\Math\CNTKMathTest\CNTKMathTest.vcxproj", "{6CEE834A-8104-46A8-8902-64C81BD7928F}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "HTKMLFReader", "..\DataReader\HTKMLFReader\HTKMLFReader.vcxproj", "{33D2FD22-DEF2-4507-A58A-368F641AEBE5}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MathPerformanceTests", "..\Math\MathPerformanceTests\MathPerformanceTests.vcxproj", "{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "UCIFastReader", "..\DataReader\UCIFastReader\UCIFastReader.vcxproj", "{E6646FFE-3588-4276-8A15-8D65C22711C1}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {1D5787D4-52E4-45DB-951B-82F220EE0C6A}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "BinaryReader", "..\DataReader\BinaryReader\BinaryReader.vcxproj", "{1D5787D4-52E4-45DB-951B-82F220EE0C6A}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LUSequenceReader", "..\DataReader\LUSequenceReader\LUSequenceReader.vcxproj", "{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKEval", "..\MachineLearning\CNTKEval\CNTKEval.vcxproj", "{482999D1-B7E2-466E-9F8D-2119F93EAFD9}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKEvalTest", "..\MachineLearning\CNTKEval\CNTKEvalTest\CNTKEvalTest.vcxproj", "{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
-	EndProjectSection
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Reader Plugins", "Reader Plugins", "{33EBFE78-A1A8-4961-8938-92A271941F94}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CNTK", "CNTK", "{DD043083-71A4-409A-AA91-F9C548DCF7EC}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKMathCUDA", "..\Math\Math\CNTKMathCUDA.vcxproj", "{B3DD765E-694E-4494-BAD7-37BBF2942517}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LMSequenceReader", "..\DataReader\LMSequenceReader\LMSequenceReader.vcxproj", "{9A2F2441-5972-4EA8-9215-4119FCE0FB68}"
-	ProjectSection(ProjectDependencies) = postProject
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}
-	EndProjectSection
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "DSSMReader", "..\DataReader\DSSMReader\DSSMReader.vcxproj", "{014DA766-B37B-4581-BC26-963EA5507931}"
-EndProject
-Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "LibSVMBinaryReader", "..\DataReader\LibSVMBinaryReader\LibSVMBinaryReader.vcxproj", "{D667AF32-028A-4A5D-BE19-F46776F0F6B2}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|x64 = Debug|x64
-		Release|x64 = Release|x64
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Debug|x64.ActiveCfg = Debug|x64
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Debug|x64.Build.0 = Debug|x64
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Release|x64.ActiveCfg = Release|x64
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5}.Release|x64.Build.0 = Release|x64
-		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Debug|x64.ActiveCfg = Debug|x64
-		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Debug|x64.Build.0 = Debug|x64
-		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Release|x64.ActiveCfg = Release|x64
-		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE}.Release|x64.Build.0 = Release|x64
-		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Debug|x64.ActiveCfg = Debug|x64
-		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Debug|x64.Build.0 = Debug|x64
-		{6CEE834A-8104-46A8-8902-64C81BD7928F}.Release|x64.ActiveCfg = Release|x64
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Debug|x64.ActiveCfg = Debug|x64
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Debug|x64.Build.0 = Debug|x64
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Release|x64.ActiveCfg = Release|x64
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5}.Release|x64.Build.0 = Release|x64
-		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}.Debug|x64.ActiveCfg = Debug|x64
-		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}.Debug|x64.Build.0 = Debug|x64
-		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976}.Release|x64.ActiveCfg = Release|x64
-		{E6646FFE-3588-4276-8A15-8D65C22711C1}.Debug|x64.ActiveCfg = Debug|x64
-		{E6646FFE-3588-4276-8A15-8D65C22711C1}.Debug|x64.Build.0 = Debug|x64
-		{E6646FFE-3588-4276-8A15-8D65C22711C1}.Release|x64.ActiveCfg = Release|x64
-		{E6646FFE-3588-4276-8A15-8D65C22711C1}.Release|x64.Build.0 = Release|x64
-		{1D5787D4-52E4-45DB-951B-82F220EE0C6A}.Debug|x64.ActiveCfg = Debug|x64
-		{1D5787D4-52E4-45DB-951B-82F220EE0C6A}.Debug|x64.Build.0 = Debug|x64
-		{1D5787D4-52E4-45DB-951B-82F220EE0C6A}.Release|x64.ActiveCfg = Release|x64
-		{1D5787D4-52E4-45DB-951B-82F220EE0C6A}.Release|x64.Build.0 = Release|x64
-		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}.Debug|x64.ActiveCfg = Debug|x64
-		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}.Debug|x64.Build.0 = Debug|x64
-		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}.Release|x64.ActiveCfg = Release|x64
-		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6}.Release|x64.Build.0 = Release|x64
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Debug|x64.ActiveCfg = Debug|x64
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Debug|x64.Build.0 = Debug|x64
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Release|x64.ActiveCfg = Release|x64
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9}.Release|x64.Build.0 = Release|x64
-		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Debug|x64.ActiveCfg = Debug|x64
-		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Debug|x64.Build.0 = Debug|x64
-		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC}.Release|x64.ActiveCfg = Release|x64
-		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Debug|x64.ActiveCfg = Debug|x64
-		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Debug|x64.Build.0 = Debug|x64
-		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Release|x64.ActiveCfg = Release|x64
-		{B3DD765E-694E-4494-BAD7-37BBF2942517}.Release|x64.Build.0 = Release|x64
-		{9A2F2441-5972-4EA8-9215-4119FCE0FB68}.Debug|x64.ActiveCfg = Debug|x64
-		{9A2F2441-5972-4EA8-9215-4119FCE0FB68}.Debug|x64.Build.0 = Debug|x64
-		{9A2F2441-5972-4EA8-9215-4119FCE0FB68}.Release|x64.ActiveCfg = Release|x64
-		{9A2F2441-5972-4EA8-9215-4119FCE0FB68}.Release|x64.Build.0 = Release|x64
-		{014DA766-B37B-4581-BC26-963EA5507931}.Debug|x64.ActiveCfg = Debug|x64
-		{014DA766-B37B-4581-BC26-963EA5507931}.Debug|x64.Build.0 = Debug|x64
-		{014DA766-B37B-4581-BC26-963EA5507931}.Release|x64.ActiveCfg = Release|x64
-		{014DA766-B37B-4581-BC26-963EA5507931}.Release|x64.Build.0 = Release|x64
-		{D667AF32-028A-4A5D-BE19-F46776F0F6B2}.Debug|x64.ActiveCfg = Debug|x64
-		{D667AF32-028A-4A5D-BE19-F46776F0F6B2}.Debug|x64.Build.0 = Debug|x64
-		{D667AF32-028A-4A5D-BE19-F46776F0F6B2}.Release|x64.ActiveCfg = Release|x64
-		{D667AF32-028A-4A5D-BE19-F46776F0F6B2}.Release|x64.Build.0 = Release|x64
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(NestedProjects) = preSolution
-		{E6F26F9A-FF64-4F0A-B749-CD309EE357EE} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{60BDB847-D0C4-4FD3-A947-0C15C08BCDB5} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{B3DD765E-694E-4494-BAD7-37BBF2942517} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
-		{6CEE834A-8104-46A8-8902-64C81BD7928F} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{668BEED5-AC07-4F35-B3AE-EE65A7F9C976} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{0F30EBCF-09F3-4EED-BF54-4214BCE53FEC} = {D45DF403-6781-444E-B654-A96868C5BE68}
-		{E6646FFE-3588-4276-8A15-8D65C22711C1} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{1D5787D4-52E4-45DB-951B-82F220EE0C6A} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{62836DC1-DF77-4B98-BF2D-45C943B7DDC6} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{33D2FD22-DEF2-4507-A58A-368F641AEBE5} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{9A2F2441-5972-4EA8-9215-4119FCE0FB68} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{014DA766-B37B-4581-BC26-963EA5507931} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-		{D667AF32-028A-4A5D-BE19-F46776F0F6B2} = {33EBFE78-A1A8-4961-8938-92A271941F94}
-	EndGlobalSection
-EndGlobal
--- a/CheckInSuites/ASR/config/README.txt
+++ b/CheckInSuites/ASR/config/README.txt
@ -4,7 +4,7 @@ These scripts are similar to those in the TIMIT folder of the ExampleSetups exce
 The globals_cpu.config and globals_gpu.config differ only in which device they use and where the results are stored. 

 To test on CPU:
-cn.exe configFile=globals_cpu.config+<DesiredConfigFile>
+cntk configFile=globals_cpu.config+<DesiredConfigFile>

 To test on GPU:
-cn.exe configFile=globals_gpu.config+<DesiredConfigFile>
+cntk configFile=globals_gpu.config+<DesiredConfigFile>
--- a/CheckInSuites/ASR/config/globals_cpu.config
+++ b/CheckInSuites/ASR/config/globals_cpu.config
@ -1,4 +1,4 @@
-WorkDir=D:\gitroot\cntk\ExampleSetups\ASR\TIMIT
+WorkDir=..\..\..\ExampleSetups\ASR\TIMIT    # assumes current directory is location of this config file; override on cmd line if not correct
 LibDir=$WorkDir$\lib
 ScpDir=$LibDir$\scp
 FBankScpShort=$ScpDir$\TIMIT.train.scp.fbank.fullpath.100
--- a/CheckInSuites/ASR/config/globals_gpu.config
+++ b/CheckInSuites/ASR/config/globals_gpu.config
@ -1,4 +1,4 @@
-WorkDir=D:\gitroot\cntk\ExampleSetups\ASR\TIMIT
+WorkDir=..\..\..\ExampleSetups\ASR\TIMIT    # assumes current directory is location of this config file; override on cmd line if not correct
 LibDir=$WorkDir$\lib
 ScpDir=$LibDir$\scp
 FBankScpShort=$ScpDir$\TIMIT.train.scp.fbank.fullpath.100
--- a/CheckInSuites/ASR/config/runall.bat
+++ b/CheckInSuites/ASR/config/runall.bat
@ -1,14 +1,14 @@
 set cnpath=d:\gitroot\cntk\CNTKSolution\x64\Release
 set proc=%1
 echo on
-%cnpath%\cn.exe configFile=globals_%proc%.config+TIMIT_TrainSimpleNetwork.config
-%cnpath%\cn.exe configFile=globals_%proc%.config+TIMIT_TrainNDLNetwork.config
-%cnpath%\cn.exe configFile=globals_%proc%.config+TIMIT_TrainAutoEncoder.config
-%cnpath%\cn.exe configFile=globals_%proc%.config+TIMIT_TrainMultiInput.config
-%cnpath%\cn.exe configFile=globals_%proc%.config+TIMIT_TrainMultiTask.config
-%cnpath%\cn.exe configFile=globals_%proc%.config+TIMIT_EvalSimpleNetwork.config
-%cnpath%\cn.exe configFile=globals_%proc%.config+TIMIT_WriteScaledLogLike.config
-%cnpath%\cn.exe configFile=globals_%proc%.config+TIMIT_WriteBottleneck.config
+%cnpath%\cntk configFile=globals_%proc%.config+TIMIT_TrainSimpleNetwork.config
+%cnpath%\cntk configFile=globals_%proc%.config+TIMIT_TrainNDLNetwork.config
+%cnpath%\cntk configFile=globals_%proc%.config+TIMIT_TrainAutoEncoder.config
+%cnpath%\cntk configFile=globals_%proc%.config+TIMIT_TrainMultiInput.config
+%cnpath%\cntk configFile=globals_%proc%.config+TIMIT_TrainMultiTask.config
+%cnpath%\cntk configFile=globals_%proc%.config+TIMIT_EvalSimpleNetwork.config
+%cnpath%\cntk configFile=globals_%proc%.config+TIMIT_WriteScaledLogLike.config
+%cnpath%\cntk configFile=globals_%proc%.config+TIMIT_WriteBottleneck.config



--- a/CheckInSuites/SLU/README
+++ b/CheckInSuites/SLU/README
@ -1,4 +1,4 @@
-C:\dev\cntk3\CNTKSolution\x64\Release\cn.exe configFile=globals.config+rnnlu.config
+C:\dev\cntk3\CNTKSolution\x64\Release\cntk configFile=globals.config+rnnlu.config


 # expected results, which has a copy at Expected.log is 
@ -11,6 +11,6 @@ Finished Epoch[3]: [Training Set] Train Loss Per Sample = 3.6568716    EvalErr P
 Finished Epoch[3]: [Validation Set] Train Loss Per Sample = 2.6959986  EvalErr Per Sample = 2.6959986

 del /q c:\temp\exp\atis
-C:\dev\cntk3\CNTKSolution\x64\Release\cn.exe configFile=globals.config+rnnlu.ndl.config
+C:\dev\cntk3\CNTKSolution\x64\Release\cntk configFile=globals.config+rnnlu.ndl.config
 #should have the same output as above using simple network builder. 

--- a/Common/BestGpu.cpp
+++ b/Common/BestGpu.cpp
@ -4,15 +4,21 @@
 // </copyright>
 //

+// This file requires the NVML library. Unfortunately, this library does not install an environment variable for locating it.
+// On Windows, the SDK gets installed to "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml" (/include, /lib).
+// From the SDK documentation:
+// "The NVML library can be found at: %ProgramW6432%\"NVIDIA Corporation"\NVSMI\ on Windows, but will not be added to the path. To dynamically link to NVML, add this path to the PATH environmental variable. To dynamically load NVML, call LoadLibrary with this path."
+// "On Linux the NVML library will be found on the standard library path. For 64-bit Linux, both the 32-bit and 64-bit NVML libraries will be installed."
+
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
 #include "Platform.h"
 #include "BestGpu.h"
 #include "commandArgUtil.h" // for ConfigParameters
 #ifndef CPUONLY
 #pragma comment (lib, "cudart.lib")
-#pragma comment (lib, "nvml.lib")
 #include <cuda_runtime.h>
-#include <nvml.h>
+#include <nvml.h>                   // note: expected at "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include" (Windows)
+#pragma comment (lib, "nvml.lib")   // note: expected at "c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib" (Windows)
 #include <vector>
 #endif
 #include "CommonMatrix.h" // for CPUDEVICE and AUTOPLACEMATRIX
@ -63,11 +69,15 @@ enum BestGpuFlags
    bestGpuFavorMemory = 2, // favor memory
    bestGpuFavorUtilization = 4, // favor low utilization
    bestGpuFavorSpeed = 8, // favor fastest processor
+    bestGpuExclusiveLock = 16, // obtain mutex for selected GPU
    bestGpuRequery = 256, // rerun the last query, updating statistics
 };

 class BestGpu
 {
+#ifdef WIN32
+    std::map<int, HANDLE> m_GPUMutex;
+#endif
 private:
    bool m_initialized; // initialized
    bool m_nvmlData; // nvml Data is valid
@ -99,6 +109,7 @@ public:
    static const int AllDevices = -1;  // can be used to specify all GPUs in GetDevices() call
    static const int RequeryDevices = -2;  // Requery refreshing statistics and picking the same number as last query
    std::vector<int> GetDevices(int number = AllDevices, BestGpuFlags flags = bestGpuNormal ); // get multiple devices
+private:
 	bool LockDevice(int deviceID, bool trial=true);
 };
    
@ -167,49 +178,22 @@ DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config)
            }
        }
 #else
-        std::vector<int> devices = g_bestGpu->GetDevices(1,  bestGpuAvoidSharing);
-		// to return a vector of AVAILABLE devices and sorted by score 
-        deviceId = (DEVICEID_TYPE)devices[0];
-		if (bLockGPU)
-		{
-			if (!g_bestGpu->LockDevice(deviceId, false)) // formally lock it 
-			{
-				string message = msra::strfun::strprintf("DeviceFromConfig: Cannot capture and lock Device %d\n", deviceId);
-				throw std::runtime_error(message.c_str());
-			}
-		}
+        deviceId = (DEVICEID_TYPE)
+            g_bestGpu->GetDevice(BestGpuFlags(bLockGPU ? (bestGpuAvoidSharing | bestGpuExclusiveLock) : bestGpuAvoidSharing));
 #endif
    }
    else if (!_stricmp(val.c_str(), "All"))
    {
-        std::vector<int> devices = g_bestGpu->GetDevices(BestGpu::AllDevices, bestGpuNormal);
+        std::vector<int> devices =
+            g_bestGpu->GetDevices(BestGpu::AllDevices, BestGpuFlags(bLockGPU ? bestGpuNormal | bestGpuExclusiveLock : bestGpuNormal));
        deviceId = (DEVICEID_TYPE)devices[0];
-		if (bLockGPU) {
-			for (auto i : devices)
-			{
-				if (!g_bestGpu->LockDevice(i, false))
-				{
-					string message = msra::strfun::strprintf("DeviceFromConfig: Cannot capture and lock Device %d\n", i);
-					throw std::runtime_error(message.c_str());
-				}				
-			}
-		}
    }
    else if (val.size() == 2 && val[0] == '*' && isdigit(val[1]))
    {
        int number = (int)(val[1] - '0');
-        std::vector<int> devices = g_bestGpu->GetDevices(number, bestGpuNormal);
+        std::vector<int> devices =
+            g_bestGpu->GetDevices(number, BestGpuFlags(bLockGPU ? bestGpuNormal | bestGpuExclusiveLock : bestGpuNormal));
        deviceId = (DEVICEID_TYPE)devices[0];
-		if (bLockGPU){
-			for (size_t i = 0; i < number; i++)
-			{
-				if (!g_bestGpu->LockDevice((int)i, false))
-				{
-					string message = msra::strfun::strprintf("DeviceFromConfig: Cannot capture and lock Device %d\n", i);
-					throw std::runtime_error(message.c_str());
-				}
-			}
-		}
    }
    else
    {
@ -222,19 +206,10 @@ DEVICEID_TYPE DeviceFromConfig(const ConfigParameters& config)
        {
            argvector<int> allowed = arr;
            g_bestGpu->SetAllowedDevices(allowed);
-            std::vector<int> devices = g_bestGpu->GetDevices(BestGpu::AllDevices, bestGpuNormal);
+
+            std::vector<int> devices =
+                g_bestGpu->GetDevices(BestGpu::AllDevices, BestGpuFlags(bLockGPU ? bestGpuNormal | bestGpuExclusiveLock : bestGpuNormal));
            deviceId = (DEVICEID_TYPE)devices[0];
-			if (bLockGPU)
-			{
-				for (auto i : devices)
-				{
-					if (!g_bestGpu->LockDevice(i, false))
-					{
-						string message = msra::strfun::strprintf("DeviceFromConfig: Cannot capture and lock Device %d\n", i);
-						throw std::runtime_error(message.c_str());
-					}
-				}
-			}
        }
    }
    return deviceId;
@ -310,8 +285,17 @@ void BestGpu::Init()
        return;

    //get the count of objects
-    //cudaError_t err =
+    cudaError_t err =
    cudaGetDeviceCount(&m_deviceCount);
+    // TODO: use CUDA_CALL here
+    if (err != cudaSuccess)
+    {
+        const char* errmsg = cudaGetErrorString(err);
+        fprintf(stderr, "!!!!!!!!CUDA EXCEPTION: %s\n", errmsg);
+        throw std::runtime_error(errmsg);
+    }
+
+

    ProcessorData pdEmpty = { 0 };
    for (int i = 0; i < m_deviceCount; i++)
@ -342,6 +326,12 @@ BestGpu::~BestGpu()
    {
        nvmlShutdown();
    }
+#ifdef WIN32
+    for (auto it : m_GPUMutex)
+    {
+        ::CloseHandle(it.second);
+    }
+#endif
 }

 // GetNvmlData - Get data from the Nvidia Management Library
@ -504,6 +494,17 @@ std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
            break;
    }

+#ifdef WIN32
+    // this code allows only one process to run concurrently on a machine
+    wchar_t buffer[80];
+    wsprintf(buffer, L"Global\\DBN.exe GPGPU querying lock");
+    HANDLE h = ::CreateMutex(NULL, FALSE, buffer);
+    if (h == NULL)  // failure  --this should not really happen
+        throw std::runtime_error("DeviceFromConfig: unexpected failure");
+    if (bestFlags & bestGpuExclusiveLock) // only wait if we will be locking devices
+        ::WaitForSingleObject(h, INFINITE);
+#endif
+    
    {
 	// even if user do not want to lock the GPU, we still need to check whether a particular GPU is locked or not, 
 	// to respect other users' exclusive lock.
@ -534,6 +535,17 @@ std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
        best.push_back(-1);
    }

+    for (int z = 0; z<best.size() && z < number; z++)
+    {
+        LockDevice(best[z], false);
+    }
+
+#ifdef WIN32
+    // we have our device - let other processors play now
+    ::ReleaseMutex(h);
+    ::CloseHandle(h);
+#endif
+
    return best; // return the array of the best GPUs
 }

@ -680,6 +692,7 @@ bool BestGpu::LockDevice(int deviceID, bool trial)
 		{
 			fprintf(stderr, "LockDevice: Capture device %d and lock it for exclusive use\n", deviceID);
 		}
+        m_GPUMutex[deviceID] = h;
 		return true;
 	}
 	::CloseHandle(h);
--- a/Common/ConfigFile.cpp
+++ b/Common/ConfigFile.cpp
@ -58,8 +58,12 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                }
            }
        }
+        // now, configString is a concatenation of lines, including parameters from the command line, with comments stripped

+        // expand any lines of the form include=
        configString = config.ResolveIncludeStatements(configString, resolvedConfigFiles);
+
+        // convert into a ConfigDictionary--top-level expressions of the form var=val; if val is a block in braces, it is kept verbatim (not parsed inside)
        config.FileParse(configString);
        return configString;
    }
@ -182,7 +186,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        return ReadConfigFile(msra::strfun::utf16(filePath));
    }

-    // ReadConfigFile - read a configuration file, and return as a string
+    // ReadConfigFile - read a configuration file, and return all lines, stripped of comments, concatenated by newlines, as one long string (no other processing, expansion etc.)
    // filePath - the path to the config file to read
    // returns: a string with the concatentated file contents
    std::string ConfigParser::ReadConfigFile(const std::wstring &filePath)
@ -205,7 +209,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        while (!file.IsEOF())
        {
            file.GetLine(str);
-            str = PreprocessConfigLine(str);
+            str = StripComments(str);
            if (str != "")
            {
                configFile.append(str);
--- a/Common/DataReader.cpp
+++ b/Common/DataReader.cpp
@ -8,8 +8,9 @@

 #include "stdafx.h"
 #define DATAREADER_LOCAL
-#include "basetypes.h"
+#include "Basics.h"
 #include "DataReader.h"
+#include "commandArgUtil.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

--- a/Common/DataWriter.cpp
+++ b/Common/DataWriter.cpp
@ -9,6 +9,7 @@
 #include "stdafx.h"
 #define DATAWRITER_LOCAL
 #include "DataWriter.h"
+#include "commandArgUtil.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

--- a/Common/Eval.cpp
+++ b/Common/Eval.cpp
@ -9,7 +9,7 @@
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings

 #include "stdafx.h"
-#include "basetypes.h"
+#include "Basics.h"
 #define EVAL_LOCAL
 #include "Eval.h"

--- a/Common/File.cpp
+++ b/Common/File.cpp
@ -8,7 +8,7 @@
 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
 #endif

-#include "basetypes.h"
+#include "Basics.h"
 #define FORMAT_SPECIALIZE // to get the specialized version of the format routines
 #include "fileutil.h"
 #include "File.h"
--- a/Common/FileTest/FileTest.cpp
+++ b/Common/FileTest/FileTest.cpp
@ -7,7 +7,7 @@
 //

 #include "stdafx.h"
-#include "basetypes.h"
+#include "Basics.h"
 #include "fileutil.h"
 #include "FileTest.h"
 #include "File.h"
--- a/Common/Include/Basics.h
+++ b/Common/Include/Basics.h
@ -0,0 +1,38 @@
+// Basics.h -- some shared generally useful pieces of code used by CNTK
+//
+// We also include a simple "emulation" layer for some proprietary MSVC CRT functions.
+
+#pragma once
+
+#ifndef _BASICS_H_
+#define _BASICS_H_
+
+#include "basetypes.h"  // TODO: gradually move over here all that's needed of basetypes.h, then remove basetypes.h.
+
+#define TWO_PI 6.283185307f // TODO: find the official standards-confirming definition of this and use it instead
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    using namespace std;
+
+    // string comparison class, so we do case insensitive compares
+    struct nocase_compare
+    {
+        // std::string version of 'less' function
+        // return false for equivalent, true for different
+        bool operator()(const std::string& left, const std::string& right) { return _stricmp(left.c_str(), right.c_str()) < 0; }
+        // std::wstring version of 'less' function, used in non-config classes
+        bool operator()(const std::wstring& left, const std::wstring& right) { return _wcsicmp(left.c_str(), right.c_str()) < 0; }
+    };
+
+}}}
+
+// ===========================================================================
+// emulation of some MSVC proprietary CRT
+// ===========================================================================
+
+#ifndef _MSC_VER
+static inline int _wsystem(const wchar_t *command) { return system(msra::strfun::utf8(command).c_str()); }
+#endif
+
+#endif // _BASICS_H_
--- a/Common/Include/DataReader.h
+++ b/Common/Include/DataReader.h
@ -25,8 +25,7 @@
 #include "Matrix.h"
 #include <map>
 #include <string>
-#include "basetypes.h"
-#include "commandArgUtil.h"
+#include "Basics.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

--- a/Common/Include/DataWriter.h
+++ b/Common/Include/DataWriter.h
@ -23,11 +23,10 @@
 #define DATAWRITER_API
 #endif

-#include "basetypes.h"
+#include "Basics.h"
 #include "Matrix.h"
 #include <map>
 #include <string>
-#include "commandArgUtil.h"


 namespace Microsoft { namespace MSR { namespace CNTK {
--- a/Common/Include/Eval.h
+++ b/Common/Include/Eval.h
@ -23,7 +23,7 @@
 #define EVAL_API
 #endif

-#include "basetypes.h"
+#include "Basics.h"
 #include <map>
 #include <vector>
 #include <string>
@ -68,8 +68,6 @@ class Eval : public IEvaluateModel<ElemType>, protected Plugin
 private:
    IEvaluateModel<ElemType> *m_eval;  // evaluation class pointer

-    virtual void Init(const std::string& config);
-
    void GetEvalClass(const std::string& config);

    // Destroy - cleanup and remove this class
@ -98,7 +96,7 @@ public:
    // inputs - map from node name to input vector
    // outputs - map from node name to output vector, outputs vectors need to be preallocated by caller, sizing will happen during evaluation
    virtual void Evaluate(std::map<std::wstring, std::vector<ElemType>*>& inputs, std::map<std::wstring, std::vector<ElemType>*>& outputs);
-
+    virtual void Init(const std::string& config);
    virtual void ResetState();
 };

--- a/Common/Include/basetypes.h
+++ b/Common/Include/basetypes.h
@ -1146,8 +1146,4 @@ static inline bool comparator(const pair<int, F>& l, const pair<int, F>& r)
    return l.second > r.second;
 }

-/// debug code for machine translation setup
-//#define DBG_SMT
-//#define DEBUG_DECODER
-
 #endif    // _BASETYPES_
--- a/Common/Include/commandArgUtil.h
+++ b/Common/Include/commandArgUtil.h
@ -4,7 +4,7 @@
 // </copyright>
 //
 #pragma once
-#include "basetypes.h"
+#include "Basics.h"
 #include <vector>
 #include <string>
 #include <map>
@ -31,24 +31,6 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    static const std::size_t openBraceVarSize = openBraceVar.size();
    static const std::size_t closingBraceVarSize = openBraceVar.size();

-    // string comparison class, so we do case insensitive compares
-    class nocase_compare
-    {
-    public:
-        // std::string version of 'less' function
-        bool operator()(const std::string& left, const std::string& right)
-        {
-            // return false for equivalent, true for different
-            return _stricmp(left.c_str(), right.c_str()) < 0;
-        }
-        // std::wstring version of 'less' function, used in non-config classes
-        bool operator()(const std::wstring& left, const std::wstring& right)
-        {
-            // return false for equivalent, true for different
-            return _wcsicmp(left.c_str(), right.c_str()) < 0;
-        }
-    };
-
    // Trim - trim white space off the start and end of the string
    // str - string to trim
    // NOTE: if the entire string is empty, then the string will be set to an empty string
@ -305,7 +287,15 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            return current;
        }

-        //Parse - Parse the string
+        // Parse - Parse the string; segment string by top-level a=b expressions and call (virtual) ParseValue() on them.
+        // This function is used at lots of places for various purposes.
+        //  - (ConfigParameters from file) config-file parsing passes in expressions of the type a1=b1 \n a2=b2 \n ..., creates a ConfigDictionary entry for each top-level a=b expression, where b can be a block in braces
+        //  - (ConfigParameters) right-hand side that is an array of parameters [ a1=b1; a2=b2 ...], with surrounding braces
+        //  - (ConfigValue) individual values are also parsed
+        //  - (ConfigArray) same as ConfigValue--the array syntax (':') is not parsed here
+        //    The above all allow ';' or newline as a separator
+        //  - (NDLScript)
+        //  - more to be added
        // stringParse - string to parse
        // pos - postion to start parsing at
        void Parse(const std::string& stringParse, std::string::size_type pos=0)
@ -417,14 +407,14 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            while (tokenEnd != npos);
        }

-        // PreprocessConfigLine - This method removes the section of a config line corresponding to a comment.
+        // StripComments - This method removes the section of a config line corresponding to a comment.
        // configLine - The line within a config file to pre-process.
        // returns:
        //      If the entire line is whitespace, or if the entire line is a comment, simply return an empty string.
        //      If there is no comment, simply return the original 'configString'
        //      If there is a comment, remove the part of 'configString' corresponding to the comment
        //      Note that midline comments need to be preceded by whitespace, otherwise they are not treated as comments.
-        std::string PreprocessConfigLine(const std::string &configLine) const
+        std::string StripComments(const std::string &configLine) const
        {
            std::string::size_type pos = configLine.find_first_not_of(" \t");

@ -708,7 +698,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            if (configLine.find_first_of("\n") != std::string::npos)
                throw std::logic_error ("\"ResolveVariablesInSingleLine\" shouldn't be called with a string containing a newline character");

-            std::string newConfigLine = PreprocessConfigLine(configLine);
+            std::string newConfigLine = StripComments(configLine);
            std::size_t start = newConfigLine.find_first_of(openBraceVar);
            std::size_t end = 0;
            while (start != std::string::npos)
--- a/DataReader/UCIFastReader/minibatchsourcehelpers.h
+++ b/DataReader/UCIFastReader/minibatchsourcehelpers.h
@ -8,7 +8,7 @@

 #pragma once

-#include "basetypes.h"
+#include "Basics.h"
 #include <stdio.h>
 #include <vector>
 #include <algorithm>
@ -89,7 +89,7 @@ public:
                    retries++;
                }
            }
-            fprintf (stderr, "randomordering: %d retries for %d elements (%.1f%%) to ensure window condition\n", retries, map.size(), 100.0 * retries / map.size());
+            fprintf (stderr, "randomordering: %zu retries for %zu elements (%.1f%%) to ensure window condition\n", retries, map.size(), 100.0 * retries / map.size());
            // ensure the window condition
            foreach_index (t, map) assert ((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2);
 #if 0       // and a live check since I don't trust myself here yet
--- a/Common/Include/nvml.h
+++ b/Common/Include/nvml.h
--- a/Common/fileutil.cpp
+++ b/Common/fileutil.cpp
@ -18,7 +18,7 @@
 #endif
 #define _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES 1
 #endif
-#include "basetypes.h"
+#include "Basics.h"
 #include "fileutil.h"
 #ifdef __unix__
 #include <sys/types.h>
--- a/DataReader/BinaryReader/BinaryReader.h
+++ b/DataReader/BinaryReader/BinaryReader.h
@ -6,6 +6,7 @@
 #pragma once
 #include "DataReader.h"
 #include "DataWriter.h"
+#include "commandArgUtil.h"
 #include <string>
 #include <map>
 #include <vector>
--- a/DataReader/BinaryReader/BinaryReader.vcxproj
+++ b/DataReader/BinaryReader/BinaryReader.vcxproj
@ -1,18 +1,10 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
@ -32,25 +24,12 @@
    <RootNamespace>UCIReader</RootNamespace>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v120</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
@ -61,54 +40,25 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>..\..\Math\Math;..\..\Common\include;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
-    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib;$(VCInstallDir)atlmfc\lib;$(WindowsSDK_LibraryPath_x86);</LibraryPath>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>..\..\Math\Math;..\..\Common\include;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
-    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib;$(VCInstallDir)atlmfc\lib;$(WindowsSDK_LibraryPath_x86);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
-    </Link>
-  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <PrecompiledHeader>Use</PrecompiledHeader>
@ -126,26 +76,6 @@
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;UCIREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
-      <Profile>true</Profile>
-    </Link>
-  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
@ -194,30 +124,21 @@
    <ClCompile Include="..\..\Common\fileutil.cpp">
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">NotUsing</PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="BinaryFile.cpp" />
    <ClCompile Include="BinaryReader.cpp" />
    <ClCompile Include="BinaryWriter.cpp" />
    <ClCompile Include="Exports.cpp" />
    <ClCompile Include="dllmain.cpp">
-      <CompileAsManaged Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</CompileAsManaged>
      <CompileAsManaged Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</CompileAsManaged>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
      </PrecompiledHeader>
-      <CompileAsManaged Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</CompileAsManaged>
      <CompileAsManaged Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</CompileAsManaged>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
      </PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="stdafx.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
    </ClCompile>
  </ItemGroup>
--- a/DataReader/BinaryReader/BinaryWriter.cpp
+++ b/DataReader/BinaryReader/BinaryWriter.cpp
@ -7,7 +7,7 @@
 //

 #include "stdafx.h"
-#include "basetypes.h"
+#include "Basics.h"
 #define DATAWRITER_EXPORTS  // creating the exports here
 #include "DataWriter.h"
 #include "BinaryReader.h"
--- a/DataReader/DSSMReader/DSSMReader.cpp
+++ b/DataReader/DSSMReader/DSSMReader.cpp
@ -94,12 +94,12 @@ void DSSMReader<ElemType>::WriteLabelFile()
            {
                labelFile << m_mapIdToLabel[i] << '\n';
            }
-            fprintf(stderr, "label file %ws written to disk\n", m_labelFileToWrite.c_str());
+            fprintf(stderr, "label file %ls written to disk\n", m_labelFileToWrite.c_str());
            m_labelFileToWrite.clear();
        }
        else if (!m_cachingWriter)
        {
-            fprintf(stderr, "WARNING: file %ws NOT written to disk yet, will be written the first time the end of the entire dataset is found.\n", m_labelFileToWrite.c_str());
+            fprintf(stderr, "WARNING: file %ls NOT written to disk yet, will be written the first time the end of the entire dataset is found.\n", m_labelFileToWrite.c_str());
        }
    }
 }
--- a/DataReader/DSSMReader/DSSMReader.h
+++ b/DataReader/DSSMReader/DSSMReader.h
@ -7,6 +7,7 @@
 #pragma once
 #include "DataReader.h"
 #include "DataWriter.h"
+#include "commandArgUtil.h"
 #include <string>
 #include <map>
 #include <vector>
--- a/DataReader/DSSMReader/DSSMReader.vcxproj
+++ b/DataReader/DSSMReader/DSSMReader.vcxproj
@ -1,18 +1,10 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
@ -32,25 +24,12 @@
    <RootNamespace>DSSMReader</RootNamespace>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v120</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
@ -61,55 +40,26 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>..\..\Math\Math;..\..\Common\include;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
-    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib;$(VCInstallDir)atlmfc\lib;$(WindowsSDK_LibraryPath_x86);</LibraryPath>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
-    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <IncludePath>c:\Program Files\Microsoft MPI\Inc;..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>c:\Program Files\Microsoft MPI\Lib\amd64;$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;DSSMREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
-    </Link>
-  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
@ -128,26 +78,6 @@
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration)\;..\..\math\$(Platform)\$(Configuration);..\$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>_CRT_SECURE_NO_WARNINGS;WIN32;NDEBUG;_WINDOWS;_USRDLL;DSSMREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKmath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <AdditionalLibraryDirectories>..\..\math\$(Platform)\$(Configuration);$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
-      <Profile>true</Profile>
-    </Link>
-  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
@ -179,7 +109,6 @@
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
    </ClInclude>
    <ClInclude Include="..\..\Common\Include\File.h">
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</ExcludedFromBuild>
    </ClInclude>
    <ClInclude Include="..\..\Common\Include\fileutil.h">
@ -194,7 +123,6 @@
    <ClCompile Include="..\..\Common\ConfigFile.cpp">
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="..\..\Common\DataReader.cpp" />
    <ClCompile Include="..\..\Common\DataWriter.cpp">
@ -203,13 +131,10 @@
    <ClCompile Include="..\..\Common\File.cpp">
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="..\..\Common\fileutil.cpp">
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">NotUsing</PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">NotUsing</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">NotUsing</PrecompiledHeader>
-      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">true</ExcludedFromBuild>
    </ClCompile>
    <ClCompile Include="dllmain.cpp" />
    <ClCompile Include="DSSMReader.cpp" />
--- a/DataReader/DSSMReader/minibatchsourcehelpers.h
+++ b/DataReader/DSSMReader/minibatchsourcehelpers.h
@ -1,117 +0,0 @@
-//
-// <copyright file="minibatchsourcehelpers.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// minibatchsourcehelpers.h -- helper classes for minibatch sources
-//
-
-#pragma once
-
-#include "basetypes.h"
-#include <stdio.h>
-#include <vector>
-#include <algorithm>
-
-namespace msra { namespace dbn {
-
-// ---------------------------------------------------------------------------
-// randomordering -- class to help manage randomization of input data
-// ---------------------------------------------------------------------------
-
-static inline size_t rand (const size_t begin, const size_t end)
-{
-    const size_t randno = ::rand() * RAND_MAX + ::rand();   // BUGBUG: still only covers 32-bit range
-    return begin + randno % (end - begin);
-}
-
-class randomordering                // note: NOT thread-safe at all
-{
-    // constants for randomization
-    const static size_t randomizeDisable=0;
-
-    typedef unsigned int INDEXTYPE; // don't use size_t, as this saves HUGE amounts of RAM
-    std::vector<INDEXTYPE> map;          // [t] -> t' indices in randomized order
-    size_t currentseed;             // seed for current sequence
-    size_t randomizationrange;      // t - randomizationrange/2 <= t' < t + randomizationrange/2 (we support this to enable swapping)
-                                    // special values (randomizeDisable)
-    void invalidate() { currentseed = (size_t) -1; }
-public:
-    randomordering() { invalidate(); randomizationrange = randomizeDisable;}
-
-    void resize (size_t len, size_t p_randomizationrange) { randomizationrange = p_randomizationrange; if (len > 0) map.resize (len); invalidate(); }
-
-    // return the randomized feature bounds for a time range
-    std::pair<size_t,size_t> bounds (size_t ts, size_t te) const
-    {
-        size_t tbegin = max (ts, randomizationrange/2) - randomizationrange/2;
-        size_t tend = min (te + randomizationrange/2, map.size());
-        return std::make_pair<size_t,size_t> (move(tbegin), move(tend));
-    }
-
-    // this returns the map directly (read-only) and will lazily initialize it for a given seed
-    const std::vector<INDEXTYPE> & operator() (size_t seed) //throw()
-    {
-        // if wrong seed then lazily recache the sequence
-        if (seed != currentseed && randomizationrange != randomizeDisable)
-        {
-            // test for numeric overflow
-            if (map.size()-1 != (INDEXTYPE) (map.size()-1))
-                throw std::runtime_error ("randomordering: INDEXTYPE has too few bits for this corpus");
-            // 0, 1, 2...
-            foreach_index (t, map) map[t] = (INDEXTYPE) t;
-
-            if (map.size() > RAND_MAX * (size_t) RAND_MAX)
-                throw std::runtime_error ("randomordering: too large training set: need to change to different random generator!");
-            srand ((unsigned int) seed);
-            size_t retries = 0;
-            foreach_index (t, map)
-            {
-                for (int tries = 0; tries < 5; tries++)
-                {
-                    // swap current pos with a random position
-                    // Random positions are limited to t+randomizationrange.
-                    // This ensures some locality suitable for paging with a sliding window.
-                    const size_t tbegin = max ((size_t) t, randomizationrange/2) - randomizationrange/2; // range of window  --TODO: use bounds() function above
-                    const size_t tend = min (t + randomizationrange/2, map.size());
-                    assert (tend >= tbegin);                    // (guard against potential numeric-wraparound bug)
-                    const size_t trand = rand (tbegin, tend);   // random number within windows
-                    assert ((size_t) t <= trand + randomizationrange/2 && trand < (size_t) t + randomizationrange/2);
-                    // if range condition is fulfilled then swap
-                    if (trand <= map[t] + randomizationrange/2 && map[t] < trand + randomizationrange/2
-                        && (size_t) t <= map[trand] + randomizationrange/2 && map[trand] < (size_t) t + randomizationrange/2)
-                    {
-                        ::swap (map[t], map[trand]);
-                        break;
-                    }
-                    // but don't multi-swap stuff out of its range (for swapping positions that have been swapped before)
-                    // instead, try again with a different random number
-                    retries++;
-                }
-            }
-            fprintf (stderr, "randomordering: %d retries for %d elements (%.1f%%) to ensure window condition\n", retries, map.size(), 100.0 * retries / map.size());
-            // ensure the window condition
-            foreach_index (t, map) assert ((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2);
-#if 0       // and a live check since I don't trust myself here yet
-            foreach_index (t, map) if (!((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2))
-            {
-                fprintf (stderr, "randomordering: windowing condition violated %d -> %d\n", t, map[t]);
-                throw std::logic_error ("randomordering: windowing condition violated");
-            }
-#endif
-#if 0       // test whether it is indeed a unique complete sequence
-            auto map2 = map;
-            ::sort (map2.begin(), map2.end());
-            foreach_index (t, map2) assert (map2[t] == (size_t) t);
-#endif
-            fprintf (stderr, "randomordering: recached sequence for seed %d: %d, %d, ...\n", (int) seed, (int) map[0], (int) map[1]);
-            currentseed = seed;
-        }
-        return map; // caller can now access it through operator[]
-    }
-    size_t CurrentSeed() {return currentseed;}
-};
-
-typedef unsigned short CLASSIDTYPE; // type to store state ids; don't use size_t --saves HUGE amounts of RAM
-
-};};
--- a/DataReader/HTKMLFReader/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFReader.cpp
@ -303,6 +303,25 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                if (n!=numFiles)
                    throw std::runtime_error (msra::strfun::strprintf ("number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n));

+            /* 
+                do "..." expansion if SCP uses relative path names
+                "..." in the SCP means full path is the same as the SCP file
+                for example, if scp file is "//aaa/bbb/ccc/ddd.scp"
+                and contains entry like 
+                    .../file1.feat
+                    .../file2.feat
+                    etc.
+                the features will be read from
+                    //aaa/bbb/ccc/file1.feat
+                    //aaa/bbb/ccc/file2.feat
+                    etc. 
+                This works well if you store the scp file with the features but 
+                do not want different scp files everytime you move or create new features
+            */
+            wstring scpdircached;
+            for (auto & entry : filelist)
+                ExpandDotDotDot(entry, scriptpath, scpdircached);
+
            infilesmulti.push_back(filelist);
        }

@ -1564,6 +1583,29 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        }
    }

+    template<class ElemType>
+    void HTKMLFReader<ElemType>::ExpandDotDotDot(wstring & featPath, const wstring & scpPath, wstring & scpDirCached) 
+    {
+        wstring delim = L"/\\";
+
+        if (scpDirCached.empty()) 
+        {
+            scpDirCached = scpPath;
+            wstring tail; 
+            auto pos = scpDirCached.find_last_of(delim);
+            if (pos != wstring::npos)
+            {
+                tail = scpDirCached.substr(pos + 1);
+                scpDirCached.resize(pos);
+            }
+            if (tail.empty()) // nothing was split off: no dir given, 'dir' contains the filename
+                scpDirCached.swap(tail);            
+        }
+        size_t pos = featPath.find(L"...");
+        if (pos != featPath.npos)
+            featPath = featPath.substr(0, pos) + scpDirCached + featPath.substr(pos + 3);
+    }
+
    template class HTKMLFReader<float>;
    template class HTKMLFReader<double>;
    }}}
--- a/DataReader/HTKMLFReader/HTKMLFReader.h
+++ b/DataReader/HTKMLFReader/HTKMLFReader.h
@ -6,6 +6,7 @@
 // HTKMLFReader.h - Include file for the MTK and MLF format of features and samples 
 #pragma once
 #include "DataReader.h"
+#include "commandArgUtil.h" // for intargvector

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -88,6 +89,10 @@ private:

    
    size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
+    
+    void ExpandDotDotDot(wstring & featPath, const wstring & scpPath, wstring & scpDirCached);
+
+    
    enum InputOutputTypes
    {
        real,
--- a/DataReader/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/DataReader/HTKMLFReader/HTKMLFReader.vcxproj
@ -1,18 +1,10 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
-    <ProjectConfiguration Include="Debug|Win32">
-      <Configuration>Debug</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
      <Platform>x64</Platform>
    </ProjectConfiguration>
-    <ProjectConfiguration Include="Release|Win32">
-      <Configuration>Release</Configuration>
-      <Platform>Win32</Platform>
-    </ProjectConfiguration>
    <ProjectConfiguration Include="Release|x64">
      <Configuration>Release</Configuration>
      <Platform>x64</Platform>
@ -32,25 +24,12 @@
    </SccProvider>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v120</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
-    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v120</PlatformToolset>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-    <CharacterSet>Unicode</CharacterSet>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
@ -61,53 +40,25 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
-  <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
-  </ImportGroup>
  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <LinkIncremental>true</LinkIncremental>
-    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
-    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib;$(VCInstallDir)atlmfc\lib;$(WindowsSDK_LibraryPath_x86);</LibraryPath>
-  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <LinkIncremental>true</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
-  </PropertyGroup>
-  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <LinkIncremental>false</LinkIncremental>
-    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
-    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib;$(VCInstallDir)atlmfc\lib;$(WindowsSDK_LibraryPath_x86);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-    <ClCompile>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <WarningLevel>Level3</WarningLevel>
-      <Optimization>Disabled</Optimization>
-      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;HTKMLFREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-    </Link>
-  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
      <PrecompiledHeader>Use</PrecompiledHeader>
@ -123,25 +74,6 @@
      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
-  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-    <ClCompile>
-      <WarningLevel>Level3</WarningLevel>
-      <PrecompiledHeader>Use</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>WIN32;NDEBUG;_WINDOWS;_USRDLL;HTKMLFREADER_EXPORTS;%(PreprocessorDefinitions)</PreprocessorDefinitions>
-      <SDLCheck>true</SDLCheck>
-    </ClCompile>
-    <Link>
-      <SubSystem>Windows</SubSystem>
-      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>CNTKMath.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <Profile>true</Profile>
-    </Link>
-  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
@ -199,16 +131,10 @@
    <ClCompile Include="DataReader.cpp" />
    <ClCompile Include="DataWriter.cpp" />
    <ClCompile Include="dllmain.cpp">
-      <CompileAsManaged Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">false</CompileAsManaged>
      <CompileAsManaged Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">false</CompileAsManaged>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
-      </PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
      </PrecompiledHeader>
-      <CompileAsManaged Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</CompileAsManaged>
      <CompileAsManaged Condition="'$(Configuration)|$(Platform)'=='Release|x64'">false</CompileAsManaged>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
-      </PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
      </PrecompiledHeader>
    </ClCompile>
@ -217,9 +143,7 @@
    <ClCompile Include="HTKMLFWriter.cpp" />
    <ClCompile Include="latticearchive.cpp" />
    <ClCompile Include="stdafx.cpp">
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">Create</PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">Create</PrecompiledHeader>
-      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">Create</PrecompiledHeader>
      <PrecompiledHeader Condition="'$(Configuration)|$(Platform)'=='Release|x64'">Create</PrecompiledHeader>
    </ClCompile>
  </ItemGroup>
--- a/DataReader/HTKMLFReader/HTKMLFWriter.cpp
+++ b/DataReader/HTKMLFReader/HTKMLFWriter.cpp
@ -12,18 +12,10 @@

 #include "htkfeatio.h"                  // for reading HTK features
 #include "ssematrix.h"
-//#include "latticearchive.h"             // for reading HTK phoneme lattices (MMI training)
-//#include "simplesenonehmm.h"            // for MMI scoring
-//#include "msra_mgram.h"                 // for unigram scores of ground-truth path in sequence training
-
-//#include "rollingwindowsource.h"        // minibatch sources
-//#include "utterancesource.h"
-//#include "readaheadsource.h"
-//#include "chunkevalsource.h"
-//#include "minibatchiterator.h"

 #define DATAWRITER_EXPORTS  // creating the exports here
 #include "DataWriter.h"
+#include "commandArgUtil.h"
 #include "HTKMLFWriter.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
--- a/DataReader/HTKMLFReader/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader/utterancesourcemulti.h
@ -879,6 +879,7 @@ private:
            auto & chunkdata = randomizedchunks[m][k].getchunkdata();
            if (chunkdata.isinram())
            {
+                if (verbosity)
                fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n",
                     k, randomizedchunks[m][k].globalts, randomizedchunks[m][k].globalte()-1, chunksinram-1);
                chunkdata.releasedata();
--- a/DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
+++ b/DataReader/HTKMLFReader_linux/HTKMLFReader.cpp
@ -25,6 +25,7 @@
 #include "minibatchiterator.h"
 #define DATAREADER_EXPORTS  // creating the exports here
 #include "DataReader.h"
+#include "commandArgUtil.h"
 #include "HTKMLFReader.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
@ -419,10 +420,10 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #ifdef __unix__
                char* tempFile;
                //GetTempFileName(pageFilePath.c_str(), L"CNTK", 0, tempFile);
-                tempFile = pageFilePath.c_str();
+                tempFile = (char*) pageFilePath.c_str();
                int fid = mkstemp(tempFile);
                unlink (tempFile);
-                close (tempFile);
+                close (fid);
                pagePaths.push_back(GetWC(tempFile));
 #endif
            }
@ -525,7 +526,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                n++;
            }

-            fprintf (stderr, " %d entries\n", n);
+            fprintf (stderr, " %zu entries\n", n);

            if (i==0)
                numFiles=n;
@ -760,7 +761,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            for (auto iter=matrices.begin();iter!=matrices.end();iter++)
            {
                if (m_nameToTypeMap.find(iter->first)==m_nameToTypeMap.end())
-                    throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %ws not found in reader - cannot generate input\n",iter->first.c_str()));
+                    throw std::runtime_error(msra::strfun::strprintf("minibatch requested for input node %S not found in reader - cannot generate input\n",iter->first.c_str()));

            }
            m_checkDictionaryKeys=false;
@ -1183,7 +1184,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                if (matrices.find(iter->first)==matrices.end())
                {
-                    fprintf(stderr,"GetMinibatchToWrite: feature node %ws specified in reader not found in the network\n",iter->first.c_str());
+                    fprintf(stderr,"GetMinibatchToWrite: feature node %S specified in reader not found in the network\n",iter->first.c_str());
                    throw std::runtime_error("GetMinibatchToWrite: feature node specified in reader not found in the network.");
                }
            }
@ -1215,7 +1216,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                {
                    reader.read (path, featkind, sampperiod, feat);   // whole file read as columns of feature vectors
                });
-                fprintf (stderr, "evaluate: reading %d frames of %S\n", feat.cols(), ((wstring)path).c_str());
+                fprintf (stderr, "evaluate: reading %zu frames of %S\n", feat.cols(), ((wstring)path).c_str());
                m_fileEvalSource->AddFile(feat, featkind, sampperiod, i);
            }
            m_inputFileIndex++;
--- a/DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp
+++ b/DataReader/HTKMLFReader_linux/HTKMLFWriter.cpp
@ -25,6 +25,7 @@

 #define DATAWRITER_EXPORTS  // creating the exports here
 #include "DataWriter.h"
+#include "commandArgUtil.h"
 #include "HTKMLFWriter.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
@ -94,7 +95,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                n++;
            }

-            fprintf (stderr, " %d entries\n", n);
+            fprintf (stderr, " %zu entries\n", n);

            if (i==0)
                numFiles=n;
@ -175,7 +176,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            msra::asr::htkfeatwriter::write (outputFile, "USER", sampPeriod, output);
        });
                        
-        fprintf (stderr, "evaluate: writing %d frames of %S\n", output.cols(), outputFile.c_str());
+        fprintf (stderr, "evaluate: writing %zu frames of %S\n", output.cols(), outputFile.c_str());


    }
--- a/DataReader/HTKMLFReader_linux/biggrowablevectors.h
+++ b/DataReader/HTKMLFReader_linux/biggrowablevectors.h
@ -99,20 +99,20 @@ public:

    template<typename VALTYPE> void push_back (VALTYPE e)   // VALTYPE could be an rvalue reference
    {
-        size_t i = size();
-        resize_without_commit (i + 1);
-        auto & block = getblockptr (i);
+        size_t i = this->size();
+        this->resize_without_commit (i + 1);
+        auto & block = this->getblockptr (i);
        if (block.get() == NULL)
            block.reset (new std::vector<ELEMTYPE> (this->elementsperblock));
-        (*block)[getblockt (i)] = e;
+        (*block)[this->getblockt (i)] = e;
    }

-          ELEMTYPE & operator[] (size_t t)       { return getblock(t)[getblockt (t)]; }    // get an element
-    const ELEMTYPE & operator[] (size_t t) const { return getblock(t)[getblockt (t)]; }    // get an element
+          ELEMTYPE & operator[] (size_t t)       { return this->getblock(t)[this->getblockt (t)]; }    // get an element
+    const ELEMTYPE & operator[] (size_t t) const { return this->getblock(t)[this->getblockt (t)]; }    // get an element

    void resize (const size_t n)
    {
-        resize_without_commit (n);
+        this->resize_without_commit (n);
        foreach_index (i, this->blocks)
            if (this->blocks[i].get() == NULL)
                this->blocks[i].reset (new std::vector<ELEMTYPE> (this->elementsperblock));
--- a/DataReader/HTKMLFReader_linux/chunkevalsource.h
+++ b/DataReader/HTKMLFReader_linux/chunkevalsource.h
@ -61,7 +61,7 @@ namespace msra { namespace dbn {
                unsigned int sampperiod = sampperiods[k];
                size_t n = numframes[k];
                msra::files::make_intermediate_dirs (outfile);
-                fprintf (stderr, "saveandflush: writing %d frames to %S\n", n, outfile.c_str());
+                fprintf (stderr, "saveandflush: writing %zu frames to %S\n", n, outfile.c_str());
                msra::dbn::matrixstripe thispred (pred, firstframe, n);
                // some sanity check for the data we've written
                const size_t nansinf = thispred.countnaninf();
@ -174,7 +174,7 @@ namespace msra { namespace dbn {
                unsigned int sampperiod = sampperiods[index][k];
                size_t n = numframes[k];
                msra::files::make_intermediate_dirs (outfile);
-                fprintf (stderr, "saveandflush: writing %d frames to %S\n", n, outfile.c_str());
+                fprintf (stderr, "saveandflush: writing %zu frames to %S\n", n, outfile.c_str());
                msra::dbn::matrixstripe thispred (pred, firstframe, n);
                // some sanity check for the data we've written
                const size_t nansinf = thispred.countnaninf();
--- a/DataReader/HTKMLFReader_linux/htkfeatio.h
+++ b/DataReader/HTKMLFReader_linux/htkfeatio.h
@ -888,7 +888,7 @@ public:
    template<typename WORDSYMBOLTABLE, typename UNITSYMBOLTABLE>
    void read (const wstring & path, const set<wstring> & restricttokeys, const WORDSYMBOLTABLE * wordmap, const UNITSYMBOLTABLE * unitmap, const double htkTimeToFrame)
    {
-        if (!restricttokeys.empty() && size() >= restricttokeys.size()) // no need to even read the file if we are there (we support multiple files)
+        if (!restricttokeys.empty() && this->size() >= restricttokeys.size()) // no need to even read the file if we are there (we support multiple files)
            return;

        fprintf (stderr, "htkmlfreader: reading MLF file %S ...", path.c_str());
@ -896,19 +896,19 @@ public:

        vector<char> buffer;    // buffer owns the characters--don't release until done
        vector<char*> lines = readlines (path, buffer);
-        vector<WORDSEQUENCE::word> wordsequencebuffer;
-        vector<WORDSEQUENCE::aligninfo> alignsequencebuffer;
+        vector<typename WORDSEQUENCE::word> wordsequencebuffer;
+        vector<typename WORDSEQUENCE::aligninfo> alignsequencebuffer;

        if (lines.empty() || strcmp (lines[0], "#!MLF!#")) malformed ("header missing");

        // parse entries
-        fprintf (stderr, "parse the line %d\n", lines.size());
+        fprintf (stderr, "parse the line %zu\n", lines.size());
        size_t line = 1;
-        while (line < lines.size() && (restricttokeys.empty() || size() < restricttokeys.size()))
+        while (line < lines.size() && (restricttokeys.empty() || this->size() < restricttokeys.size()))
            parseentry (lines, line, restricttokeys, wordmap, unitmap, wordsequencebuffer, alignsequencebuffer, htkTimeToFrame);

        curpath.clear();
-        fprintf (stderr, " total %lu entries\n", size());
+        fprintf (stderr, " total %lu entries\n", this->size());
    }

    // read state list, index is from 0
--- a/DataReader/HTKMLFReader_linux/latticearchive.h
+++ b/DataReader/HTKMLFReader_linux/latticearchive.h
@ -12,6 +12,8 @@
 #undef HACK_IN_SILENCE          // [v-hansu] hack to simulate DEL in the lattice
 #define SILENCE_PENALTY          // give penalty to added silence

+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>

 #include "basetypes.h"
 #include "latticestorage.h"
@ -24,7 +26,7 @@
 #include <unordered_map>
 #include <algorithm>        // for find()
 #include "simplesenonehmm.h"
-
+#include <inttypes.h>
 namespace msra { namespace math { class ssematrixbase;  template<class ssematrixbase> class ssematrix; template<class ssematrixbase> class ssematrixstriperef; };};

 namespace msra { namespace lm { class CMGramLM; class CSymbolSet; };};        // for numer-lattice building
@ -188,7 +190,7 @@ public: // TODO: make private again once
                if (ai.size() < 2)  // less than 2--must be /sil/
                    continue;
                spunit = ai[ai.size() - 1].unit;
-                fprintf (stderr, "builduniquealignments: /sp/ unit inferred through heuristics as %d\n", spunit);
+                fprintf (stderr, "builduniquealignments: /sp/ unit inferred through heuristics as %zu\n", spunit);
                break;
            }
        }
@ -234,7 +236,7 @@ public: // TODO: make private again once
                && nodes[edges[prevj].S].t == nodes[edges[j].S].t
                && nodes[edges[prevj].E].t == nodes[edges[j].E].t
                && edges[prevj].l != edges[j].l)   // some diagnostics
-                    fprintf (stderr, "build: merging edges %d and %d despite slightly different LM scores %.8f vs. %.8f, ts/te=%.2f/%.2f\n",
+                    fprintf (stderr, "build: merging edges %zu and %zu despite slightly different LM scores %.8f vs. %.8f, ts/te=%.2f/%.2f\n",
                             prevj, j, edges[prevj].l, edges[j].l, nodes[edges[prevj].S].t * 0.01f, nodes[edges[prevj].E].t * 0.01f);
 #endif
            if (prevj == SIZE_MAX || fabs (edges[prevj].l - edges[j].l) > lmargin || (info.hasacscores && edges[prevj].a != edges[j].a) || comparealign (prevj, j, false) != 0)
@ -286,7 +288,7 @@ public: // TODO: make private again once
        }
        const size_t uniquealigntokens = uniquededgedatatokens.size() - (numuniquealignments * (info.hasacscores ? 2 : 1));
        const size_t nonuniquenonsptokens = align.size() - numimpliedsp;
-        fprintf (stderr, "builduniquealignments: %d edges: %d unique alignments (%.2f%%); %d align tokens - %d implied /sp/ units = %d, uniqued to %d (%.2f%%)\n",
+        fprintf (stderr, "builduniquealignments: %zu edges: %zu unique alignments (%.2f%%); %zu align tokens - %zu implied /sp/ units = %zu, uniqued to %zu (%.2f%%)\n",
                 edges.size(), numuniquealignments, 100.0f * numuniquealignments / edges.size(),
                 align.size(), numimpliedsp, nonuniquenonsptokens, uniquealigntokens, 100.0f * uniquealigntokens / nonuniquenonsptokens);

@ -593,7 +595,7 @@ private:
 #if 1           // multiple /sil/ -> log this (as we are not sure whether this is actually proper--probably it is)
                if (numsilunits > 1)
                {
-                    fprintf (stderr, "backpointers: lattice '%S', edge %d has %d /sil/ phonemes\n", L.getkey(), j, numsilunits);
+                    fprintf (stderr, "backpointers: lattice '%S', edge %d has %zu /sil/ phonemes\n", L.getkey(), j, numsilunits);
                    fprintf (stderr, "alignments: :");
                    foreach_index (a, aligntokens)
                    {
@ -643,7 +645,7 @@ private:
    double bestpathlattice (const std::vector<float> & edgeacscores, std::vector<double> & logpps,
                            const float lmf, const float wp, const float amf) const;

-    static float lattice::alignedge (const_array_ref<aligninfo> units, const msra::asr::simplesenonehmm & hset, 
+    static float alignedge (const_array_ref<aligninfo> units, const msra::asr::simplesenonehmm & hset, 
                                     const msra::math::ssematrixbase & logLLs, msra::math::ssematrixbase & gammas, 
                                     size_t edgeindex, const bool returnsenoneids, array_ref<unsigned short> thisedgealignments);

@ -674,7 +676,7 @@ private:
                                    const std::vector<float> & transcriptunigrams, const msra::math::ssematrixbase & logLLs, 
                                    const msra::asr::simplesenonehmm & hset, const float lmf, const float wp, const float amf);

-    static float lattice::forwardbackwardedge (const_array_ref<aligninfo> units, const msra::asr::simplesenonehmm & hset, 
+    static float forwardbackwardedge (const_array_ref<aligninfo> units, const msra::asr::simplesenonehmm & hset, 
                                               const msra::math::ssematrixbase & logLLs, msra::math::ssematrixbase & gammas, 
                                               size_t edgeindex);

@ -746,7 +748,7 @@ public:
        size_t totaledgeframes = 0;
        for (size_t j = 0; j < info.numedges; j++)
            totaledgeframes += nodes[edges[j].E].t - (size_t) nodes[edges[j].S].t;
-        fprintf (stderr, "lattice: read %d nodes, %d edges, %d units, %d frames, %.1f edges/node, %.1f units/edge, %.1f frames/edge, density %.1f\n",
+        fprintf (stderr, "lattice: read %zu nodes, %zu edges, %zu units, %zu frames, %.1f edges/node, %.1f units/edge, %.1f frames/edge, density %.1f\n",
                 info.numnodes, info.numedges, align.size(), info.numframes,
                 info.numedges / (double) info.numnodes, align.size() / (double) info.numedges, totaledgeframes / (double) info.numedges, totaledgeframes / (double) info.numframes);
    }
@ -895,7 +897,7 @@ public:
 #if 1       // post-bugfix for incorrect inference of spunit
            if (info.impliedspunitid != SIZE_MAX && info.impliedspunitid >= idmap.size())   // we have buggy lattices like that--what do they mean??
            {
-                fprintf (stderr, "fread: detected buggy spunit id %d which is out of range (%d entries in map)\n", info.impliedspunitid, idmap.size());
+                fprintf (stderr, "fread: detected buggy spunit id %zu which is out of range (%zu entries in map)\n", info.impliedspunitid, idmap.size());
                throw std::runtime_error ("fread: out of bounds spunitid");
            }
 #endif
@ -949,7 +951,7 @@ public:
                    k += skipscoretokens;
                    uniquealignments++;
                }
-                fprintf (stderr, "fread: mapped %d unique alignments\n", uniquealignments);
+                fprintf (stderr, "fread: mapped %zu unique alignments\n", uniquealignments);
            }
            if (info.impliedspunitid != spunit)
            {
@ -1091,13 +1093,13 @@ public:
    {
        if (tocpaths.empty())   // nothing to read--keep silent
            return;
-        fprintf (stderr, "archive: opening %d lattice-archive TOC files ('%S' etc.)..", tocpaths.size(), tocpaths[0].c_str());
+        fprintf (stderr, "archive: opening %zu lattice-archive TOC files ('%S' etc.)..", tocpaths.size(), tocpaths[0].c_str());
        foreach_index (i, tocpaths)
        {
            fprintf (stderr, ".");
            open (tocpaths[i]);
        }
-        fprintf (stderr, " %d total lattices referenced in %d archive files\n", toc.size(), archivepaths.size());
+        fprintf (stderr, " %zu total lattices referenced in %zu archive files\n", toc.size(), archivepaths.size());
    }

    // open an archive
@ -1133,7 +1135,7 @@ public:
                throw std::runtime_error ("open: invalid TOC line (empty archive pathname): " + std::string (line));
            char c;
            uint64_t offset;
-            if (sscanf_s (q, "[%I64u]%c", &offset, &c, sizeof (c)) != 1)
+            if (sscanf (q, "[%" PRIu64 "]%c", &offset, &c) != 1)
                throw std::runtime_error ("open: invalid TOC line (bad [] expression): " + std::string (line));
            if (!toc.insert (make_pair (key, latticeref (offset, archiveindex))).second)
                throw std::runtime_error ("open: TOC entry leads to duplicate key: " + std::string (line));
--- a/DataReader/HTKMLFReader_linux/latticestorage.h
+++ b/DataReader/HTKMLFReader_linux/latticestorage.h
@ -25,7 +25,7 @@ static void checkoverflow (size_t fieldval, size_t targetval, const char * field
    if (fieldval != targetval)
    {
        char buf[1000];
-        sprintf_s (buf, "lattice: bit field %s too small for value 0x%x (cut from 0x%x)", fieldname, targetval, fieldval);
+        sprintf_s (buf, "lattice: bit field %s too small for value 0x%zu (cut from 0x%zu)", fieldname, targetval, fieldval);
        throw std::runtime_error (buf);
    }
 }
--- a/DataReader/HTKMLFReader_linux/minibatchiterator.h
+++ b/DataReader/HTKMLFReader_linux/minibatchiterator.h
@ -179,7 +179,7 @@ public:
          timegetbatch (0), timechecklattice (0)
    {
        firstvalidepochstartframe = source.firstvalidglobalts (epochstartframe); // epochstartframe may fall between utterance boundaries; this gets us the first valid boundary
-        fprintf (stderr, "minibatchiterator: epoch %d: frames [%d..%d] (first utterance at frame %d) with %d datapasses\n",
+        fprintf (stderr, "minibatchiterator: epoch %zu: frames [%zu..%zu] (first utterance at frame %zu) with %zu datapasses\n",
                 epoch, epochstartframe, epochendframe, firstvalidepochstartframe, datapasses);
        mbstartframe = firstvalidepochstartframe;
        datapass = 0;
@ -197,7 +197,7 @@ public:
          timegetbatch (0), timechecklattice (0)
    {
        firstvalidepochstartframe = source.firstvalidglobalts (epochstartframe); // epochstartframe may fall between utterance boundaries; this gets us the first valid boundary
-        fprintf (stderr, "minibatchiterator: epoch %d: frames [%d..%d] (first utterance at frame %d) with %d datapasses\n",
+        fprintf (stderr, "minibatchiterator: epoch %zu: frames [%zu..%zu] (first utterance at frame %zu) with %zu datapasses\n",
                 epoch, epochstartframe, epochendframe, firstvalidepochstartframe, datapasses);
        mbstartframe = firstvalidepochstartframe;
        datapass = 0;
@ -223,7 +223,7 @@ public:
        {
            mbstartframe = firstvalidepochstartframe;
            datapass++;
-            fprintf (stderr, "\nminibatchiterator: entering %d-th repeat pass through the data\n", datapass+1);
+            fprintf (stderr, "\nminibatchiterator: entering %zu-th repeat pass through the data\n", datapass+1);
        }
        fillorclear();
    }
--- a/DataReader/HTKMLFReader_linux/minibatchsourcehelpers.h
+++ b/DataReader/HTKMLFReader_linux/minibatchsourcehelpers.h
@ -249,7 +249,7 @@ public:
                        retries++;
                    }
                }
-                fprintf (stderr, "randomordering: %d retries for %d elements (%.1f%%) to ensure window condition\n", retries, map.size(), 100.0 * retries / map.size());
+                fprintf (stderr, "randomordering: %zu retries for %zu elements (%.1f%%) to ensure window condition\n", retries, map.size(), 100.0 * retries / map.size());
                // ensure the window condition
                foreach_index (t, map) assert ((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2);
    #if 1       // and a live check since I don't trust myself here yet
--- a/DataReader/HTKMLFReader_linux/rollingwindowsource.h
+++ b/DataReader/HTKMLFReader_linux/rollingwindowsource.h
@ -100,7 +100,7 @@ namespace msra { namespace dbn {
            size_t blockid = t0 / elementsperblock;
            assert (blockid * elementsperblock == t0);
            assert (blocks[blockid]);
-            fprintf (stderr, "recoverblock: releasing feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1);
+            fprintf (stderr, "recoverblock: releasing feature block %zu [%zu..%zu)\n", blockid, t0, t0 + elementsperblock -1);
            blocks[blockid].reset();    // free the memory
        }
        void recoverblock (size_t t0)   // t0=block start time
@ -109,7 +109,7 @@ namespace msra { namespace dbn {
            size_t blockid = t0 / elementsperblock;
            assert (blockid * elementsperblock == t0);
            assert (!blocks[blockid]);
-            fprintf (stderr, "recoverblock: recovering feature block %d [%d..%d)\n", blockid, t0, t0 + elementsperblock -1);
+            fprintf (stderr, "recoverblock: recovering feature block %zu [%zu..%zu)\n", blockid, t0, t0 + elementsperblock -1);
            blocks[blockid].reset (newblock());
            msra::dbn::matrix & block = *blocks[blockid];
            fsetpos (f, blockid * block.sizeinpagefile());
@ -163,7 +163,7 @@ namespace msra { namespace dbn {
            // finish off last block
            flushlastblock();
            fflushOrDie (f);
-            fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %ull bytes\n", (int) n, fgetpos (f));
+            fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %zu bytes\n", (int) n, fgetpos (f));
            fclose (f);
            foreach_index (i, blocks) assert (!blocks[i]);   // ensure we flushed
            assert (inmembegin == inmemend);    // nothing in cache
@ -265,7 +265,7 @@ namespace msra { namespace dbn {
            //  - implement block-wise paging directly from HTK feature files through htkfeatreader
            featkind.clear();
            std::vector<float> frame;
-            fprintf (stderr, "minibatchframesource: reading %d utterances..", infiles.size());
+            fprintf (stderr, "minibatchframesource: reading %zu utterances..", infiles.size());
            size_t numclasses = 0;              // number of units found (actually max id +1)
            size_t notfound = 0;                // number of entries missing in MLF
            msra::asr::htkfeatreader reader;    // feature reader
@ -314,7 +314,7 @@ namespace msra { namespace dbn {
                        size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
                        if (abs ((int) labframes - (int) feat.cols()) > 0)
                        {
-                            fprintf (stderr, "\nminibatchframesource: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
+                            fprintf (stderr, "\nminibatchframesource: %d-th file has small duration mismatch (%zu in label vs. %zu in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
                            notfound++;
                            continue;   // skip this utterance at all
                        }
@ -369,10 +369,10 @@ namespace msra { namespace dbn {
            assert (labels.empty() || numframes == classids.size());
            if ((vdim != 0 && numframes != frames.size()) || (!labels.empty() && numframes != classids.size()))
                throw std::runtime_error ("minibatchframesource: numframes variable screwup");
-            fprintf (stderr, " %d frames read from %d utterances; %d classes\n", numframes, infiles.size(), numclasses);
+            fprintf (stderr, " %zu frames read from %zu utterances; %zu classes\n", numframes, infiles.size(), numclasses);
            if (notfound > 0)
            {
-                fprintf (stderr, "minibatchframesource: %d files out of %d not found in label set\n", notfound, infiles.size());
+                fprintf (stderr, "minibatchframesource: %zu files out of %zu not found in label set\n", notfound, infiles.size());
                if (notfound > infiles.size() / 2)
                    throw std::runtime_error ("minibatchframesource: too many files not found in label set--assuming broken configuration\n");
            }
@ -426,7 +426,7 @@ namespace msra { namespace dbn {
            const size_t te = min (ts + framesrequested, totalframes());    // do not go beyond sweep boundary
            assert (te > ts);
            if (verbosity >= 2)
-                fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep);
+                fprintf (stderr, "getbatch: frames [%zu..%zu] in sweep %zu\n", ts, te-1, sweep);

            // get random sequence (each time index occurs exactly once)
            // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes.
@ -548,7 +548,7 @@ namespace msra { namespace dbn {
            }


-            fprintf (stderr, "minibatchframesourcemulti: reading %d feature sets and %d label sets...", infiles.size(),labels.size());
+            fprintf (stderr, "minibatchframesourcemulti: reading %zu feature sets and %zu label sets...", infiles.size(),labels.size());

            foreach_index (m, infiles)
            {
@ -605,7 +605,7 @@ namespace msra { namespace dbn {
                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
                            if (abs ((int) labframes - (int) feat.cols()) > 0)
                            {
-                                fprintf (stderr, "\nminibatchframesourcemulti: %d-th file has small duration mismatch (%d in label vs. %d in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
+                                fprintf (stderr, "\nminibatchframesourcemulti: %d-th file has small duration mismatch (%zu in label vs. %zu in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
                                notfound++;
                                continue;   // skip this utterance at all
                            }
@ -686,12 +686,12 @@ namespace msra { namespace dbn {
                if (m==0)
                {
                    foreach_index (j, numclasses)
-                        fprintf (stderr, "\nminibatchframesourcemulti: read label set %d: %d classes\n", j, numclasses[j]);
+                        fprintf (stderr, "\nminibatchframesourcemulti: read label set %d: %zu classes\n", j, numclasses[j]);
                }
-                fprintf (stderr, "\nminibatchframesourcemulti: feature set %d: %d frames read from %d utterances\n", m, pframes[m]->size(), infiles[m].size());
+                fprintf (stderr, "\nminibatchframesourcemulti: feature set %d: %zu frames read from %zu utterances\n", m, pframes[m]->size(), infiles[m].size());
                if (notfound > 0)
                {
-                    fprintf (stderr, "minibatchframesourcemulti: %d files out of %d not found in label set\n", notfound, infiles[m].size());
+                    fprintf (stderr, "minibatchframesourcemulti: %zu files out of %zu not found in label set\n", notfound, infiles[m].size());
                    if (notfound > infiles[m].size() / 2)
                        throw std::runtime_error ("minibatchframesourcemulti: too many files not found in label set--assuming broken configuration\n");
                }
@ -751,7 +751,7 @@ namespace msra { namespace dbn {
            const size_t te = min (ts + framesrequested, totalframes());    // do not go beyond sweep boundary
            assert (te > ts);
            if (verbosity >= 2)
-                fprintf (stderr, "getbatch: frames [%d..%d] in sweep %d\n", ts, te-1, sweep);
+                fprintf (stderr, "getbatch: frames [%zu..%zu] in sweep %zu\n", ts, te-1, sweep);

            // get random sequence (each time index occurs exactly once)
            // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes.
--- a/DataReader/HTKMLFReader_linux/simplesenonehmm.h
+++ b/DataReader/HTKMLFReader_linux/simplesenonehmm.h
@ -215,7 +215,7 @@ public:
                symmap.insert (std::make_pair (hmmname, hmmindex)); // insert into hash table
            }
        }
-        fprintf (stderr, "simplesenonehmm: %d units with %d unique HMMs, %d tied states, and %d trans matrices read\n",
+        fprintf (stderr, "simplesenonehmm: %zu units with %zu unique HMMs, %zu tied states, and %zu trans matrices read\n",
                 symmap.size(), hmms.size(), statemap.size(), transPs.size());
    }

--- a/DataReader/HTKMLFReader_linux/ssematrix.h
+++ b/DataReader/HTKMLFReader_linux/ssematrix.h
@ -278,7 +278,7 @@ public:
                         bool addtoresult, const float thisscale, const float weight)
    {
        assert (a.size() == b.size());
-        assert ((15 & (int) &a[0]) == 0); assert ((15 & (int) &b[0]) == 0);   // enforce SSE alignment
+        assert ((15 & (long) &a[0]) == 0); assert ((15 & (long) &b[0]) == 0);   // enforce SSE alignment

        size_t nlong = (a.size() + 3) / 4; // number of SSE elements
        const msra::math::float4 * pa = (const msra::math::float4 *) &a[0];
@ -313,9 +313,9 @@ public:
        // for (size_t k = 0; k < 4; k++)
        //     dotprod (row, const_array_ref<float> (&cols4[k * cols4stride], cols4stride), usij[k * usijstride]);

-        assert ((15 & (int) &row[0]) == 0);
-        assert ((15 & (int) &cols4[0]) == 0);
-        assert ((15 & (int) &cols4[cols4stride]) == 0);
+        assert ((15 & (long) &row[0]) == 0);
+        assert ((15 & (long) &cols4[0]) == 0);
+        assert ((15 & (long) &cols4[cols4stride]) == 0);
        //assert (cols4stride * 4 == cols4.size());     // (passed in one vector with 4 columns stacked on top of each other)
        //assert (row.size() * 4 == cols4.size());  // this assert is no longer appropriate because of further breaking into blocks

@ -1152,7 +1152,7 @@ public:
        foreach_coord (i, j, us)
            if (std::isnan (us(i,j)))
            {
-                fprintf (stderr, "hasnan: NaN detected at %s (%d,%d)\n", name, i, j);
+                fprintf (stderr, "hasnan: NaN detected at %s (%zu,%zu)\n", name, i, j);
                return true;
            }
 #endif
@ -1203,7 +1203,7 @@ class ssematrixfrombuffer : public ssematrixbase
 {
    void operator= (const ssematrixfrombuffer &); ssematrixfrombuffer (const ssematrixfrombuffer &);  // base cannot be assigned except by move
 public:
-    ssematrixfrombuffer() { clear(); }
+    ssematrixfrombuffer() { this->clear(); }

    // instantiate from a float vector  --buffer must be SSE-aligned
    template<class VECTOR> ssematrixfrombuffer (VECTOR & buffer, size_t n, size_t m) : ssematrixbase (buffer, n, m) {}
@ -1262,7 +1262,7 @@ template<class ssematrixbase> class ssematrix : public ssematrixbase
    static __declspec(noreturn) void failed (size_t nbytes) { static/*not thread-safe--for diagnostics only*/ char buf[80] = { 0 }; sprintf_s (buf, "allocation of SSE vector failed (%d bytes)", nbytes); throw std::bad_exception (buf); }
 #endif
 #ifdef __unix__
-    static void failed (size_t nbytes) { static/*not thread-safe--for diagnostics only*/ char buf[80] = { 0 }; sprintf_s (buf, "allocation of SSE vector failed (%d bytes)", nbytes); throw std::bad_exception (); }
+    static void failed (size_t nbytes) { static/*not thread-safe--for diagnostics only*/ char buf[80] = { 0 }; sprintf_s (buf, "allocation of SSE vector failed (%zu bytes)", nbytes); throw std::bad_exception (); }
 #endif
 #if 0   // TODO: move to separate header file numahelpers.h
    template<typename T> static T * new_sse (size_t nbytes) { T * pv = (T *) msra::numa::malloc (nbytes * sizeof (T), 16); if (pv) return pv; failed (nbytes * sizeof (T)); }
@ -1286,18 +1286,18 @@ template<class ssematrixbase> class ssematrix : public ssematrixbase
    };
 public:
    // construction
-    ssematrix() { clear(); }
-    ssematrix (size_t n, size_t m) { clear(); resize (n, m); }
-    ssematrix (size_t n) { clear(); resize (n, 1); }  // vector
-    ssematrix (const ssematrix & other) { clear(); assign (other); }
-    ssematrix (const ssematrixbase & other) { clear(); assign (other); }
+    ssematrix() { this->clear(); }
+    ssematrix (size_t n, size_t m) { this->clear(); resize (n, m); }
+    ssematrix (size_t n) { this->clear(); resize (n, 1); }  // vector
+    ssematrix (const ssematrix & other) { this->clear(); assign (other); }
+    ssematrix (const ssematrixbase & other) { this->clear(); assign (other); }
    ssematrix (ssematrix && other) { this->move (other); }
-    ssematrix (const std::vector<float> & other) { clear(); resize (other.size(), 1); foreach_index (k, other) (*this)[k] = other[k]; }
+    ssematrix (const std::vector<float> & other) { this->clear(); resize (other.size(), 1); foreach_index (k, other) (*this)[k] = other[k]; }

    // construct elementwise with a function f(i,j)
    template<typename FUNCTION> ssematrix (size_t n, size_t m, const FUNCTION & f)
    {
-        clear();
+        this->clear();
        resize (n, m);
        auto & us = *this;
        foreach_coord (i, j, us)
@ -1349,7 +1349,7 @@ public:
    void resizeonce (size_t n, size_t m)
    {
 #if 1   // BUGBUG: at end of epoch, resizes are OK... so we log but allow them
-        if (!empty() && (n != this->numrows || m != this->numcols))
+        if (!this->empty() && (n != this->numrows || m != this->numcols))
            fprintf (stderr, "resizeonce: undesired resize from %d x %d to %d x %d\n", this->numrows, this->numcols, n, m);
        resize (n, m);
 #else
@ -1431,8 +1431,8 @@ public:
    }

    // paging support (used in feature source)
-    void topagefile (FILE * f) const { if (!empty()) fwriteOrDie (this->p, sizeinpagefile(), 1, f); }
-    void frompagefile (FILE * f) { if (!empty()) freadOrDie (this->p, sizeinpagefile(), 1, f); }
+    void topagefile (FILE * f) const { if (!this->empty()) fwriteOrDie (this->p, sizeinpagefile(), 1, f); }
+    void frompagefile (FILE * f) { if (!this->empty()) freadOrDie (this->p, sizeinpagefile(), 1, f); }
    size_t sizeinpagefile() const { return this->colstride * this->numcols * sizeof (*(this->p)); }

    // getting a one-column sub-view on this
--- a/DataReader/HTKMLFReader_linux/utterancesource.h
+++ b/DataReader/HTKMLFReader_linux/utterancesource.h
@ -113,7 +113,7 @@ class minibatchutterancesource : public minibatchsource
                if (featdim == 0)
                {
                    reader.getinfo (utteranceset[0].parsedpath, featkind, featdim, sampperiod);
-                    fprintf (stderr, "requiredata: determined feature kind as %d-dimensional '%s' with frame shift %.1f ms\n", featdim, featkind.c_str(), sampperiod / 1e4);
+                    fprintf (stderr, "requiredata: determined feature kind as %zu-dimensional '%s' with frame shift %.1f ms\n", featdim, featkind.c_str(), sampperiod / 1e4);
                }
                // read all utterances; if they are in the same archive, htkfeatreader will be efficient in not closing the file
                frames.resize (featdim, totalframes);
@ -130,7 +130,7 @@ class minibatchutterancesource : public minibatchsource
                        latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
                }
                //fprintf (stderr, "\n");
-                fprintf (stderr, "requiredata: %d utterances read\n", utteranceset.size());
+                fprintf (stderr, "requiredata: %zu utterances read\n", utteranceset.size());
            }
            catch (...)
            {
@ -297,7 +297,7 @@ public:
                throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
            if (uttframes > frameref::maxframesperutterance)
            {
-                fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
+                fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%zu frames) because it exceeds max. frames (%zu) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
                continue;
            }

@ -331,7 +331,7 @@ public:
                size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
                if (labframes != uttframes)
                {
-                    fprintf (stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
+                    fprintf (stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str());
                    nomlf++;
                    continue;   // skip this utterance at all
                }
@ -360,7 +360,7 @@ public:
                throw std::logic_error (msra::strfun::strprintf ("minibatchutterancesource: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
            assert (labels.empty() || classids.size() == _totalframes + utteranceset.size());
        }
-        fprintf (stderr, " %d frames in %d out of %d utterances; %d classes\n", _totalframes, utteranceset.size(),infiles.size(), numclasses);
+        fprintf (stderr, " %zu frames in %zu out of %zu utterances; %zu classes\n", _totalframes, utteranceset.size(),infiles.size(), numclasses);
        if (!labels.empty())
            foreach_index (i, utteranceset)
        {
@ -369,7 +369,7 @@ public:
        }
        if (nomlf + nolat > 0)
        {
-            fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles.size(), nomlf, nolat);
+            fprintf (stderr, "minibatchutterancesource: out of %zu files, %zu files not found in label set and %zu have no lattice\n", infiles.size(), nomlf, nolat);
            if (nomlf + nolat > infiles.size() / 2)
                throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
        }
@ -397,7 +397,7 @@ public:
            // TODO: above push_back does not actually 'move' because the internal push_back does not accept that
        }
        numutterances = utteranceset.size();
-        fprintf (stderr, "minibatchutterancesource: %d utterances grouped into %d chunks, av. chunk size: %.1f utterances, %.1f frames\n",
+        fprintf (stderr, "minibatchutterancesource: %zu utterances grouped into %zu chunks, av. chunk size: %.1f utterances, %.1f frames\n",
                 numutterances, allchunks.size(), numutterances / (double) allchunks.size(), _totalframes / (double) allchunks.size());
        // Now utterances are stored exclusively in allchunks[]. They are never referred to by a sequential utterance id at this point, only by chunk/within-chunk index.

@ -462,7 +462,7 @@ private:
            return sweep;

        currentsweep = sweep;
-        fprintf (stderr, "lazyrandomization: re-randomizing for sweep %d in %s mode\n", currentsweep, framemode ? "frame" : "utterance");
+        fprintf (stderr, "lazyrandomization: re-randomizing for sweep %zu in %s mode\n", currentsweep, framemode ? "frame" : "utterance");

        const size_t sweepts = sweep * _totalframes;     // first global frame index for this sweep

@ -749,7 +749,7 @@ private:
        if (!chunkdata.isinram())
            return;       // already out

-        fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n",
+        fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
                 k, randomizedchunks[k].globalts, randomizedchunks[k].globalte()-1, chunksinram-1);
        chunkdata.releasedata();
        chunksinram--;
@ -768,7 +768,7 @@ private:
        if (chunkdata.isinram())
            return false;

-        fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
+        fprintf (stderr, "requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
        msra::util::attempt (5, [&]()   // (reading from network)
        {
            chunkdata.requiredata (featkind, featdim, sampperiod, this->lattices);
@ -858,7 +858,7 @@ public:
            transcripts.clear();

            // return these utterances
-            fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
+            fprintf (stderr, "getbatch: getting utterances %zu..%zu (%zu frames out of %zu requested) in sweep %zu\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
            {
@ -922,7 +922,7 @@ public:
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[lastchunk].windowend;
-            fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
+            fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
            for (size_t k = 0; k < windowbegin; k++)
--- a/DataReader/HTKMLFReader_linux/utterancesourcemulti.h
+++ b/DataReader/HTKMLFReader_linux/utterancesourcemulti.h
@ -131,7 +131,7 @@ class minibatchutterancesourcemulti : public minibatchsource
                if (featdim == 0)
                {
                    reader.getinfo (utteranceset[0].parsedpath, featkind, featdim, sampperiod);
-                    fprintf (stderr, "requiredata: determined feature kind as %d-dimensional '%s' with frame shift %.1f ms\n", featdim, featkind.c_str(), sampperiod / 1e4);
+                    fprintf (stderr, "requiredata: determined feature kind as %zu-dimensional '%s' with frame shift %.1f ms\n", featdim, featkind.c_str(), sampperiod / 1e4);
                }
                // read all utterances; if they are in the same archive, htkfeatreader will be efficient in not closing the file
                frames.resize (featdim, totalframes);
@ -148,7 +148,7 @@ class minibatchutterancesourcemulti : public minibatchsource
                        latticesource.getlattices (utteranceset[i].key(), lattices[i], uttframes.cols());
                }
                //fprintf (stderr, "\n");
-                fprintf (stderr, "requiredata: %d utterances read\n", utteranceset.size());
+                fprintf (stderr, "requiredata: %zu utterances read\n", utteranceset.size());
            }
            catch (...)
            {
@ -372,7 +372,7 @@ public:
                    throw std::runtime_error ("minibatchutterancesource: utterances < 2 frames not supported");
                if (uttframes > frameref::maxframesperutterance)
                {
-                    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%d frames) because it exceeds max. frames (%d) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
+                    fprintf (stderr, "minibatchutterancesource: skipping %d-th file (%zu frames) because it exceeds max. frames (%zu) for frameref bit field: %S", i, uttframes, frameref::maxframesperutterance, key.c_str());
                    continue;
                }

@ -419,7 +419,7 @@ public:
                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
                            if (labframes != uttframes)
                            {
-                                fprintf (stderr, " [duration mismatch (%d in label vs. %d in feat file), skipping %S]", labframes, uttframes, key.c_str());
+                                fprintf (stderr, " [duration mismatch (%zu in label vs. %zu in feat file), skipping %S]", labframes, uttframes, key.c_str());
                                nomlf++;
                                continue;   // skip this utterance at all
                            }
@ -457,7 +457,7 @@ public:
                    assert(uttframes==framesaccum[i]); //ensure that number of frames is consistent in each input feature "stream"
                }
            }
-            fprintf (stderr, "feature set %d: %d frames in %d out of %d utterances\n", m, _totalframes, utteranceset.size(),infiles[m].size());
+            fprintf (stderr, "feature set %d: %zu frames in %zu out of %zu utterances\n", m, _totalframes, utteranceset.size(),infiles[m].size());

            if (!labels.empty()){
                foreach_index (j, labels){
@ -474,12 +474,12 @@ public:
            }
            if (nomlf + nolat > 0)
            {
-                fprintf (stderr, "minibatchutterancesource: out of %d files, %d files not found in label set and %d have no lattice\n", infiles.size(), nomlf, nolat);
+                fprintf (stderr, "minibatchutterancesource: out of %zu files, %zu files not found in label set and %zu have no lattice\n", infiles.size(), nomlf, nolat);
                if (nomlf + nolat > infiles[m].size() / 2)
                    throw std::runtime_error ("minibatchutterancesource: too many files not found in label set--assuming broken configuration\n");
            }

-            if (m==0) {foreach_index(j, numclasses) { fprintf(stderr,"label set %d: %d classes\n",j, numclasses[j]); } }
+            if (m==0) {foreach_index(j, numclasses) { fprintf(stderr,"label set %d: %zu classes\n",j, numclasses[j]); } }
            // distribute them over chunks
            // We simply count off frames until we reach the chunk size.
            // Note that we first randomize the chunks, i.e. when used, chunks are non-consecutive and thus cause the disk head to seek for each chunk.
@ -511,7 +511,7 @@ public:
            }

            numutterances = utteranceset.size();
-            fprintf (stderr, "minibatchutterancesource: %d utterances grouped into %d chunks, av. chunk size: %.1f utterances, %.1f frames\n",
+            fprintf (stderr, "minibatchutterancesource: %zu utterances grouped into %zu chunks, av. chunk size: %.1f utterances, %.1f frames\n",
                numutterances, thisallchunks.size(), numutterances / (double) thisallchunks.size(), _totalframes / (double) thisallchunks.size());
            // Now utterances are stored exclusively in allchunks[]. They are never referred to by a sequential utterance id at this point, only by chunk/within-chunk index.
        }
@ -600,7 +600,7 @@ private:
            return sweep;

        currentsweep = sweep;
-        fprintf (stderr, "lazyrandomization: re-randomizing for sweep %d in %s mode\n", currentsweep, framemode ? "frame" : "utterance");
+        fprintf (stderr, "lazyrandomization: re-randomizing for sweep %zu in %s mode\n", currentsweep, framemode ? "frame" : "utterance");

        const size_t sweepts = sweep * _totalframes;     // first global frame index for this sweep

@ -912,7 +912,7 @@ private:
            auto & chunkdata = randomizedchunks[m][k].getchunkdata();
            if (chunkdata.isinram())
            {
-                fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n",
+                fprintf (stderr, "releaserandomizedchunk: paging out randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n",
                     k, randomizedchunks[m][k].globalts, randomizedchunks[m][k].globalte()-1, chunksinram-1);
                chunkdata.releasedata();
                numreleased++;
@ -957,7 +957,7 @@ private:
            {
                auto & chunk = randomizedchunks[m][chunkindex];
                auto & chunkdata = chunk.getchunkdata();
-                fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %d (frame range [%d..%d]), %d resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
+                fprintf (stderr, "feature set %d: requirerandomizedchunk: paging in randomized chunk %zu (frame range [%zu..%zu]), %zu resident in RAM\n", m, chunkindex, chunk.globalts, chunk.globalte()-1, chunksinram+1);
                msra::util::attempt (5, [&]()   // (reading from network)
                {
                    chunkdata.requiredata (featkind[m], featdim[m], sampperiod[m], this->lattices);
@ -1069,7 +1069,7 @@ public:
                }
            }
            // return these utterances
-            fprintf (stderr, "getbatch: getting utterances %d..%d (%d frames out of %d requested) in sweep %d\n", spos, epos -1, mbframes, framesrequested, sweep);
+            fprintf (stderr, "getbatch: getting utterances %zu..%zu (%zu frames out of %zu requested) in sweep %zu\n", spos, epos -1, mbframes, framesrequested, sweep);
            size_t tspos = 0;   // relative start of utterance 'pos' within the returned minibatch
            for (size_t pos = spos; pos < epos; pos++)
            {
@ -1147,7 +1147,7 @@ public:
            const size_t lastchunk = chunkforframepos (globalte-1);
            const size_t windowbegin = randomizedchunks[0][firstchunk].windowbegin;
            const size_t windowend = randomizedchunks[0][lastchunk].windowend;
-            fprintf (stderr, "getbatch: getting randomized frames [%d..%d] (%d frames out of %d requested) in sweep %d; chunks [%d..%d] -> chunk window [%d..%d)\n",
+            fprintf (stderr, "getbatch: getting randomized frames [%zu..%zu] (%zu frames out of %zu requested) in sweep %zu; chunks [%zu..%zu] -> chunk window [%zu..%zu)\n",
                     globalts, globalte, mbframes, framesrequested, sweep, firstchunk, lastchunk, windowbegin, windowend);
            // release all data outside, and page in all data inside
            for (size_t k = 0; k < windowbegin; k++)
--- a/DataReader/Kaldi2Reader/DOCUMENTATION.txt
+++ b/DataReader/Kaldi2Reader/DOCUMENTATION.txt
@ -0,0 +1,57 @@
+Everything is the same as HTKMLFReader_linux, except:
+
+********** Features **********
+
+The features section is different:
+
+features=[
+    dim=
+    rx=
+    scpFile=
+    featureTransform=
+]
+
+rx is a text file which contains:
+
+    one Kaldi feature rxspecifier readable by RandomAccessBaseFloatMatrixReader.
+    'ark:' specifiers don't work; only 'scp:' specifiers work.
+
+scpFile is a text file generated by running:
+
+    feat-to-len FEATURE_RXSPECIFIER_FROM_ABOVE ark,t:- > TEXT_FILE_NAME
+
+    scpFile should contain one line per utterance.
+
+    If you want to run with fewer utterances, just shorten this file.
+    (It will load the feature rxspecifier but ignore utterances not present in scpFile).
+
+featureTransform is the name of a Kaldi feature transform file:
+    
+    Kaldi feature transform files are used for stacking / applying transforms to features.
+
+    An empty string (if permitted by the config file reader?) or the special string: NO_FEATURE_TRANSFORM
+    says to ignore this option.
+
+********** Labels **********
+
+The labels section is also different.
+
+labels=[
+    mlfFile=
+    labelDim=
+    labelMappingFile=
+]
+
+Only difference is mlfFile. mlfFile is a different format now. It is a text file which contains:
+
+    one Kaldi label rxspecifier readable by Kaldi's copy-post binary.
+
+********** Performance **********
+
+# If you have 100000s of utterances or if your data is beyond 50 GB, you will need:
+randomize=4320000
+
+You don't need to do anything with UseAllDataForPreComputedNode (it's ok to use all data).
+
+Read language ID DNN (stacked) train set 41728000 frames (130 GB) from scratch-raid in 597 seconds
+Read language ID DNN (stacked) valid set 4350199  frames from scratch-raid in 106 seconds
--- a/DataReader/Kaldi2Reader/DataReader.cpp
+++ b/DataReader/Kaldi2Reader/DataReader.cpp
@ -0,0 +1,181 @@
+//
+// <copyright file="DataReader.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// DataReader.cpp : Defines the exported functions for the DLL application.
+//
+
+#include "stdafx.h"
+#include "basetypes.h"
+
+#include "htkfeatio.h"                  // for reading HTK features
+//#include "latticearchive.h"             // for reading HTK phoneme lattices (MMI training)
+#include "simplesenonehmm.h"            // for MMI scoring
+//#include "msra_mgram.h"                 // for unigram scores of ground-truth path in sequence training
+
+#include "rollingwindowsource.h"        // minibatch sources
+//#include "readaheadsource.h"
+#include "chunkevalsource.h"
+#define DATAREADER_EXPORTS
+#include "DataReader.h"
+#include "HTKMLFReader.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+void DATAREADER_API GetReader(IDataReader<ElemType>** preader)
+{
+    *preader = new HTKMLFReader<ElemType>();
+}
+
+extern "C" DATAREADER_API void GetReaderF(IDataReader<float>** preader)
+{
+    GetReader(preader);
+}
+extern "C" DATAREADER_API void GetReaderD(IDataReader<double>** preader)
+{
+    GetReader(preader);
+}
+
+
+// Init - Reader Initialize for multiple data sets
+// config - [in] configuration parameters for the datareader
+template<class ElemType>
+void DataReader<ElemType>::Init(const ConfigParameters& readerConfig)
+{
+    m_dataReader = new HTKMLFReader<ElemType>();
+    m_dataReader->Init(readerConfig);
+}
+
+template<class ElemType>
+void DataReader<ElemType>::GetDataReader(const ConfigParameters& /*config*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+// Destroy - cleanup and remove this class
+// NOTE: this destroys the object, and it can't be used past this point
+template<class ElemType>
+void DataReader<ElemType>::Destroy()
+{
+    delete m_dataReader;
+    m_dataReader = NULL;
+}
+
+// DataReader Constructor
+// config - string  of options (i.e. "-windowsize:11 -addenergy") data reader specific 
+template<class ElemType>
+DataReader<ElemType>::DataReader(const ConfigParameters& config)
+{
+    Init(config);
+}
+
+
+// destructor - cleanup temp files, etc. 
+template<class ElemType>
+DataReader<ElemType>::~DataReader()
+{
+    delete m_dataReader;
+    m_dataReader = NULL;
+}
+
+//StartMinibatchLoop - Startup a minibatch loop 
+// mbSize - [in] size of the minibatch (number of frames, etc.)
+// epoch - [in] epoch number for this loop
+// requestedEpochSamples - [in] number of samples to randomize, defaults to requestDataSize which uses the number of samples there are in the dataset
+template<class ElemType>
+void DataReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples)
+{
+    m_dataReader->StartMinibatchLoop(mbSize, epoch, requestedEpochSamples);
+}
+
+// GetMinibatch - Get the next minibatch (features and labels)
+// matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponing matrix, 
+//             [out] each matrix resized if necessary containing data. 
+// returns - true if there are more minibatches, false if no more minibatchs remain
+template<class ElemType>
+bool DataReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices)
+{
+    return m_dataReader->GetMinibatch(matrices);
+}
+
+template<class ElemType>
+size_t DataReader<ElemType>::NumberSlicesInEachRecurrentIter()
+{
+    return m_dataReader->NumberSlicesInEachRecurrentIter();
+}
+
+template<class ElemType>
+void DataReader<ElemType>::SetNbrSlicesEachRecurrentIter(const size_t sz)
+{
+    m_dataReader->SetNbrSlicesEachRecurrentIter(sz);
+}
+
+template<class ElemType>
+void DataReader<ElemType>::SetSentenceEndInBatch(std::vector<size_t> &sentenceEnd)
+{
+    m_dataReader->SetSentenceEndInBatch(sentenceEnd);
+}
+
+// GetLabelMapping - Gets the label mapping from integer index to label type 
+// returns - a map from numeric datatype to native label type 
+template<class ElemType>
+const map<typename IDataReader<ElemType>::LabelIdType, typename IDataReader<ElemType>::LabelType>& DataReader<ElemType>::GetLabelMapping(const std::wstring& sectionName)
+{
+    return m_dataReader->GetLabelMapping(sectionName);
+}
+
+// SetLabelMapping - Sets the label mapping from integer index to label 
+// labelMapping - mapping table from label values to IDs (must be 0-n)
+// note: for tasks with labels, the mapping table must be the same between a training run and a testing run 
+template<class ElemType>
+void DataReader<ElemType>::SetLabelMapping(const std::wstring& sectionName, const std::map<typename IDataReader<ElemType>::LabelIdType, typename IDataReader<ElemType>::LabelType>& labelMapping)
+{
+    m_dataReader->SetLabelMapping(sectionName, labelMapping);
+}
+
+// GetData - Gets metadata from the specified section (into CPU memory) 
+// sectionName - section name to retrieve data from
+// numRecords - number of records to read
+// data - pointer to data buffer, if NULL, dataBufferSize will be set to size of required buffer to accomidate request
+// dataBufferSize - [in] size of the databuffer in bytes
+//                  [out] size of buffer filled with data
+// recordStart - record to start reading from, defaults to zero (start of data)
+// returns: true if data remains to be read, false if the end of data was reached
+template<class ElemType>
+bool DataReader<ElemType>::GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart)
+{
+    return m_dataReader->GetData(sectionName, numRecords, data, dataBufferSize, recordStart);
+}
+
+template<class ElemType>
+bool DataReader<ElemType>::DataEnd(EndDataType endDataType)
+{
+    return m_dataReader->DataEnd(endDataType);
+}
+
+template class DataReader<float>;
+template class DataReader<double>;
+
+// Utility function, in ConfigFile.cpp, but HTKMLFReader doesn't need that code...
+
+// Trim - trim white space off the start and end of the string
+// str - string to trim
+// NOTE: if the entire string is empty, then the string will be set to an empty string
+/*  void Trim(std::string& str)
+{
+    auto found = str.find_first_not_of(" \t");
+    if (found == npos)
+    {
+        str.erase(0);
+        return;
+    }
+    str.erase(0, found);
+    found = str.find_last_not_of(" \t");
+    if (found != npos)
+        str.erase(found+1);
+}*/
+
+
+}}}
--- a/DataReader/Kaldi2Reader/DataWriter.cpp
+++ b/DataReader/Kaldi2Reader/DataWriter.cpp
@ -0,0 +1,111 @@
+//
+// <copyright file="DataWriter.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// DataWriter.cpp : Defines the exported functions for the DLL application.
+//
+
+#include "stdafx.h"
+#include "basetypes.h"
+
+#include "htkfeatio.h"                  // for reading HTK features
+
+#define DATAWRITER_EXPORTS
+#include "DataWriter.h"
+#include "HTKMLFWriter.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+void DATAWRITER_API GetWriter(IDataWriter<ElemType>** pwriter)
+{
+    *pwriter = new HTKMLFWriter<ElemType>();
+}
+
+extern "C" DATAWRITER_API void GetWriterF(IDataWriter<float>** pwriter)
+{
+    GetWriter(pwriter);
+}
+extern "C" DATAWRITER_API void GetWriterD(IDataWriter<double>** pwriter)
+{
+    GetWriter(pwriter);
+}
+
+
+template<class ElemType>
+void DataWriter<ElemType>::Init(const ConfigParameters& writerConfig)
+{
+    m_dataWriter = new HTKMLFWriter<ElemType>();
+    m_dataWriter->Init(writerConfig);
+}
+
+
+template<class ElemType>
+void DataWriter<ElemType>::GetDataWriter(const ConfigParameters& /*config*/)
+{
+    NOT_IMPLEMENTED;
+}
+
+
+// Destroy - cleanup and remove this class
+// NOTE: this destroys the object, and it can't be used past this point
+template<class ElemType>
+void DataWriter<ElemType>::Destroy()
+{
+    delete m_dataWriter;
+    m_dataWriter = NULL;
+}
+
+
+// DataWriter Constructor
+// config - [in] configuration data for the data writer
+template<class ElemType>
+DataWriter<ElemType>::DataWriter(const ConfigParameters& config)
+{
+    Init(config);
+}
+
+
+// destructor - cleanup temp files, etc. 
+template<class ElemType>
+DataWriter<ElemType>::~DataWriter()
+{
+    delete m_dataWriter;
+    m_dataWriter = NULL;
+}
+
+// GetSections - Get the sections of the file
+// sections - a map of section name to section. Data sepcifications from config file will be used to determine where and how to save data
+template<class ElemType>
+void DataWriter<ElemType>::GetSections(std::map<std::wstring, SectionType, nocase_compare>& sections)
+{
+    m_dataWriter->GetSections(sections);
+}
+
+// SaveData - save data in the file/files 
+// recordStart - Starting record number
+// matricies - a map of section name (section:subsection) to data pointer. Data sepcifications from config file will be used to determine where and how to save data
+// numRecords - number of records we are saving, can be zero if not applicable
+// datasetSize - Size of the dataset
+// byteVariableSized - for variable sized data, size of current block to be written, zero when not used, or ignored if not variable sized data
+template<class ElemType>
+bool DataWriter<ElemType>::SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized)
+{
+    return m_dataWriter->SaveData(recordStart, matrices, numRecords, datasetSize, byteVariableSized);
+}
+
+// SaveMapping - save a map into the file
+// saveId - name of the section to save into (section:subsection format)
+// labelMapping - map we are saving to the file
+template<class ElemType>
+void DataWriter<ElemType>::SaveMapping(std::wstring saveId, const std::map<LabelIdType, LabelType>& labelMapping)
+{
+    m_dataWriter->SaveMapping(saveId, labelMapping);
+}
+
+//The explicit instantiation
+template class DataWriter<double>; 
+template class DataWriter<float>;
+
+}}}
--- a/DataReader/Kaldi2Reader/HTKMLFReader.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.cpp
--- a/DataReader/Kaldi2Reader/HTKMLFReader.h
+++ b/DataReader/Kaldi2Reader/HTKMLFReader.h
@ -0,0 +1,114 @@
+//
+// <copyright file="HTKMLFReader.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples 
+#pragma once
+#include "DataReader.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+class HTKMLFReader : public IDataReader<ElemType>
+{
+private:
+    msra::dbn::minibatchiterator* m_mbiter;
+    msra::dbn::minibatchsource* m_frameSource;
+    vector<msra::asr::FeatureSection *> m_trainingOrTestingFeatureSections;
+    //msra::dbn::minibatchreadaheadsource* m_readAheadSource;
+    msra::dbn::FileEvalSource* m_fileEvalSource; 
+    vector<msra::asr::FeatureSection *> m_writingFeatureSections;
+    msra::dbn::latticesource* m_lattices;
+    map<wstring,msra::lattices::lattice::htkmlfwordsequence> m_latticeMap;
+    
+    vector<bool> m_sentenceEnd;
+    bool m_readAhead;
+    bool m_truncated;
+    vector<size_t> m_processedFrame;
+    size_t m_numberOfuttsPerMinibatch;
+    size_t m_actualnumberOfuttsPerMinibatch;
+    size_t m_mbSize;
+    vector<size_t> m_toProcess;
+    vector<size_t> m_switchFrame;
+    bool m_noData;
+
+    bool m_trainOrTest; // if false, in file writing mode
+	using LabelType = typename IDataReader<ElemType>::LabelType;
+	using LabelIdType = typename IDataReader<ElemType>::LabelIdType;
+ 
+    std::map<LabelIdType, LabelType> m_idToLabelMap;
+    
+    bool m_partialMinibatch; // allow partial minibatches?
+    
+    std::vector<ElemType*> m_featuresBufferMultiUtt;
+    std::vector<size_t> m_featuresBufferAllocatedMultiUtt;
+    std::vector<ElemType*> m_labelsBufferMultiUtt;
+    std::vector<size_t> m_labelsBufferAllocatedMultiUtt;
+    std::vector<size_t> m_featuresStartIndexMultiUtt;
+    std::vector<size_t> m_labelsStartIndexMultiUtt;
+
+    std::vector<ElemType*> m_featuresBufferMultiIO;
+    std::vector<size_t> m_featuresBufferAllocatedMultiIO;
+    std::vector<ElemType*> m_labelsBufferMultiIO;
+    std::vector<size_t> m_labelsBufferAllocatedMultiIO;
+
+    std::map<std::wstring,size_t> m_featureNameToIdMap;
+    std::map<std::wstring,size_t> m_labelNameToIdMap;
+    std::map<std::wstring,size_t> m_nameToTypeMap;
+    std::map<std::wstring,size_t> m_featureNameToDimMap;
+    std::map<std::wstring,size_t> m_labelNameToDimMap;
+    // for writing outputs to files (standard single input/output network) - deprecate eventually
+    bool m_checkDictionaryKeys;
+    bool m_convertLabelsToTargets;
+    std::vector <bool> m_convertLabelsToTargetsMultiIO;
+    std::vector<std::vector<std::wstring>> m_inputFilesMultiIO;
+ 
+    size_t m_inputFileIndex;
+    std::vector<size_t> m_featDims;
+    std::vector<size_t> m_labelDims;
+
+    std::vector<std::vector<std::vector<ElemType>>>m_labelToTargetMapMultiIO;
+     
+    void PrepareForTrainingOrTesting(const ConfigParameters& config);
+    void PrepareForWriting(const ConfigParameters& config);
+    
+    bool GetMinibatchToTrainOrTest(std::map<std::wstring, Matrix<ElemType>*>&matrices);
+    bool GetMinibatchToWrite(std::map<std::wstring, Matrix<ElemType>*>&matrices);
+    
+    void StartMinibatchLoopToTrainOrTest(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    void StartMinibatchLoopToWrite(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+
+    bool ReNewBufferForMultiIO(size_t i);
+
+    size_t NumberSlicesInEachRecurrentIter() { return m_numberOfuttsPerMinibatch ;} 
+    void SetNbrSlicesEachRecurrentIter(const size_t) { };
+
+     void GetDataNamesFromConfig(const ConfigParameters& readerConfig, std::vector<std::wstring>& features, std::vector<std::wstring>& labels);
+
+    
+    size_t ReadLabelToTargetMappingFile (const std::wstring& labelToTargetMappingFile, const std::wstring& labelListFile, std::vector<std::vector<ElemType>>& labelToTargetMap);
+    enum InputOutputTypes
+    {
+        real,
+        category,
+    };
+
+
+
+public:
+    virtual void Init(const ConfigParameters& config);
+    virtual void Destroy() {delete this;}
+    virtual ~HTKMLFReader();
+    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples=requestDataSize);
+    virtual bool GetMinibatch(std::map<std::wstring, Matrix<ElemType>*>& matrices);
+    virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
+    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<LabelIdType, LabelType>& labelMapping);
+    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart=0);
+
+    virtual bool DataEnd(EndDataType endDataType);
+    void SetSentenceEndInBatch(vector<size_t> &/*sentenceEnd*/);
+    void SetSentenceEnd(int /*actualMbSize*/){};
+};
+
+}}}
--- a/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
+++ b/DataReader/Kaldi2Reader/HTKMLFWriter.cpp
@ -0,0 +1,290 @@
+//
+// <copyright file="HTKMLFReader.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// HTKMLFReader.cpp : Defines the exported functions for the DLL application.
+//
+
+#include "stdafx.h"
+#include "basetypes.h"
+
+#include "htkfeatio.h"                  // for reading HTK features
+
+//#ifndef __unix__
+#include "ssematrix.h"
+//#endif
+//#include "latticearchive.h"             // for reading HTK phoneme lattices (MMI training)
+//#include "simplesenonehmm.h"            // for MMI scoring
+//#include "msra_mgram.h"                 // for unigram scores of ground-truth path in sequence training
+
+//#include "rollingwindowsource.h"        // minibatch sources
+//#include "utterancesource.h"
+//#include "readaheadsource.h"
+//#include "chunkevalsource.h"
+//#include "minibatchiterator.h"
+
+#define DATAWRITER_EXPORTS  // creating the exports here
+#include "DataWriter.h"
+#include "HTKMLFWriter.h"
+#ifdef LEAKDETECT
+#include <vld.h> // for memory leak detection
+#endif
+
+
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+    // Create a Data Writer
+    //DATAWRITER_API IDataWriter* DataWriterFactory(void)
+
+    template<class ElemType>
+    void HTKMLFWriter<ElemType>::Init(const ConfigParameters& writerConfig)
+    {
+        m_tempArray = nullptr;
+        m_tempArraySize = 0;
+
+        vector<wstring> scriptpaths;
+        vector<wstring> filelist;
+        size_t numFiles;
+        size_t firstfilesonly = SIZE_MAX;   // set to a lower value for testing
+
+        ConfigArray outputNames = writerConfig("outputNodeNames","");
+        if (outputNames.size()<1)
+            RuntimeError("writer needs at least one outputNodeName specified in config");
+        int counter = 0;
+        foreach_index(i, outputNames) // inputNames should map to node names
+        {
+            ConfigParameters thisOutput = writerConfig(outputNames[i]);
+
+            if (thisOutput.Exists("dim"))
+                udims.push_back(thisOutput("dim"));
+            else
+                RuntimeError("HTKMLFWriter::Init: writer need to specify dim of output");
+            if (thisOutput.Exists("file"))
+                scriptpaths.push_back(thisOutput("file"));
+            else if (thisOutput.Exists("scpFile"))
+                scriptpaths.push_back(thisOutput("scpFile"));
+            else
+                RuntimeError("HTKMLFWriter::Init: writer needs to specify scpFile for output");
+            
+            if (thisOutput.Exists("Kaldicmd"))
+            {
+                kaldicmd.push_back(thisOutput("Kaldicmd"));
+                kaldi::BaseFloatMatrixWriter wfea;
+                feature_writer.push_back(wfea);
+                feature_writer[i].Open(msra::strfun::utf8(kaldicmd[counter]));
+            }
+ 
+            outputNameToIdMap[outputNames[i]]= i;
+            outputNameToDimMap[outputNames[i]]=udims[i];
+            wstring type = thisOutput("type","Real");
+            if (type == L"Real")
+            {
+                outputNameToTypeMap[outputNames[i]] = OutputTypes::outputReal;
+            }
+            else
+            {
+                throw std::runtime_error ("HTKMLFWriter::Init: output type for writer output expected to be Real");
+            }
+            counter++;
+        }
+
+        numFiles=0;
+        foreach_index(i,scriptpaths)
+        {
+            filelist.clear();
+            std::wstring scriptPath = scriptpaths[i];
+            fprintf(stderr, "HTKMLFWriter::Init: reading output script file %S ...", scriptPath.c_str());
+            size_t n = 0;
+            for (msra::files::textreader reader(scriptPath); reader && filelist.size() <= firstfilesonly/*optimization*/; )
+            {
+                std::wstring line = reader.wgetline();
+                wstringstream ss(line);
+                std::wstring first_col;
+                ss >> first_col;
+                filelist.push_back (first_col); //LEOTODO
+                n++;
+            }
+
+            fprintf (stderr, " %zu entries\n", n);
+
+            if (i==0)
+                numFiles=n;
+            else
+                if (n!=numFiles)
+                    throw std::runtime_error (msra::strfun::strprintf ("HTKMLFWriter:Init: number of files in each scriptfile inconsistent (%d vs. %d)", numFiles,n));
+
+            outputFiles.push_back(filelist);
+        }
+        outputFileIndex=0;
+        sampPeriod=100000;
+
+    }
+
+    template<class ElemType>
+    void HTKMLFWriter<ElemType>::Destroy()
+    {
+        delete [] m_tempArray;
+        m_tempArray = nullptr;
+        m_tempArraySize = 0;
+        for (size_t i=0; i<feature_writer.size(); i++) {
+            feature_writer[i].Close();
+            fprintf(stderr, "Closed Kaldi writer\n");
+        }
+    }
+
+    template<class ElemType>
+    void HTKMLFWriter<ElemType>::GetSections(std::map<std::wstring, SectionType, nocase_compare>& /*sections*/)
+    {
+    }
+
+    template<class ElemType>
+    bool HTKMLFWriter<ElemType>::SaveData(size_t /*recordStart*/, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t /*numRecords*/, size_t /*datasetSize*/, size_t /*byteVariableSized*/)
+    {
+        
+        if (kaldicmd.size() == 0)
+        {
+            //std::map<std::wstring, void*, nocase_compare>::iterator iter;
+            if (outputFileIndex>=outputFiles[0].size())
+                RuntimeError("index for output scp file out of range...");
+
+            for (auto iter = matrices.begin();iter!=matrices.end(); iter++)
+            {
+                wstring outputName = iter->first;
+                Matrix<ElemType>& outputData = *(static_cast<Matrix<ElemType>*>(iter->second));
+                size_t id = outputNameToIdMap[outputName];
+                size_t dim = outputNameToDimMap[outputName];
+                wstring outFile = outputFiles[id][outputFileIndex];
+            
+                assert(outputData.GetNumRows()==dim); dim;
+
+                SaveToKaldiFile(outFile,outputData);
+            }
+
+            outputFileIndex++;
+        } else
+        {
+            if (outputFileIndex>=outputFiles[0].size())
+                RuntimeError("index for output scp file out of range...");
+            int i = 0;
+            for (auto iter = matrices.begin();iter!=matrices.end(); iter++)
+            {
+                wstring outputName = iter->first;
+                Matrix<ElemType>& outputData = *(static_cast<Matrix<ElemType>*>(iter->second));
+                size_t id = outputNameToIdMap[outputName];
+                size_t dim = outputNameToDimMap[outputName];
+                wstring outFile = outputFiles[id][outputFileIndex];
+                string wfea = "ark:" + msra::strfun::utf8(outFile);
+                
+                //wfea = msra::strfun::utf8(kaldicmd[i]);
+                //feature_writer[i].Open(wfea);
+                kaldi::Matrix<kaldi::BaseFloat> nnet_out_host;
+   
+                assert(outputData.GetNumRows()==dim); dim;
+                const std::string outputPath = msra::strfun::utf8(outFile);
+                const std::string file_key = removeExtension(basename(outputPath));
+
+                nnet_out_host.Resize(outputData.GetNumCols(), outputData.GetNumRows());
+                outputData.CopyToArray(m_tempArray, m_tempArraySize);
+                ElemType * pValue = m_tempArray;
+
+           
+                for (int j=0; j< outputData.GetNumCols(); j++)
+                {
+                    for (int i=0; i<outputData.GetNumRows(); i++)
+                    {
+                        nnet_out_host(j,i) = (float)*pValue++;                
+                        if (nnet_out_host(j,i) > 50)
+                        {
+                            nnet_out_host(j,i)  = -(float)log(1.0/outputData.GetNumCols());
+                            fprintf (stderr, "overflowed!! : %d %d frames of %s\n", i,j, wfea.c_str());
+                        }
+
+
+                    }
+                }
+                
+                fprintf (stderr, "evaluate: writing %zu frames of %s\n", outputData.GetNumCols(), wfea.c_str());
+                feature_writer[i].Write(file_key, nnet_out_host);
+                i++;
+ 
+            }
+
+            outputFileIndex++;
+
+        }
+        return true;
+    }
+
+    template<class ElemType>
+    void HTKMLFWriter<ElemType>::SaveToFile(std::wstring& outputFile, Matrix<ElemType>& outputData)
+    {
+        msra::dbn::matrix output;
+        output.resize(outputData.GetNumRows(),outputData.GetNumCols());
+        outputData.CopyToArray(m_tempArray, m_tempArraySize);
+        ElemType * pValue = m_tempArray;
+
+        for (int j=0; j< outputData.GetNumCols(); j++)
+            {
+                for (int i=0; i<outputData.GetNumRows(); i++)
+                {
+                    output(i,j) = (float)*pValue++;                
+                }
+            }
+            
+        const size_t nansinf = output.countnaninf();
+        if (nansinf > 0)
+            fprintf (stderr, "chunkeval: %d NaNs or INF detected in '%S' (%d frames)\n", (int) nansinf, outputFile.c_str(), (int) output.cols());
+        // save it
+        msra::files::make_intermediate_dirs (outputFile);
+        msra::util::attempt (5, [&]()
+        {
+            msra::asr::htkfeatwriter::write (outputFile, "USER", sampPeriod, output);
+        });
+                        
+        fprintf (stderr, "evaluate: writing %zu frames of %S\n", output.cols(), outputFile.c_str());
+
+
+    }
+
+    template<class ElemType>
+    void HTKMLFWriter<ElemType>::SaveToKaldiFile(std::wstring& outputFile, Matrix<ElemType>& outputData)
+    {
+        msra::dbn::matrix output;
+        output.resize(outputData.GetNumRows(),outputData.GetNumCols());
+        outputData.CopyToArray(m_tempArray, m_tempArraySize);
+        ElemType * pValue = m_tempArray;
+
+        for (int j=0; j< outputData.GetNumCols(); j++)
+            {
+                for (int i=0; i<outputData.GetNumRows(); i++)
+                {
+                    output(i,j) = (float)*pValue++;                
+                }
+            }
+            
+        const size_t nansinf = output.countnaninf();
+        if (nansinf > 0)
+            fprintf (stderr, "chunkeval: %d NaNs or INF detected in '%S' (%d frames)\n", (int) nansinf, outputFile.c_str(), (int) output.cols());
+        // save it
+        msra::files::make_intermediate_dirs (outputFile);
+        msra::util::attempt (5, [&]()
+        {
+            msra::asr::htkfeatwriter::writeKaldi (outputFile, "USER", sampPeriod, output, sizeof(ElemType));
+        });
+                        
+        fprintf (stderr, "evaluate: writing %zu frames of %S\n", output.cols(), outputFile.c_str());
+    }
+
+
+
+    template<class ElemType>
+    void HTKMLFWriter<ElemType>::SaveMapping(std::wstring saveId, const std::map<LabelIdType, LabelType>& /*labelMapping*/)
+    {
+    }
+   
+    template class HTKMLFWriter<float>;
+    template class HTKMLFWriter<double>;
+
+}}}
--- a/DataReader/Kaldi2Reader/HTKMLFWriter.h
+++ b/DataReader/Kaldi2Reader/HTKMLFWriter.h
@ -0,0 +1,49 @@
+//
+// <copyright file="HTKMLFReader.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// HTKMLFReader.h - Include file for the MTK and MLF format of features and samples 
+#pragma once
+#include "DataWriter.h"
+#include <map>
+#include <vector>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template<class ElemType>
+class HTKMLFWriter : public IDataWriter<ElemType>
+{
+private:
+    std::vector<size_t> outputDims;
+    std::vector<std::vector<std::wstring>> outputFiles;
+    std::vector<std::wstring> kaldicmd;
+    std::vector<kaldi::BaseFloatMatrixWriter> feature_writer;
+    std::vector<size_t> udims;
+    std::map<std::wstring,size_t> outputNameToIdMap;
+    std::map<std::wstring,size_t> outputNameToDimMap;
+    std::map<std::wstring,size_t> outputNameToTypeMap;
+    unsigned int sampPeriod;
+    size_t outputFileIndex;
+    void SaveToFile(std::wstring& outputFile, Matrix<ElemType>& outputData);
+    void SaveToKaldiFile(std::wstring& outputFile, Matrix<ElemType>& outputData);
+    ElemType * m_tempArray;
+    size_t m_tempArraySize;
+
+    enum OutputTypes
+    {
+        outputReal,
+        outputCategory,
+    };
+
+public:
+    using LabelType = typename IDataWriter<ElemType>::LabelType;
+    using LabelIdType = typename IDataWriter<ElemType>::LabelIdType;
+    virtual void Init(const ConfigParameters& writerConfig);
+    virtual void Destroy();
+    virtual void GetSections(std::map<std::wstring, SectionType, nocase_compare>& sections);
+    virtual bool SaveData(size_t recordStart, const std::map<std::wstring, void*, nocase_compare>& matrices, size_t numRecords, size_t datasetSize, size_t byteVariableSized);
+    virtual void SaveMapping(std::wstring saveId, const std::map<LabelIdType, LabelType>& labelMapping);
+};
+
+}}}
--- a/DataReader/Kaldi2Reader/basetypes.h
+++ b/DataReader/Kaldi2Reader/basetypes.h
--- a/DataReader/Kaldi2Reader/biggrowablevectors.h
+++ b/DataReader/Kaldi2Reader/biggrowablevectors.h
@ -0,0 +1,122 @@
+//
+// <copyright file="biggrowablevectors.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// biggrowablevectors.h -- big growable vector that uses two layers and optionally a disk backing store for paging
+
+#pragma once
+
+namespace msra { namespace dbn {
+
+// ---------------------------------------------------------------------------
+// growablevectorbase -- helper for two-layer growable random-access array
+// This allows both a fully allocated vector (with push_back()), e.g. for uids,
+// as well as a partially allocated one (content managed by derived class), for features and lattice blocks.
+// TODO:
+//  - test this (make copy of binary first before full compilation; or rebuild the previous version)
+//  - fully move in-mem range here, test again
+//  - then we can move towards paging from archive directly (biggrowablevectorarray gets tossed)
+// ---------------------------------------------------------------------------
+template<class BLOCKTYPE> class growablevectorbase
+{
+protected:  // fix this later
+    const size_t elementsperblock;
+    size_t n;                                           // number of elements
+    std::vector<std::unique_ptr<BLOCKTYPE>> blocks;     // the data blocks
+    void operator= (const growablevectorbase &);        // (non-assignable)
+    void check (size_t t) const { if (t >= n) throw std::logic_error ("growablevectorbase: out of bounds"); }   // bounds check helper
+
+    // resize intermediate level, but do not allocate blocks
+    // (may deallocate if shrinking)
+    void resize_without_commit (size_t T)
+    {
+        blocks.resize ((T + elementsperblock-1) / elementsperblock);
+        n = T;
+        // TODO: update allocated range
+    }
+
+    // commit memory
+    // begin/end must be block boundaries
+    void commit (size_t begin, size_t end, BLOCKTYPE * blockdata)
+    {
+        auto blockptr = getblock (begin, end);  // memory leak: if this fails (logic error; should never happen)
+        blockptr.set (blockdata);               // take ownership of the block
+        // TODO: update allocated range  --also enforce consecutiveness
+    }
+
+    // flush a block
+    // begin/end must be block boundaries
+    void flush (size_t begin, size_t end)
+    {
+        auto blockptr = getblock (begin, end);  // memory leak: if this fails (logic error; should never happen)
+        blockptr.reset();                       // release it
+        // TODO: update allocated range  --also enforce consecutiveness
+    }
+
+    // helper to get a block pointer, with block referenced as its entire range
+    std::unique_ptr<BLOCKTYPE> & getblockptr (size_t t) // const
+    {
+        check (t);
+        return blocks[t / elementsperblock];
+    }
+
+    // helper to get a block pointer, with block referenced as its entire range
+    std::unique_ptr<BLOCKTYPE> & getblockptr (size_t begin, size_t end) const
+    {
+        // BUGBUG: last block may be shorter than elementsperblock
+        if (end - begin != elementsperblock || getblockt (begin) != 0)
+            throw std::logic_error ("growablevectorbase: non-block boundaries passed to block-level function");
+        return getblockptr (begin);
+    }
+public:
+    growablevectorbase (size_t elementsperblock) : elementsperblock (elementsperblock), n (0) { blocks.reserve (1000); }
+    size_t size() const { return n; }       // number of frames
+    bool empty() const { return size() == 0; }
+
+    // to access an element t -> getblock(t)[getblockt(t)]
+    BLOCKTYPE & getblock (size_t t) const
+    {
+        check (t);
+        const size_t blockid = t / elementsperblock;
+        return *blocks[blockid].get();
+    }
+
+    size_t getblockt (size_t t) const
+    {
+        check (t);
+        return t % elementsperblock;
+    }
+};
+
+// ---------------------------------------------------------------------------
+// biggrowablevector -- big vector we can push_back to
+// ---------------------------------------------------------------------------
+template<typename ELEMTYPE> class biggrowablevector : public growablevectorbase<std::vector<ELEMTYPE>>
+{
+public:
+    biggrowablevector() : growablevectorbase<std::vector<ELEMTYPE>>::growablevectorbase (65536) { }
+
+    template<typename VALTYPE> void push_back (VALTYPE e)   // VALTYPE could be an rvalue reference
+    {
+        size_t i = this->size();
+        this->resize_without_commit (i + 1);
+        auto & block = this->getblockptr (i);
+        if (block.get() == NULL)
+            block.reset (new std::vector<ELEMTYPE> (this->elementsperblock));
+        (*block)[this->getblockt (i)] = e;
+    }
+
+          ELEMTYPE & operator[] (size_t t)       { return this->getblock(t)[this->getblockt (t)]; }    // get an element
+    const ELEMTYPE & operator[] (size_t t) const { return this->getblock(t)[this->getblockt (t)]; }    // get an element
+
+    void resize (const size_t n)
+    {
+        this->resize_without_commit (n);
+        foreach_index (i, this->blocks)
+            if (this->blocks[i].get() == NULL)
+                this->blocks[i].reset (new std::vector<ELEMTYPE> (this->elementsperblock));
+    }
+};
+
+};};
--- a/DataReader/Kaldi2Reader/chunkevalsource.h
+++ b/DataReader/Kaldi2Reader/chunkevalsource.h
@ -0,0 +1,373 @@
+//
+// <copyright file="chunkevalsource.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+#pragma once
+
+
+//#include <objbase.h>
+#include "basetypes.h"                  // for attempt()
+#include "htkfeatio.h"                  // for reading HTK features
+#include "minibatchsourcehelpers.h"
+
+#ifndef __unix__
+#include "ssematrix.h"
+#endif
+
+#ifdef LEAKDETECT
+#include <vld.h> // for memory leak detection
+#endif
+
+namespace msra { namespace dbn {
+
+    class chunkevalsource // : public numamodelmanager
+    {
+        const size_t chunksize;                 // actual block size to perform computation on
+
+        // data FIFO
+        msra::dbn::matrix feat;
+        std::vector<std::vector<float>> frames; // [t] all feature frames concatenated into a big block
+        std::vector<char> boundaryflags;        // [t] -1 for first and +1 last frame, 0 else (for augmentneighbors())
+        std::vector<size_t> numframes;          // [k] number of frames for all appended files
+        std::vector<std::wstring> outpaths;     // [k] and their pathnames
+        std::vector<unsigned int> sampperiods;  // [k] and sample periods (they should really all be the same...)
+        size_t vdim; // input dimension
+        size_t udim; // output dimension
+        bool minibatchready;
+        void operator=(const chunkevalsource &);
+    private:
+        void clear()    // empty the FIFO
+        {
+            frames.clear();
+            boundaryflags.clear();
+            numframes.clear();
+            outpaths.clear();
+            sampperiods.clear();
+            minibatchready=false;
+        }
+
+        
+
+        void saveandflush(msra::dbn::matrix &pred)
+        {
+            const size_t framesinblock = frames.size();
+
+            // write out all files
+            size_t firstframe = 0;
+            foreach_index (k, numframes)
+            {
+                const wstring & outfile = outpaths[k];
+                unsigned int sampperiod = sampperiods[k];
+                size_t n = numframes[k];
+                msra::files::make_intermediate_dirs (outfile);
+                fprintf (stderr, "saveandflush: writing %zu frames to %S\n", n, outfile.c_str());
+                msra::dbn::matrixstripe thispred (pred, firstframe, n);
+                // some sanity check for the data we've written
+                const size_t nansinf = thispred.countnaninf();
+                if (nansinf > 0)
+                    fprintf (stderr, "chunkeval: %d NaNs or INF detected in '%S' (%d frames)\n", (int) nansinf, outfile.c_str(), (int) thispred.cols());
+                // save it
+                msra::util::attempt (5, [&]()
+                {
+                    msra::asr::htkfeatwriter::write (outfile, "USER", sampperiod, thispred);
+                });
+                firstframe += n;
+            }
+            assert (firstframe == framesinblock); framesinblock;
+
+            // and we are done --forget the FIFO content & get ready for next chunk
+            clear();
+
+        }
+
+    public:
+        chunkevalsource (size_t numinput, size_t numoutput, size_t chunksize)
+            :vdim(numinput),udim(numoutput),chunksize(chunksize)
+        {         
+            frames.reserve (chunksize * 2);    
+            feat.resize(vdim,chunksize); // initialize to size chunksize
+        }
+
+        // append data to chunk
+        template<class MATRIX> void addfile (const MATRIX & feat, const string & featkind, unsigned int sampperiod, const std::wstring & outpath)
+        {
+            // append to frames; also expand neighbor frames
+            if (feat.cols() < 2)
+                throw std::runtime_error ("evaltofile: utterances < 2 frames not supported");
+            foreach_column (t, feat)
+            {
+                std::vector<float> v (&feat(0,t), &feat(0,t) + feat.rows());
+                frames.push_back (v);
+                boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
+            }
+
+            numframes.push_back (feat.cols());
+            outpaths.push_back (outpath);
+            sampperiods.push_back (sampperiod);
+            
+        }
+
+        void createevalminibatch()
+        {
+            const size_t framesinblock = frames.size();
+            feat.resize(vdim, framesinblock);   // input features for whole utt (col vectors)
+            // augment the features
+            msra::dbn::augmentneighbors (frames, boundaryflags, 0, framesinblock, feat);
+            minibatchready=true;
+        }
+
+        void writetofiles(msra::dbn::matrix &pred){ saveandflush(pred); }
+
+        msra::dbn::matrix chunkofframes() { assert(minibatchready); return feat; }
+
+        bool isminibatchready() { return minibatchready; }
+
+        size_t currentchunksize() { return frames.size(); }
+        void flushinput(){createevalminibatch();}
+        void reset() { clear(); }
+
+    };
+
+
+    class chunkevalsourcemulti // : public numamodelmanager
+    {
+        const size_t chunksize;                 // actual block size to perform computation on
+
+        // data FIFO
+        std::vector<msra::dbn::matrix> feat;
+        std::vector<std::vector<std::vector<float>>> framesmulti; // [t] all feature frames concatenated into a big block
+        std::vector<char> boundaryflags;        // [t] -1 for first and +1 last frame, 0 else (for augmentneighbors())
+        std::vector<size_t> numframes;          // [k] number of frames for all appended files
+        std::vector<std::vector<std::wstring>> outpaths;     // [k] and their pathnames
+        std::vector<std::vector<unsigned int>> sampperiods;  // [k] and sample periods (they should really all be the same...)
+        std::vector<size_t> vdims; // input dimension
+        std::vector<size_t> udims; // output dimension
+        bool minibatchready;
+
+                void operator=(const chunkevalsourcemulti &);
+    private:
+        void clear()    // empty the FIFO
+        {
+            foreach_index(i, vdims)
+            {
+                framesmulti[i].clear();
+                outpaths[i].clear();
+                sampperiods[i].clear();
+            }
+            boundaryflags.clear();
+            numframes.clear();
+            minibatchready=false;
+        }
+
+        
+
+        void saveandflush(msra::dbn::matrix &pred, size_t index)
+        {
+            const size_t framesinblock = framesmulti[index].size();
+
+            // write out all files
+            size_t firstframe = 0;
+            foreach_index (k, numframes)
+            {
+                const wstring & outfile = outpaths[index][k];
+                unsigned int sampperiod = sampperiods[index][k];
+                size_t n = numframes[k];
+                msra::files::make_intermediate_dirs (outfile);
+                fprintf (stderr, "saveandflush: writing %zu frames to %S\n", n, outfile.c_str());
+                msra::dbn::matrixstripe thispred (pred, firstframe, n);
+                // some sanity check for the data we've written
+                const size_t nansinf = thispred.countnaninf();
+                if (nansinf > 0)
+                    fprintf (stderr, "chunkeval: %d NaNs or INF detected in '%S' (%d frames)\n", (int) nansinf, outfile.c_str(), (int) thispred.cols());
+                // save it
+                msra::util::attempt (5, [&]()
+                {
+                    msra::asr::htkfeatwriter::write (outfile, "USER", sampperiod, thispred);
+                });
+                firstframe += n;
+            }
+            assert (firstframe == framesinblock); framesinblock;
+
+            // and we are done --forget the FIFO content & get ready for next chunk
+            
+        }
+
+    public:
+        chunkevalsourcemulti (std::vector<size_t> vdims, std::vector<size_t> udims, size_t chunksize)
+            :vdims(vdims),udims(udims),chunksize(chunksize)
+        {     
+
+            foreach_index(i, vdims)
+            {
+                msra::dbn::matrix thisfeat;
+                std::vector<std::vector<float>> frames; // [t] all feature frames concatenated into a big block
+                
+                frames.reserve(chunksize * 2);
+                framesmulti.push_back(frames);
+                //framesmulti[i].reserve (chunksize * 2);    
+                
+                thisfeat.resize(vdims[i], chunksize);
+                feat.push_back(thisfeat);
+    
+                outpaths.push_back(std::vector<std::wstring>());
+                sampperiods.push_back(std::vector<unsigned int>());
+                //feat[i].resize(vdims[i],chunksize); // initialize to size chunksize
+            }
+        }
+
+        // append data to chunk
+        template<class MATRIX> void addfile (const MATRIX & feat, const string & featkind, unsigned int sampperiod, const std::wstring & outpath, size_t index)
+        {
+            // append to frames; also expand neighbor frames
+            if (feat.cols() < 2)
+                throw std::runtime_error ("evaltofile: utterances < 2 frames not supported");
+            foreach_column (t, feat)
+            {
+                std::vector<float> v (&feat(0,t), &feat(0,t) + feat.rows());
+                framesmulti[index].push_back (v);
+                if (index==0)
+                    boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
+            }
+            if (index==0)
+                numframes.push_back (feat.cols());
+
+            outpaths[index].push_back (outpath);
+            sampperiods[index].push_back (sampperiod);
+            
+        }
+
+        void createevalminibatch()
+        {
+            foreach_index(i, framesmulti)
+            {
+                const size_t framesinblock = framesmulti[i].size();
+                feat[i].resize(vdims[i], framesinblock);   // input features for whole utt (col vectors)
+                // augment the features
+                msra::dbn::augmentneighbors (framesmulti[i], boundaryflags, 0, framesinblock, feat[i]);
+            }
+            minibatchready=true;
+        }
+
+        void writetofiles(msra::dbn::matrix &pred, size_t index){ saveandflush(pred, index); }
+
+        msra::dbn::matrix chunkofframes(size_t index) { assert(minibatchready); assert(index<=feat.size()); return feat[index]; }
+
+        bool isminibatchready() { return minibatchready; }
+
+        size_t currentchunksize() { return framesmulti[0].size(); }
+        void flushinput(){createevalminibatch();}
+        void reset() { clear(); }
+
+    };
+
+    class FileEvalSource // : public numamodelmanager
+    {
+        const size_t chunksize;                 // actual block size to perform computation on
+
+        // data FIFO
+        std::vector<msra::dbn::matrix> feat;
+        std::vector<std::vector<std::vector<float>>> framesMulti; // [t] all feature frames concatenated into a big block
+        std::vector<char> boundaryFlags;        // [t] -1 for first and +1 last frame, 0 else (for augmentneighbors())
+        std::vector<size_t> numFrames;          // [k] number of frames for all appended files
+        std::vector<std::vector<unsigned int>> sampPeriods;  // [k] and sample periods (they should really all be the same...)
+        std::vector<size_t> vdims; // input dimension
+        std::vector<size_t> leftcontext;
+        std::vector<size_t> rightcontext;
+        bool minibatchReady;
+        size_t minibatchSize;
+        size_t frameIndex;
+
+        void operator=(const FileEvalSource &);
+
+    private:
+        void Clear()    // empty the FIFO
+        {
+            foreach_index(i, vdims)
+            {
+                framesMulti[i].clear();
+                sampPeriods[i].clear();
+            }
+            boundaryFlags.clear();
+            numFrames.clear();
+            minibatchReady=false;
+            frameIndex=0;
+        }
+
+    public:
+        FileEvalSource(std::vector<size_t> vdims, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t chunksize) :vdims(vdims), leftcontext(leftcontext), rightcontext(rightcontext), chunksize(chunksize)
+        {     
+            foreach_index(i, vdims)
+            {
+                msra::dbn::matrix thisfeat;
+                std::vector<std::vector<float>> frames; // [t] all feature frames concatenated into a big block
+                
+                frames.reserve(chunksize * 2);
+                framesMulti.push_back(frames);
+                //framesmulti[i].reserve (chunksize * 2);    
+                
+                thisfeat.resize(vdims[i], chunksize);
+                feat.push_back(thisfeat);
+    
+                sampPeriods.push_back(std::vector<unsigned int>());
+                //feat[i].resize(vdims[i],chunksize); // initialize to size chunksize
+            }
+        }
+
+        // append data to chunk
+        template<class MATRIX> void AddFile (const MATRIX & feat, const string & /*featkind*/, unsigned int sampPeriod, size_t index)
+        {
+            // append to frames; also expand neighbor frames
+            if (feat.cols() < 2)
+                throw std::runtime_error ("evaltofile: utterances < 2 frames not supported");
+            foreach_column (t, feat)
+            {
+                std::vector<float> v (&feat(0,t), &feat(0,t) + feat.rows());
+                framesMulti[index].push_back (v);
+                if (index==0)
+                    boundaryFlags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
+            }
+            if (index==0)
+                numFrames.push_back (feat.cols());
+
+            sampPeriods[index].push_back (sampPeriod);
+            
+        }
+
+        void CreateEvalMinibatch()
+        {
+            foreach_index(i, framesMulti)
+            {
+                const size_t framesInBlock = framesMulti[i].size();
+                feat[i].resize(vdims[i], framesInBlock);   // input features for whole utt (col vectors)
+                // augment the features
+                size_t leftextent, rightextent;
+                // page in the needed range of frames
+                if (leftcontext[i] == 0 && rightcontext[i] == 0)
+                {
+                    leftextent = rightextent = augmentationextent(framesMulti[i][0].size(), vdims[i]);
+                }
+                else
+                {
+                    leftextent = leftcontext[i];
+                    rightextent = rightcontext[i];
+                }
+
+                //msra::dbn::augmentneighbors(framesMulti[i], boundaryFlags, 0, leftcontext[i], rightcontext[i],)
+                msra::dbn::augmentneighbors (framesMulti[i], boundaryFlags, leftextent, rightextent, 0, framesInBlock, feat[i]);
+            }
+            minibatchReady=true;
+        }
+
+        void SetMinibatchSize(size_t mbSize){ minibatchSize=mbSize;}
+        msra::dbn::matrix ChunkOfFrames(size_t index) { assert(minibatchReady); assert(index<=feat.size()); return feat[index]; }
+
+        bool IsMinibatchReady() { return minibatchReady; }
+
+        size_t CurrentFileSize() { return framesMulti[0].size(); }
+        void FlushInput(){CreateEvalMinibatch();}
+        void Reset() { Clear(); }
+    };
+
+    
+};};
--- a/MachineLearning/PTaskHost/dllmain.cpp
+++ b/MachineLearning/PTaskHost/dllmain.cpp
@ -1,9 +1,14 @@
+//
+// <copyright file="dllmain.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
 // dllmain.cpp : Defines the entry point for the DLL application.
 #include "stdafx.h"

-BOOL APIENTRY DllMain( HMODULE hModule,
+BOOL APIENTRY DllMain( HMODULE /*hModule*/,
                       DWORD  ul_reason_for_call,
-                       LPVOID lpReserved
+                       LPVOID /*lpReserved*/
                     )
 {
    switch (ul_reason_for_call)
--- a/DataReader/Kaldi2Reader/fileutil.cpp
+++ b/DataReader/Kaldi2Reader/fileutil.cpp
@ -1,379 +1,11 @@
 //
-// fileutil.cpp - file I/O with error checking
-//
+// <copyright file="FileUtil.cpp" company="Microsoft">
 //     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
 //
-// $Log: /Speech_To_Speech_Translation/dbn/dbn/fileutil.cpp $
-// 
-// 125   1/03/13 8:53p Kaisheny
-// Asynchronous SGD using data pipe.
-// 
-// 124   9/30/12 10:46a Fseide
-// new optional parameter to fuptodate()--caller can now choose whether a
-// missing input file, with target file present, will cause a failure or
-// considers the target up-to-date
-// 
-// 123   8/20/12 12:29p V-hansu
-// fixed a major bug in freadOrDie() for chunks > 15M units (breaking into
-// chunks was broken)
-// 
-// 122   4/01/12 12:02p Fseide
-// (expanded an error message)
-// 
-// 121   11/09/11 10:01 Fseide
-// added a new overload for fgetfilelines() that returns an array of char*
-// instead of strings, to avoid mem alloc
-// 
-// 120   10/27/11 18:52 Fseide
-// updated freadOrDie() to smaller chunk size
-// 
-// 119   10/27/11 13:40 Fseide
-// freadOrDie() now explicitly breaks up large reads because CRT fread()
-// does not handle them (due to a Windows bug)
-// 
-// 118   6/10/11 9:49 Fseide
-// new function fgetfilelines() for reading text files
-// 
-// 117   3/07/11 12:13 Fseide
-// actually implemented unlinkOrDie() (was a dummy)
-// 
-// 116   12/07/10 10:03 Fseide
-// (corrected the buffer size in fsetpos() fro 65336 to 65536)
-// 
-// 115   12/03/10 10:53 Fseide
-// fsetpos() optimization when seeking forward within the current read
-// buffer
-// 
-// 114   11/18/10 4:32p Kit
-// added missing header for errno
-// 
-// 113   11/18/10 9:20 Fseide
-// a basic optimization in fsetpos() to avoid rereading the buffer if
-// fsetpos() does not actually move the file pointer
-// 
-// 112   11/17/10 15:00 Fseide
-// new function fuptodate();
-// make_intermediate_dirs() moved to namespace msra::files (all new
-// functions should be put in there)
-// 
-// 111   11/12/10 16:43 Fseide
-// bug in getfiletime(), totally broken
-// 
-// 110   11/09/10 8:56 Fseide
-// some cleanup of make_intermediate_dirs()
-// 
-// 109   11/08/10 17:07 Fseide
-// new function make_intermediate_dirs()
-// 
-// 108   11/30/09 1:32p Kit
-// 
-// 107   2/05/09 19:05 Fseide
-// fgetline() now returns a non-const pointer, because user may want to
-// post-process the line, and the returned value is a user-specified
-// buffer anyway
-// 
-// 106   1/16/09 8:59 Fseide
-// exported fskipspace()
-// 
-// 105   1/16/09 8:47 Fseide
-// (a comment added)
-// 
-// 104   1/15/09 7:38 Fseide
-// some magic to unify fgetstring() for char and wchar_t to a single
-// template function
-// 
-// 103   1/14/09 19:27 Fseide
-// new functions fsetpos() and fgetpos();
-// added missing read-error checks to fget(w)string()
-// 
-// 102   1/14/09 12:38 Fseide
-// bug fix in fgetline(): missed an error check
-// 
-// 101   1/09/09 7:40 Fseide
-// (fixed a warning)
-// 
-// 100   1/08/09 16:38 Fseide
-// fopenOrDie() now supports "-" as the pathname, referring to stdin or
-// stdout
-// 
-// 99    1/08/09 15:32 Fseide
-// new funtion expand_wildcards()
-// 
-// 98    1/05/09 8:44 Fseide
-// (added comments)
-// 
-// 97    12/24/08 14:44 Fseide
-// added an overflow check to fputwfx()
-// 
-// 96    12/12/08 10:11a Qiluo
-// (change marker of banned APIs)
-// 
-// 95    12/11/08 7:40p Qiluo
-// (change marker of banned APIs)
-// 
-// 94    12/09/08 6:59p Qiluo
-// reverted stringerror => strerror
-// 
-// 93    12/09/08 6:37p Qiluo
-// fixed a few compilation bugs
-// 
-// 92    12/09/08 6:28p Qiluo
-// strerror => stringerror
-// 
-// 91    12/01/08 2:43p Qiluo
-// add markers for banned APIs, and refine the api fixing
-// 
-// 90    11/11/08 7:34p Qiluo
-// fix bug in strnlen
-// 
-// 89    11/11/08 18:27 Fseide
-// no longer disables C4996
-// 
-// 88    11/11/08 6:04p Qiluo
-// recover the old fputstring functions
-// 
-// 87    11/10/08 2:34p Qiluo
-// remove the dependency of header "StringUtil.h"
-// 
-// 86    10/31/08 5:08p Qiluo
-// remove banned APIs
-// 
-// 85    6/24/08 19:03 Fseide
-// added fgetwstring() and fputstring() for wstrings
-// 
-// 84    6/02/08 14:11 Fseide
-// fgetwfx() and wputwfx() now a bit more tolerant
-// 
-// 83    08-05-29 18:18 Llu
-// fix the interface of fputwav
-// 
-// 82    08-05-29 14:53 Llu
-// 
-// 81    08-05-29 13:53 Llu
-// add fputwav revise fgetwav using stl instead of short *
-// 
-// 80    3/19/08 16:13 Fseide
-// (better solution to prev. problem)
-// 
-// 79    3/19/08 16:07 Fseide
-// (#ifdef'ed out fprintfOrDie() in _MANAGED builds)
-// 
-// 78    10/30/07 16:46 Fseide
-// 
-// 77    3/27/07 13:54 Fseide
-// added 'using namespace std;' (was removed from message.h as it does not
-// belong there)
-// 
-// 76    1/30/07 1:59p Kit
-// Undid updates to fgetline error handling
-// 
-// 70    12/20/06 10:48a Kit
-// increased size of line buffer for fgetline because we seem to be
-// getting large strings in some rss feeds
-// 
-// 69    06-12-04 18:30 Llu
-// (fixed an unnecessary "deprecated string function" warning under VS
-// 2005)
-// 
-// 68    11/27/06 11:40 Fseide
-// new methods fgetwfx() and fputwfx() for direct access to simple PCM WAV
-// files
-// 
-// 67    10/14/06 18:31 Fseide
-// added char* version of fexists()
-// 
-// 66    5/14/06 19:58 Fseide
-// new function fsetmode()
-// 
-// 65    3/29/06 16:10 Fseide
-// increased buffer size in fgetfile() to 64k
-// 
-// 64    3/29/06 15:36 Fseide
-// changed to reading entire file instead of line-by-line, not changing
-// newlines anymore
-// 
-// 63    3/24/06 4:40p Rogeryu
-// workaround a VC 2003 header bug (va_start macro for references) in
-// MESSAGE/ERROR functions
-// 
-// 62    3/22/06 3:31p Rogeryu
-// (comments changed)
-// 
-// 61    3/21/06 5:21p Rogeryu
-// review and fix level2_security OACR warnings
-// 
-// 60    3/21/06 9:26a Rogeryu
-// review and fix OACR warnings
-// 
-// 59    06-03-15 15:41 Yushli
-// Suppress C4996 Warning per function
-// 
-// 58    06-03-14 12:11 Yushli
-// Suppress C4996 Warning on strerror per function
-// 
-// 57    06-03-14 10:33 Yushli
-// Suppress C4996 Warning per function.
-// 
-// 56    2/28/06 1:49p Kjchen
-// suppress oacr warning
-// 
-// 55    2/24/06 8:03p Kjchen
-// depress oacr warnings
-// 
-// 54    2/21/06 11:32a Kit
-// aadded filesize64 to support large files
-// 
-// 53    1/10/06 8:23p Rogeryu
-// fix a warning
-// 
-// 52    1/09/06 7:12p Rogeryu
-// wide version of fgetline
-// 
-// 51    12/20/05 21:15 Fseide
-// changed CreateFile() to CreateFileW()
-// 
-// 50    12/19/05 22:50 Fseide
-// setfiletime() fixed, now actually works
-// 
-// 49    12/19/05 21:52 Fseide
-// fputfile() added in 8-bit string version
-// 
-// 48    12/18/05 17:01 Fseide
-// fixed file-handle leaks in error conditions
-// 
-// 47    12/15/05 20:25 Fseide
-// added getfiletime(), setfiletime(), and fputfile() for strings
-// 
-// 46    9/27/05 12:22 Fseide
-// added wstring version of renameOrDie()
-// 
-// 45    9/22/05 12:26 Fseide
-// new method fexists()
-// 
-// 44    9/15/05 11:33 Fseide
-// new version of fgetline() that avoids buffer allocations, since this
-// seems very expensive esp. when reading a file line by line with
-// fgetline()
-// 
-// 43    9/05/05 4:57p F-xyzhao
-// renameOrDie(): changed string to std::string
-// 
-// 42    9/05/05 11:00 Fseide
-// new method renameOrDie()
-// 
-// 41    8/19/05 18:19 Fseide
-// bugfixes in WAVEHEADER::write and prepare
-// 
-// 40    8/19/05 18:02 Fseide
-// WAVEHEADER::write() now flushes
-// 
-// 39    8/19/05 17:56 Fseide
-// extended WAVEHEADER with write() and update()
-// 
-// 38    8/14/05 16:56 Fseide
-// fopenOrDie() now sets large buffer if 'S' option
-// 
-// 37    8/13/05 15:37 Fseide
-// added new version of fgetline that takes a buffer
-// 
-// 36    7/28/05 18:04 Fseide
-// bug fix in fgetin24 and fputint24
-// 
-// 35    7/26/05 18:54 Fseide
-// new functions fgetint24() and fputint24()
-// 
-// 34    5/10/05 14:12 Fseide
-// (level-4 warning fixed)
-// 
-// 33    5/10/05 11:57 Fseide
-// (level-4 warnings removed)
-// 
-// 32    5/09/05 12:07 Fseide
-// fixed for-loop conformance issues
-// 
-// 31    2/27/05 17:41 Fseide
-// recovered v29 that somehow got overwritten
-// 
-// 29    2/12/05 15:21 Fseide
-// fgetdouble() and fputdouble() added
-// 
-// 28    2/05/05 12:38 Fseide
-// new methods fputfile(), fgetfile();
-// new overload for filesize()
-// 
-// 27    2/03/05 22:34 Fseide
-// added new version of fgetline() that returns an STL string
-// 
-// 26    5/31/04 10:06 Fseide
-// new methods fseekOrDie(), ftellOrDie(), unlinkOrDie(), renameOrDie()
-// 
-// 25    3/19/04 4:01p Fseide
-// fwriteOrDie(): first argument changed to const
-// 
-// 24    2/21/04 10:26 Fseide
-// (compiler warnings eliminated)
-// 
-// 23    2/19/04 9:46p V-xlshi
-// 
-// 22    2/19/04 3:44p V-xlshi
-// fgetwavraw and fgetraw function is added, fgetwav is changed but its
-// functionality is the same with the old one.
-// 
-// 21    2/03/04 8:17p V-xlshi
-// 
-// 20    9/08/03 22:55 Fseide
-// fgetwav() can now read stereo PCM files
-// 
-// 19    8/15/03 15:40 Fseide
-// new method filesize()
-// 
-// 18    8/13/03 21:06 Fseide
-// new function fputbyte()
-// 
-// 17    8/13/03 15:37 Fseide
-// an error msg corrected
-// 
-// 16    8/07/03 22:04 Fseide
-// fprintfOrDie() now really dies in case of error
-// 
-// 15    7/30/03 5:09p Fseide
-// (eliminated a compiler warning)
-// 
-// 14    03-07-30 14:17 I-rogery
-// 
-// 13    7/25/03 6:07p Fseide
-// new functions fgetbyte() and fgetwav()
-// 
-// 12    6/03/03 5:23p Fseide
-// (some compiler warnings related to size_t eliminated)
-// 
-// 11    3/27/03 3:42p Fseide
-// fwriteOrDie() rewritten to break huge blocks into chunks of 16 MB
-// because Windows std C lib can't handle fwrite() with e.g. 100 MB in one
-// call
-// 
-// 10    7/23/02 9:00p Jlzhou
-// 
-// 9     7/03/02 9:25p Fseide
-// fcompareTag() now uses STRING type for both of its arguments (before,
-// it used const char * for one of them)
-// 
-// 8     6/10/02 3:14p Fseide
-// new functions fgettoken(), fgetfloat_ascii(), fskipNewline()
-// 
-// 7     6/07/02 7:26p Fseide
-// new functions fcheckTag_ascii() and fgetint_ascii()
-// 
-// 6     6/03/02 10:58a Jlzhou
-// 
-// 5     4/15/02 1:12p Fseide
-// void fputstring (FILE * f, const TSTRING & str) and fpad() added
-// 
-// 4     4/03/02 3:56p Fseide
-// VSS keyword and copyright added
-//
-// F. Seide 5 Mar 2002
-//
+
+
+#include "stdafx.h"

 #ifndef UNDER_CE    // fixed-buffer overloads not available for wince
 #ifdef _CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES  // fixed-buffer overloads for strcpy() etc.
@ -384,17 +16,20 @@

 #include "basetypes.h"
 #include "fileutil.h"
-#include "message.h"
 #include <stdio.h>
 #include <string.h>
 #include <ctype.h>
+#ifndef __unix__
 #include "windows.h"    // for FILETIME
+#endif
 #include <algorithm>    // for std::find

 #ifndef UNDER_CE  // some headers don't exist under winCE - the appropriate definitions seem to be in stdlib.h
 #include <fcntl.h>      // for _O_BINARY/TEXT - not needed for wince
+#ifndef __unix__
 #include <io.h>         // for _setmode()
 #endif
+#endif

 #include <errno.h>

@ -413,13 +48,15 @@ static const wchar_t * strchr (const wchar_t * s, wchar_t v) { return wcschr (s,
 template<class _T> FILE * fopenStdHandle (const _T * mode)
 {
    FILE * f = strchr (mode, 'r') ? stdin : stdout;
-    if (strchr (mode, 'b') || strchr (mode, 't'))   // change binary mode
+#ifndef __unix__ // don't need binary/text distinction on unix
+    if (strchr(mode, 'b') || strchr(mode, 't'))   // change binary mode
    {
        // switch to binary mode if not yet (in case it is stdin)
        int rc = _setmode (_fileno (f), strchr (mode, 'b') ? _O_BINARY : _O_TEXT);
        if (rc == -1)
-            ERROR ("error switching stream to binary mode: %s", strerror (errno));
+            RuntimeError ("error switching stream to binary mode: %s", strerror (errno));
    }
+#endif
    return f;
 }

@ -428,8 +65,7 @@ FILE * fopenOrDie (const STRING & pathname, const char * mode)
    FILE * f = (pathname[0] == '-') ? fopenStdHandle (mode) : fopen (pathname.c_str(), mode);
    if (f == NULL)
    {
-    ERROR ("error opening file '%s': %s", pathname.c_str(), strerror (errno));
-        return NULL;    // keep OACR happy
+        RuntimeError("error opening file '%s': %s", pathname.c_str(), strerror(errno));
    }
    if (strchr (mode, 'S'))
    {   // if optimized for sequential access then use large buffer
@ -443,8 +79,7 @@ FILE * fopenOrDie (const WSTRING & pathname, const wchar_t * mode)
    FILE * f = (pathname[0] == '-') ? fopenStdHandle (mode) : _wfopen (pathname.c_str(), mode);
    if (f == NULL)
    {
-        ERROR ("error opening file '%S': %s", pathname.c_str(), strerror (errno));
-        return NULL;    // keep OACR happy
+        RuntimeError ("error opening file '%S': %s", pathname.c_str(), strerror (errno));
    }
    if (strchr (mode, 'S'))
    {   // if optimized for sequential access then use large buffer
@ -457,11 +92,12 @@ FILE * fopenOrDie (const WSTRING & pathname, const wchar_t * mode)
 // set mode to binary or text (pass 'b' or 't')
 // ----------------------------------------------------------------------------

-void fsetmode (FILE * f, char type)
+#ifndef __unix__ // don't need binary/text distinction on unix
+void fsetmode(FILE * f, char type)
 {
    if (type != 'b' && type != 't')
    {
-        ERROR ("fsetmode: invalid type '%c'");
+        RuntimeError ("fsetmode: invalid type '%c'");
    }
 #ifdef UNDER_CE // winCE and win32 have different return types for _fileno
    FILE *fd = _fileno (f);   // note: no error check possible
@ -472,9 +108,10 @@ void fsetmode (FILE * f, char type)
    int rc = _setmode (fd, mode);
    if (rc == -1)
    {
-    ERROR ("error changing file mode: %s", strerror (errno));
+    RuntimeError ("error changing file mode: %s", strerror (errno));
    }
 }
+#endif

 // ----------------------------------------------------------------------------
 // freadOrDie(): like fread() but terminate with err msg in case of error
@ -488,7 +125,7 @@ void freadOrDie (void * ptr, size_t size, size_t count, FILE * f)
        size_t chunkn = min (count, 15*1024*1024);  // BUGBUG: I surely meant this limit to be bytes, not units of 'size'...
        size_t n = fread (ptr, size, chunkn, f);
        if (n != chunkn)
-            ERROR ("error reading from file: %s", strerror (errno));
+            RuntimeError ("error reading from file: %s", strerror (errno));
        count -= n;
        ptr = n * size + (char*) ptr;
    }
@ -503,7 +140,7 @@ void freadOrDie (void * ptr, size_t size, size_t count, const HANDLE f)
        DWORD n ;
        ReadFile(f, ptr, (DWORD) chunkn, &n, NULL);
        if (n != chunkn)
-            ERROR ("error number for reading from file: %s", GetLastError());
+            RuntimeError ("error number for reading from file: %s", GetLastError());
        count -= (size_t) (n / size);
        ptr = n + (char*) ptr;
    }
@ -530,7 +167,7 @@ void fwriteOrDie (const void * ptr, size_t size, size_t count, FILE * f)
        size_t n = fwrite ((const void *) p1, 1, wantWrite, f);
        if (n != wantWrite)
        {
-            ERROR ("error writing to file (ptr=0x%08lx, size=%d,"
+            RuntimeError ("error writing to file (ptr=0x%08lx, size=%d,"
                " count=%d, writing %d bytes after %d): %s",
                ptr, size, count, (int) wantWrite,
                (int) (size * count - totalBytes),
@ -556,7 +193,7 @@ void fwriteOrDie (const void * ptr, size_t size, size_t count, const HANDLE f)
        DWORD byteWritten = 0 ;
        if (WriteFile(f, (const void *) p1, wantWrite, &byteWritten, NULL) == false)
        {
-            ERROR ("error writing to file (ptr=0x%08lx, size=%d,"
+            RuntimeError ("error writing to file (ptr=0x%08lx, size=%d,"
                " count=%d, writing %d bytes after %d): %s",
                ptr, size, count, (int) wantWrite,
                (int) (size * count - totalBytes),
@ -581,7 +218,7 @@ void fprintfOrDie (FILE * f, const char * fmt, ...)
    int rc = vfprintf (f, fmt, arg_ptr);
    if (rc < 0)
    {
-        ERROR ("error writing to file: %s", strerror (errno));
+        RuntimeError ("error writing to file: %s", strerror (errno));
    }
 }
 #pragma warning(pop)
@ -595,7 +232,7 @@ void fflushOrDie (FILE * f)
    int rc = fflush (f);
    if (rc != 0)
    {
-    ERROR ("error flushing to file: %s", strerror (errno));
+    RuntimeError ("error flushing to file: %s", strerror (errno));
    }
 }

@ -608,22 +245,22 @@ size_t filesize (FILE * f)
    long curPos = ftell (f);
    if (curPos == -1L)
    {
-    ERROR ("error determining file position: %s", strerror (errno));
+    RuntimeError ("error determining file position: %s", strerror (errno));
    }
    int rc = fseek (f, 0, SEEK_END);
    if (rc != 0)
    {
-    ERROR ("error seeking to end of file: %s", strerror (errno));
+    RuntimeError ("error seeking to end of file: %s", strerror (errno));
    }
    long len = ftell (f);
    if (len == -1L)
    {
-    ERROR ("error determining file position: %s", strerror (errno));
+    RuntimeError ("error determining file position: %s", strerror (errno));
    }
    rc = fseek (f, curPos, SEEK_SET);
    if (rc != 0)
    {
-    ERROR ("error resetting file position: %s", strerror (errno));
+    RuntimeError ("error resetting file position: %s", strerror (errno));
    }
    return (size_t) len;
 }
@ -648,7 +285,7 @@ size_t filesize (const wchar_t * pathname)
 #ifndef UNDER_CE    // no 64-bit under winCE

 // filesize64(): determine size of the file in bytes (with pathname)
-__int64 filesize64 (const wchar_t * pathname)
+int64_t filesize64 (const wchar_t * pathname)
 {
    __stat64 fileinfo;
    if (_wstat64 (pathname,&fileinfo) == -1) 
@ -667,33 +304,33 @@ long fseekOrDie (FILE * f, long offset, int mode)
    long curPos = ftell (f);
    if (curPos == -1L)
    {
-    ERROR ("error seeking: %s", strerror (errno));
+    RuntimeError ("error seeking: %s", strerror (errno));
    }
    int rc = fseek (f, offset, mode);
    if (rc != 0)
    {
-    ERROR ("error seeking: %s", strerror (errno));
+    RuntimeError ("error seeking: %s", strerror (errno));
    }
    return curPos;
 }

-unsigned __int64 fgetpos (FILE * f)
+uint64_t fgetpos (FILE * f)
 {
    fpos_t post;
    int rc = ::fgetpos (f, &post);
    if (rc != 0)
-        ERROR ("error getting file position: %s", strerror (errno));
+        RuntimeError ("error getting file position: %s", strerror (errno));
    return post;
 }

-void fsetpos (FILE * f, unsigned __int64 reqpos)
+void fsetpos (FILE * f, uint64_t reqpos)
 {
    // ::fsetpos() flushes the read buffer. This conflicts with a situation where
    // we generally read linearly but skip a few bytes or KB occasionally, as is
    // the case in speech recognition tools. This requires a number of optimizations.

-    unsigned __int64 curpos = fgetpos (f);
-    unsigned __int64 cureob = curpos + f->_cnt; // UGH: we mess with an internal structure here
+    uint64_t curpos = fgetpos (f);
+    uint64_t cureob = curpos + f->_cnt; // UGH: we mess with an internal structure here
    while (reqpos >= curpos && reqpos < cureob)
    {
        // if we made it then do not call fsetpos()
@ -715,7 +352,7 @@ void fsetpos (FILE * f, unsigned __int64 reqpos)
    fpos_t post = reqpos;
    int rc = ::fsetpos (f, &post);
    if (rc != 0)
-        ERROR ("error setting file position: %s", strerror (errno));
+        RuntimeError ("error setting file position: %s", strerror (errno));
 }

 // ----------------------------------------------------------------------------
@ -725,12 +362,12 @@ void fsetpos (FILE * f, unsigned __int64 reqpos)
 void unlinkOrDie (const std::string & pathname)
 {
    if (_unlink (pathname.c_str()) != 0 && errno != ENOENT)     // if file is missing that's what we want
-    ERROR ("error deleting file '%s': %s", pathname.c_str(), strerror (errno));
+    RuntimeError ("error deleting file '%s': %s", pathname.c_str(), strerror (errno));
 }
 void unlinkOrDie (const std::wstring & pathname)
 {
    if (_wunlink (pathname.c_str()) != 0 && errno != ENOENT)    // if file is missing that's what we want
-    ERROR ("error deleting file '%S': %s", pathname.c_str(), strerror (errno));
+    RuntimeError ("error deleting file '%S': %s", pathname.c_str(), strerror (errno));
 }

 // ----------------------------------------------------------------------------
@ -741,14 +378,14 @@ void unlinkOrDie (const std::wstring & pathname)
 void renameOrDie (const std::string & from, const std::string & to)
 {
    if (!MoveFileA (from.c_str(),to.c_str()))
-    ERROR ("error renaming: %s", GetLastError());
+    RuntimeError ("error renaming: %s", GetLastError());
 }
 #endif

 void renameOrDie (const std::wstring & from, const std::wstring & to)
 {
    if (!MoveFileW (from.c_str(),to.c_str()))
-    ERROR ("error renaming: %s", GetLastError());
+    RuntimeError ("error renaming: %s", GetLastError());
 }

 // ----------------------------------------------------------------------------
@ -821,12 +458,12 @@ template<class CHAR>
 CHAR * fgetline (FILE * f, CHAR * buf, int size)
 {

-    unsigned __int64 filepos = fgetpos (f); // (for error message only)
+    uint64_t filepos = fgetpos (f); // (for error message only)
    CHAR * p = fgets (buf, size, f);
    if (p == NULL)            // EOF reached: next time feof() = true
    {
        if (ferror (f))
-            ERROR ("error reading line: %s", strerror (errno));
+            RuntimeError ("error reading line: %s", strerror (errno));
        buf[0] = 0;
        return buf;
    }
@ -837,7 +474,7 @@ CHAR * fgetline (FILE * f, CHAR * buf, int size)
    if (n >= (size_t) size -1)
    {
        basic_string<CHAR> example (p, n < 100 ? n : 100);
-        ERROR ("input line too long at file offset %I64d (max. %d characters allowed) [%s ...]",
+        RuntimeError ("input line too long at file offset %I64d (max. %d characters allowed) [%s ...]",
               filepos, size -1, _utf8 (example).c_str());
    }

@ -869,7 +506,7 @@ const wchar_t * fgetline (FILE * f, wchar_t * buf, int size)
    if (p == NULL)            // EOF reached: next time feof() = true
    {
        if (ferror (f))
-            ERROR ("error reading line: %s", strerror (errno));
+            RuntimeError ("error reading line: %s", strerror (errno));
        buf[0] = 0;
        return buf;
    }
@ -880,7 +517,7 @@ const wchar_t * fgetline (FILE * f, wchar_t * buf, int size)
    if (n >= (size_t) size -1)
    {
        wstring example (buf, min (n, 100));
-        ERROR ("input line too long at file offset %U64d (max. %d characters allowed) [%S ...]",
+        RuntimeError ("input line too long at file offset %U64d (max. %d characters allowed) [%S ...]",
               fgetpos (f), size -1, example.c_str());
    }

@ -960,11 +597,11 @@ const char * fgetstring (FILE * f, __out_z_cap(size) char * buf, int size)
    {
    int c = fgetc (f);
    if (c == EOF)
-            ERROR ("error reading string or missing 0: %s", strerror (errno));
+            RuntimeError ("error reading string or missing 0: %s", strerror (errno));
    if (c == 0) break;
    if (i >= size -1)
    {
-        ERROR ("input line too long (max. %d characters allowed)", size -1);
+        RuntimeError ("input line too long (max. %d characters allowed)", size -1);
    }
    buf[i] = (char) c;
    }
@ -983,7 +620,7 @@ const char * fgetstring (const HANDLE f, __out_z_cap(size) char * buf, int size)
        if (c == (char) 0) break;
        if (i >= size -1)
        {
-            ERROR ("input line too long (max. %d characters allowed)", size -1);
+            RuntimeError ("input line too long (max. %d characters allowed)", size -1);
        }
        buf[i] = (char) c;
    }
@ -1000,7 +637,7 @@ wstring fgetwstring (FILE * f)
    {
    int c = fgetwc (f);
    if (c == EOF)
-            ERROR ("error reading string or missing 0: %s", strerror (errno));
+            RuntimeError ("error reading string or missing 0: %s", strerror (errno));
    if (c == 0) break;
        res.push_back ((wchar_t) c);
    }
@ -1015,14 +652,14 @@ void fskipspace (FILE * f)
    if (c == EOF)       // hit the end
        {
            if (ferror (f))
-                ERROR ("error reading from file: %s", strerror (errno));
+                RuntimeError ("error reading from file: %s", strerror (errno));
            break;
        }
    if (!isspace (c))    // end of space: undo getting that character
        {
            int rc = ungetc (c, f);
            if (rc != c)
-                ERROR ("error in ungetc(): %s", strerror (errno));
+                RuntimeError ("error in ungetc(): %s", strerror (errno));
            break;
        }
    }
@ -1047,7 +684,7 @@ void fskipNewline (FILE * f)

    if (c != '\n')
    {
-    ERROR ("unexpected garbage at end of line");
+    RuntimeError ("unexpected garbage at end of line");
    }
 }

@ -1064,7 +701,7 @@ const char * fgettoken (FILE * f, __out_z_cap(size) char * buf, int size)
    if (c == EOF) break;
    if (isspace (c)) break;
    if (i >= size -1)
-        ERROR ("input token too long (max. %d characters allowed)", size -1);
+        RuntimeError ("input token too long (max. %d characters allowed)", size -1);
    buf[i] = (char) c;
    }
    // ... TODO: while (isspace (c)) c = fgetc (f);      // skip trailing space
@ -1072,7 +709,7 @@ const char * fgettoken (FILE * f, __out_z_cap(size) char * buf, int size)
    {
    int rc = ungetc (c, f);
    if (rc != c)
-        ERROR ("error in ungetc(): %s", strerror (errno));
+        RuntimeError ("error in ungetc(): %s", strerror (errno));
    }
    ASSERT (i < size);
    buf[i] = 0;
@ -1157,7 +794,7 @@ void fcheckTag_ascii (FILE * f, const STRING & expectedTag)
    fgettoken (f, buf, sizeof(buf)/sizeof(*buf));
    if (expectedTag != buf)
    {
-        ERROR ("invalid tag '%s' found; expected '%s'", buf, expectedTag.c_str());
+        RuntimeError ("invalid tag '%s' found; expected '%s'", buf, expectedTag.c_str());
    }
 }

@ -1169,7 +806,7 @@ void fcompareTag (const STRING & readTag, const STRING & expectedTag)
 {
    if (readTag != expectedTag)
    {
-        ERROR ("invalid tag '%s' found; expected '%s'", 
+        RuntimeError ("invalid tag '%s' found; expected '%s'", 
               readTag.c_str(), expectedTag.c_str());
    }
 }
@ -1216,7 +853,7 @@ void fpad (FILE * f, int n)
    int pos = ftell (f);
    if (pos == -1)
    {
-    ERROR ("error in ftell(): %s", strerror (errno));
+    RuntimeError ("error in ftell(): %s", strerror (errno));
    }
    // determine how many bytes are needed (at least 1 for the 0-terminator)
    // and create a dummy string of that length incl. terminator
@ -1308,7 +945,7 @@ int fgetint_ascii (FILE * f)
    int rc = ungetc (c, f);
    if (rc != c)
    {
-    ERROR ("error in ungetc(): %s", strerror (errno));
+    RuntimeError ("error in ungetc(): %s", strerror (errno));
    }
    return res;
 }
@ -1336,9 +973,9 @@ float fgetfloat_ascii (FILE * f)
    fskipspace (f);
    int rc = fscanf (f, "%f", &val); // security hint: safe overloads
    if (rc == 0)
-    ERROR ("error reading float value from file (invalid format): %s");
+    RuntimeError ("error reading float value from file (invalid format): %s");
    else if (rc == EOF)
-    ERROR ("error reading from file: %s", strerror (errno));
+    RuntimeError ("error reading from file: %s", strerror (errno));
    ASSERT (rc == 1);
    return val;
 }
@ -1441,7 +1078,7 @@ void WAVEHEADER::write (FILE * f)
    long curPos = ftell (f);
    if (curPos == -1L)
    {
-    ERROR ("error determining file position: %s", strerror (errno));
+    RuntimeError ("error determining file position: %s", strerror (errno));
    }
    unsigned int len = (unsigned int) filesize (f);
    unsigned int RiffLength = len - 8;
@ -1453,6 +1090,7 @@ void WAVEHEADER::write (FILE * f)
    fseekOrDie (f, curPos, SEEK_SET);
 }

+#if 0
 unsigned int WAVEHEADER::read (FILE * f, signed short & wRealFormatTag, int & bytesPerSample)
 {
    // read header
@ -1467,17 +1105,17 @@ unsigned int WAVEHEADER::read (FILE * f, signed short & wRealFormatTag, int & by
        wRealFormatTag = 1;     // Workaround: pretend it is 1 (seems safe)
    }
    (wRealFormatTag == 1 || wRealFormatTag == 7)
-        || ERROR ("WAVEHEADER::read: wFormatTag=%d not supported for now", wRealFormatTag);
+        || RuntimeError ("WAVEHEADER::read: wFormatTag=%d not supported for now", wRealFormatTag);
    unsigned short wChannels = fgetshort (f);
    unsigned long dwSamplesPerSec = fgetint (f);
    unsigned int sampleRate = dwSamplesPerSec;
    /*unsigned long dwAvgBytesPerSec = */ fgetint (f);
    unsigned short wBlockAlign = fgetshort (f);
    unsigned short wBitsPerSample = fgetshort (f);
-    (wBitsPerSample <= 16) || ERROR ("WAVEHEADER::read: invalid wBitsPerSample %d", wBitsPerSample);
+    (wBitsPerSample <= 16) || RuntimeError ("WAVEHEADER::read: invalid wBitsPerSample %d", wBitsPerSample);
    bytesPerSample = wBitsPerSample / 8;
    (wBlockAlign == wChannels * bytesPerSample)
-        || ERROR ("WAVEHEADER::read: wBlockAlign != wChannels*bytesPerSample not supported");
+        || RuntimeError ("WAVEHEADER::read: wBlockAlign != wChannels*bytesPerSample not supported");
    while (fmtLen > 16) // unused extra garbage in header
    {
        fgetbyte (f);
@ -1485,7 +1123,7 @@ unsigned int WAVEHEADER::read (FILE * f, signed short & wRealFormatTag, int & by
    }
    if (wRealFormatTag == 7)
    {
-        (bytesPerSample == 1) || ERROR ("WAVEHEADER::read: invalid wBitsPerSample %d for mulaw", wBitsPerSample);
+        (bytesPerSample == 1) || RuntimeError ("WAVEHEADER::read: invalid wBitsPerSample %d for mulaw", wBitsPerSample);
        fcheckTag (f, "fact");
        unsigned int factLen = fgetint (f);
        while (factLen > 0)
@ -1528,7 +1166,7 @@ static void fgetwavraw(FILE * f, ARRAY<short> & wav, const WAVEHEADER & wavhd)
    wav.resize (wavhd.DataLength / bytesPerSample);
    if (wavhd.wFormatTag == 7)    // mulaw
    {
-        (wavhd.nChannels == 1) || ERROR ("fgetwav: wChannels=%d not supported for mulaw", wavhd.nChannels);
+        (wavhd.nChannels == 1) || RuntimeError ("fgetwav: wChannels=%d not supported for mulaw", wavhd.nChannels);
        ARRAY<unsigned char> data;
        int numSamples = wavhd.DataLength/wavhd.nBlockAlign;
        data.resize (numSamples);
@ -1545,7 +1183,7 @@ static void fgetwavraw(FILE * f, ARRAY<short> & wav, const WAVEHEADER & wavhd)
    // ... TODO: support 8 bit linear PCM samples (implement when needed; samples scaled to 'short')
    else
    {
-        ERROR ("bytesPerSample != 2 is not supported except mulaw format!\n");
+        RuntimeError ("bytesPerSample != 2 is not supported except mulaw format!\n");
    }
 }

@ -1586,7 +1224,7 @@ void fgetwav (FILE * f, ARRAY<short> & wav, int & sampleRate)
    }
    else
    {
-        ERROR ("bytesPerSample/wChannels != 2 needs to be implemented");
+        RuntimeError ("bytesPerSample/wChannels != 2 needs to be implemented");
    }
 }

@ -1644,7 +1282,7 @@ unsigned int fgetwfx (FILE * f, WAVEFORMATEX & wfx)
        wfx.wFormatTag = 1;     // Workaround: pretend it is 1 (seems safe)
    }
    (wfx.wFormatTag == 1 || wfx.wFormatTag == 3 || wfx.wFormatTag == 7)
-        || ERROR ("WAVEHEADER::read: wFormatTag=%d not supported for now", wfx.wFormatTag);
+        || RuntimeError ("WAVEHEADER::read: wFormatTag=%d not supported for now", wfx.wFormatTag);
    wfx.nChannels = fgetshort (f);
    wfx.nSamplesPerSec = fgetint (f);
    wfx.nAvgBytesPerSec = fgetint (f);
@ -1662,7 +1300,7 @@ void fputwfx (FILE *f, const WAVEFORMATEX & wfx, unsigned int numSamples)
 {
    unsigned int DataLength = numSamples * wfx.nBlockAlign;
    (DataLength / wfx.nBlockAlign == numSamples)
-        || ERROR ("fputwfx: data size exceeds WAV header 32-bit range");
+        || RuntimeError ("fputwfx: data size exceeds WAV header 32-bit range");
    unsigned int RiffLength = 36 + DataLength;
    unsigned int FmtLength  = 16; 
    // file header
@ -1713,6 +1351,7 @@ void fputwav (const wstring & fn, const vector<short> & wav, int sampleRate, int
    fputwav (f, wav, sampleRate, nChannels);
    fflushOrDie (f);    // after this, fclose() (in destructor of f) cannot fail
 }
+#endif

 // ----------------------------------------------------------------------------
 // fputbyte(): write a byte value
@ -1859,7 +1498,7 @@ void fgetfile (FILE * f, ARRAY<char> & buffer)
        size_t n = fread (&inbuf[0], sizeof (inbuf[0]), inbuf.size(), f);
        if (ferror (f))
        {
-            ERROR ("fgetfile: error reading from file: %s", strerror (errno));
+            RuntimeError ("fgetfile: error reading from file: %s", strerror (errno));
        }
        buffer.insert (buffer.end(), inbuf.begin(), inbuf.begin() + n);
    }
@ -1934,12 +1573,12 @@ void setfiletime (const wstring & path, const FILETIME & time)
                                OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL));
    if (h == INVALID_HANDLE_VALUE)
    {
-        ERROR ("setfiletime: error opening file: %d", GetLastError());
+        RuntimeError ("setfiletime: error opening file: %d", GetLastError());
    }
    BOOL rc = SetFileTime (h, NULL, NULL, &time);
    if (!rc)
    {
-        ERROR ("setfiletime: error setting file time information: %d", GetLastError());
+        RuntimeError ("setfiletime: error setting file time information: %d", GetLastError());
    }
 }

@ -2017,7 +1656,7 @@ void expand_wildcards (const wstring & path, vector<wstring> & paths)
 {
    BOOL rc = ExpandWildcards (path, paths);
    if (!rc)
-        ERROR ("error in expanding wild cards '%S': %S", path.c_str(), FormatWin32Error (::GetLastError()).c_str());
+        RuntimeError ("error in expanding wild cards '%S': %S", path.c_str(), FormatWin32Error (::GetLastError()).c_str());
 }

 // ----------------------------------------------------------------------------
@ -2036,7 +1675,7 @@ static void mkdir (const wstring & path)
        if (att != INVALID_FILE_ATTRIBUTES || (att & FILE_ATTRIBUTE_DIRECTORY) != 0)
            return; // ok
    }
-    ERROR ("make_intermediate_dirs: error creating intermediate directory %S", path.c_str());
+    RuntimeError ("make_intermediate_dirs: error creating intermediate directory %S", path.c_str());
 }

 // make subdir of a file including parents
--- a/DataReader/Kaldi2Reader/fileutil.h
+++ b/DataReader/Kaldi2Reader/fileutil.h
@ -0,0 +1,620 @@
+//
+// fileutil.h - file I/O with error checking
+//
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+//
+#pragma once
+#ifndef _FILEUTIL_
+#define _FILEUTIL_
+
+#include "Platform.h"
+#include <stdio.h>
+#ifdef __unix__
+#include <sys/types.h>
+#include <sys/stat.h>
+#endif
+#include <algorithm>    // for std::find
+#include <vector>
+#include <map>
+#include <functional>
+#include <cctype>
+#include <errno.h>
+#include <stdint.h>
+#include <assert.h>
+#include <string.h>     // for strerror()
+
+using namespace std;
+
+#define SAFE_CLOSE(f) (((f) == NULL) || (fcloseOrDie ((f)), (f) = NULL))
+
+// ----------------------------------------------------------------------------
+// fopenOrDie(): like fopen() but terminate with err msg in case of error.
+// A pathname of "-" returns stdout or stdin, depending on mode, and it will
+// change the binary mode if 'b' or 't' are given. If you use this, make sure
+// not to fclose() such a handle.
+// ----------------------------------------------------------------------------
+
+FILE * fopenOrDie (const string & pathname, const char * mode);
+FILE * fopenOrDie (const wstring & pathname, const wchar_t * mode);
+
+#ifndef __unix__
+// ----------------------------------------------------------------------------
+// fsetmode(): set mode to binary or text
+// ----------------------------------------------------------------------------
+
+void fsetmode (FILE * f, char type);
+#endif
+
+// ----------------------------------------------------------------------------
+// freadOrDie(): like fread() but terminate with err msg in case of error
+// ----------------------------------------------------------------------------
+
+void freadOrDie (void * ptr, size_t size, size_t count, FILE * f);
+
+template<class _T>
+void freadOrDie (_T & data, int num, FILE * f)    // template for vector<>
+{ data.resize (num); if (data.size() > 0) freadOrDie (&data[0], sizeof (data[0]), data.size(), f); }
+template<class _T>
+void freadOrDie (_T & data, size_t num, FILE * f)    // template for vector<>
+{ data.resize (num); if (data.size() > 0) freadOrDie (&data[0], sizeof (data[0]), data.size(), f); }
+
+
+// ----------------------------------------------------------------------------
+// fwriteOrDie(): like fwrite() but terminate with err msg in case of error
+// ----------------------------------------------------------------------------
+
+void fwriteOrDie (const void * ptr, size_t size, size_t count, FILE * f);
+
+template<class _T>
+void fwriteOrDie (const _T & data, FILE * f)    // template for vector<>
+{ if (data.size() > 0) fwriteOrDie (&data[0], sizeof (data[0]), data.size(), f); }
+
+
+// ----------------------------------------------------------------------------
+// fprintfOrDie(): like fprintf() but terminate with err msg in case of error
+// ----------------------------------------------------------------------------
+
+void fprintfOrDie (FILE * f, const char *format, ...);
+
+// ----------------------------------------------------------------------------
+// fcloseOrDie(): like fclose() but terminate with err msg in case of error
+// not yet implemented, but we should
+// ----------------------------------------------------------------------------
+
+#define fcloseOrDie fclose
+
+// ----------------------------------------------------------------------------
+// fflushOrDie(): like fflush() but terminate with err msg in case of error
+// ----------------------------------------------------------------------------
+
+void fflushOrDie (FILE * f);
+
+// ----------------------------------------------------------------------------
+// filesize(): determine size of the file in bytes
+// ----------------------------------------------------------------------------
+
+size_t filesize (const wchar_t * pathname);
+size_t filesize (FILE * f);
+int64_t filesize64 (const wchar_t * pathname);
+
+// ----------------------------------------------------------------------------
+// fseekOrDie(),ftellOrDie(), fget/setpos(): seek functions with error handling
+// ----------------------------------------------------------------------------
+
+// 32-bit offsets only
+long fseekOrDie (FILE * f, long offset, int mode = SEEK_SET);
+#define ftellOrDie ftell
+
+// ----------------------------------------------------------------------------
+// fget/setpos(): seek functions with error handling
+// ----------------------------------------------------------------------------
+
+uint64_t fgetpos (FILE * f);
+void fsetpos (FILE * f, uint64_t pos);
+
+// ----------------------------------------------------------------------------
+// unlinkOrDie(): unlink() with error handling
+// ----------------------------------------------------------------------------
+
+void unlinkOrDie (const std::string & pathname);
+void unlinkOrDie (const std::wstring & pathname);
+
+// ----------------------------------------------------------------------------
+// renameOrDie(): rename() with error handling
+// ----------------------------------------------------------------------------
+
+void renameOrDie (const std::string & from, const std::string & to);
+void renameOrDie (const std::wstring & from, const std::wstring & to);
+
+// ----------------------------------------------------------------------------
+// fexists(): test if a file exists
+// ----------------------------------------------------------------------------
+
+bool fexists (const char * pathname);
+bool fexists (const wchar_t * pathname);
+inline bool fexists (const std::string & pathname) { return fexists (pathname.c_str()); }
+inline bool fexists (const std::wstring & pathname) { return fexists (pathname.c_str()); }
+
+// ----------------------------------------------------------------------------
+// funicode(): test if a file uses unicode
+// ----------------------------------------------------------------------------
+
+bool funicode (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fskipspace(): skip space characters
+// ----------------------------------------------------------------------------
+
+bool fskipspace (FILE * F);
+bool fskipwspace (FILE * F);
+
+// ----------------------------------------------------------------------------
+// fgetline(): like fgets() but terminate with err msg in case of error;
+//  removes the newline character at the end (like gets()), returned buffer is
+//  always 0-terminated; has second version that returns an STL string instead
+// fgetstring(): read a 0-terminated string (terminate if error)
+// fgetword(): read a space-terminated token (terminate if error)
+// fskipNewLine(): skip all white space until end of line incl. the newline
+// ----------------------------------------------------------------------------
+
+// ----------------------------------------------------------------------------
+// fputstring(): write a 0-terminated string (terminate if error)
+// ----------------------------------------------------------------------------
+
+void fputstring (FILE * f, const char *);
+void fputstring (const HANDLE f, const char * str);
+void fputstring (FILE * f, const std::string &);
+void fputstring (FILE * f, const wchar_t *);
+void fputstring (FILE * f, const std::wstring &);
+
+template<class CHAR> CHAR * fgetline (FILE * f, CHAR * buf, int size);
+template<class CHAR, size_t n> CHAR * fgetline (FILE * f, CHAR (& buf)[n]) { return fgetline (f, buf, n); }
+string fgetline (FILE * f);
+wstring fgetlinew (FILE * f);
+void fgetline (FILE * f, std::string & s, std::vector<char> & buf);
+void fgetline (FILE * f, std::wstring & s, std::vector<char> & buf);
+void fgetline (FILE * f, std::vector<char> & buf);
+void fgetline (FILE * f, std::vector<wchar_t> & buf);
+
+const char * fgetstring (FILE * f, char * buf, int size);
+template<size_t n> const char * fgetstring (FILE * f, char (& buf)[n]) { return fgetstring (f, buf, n); }
+const char * fgetstring (const HANDLE f, char * buf, int size);
+template<size_t n> const char * fgetstring (const HANDLE f, char (& buf)[n]) { return fgetstring (f, buf, n); }
+
+const wchar_t * fgetstring (FILE * f, wchar_t * buf, int size);
+wstring fgetwstring (FILE * f);
+string fgetstring (FILE * f);
+
+const char * fgettoken (FILE * f, char * buf, int size);
+template<size_t n> const char * fgettoken (FILE * f, char (& buf)[n]) { return fgettoken (f, buf, n); }
+string fgettoken (FILE * f);
+const wchar_t * fgettoken (FILE * f, wchar_t * buf, int size);
+wstring fgetwtoken (FILE * f);
+
+int fskipNewline (FILE * f, bool skip = true);
+int fskipwNewline (FILE * f, bool skip = true);
+
+// ----------------------------------------------------------------------------
+// fputstring(): write a 0-terminated string (terminate if error)
+// ----------------------------------------------------------------------------
+
+void fputstring (FILE * f, const char *);
+void fputstring (FILE * f, const std::string &);
+void fputstring (FILE * f, const wchar_t *);
+void fputstring (FILE * f, const std::wstring &);
+
+// ----------------------------------------------------------------------------
+// fgetTag(): read a 4-byte tag & return as a string
+// ----------------------------------------------------------------------------
+
+string fgetTag (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fcheckTag(): read a 4-byte tag & verify it; terminate if wrong tag
+// ----------------------------------------------------------------------------
+
+void fcheckTag (FILE * f, const char * expectedTag);
+void fcheckTag_ascii (FILE * f, const string & expectedTag);
+
+// ----------------------------------------------------------------------------
+// fcompareTag(): compare two tags; terminate if wrong tag
+// ----------------------------------------------------------------------------
+
+void fcompareTag (const string & readTag, const string & expectedTag);
+
+// ----------------------------------------------------------------------------
+// fputTag(): write a 4-byte tag
+// ----------------------------------------------------------------------------
+
+void fputTag (FILE * f, const char * tag);
+
+// ----------------------------------------------------------------------------
+// fskipstring(): skip a 0-terminated string, such as a pad string
+// ----------------------------------------------------------------------------
+
+void fskipstring (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fpad(): write a 0-terminated string to pad file to a n-byte boundary
+// ----------------------------------------------------------------------------
+
+void fpad (FILE * f, int n);
+
+// ----------------------------------------------------------------------------
+// fgetbyte(): read a byte value
+// ----------------------------------------------------------------------------
+
+char fgetbyte (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fgetshort(): read a short value
+// ----------------------------------------------------------------------------
+
+short fgetshort (FILE * f);
+short fgetshort_bigendian (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fgetint24(): read a 3-byte (24-bit) int value
+// ----------------------------------------------------------------------------
+
+int fgetint24 (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fgetint(): read an int value
+// ----------------------------------------------------------------------------
+
+int fgetint (FILE * f);
+int fgetint_bigendian (FILE * f);
+int fgetint_ascii (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fgetlong(): read an long value
+// ----------------------------------------------------------------------------
+long fgetlong (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fgetfloat(): read a float value
+// ----------------------------------------------------------------------------
+
+float fgetfloat (FILE * f);
+float fgetfloat_bigendian (FILE * f);
+float fgetfloat_ascii (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fgetdouble(): read a double value
+// ----------------------------------------------------------------------------
+
+double fgetdouble (FILE * f);
+
+// ----------------------------------------------------------------------------
+// fputbyte(): write a byte value
+// ----------------------------------------------------------------------------
+
+void fputbyte (FILE * f, char val);
+
+// ----------------------------------------------------------------------------
+// fputshort(): write a short value
+// ----------------------------------------------------------------------------
+
+void fputshort (FILE * f, short val);
+
+// ----------------------------------------------------------------------------
+// fputint24(): write a 3-byte (24-bit) int value
+// ----------------------------------------------------------------------------
+
+void fputint24 (FILE * f, int v);
+
+// ----------------------------------------------------------------------------
+// fputint(): write an int value
+// ----------------------------------------------------------------------------
+
+void fputint (FILE * f, int val);
+
+// ----------------------------------------------------------------------------
+// fputlong(): write an long value
+// ----------------------------------------------------------------------------
+
+void fputlong (FILE * f, long val);
+
+// ----------------------------------------------------------------------------
+// fputfloat(): write a float value
+// ----------------------------------------------------------------------------
+
+void fputfloat (FILE * f, float val);
+
+// ----------------------------------------------------------------------------
+// fputdouble(): write a double value
+// ----------------------------------------------------------------------------
+
+void fputdouble (FILE * f, double val);
+
+
+// template versions of put/get functions for binary files
+template <typename T>
+void fput(FILE * f, T v)
+{
+    fwriteOrDie (&v, sizeof (v), 1, f);
+}
+
+
+// template versions of put/get functions for binary files
+template <typename T>
+void fget(FILE * f, T& v)
+{
+    freadOrDie ((void *)&v, sizeof (v), 1, f);
+}
+
+
+// GetFormatString - get the format string for a particular type
+template <typename T>
+const wchar_t* GetFormatString(T /*t*/)
+{
+    // if this _ASSERT goes off it means that you are using a type that doesn't have
+    // a read and/or write routine. 
+    // If the type is a user defined class, you need to create some global functions that handles file in/out.
+    // for example: 
+    //File& operator>>(File& stream, MyClass& test);
+    //File& operator<<(File& stream, MyClass& test);
+    //
+    // in your class you will probably want to add these functions as friends so you can access any private members
+    // friend File& operator>>(File& stream, MyClass& test);
+    // friend File& operator<<(File& stream, MyClass& test);
+    //
+    // if you are using wchar_t* or char* types, these use other methods because they require buffers to be passed
+    // either use std::string and std::wstring, or use the WriteString() and ReadString() methods
+    assert(false);  // need a specialization
+    return NULL;
+}
+
+// GetFormatString - specalizations to get the format string for a particular type
+template <>             const wchar_t* GetFormatString(char);
+template <>          const wchar_t* GetFormatString(wchar_t);
+template <>            const wchar_t* GetFormatString(short);
+template <>              const wchar_t* GetFormatString(int);
+template <>             const wchar_t* GetFormatString(long);
+template <>   const wchar_t* GetFormatString(unsigned short);
+template <>     const wchar_t* GetFormatString(unsigned int);
+template <>    const wchar_t* GetFormatString(unsigned long);
+template <>            const wchar_t* GetFormatString(float);
+template <>           const wchar_t* GetFormatString(double);
+template <>           const wchar_t* GetFormatString(size_t);
+template <>        const wchar_t* GetFormatString(long long);
+template <>      const wchar_t* GetFormatString(const char*);
+template <>   const wchar_t* GetFormatString(const wchar_t*);
+
+// GetScanFormatString - get the format string for a particular type
+template <typename T>
+const wchar_t* GetScanFormatString(T t)
+{
+    assert(false);  // need a specialization
+    return NULL;
+}
+
+// GetScanFormatString - specalizations to get the format string for a particular type
+template <>             const wchar_t* GetScanFormatString(char);
+template <>          const wchar_t* GetScanFormatString(wchar_t);
+template <>            const wchar_t* GetScanFormatString(short);
+template <>              const wchar_t* GetScanFormatString(int);
+template <>             const wchar_t* GetScanFormatString(long);
+template <>   const wchar_t* GetScanFormatString(unsigned short);
+template <>     const wchar_t* GetScanFormatString(unsigned int);
+template <>    const wchar_t* GetScanFormatString(unsigned long);
+template <>            const wchar_t* GetScanFormatString(float);
+template <>           const wchar_t* GetScanFormatString(double);
+template <>           const wchar_t* GetScanFormatString(size_t);
+template <>        const wchar_t* GetScanFormatString(long long);
+
+
+// ----------------------------------------------------------------------------
+// fgetText(): get a value from a text file
+// ----------------------------------------------------------------------------
+template <typename T>
+void fgetText(FILE * f, T& v)
+{
+    int rc = ftrygetText(f, v);
+    if (rc == 0)
+        throw std::runtime_error("error reading value from file (invalid format)");
+    else if (rc == EOF)
+        throw std::runtime_error(std::string("error reading from file: ") + strerror(errno));
+    assert(rc == 1);
+}
+
+// version to try and get a string, and not throw exceptions if contents don't match
+template <typename T>
+int ftrygetText(FILE * f, T& v)
+{
+    const wchar_t* formatString = GetScanFormatString<T>(v);
+    int rc = fwscanf (f, formatString, &v);
+    assert(rc == 1 || rc == 0);
+    return rc;
+}
+
+template <> int ftrygetText<bool>(FILE * f, bool& v);
+// ----------------------------------------------------------------------------
+// fgetText() specializations for fwscanf_s differences: get a value from a text file
+// ----------------------------------------------------------------------------
+void fgetText(FILE * f, char& v);
+void fgetText(FILE * f, wchar_t& v);
+
+
+// ----------------------------------------------------------------------------
+// fputText(): write a value out as text
+// ----------------------------------------------------------------------------
+template <typename T>
+void fputText(FILE * f, T v)
+{
+    const wchar_t* formatString = GetFormatString(v);
+    int rc = fwprintf(f, formatString, v);
+    if (rc == 0)
+        throw std::runtime_error("error writing value to file, no values written");
+    else if (rc < 0)
+        throw std::runtime_error(std::string("error writing to file: ") + strerror(errno));
+}
+
+// ----------------------------------------------------------------------------
+// fputText(): write a bool out as character
+// ----------------------------------------------------------------------------
+template <> void fputText<bool>(FILE * f, bool v);
+
+// ----------------------------------------------------------------------------
+// fputfile(): write a binary block or a string as a file
+// ----------------------------------------------------------------------------
+
+void fputfile (const wstring & pathname, const std::vector<char> & buffer);
+void fputfile (const wstring & pathname, const std::wstring & string);
+void fputfile (const wstring & pathname, const std::string & string);
+
+// ----------------------------------------------------------------------------
+// fgetfile(): load a file as a binary block
+// ----------------------------------------------------------------------------
+
+void fgetfile (const wstring & pathname, std::vector<char> & buffer);
+void fgetfile (FILE * f, std::vector<char> & buffer);
+namespace msra { namespace files {
+    void fgetfilelines (const std::wstring & pathname, vector<char> & readbuffer, std::vector<std::string> & lines);
+    static inline std::vector<std::string> fgetfilelines (const std::wstring & pathname) { vector<char> buffer; std::vector<std::string> lines; fgetfilelines (pathname, buffer, lines); return lines; }
+    vector<char*> fgetfilelines (const wstring & pathname, vector<char> & readbuffer);
+};};
+
+// ----------------------------------------------------------------------------
+// expand_wildcards() -- expand a path with wildcards (also intermediate ones)
+// ----------------------------------------------------------------------------
+
+void expand_wildcards (const wstring & path, vector<wstring> & paths);
+
+// ----------------------------------------------------------------------------
+// make_intermediate_dirs() -- make all intermediate dirs on a path
+// ----------------------------------------------------------------------------
+
+namespace msra { namespace files {
+    void make_intermediate_dirs (const wstring & filepath);
+};};
+
+// ----------------------------------------------------------------------------
+// fuptodate() -- test whether an output file is at least as new as an input file
+// ----------------------------------------------------------------------------
+
+namespace msra { namespace files {
+    bool fuptodate (const wstring & target, const wstring & input, bool inputrequired = true);
+};};
+
+#if 0
+// ----------------------------------------------------------------------------
+// simple support for WAV file I/O
+// ----------------------------------------------------------------------------
+
+// define the header if we haven't seen it yet
+#ifndef _WAVEFORMATEX_
+#define _WAVEFORMATEX_
+
+/*
+ *  extended waveform format structure used for all non-PCM formats. this
+ *  structure is common to all non-PCM formats.
+ */
+typedef unsigned short WORD;  // in case not defined yet (i.e. linux)
+typedef struct tWAVEFORMATEX
+{
+    WORD        wFormatTag;         /* format type */
+    WORD        nChannels;          /* number of channels (i.e. mono, stereo...) */
+    DWORD       nSamplesPerSec;     /* sample rate */
+    DWORD       nAvgBytesPerSec;    /* for buffer estimation */
+    WORD        nBlockAlign;        /* block size of data */
+    WORD        wBitsPerSample;     /* number of bits per sample of mono data */
+    WORD        cbSize;             /* the count in bytes of the size of */
+                                    /* extra information (after cbSize) */
+} WAVEFORMATEX, *PWAVEFORMATEX;
+
+#endif /* _WAVEFORMATEX_ */
+
+typedef struct wavehder{
+    char          riffchar[4];
+    unsigned int  RiffLength;
+    char          wavechar[8];
+    unsigned int  FmtLength; 
+    signed short  wFormatTag; 
+    signed short  nChannels;    
+    unsigned int  nSamplesPerSec; 
+    unsigned int  nAvgBytesPerSec; 
+    signed short  nBlockAlign; 
+    signed short  wBitsPerSample;
+    char          datachar[4];
+    unsigned int  DataLength;
+private:
+    void prepareRest (int SampleCount);
+public:
+    void prepare (unsigned int Fs, int Bits, int Channels, int SampleCount);
+    void prepare (const WAVEFORMATEX & wfx, int SampleCount);
+    unsigned int read (FILE * f, signed short & wRealFormatTag, int & bytesPerSample);
+    void write (FILE * f);
+    static void update (FILE * f);
+} WAVEHEADER;
+
+// ----------------------------------------------------------------------------
+// fgetwfx(), fputwfx(): I/O of wave file headers only
+// ----------------------------------------------------------------------------
+unsigned int fgetwfx (FILE *f, WAVEFORMATEX & wfx);
+void fputwfx (FILE *f, const WAVEFORMATEX & wfx, unsigned int numSamples);
+
+// ----------------------------------------------------------------------------
+// fgetraw(): read data of .wav file, and separate data of multiple channels. 
+//            For example, data[i][j]: i is channel index, 0 means the first 
+//            channel. j is sample index.
+// ----------------------------------------------------------------------------
+void fgetraw (FILE *f,std::vector< std::vector<short> > & data,const WAVEHEADER & wavhd);
+#endif
+
+// ----------------------------------------------------------------------------
+// temp functions -- clean these up
+// ----------------------------------------------------------------------------
+
+// split a pathname into directory and filename
+static inline void splitpath (const wstring & path, wstring & dir, wstring & file)
+{
+    size_t pos = path.find_last_of (L"\\:/");    // DOS drives, UNIX, Windows
+    if (pos == path.npos)   // no directory found
+    {
+        dir.clear();
+        file = path;
+    }
+    else
+    {
+        dir = path.substr (0, pos);
+        file = path.substr (pos +1);
+    }
+}
+
+// test if a pathname is a relative path
+// A relative path is one that can be appended to a directory.
+// Drive-relative paths, such as D:file, are considered non-relative.
+static inline bool relpath (const wchar_t * path)
+{   // this is a wild collection of pathname conventions in Windows
+    if (path[0] == '/' || path[0] == '\\')  // e.g. \WINDOWS
+        return false;
+    if (path[0] && path[1] == ':')          // drive syntax
+        return false;
+    // ... TODO: handle long NT paths
+    return true;                            // all others
+}
+template<class CHAR>
+static inline bool relpath (const std::basic_string<CHAR> & s) { return relpath (s.c_str()); }
+
+// trim from start
+static inline std::string &ltrim(std::string &s) {
+    s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun<int, int>(std::isspace))));
+    return s;
+}
+
+// trim from end
+static inline std::string &rtrim(std::string &s) {
+    s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun<int, int>(std::isspace))).base(), s.end());
+    return s;
+}
+
+// trim from both ends
+static inline std::string &trim(std::string &s) {
+    return ltrim(rtrim(s));
+}
+
+vector<string> sep_string(const string & str, const string & sep);
+
+#endif    // _FILEUTIL_
--- a/DataReader/Kaldi2Reader/htkfeatio.h
+++ b/DataReader/Kaldi2Reader/htkfeatio.h
@ -0,0 +1,532 @@
+//
+// <copyright file="htkfeatio.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// htkfeatio.h -- helper for I/O of HTK feature files
+
+#pragma once
+
+#include "basetypes.h"
+#include "fileutil.h"
+#include "simple_checked_arrays.h"
+
+#include <string>
+#include <regex>
+#include <set>
+#include <hash_map>
+#include <stdint.h>
+#include <limits.h>
+#include <wchar.h>
+//#include <iostream>
+
+#include "htkfeatio_utils.h"
+#include "kaldi.h"
+
+namespace msra { namespace asr {
+
+class FeatureSection {
+public:        
+    wstring scpFile;
+    string rx;
+    string feature_transform;
+
+private:
+    kaldi::RandomAccessBaseFloatMatrixReader * feature_reader;
+    kaldi::nnet1::Nnet nnet_transf;
+    kaldi::CuMatrix<kaldi::BaseFloat> feats_transf;
+    kaldi::Matrix<kaldi::BaseFloat> buf;
+
+public:
+
+    FeatureSection(wstring scpFile, wstring rx_file, wstring feature_transform) {
+        this->scpFile = scpFile;
+        this->rx = trimmed(fileToStr(toStr(rx_file)));
+        this->feature_transform = toStr(feature_transform);
+
+        feature_reader = new kaldi::RandomAccessBaseFloatMatrixReader(rx);
+
+        //std::wcout << "Kaldi2Reader: created feature reader " << feature_reader << " [" << rx.c_str() << "]" << std::endl;
+
+        if (this->feature_transform == "NO_FEATURE_TRANSFORM") {
+            this->feature_transform = "";
+        }
+
+        if (!this->feature_transform.empty()) {
+            nnet_transf.Read(this->feature_transform);
+        }
+    }
+
+    kaldi::Matrix<kaldi::BaseFloat> & read(wstring wkey) {
+        string key = toStr(wkey);
+
+        if (!feature_reader->HasKey(key)) {
+            fprintf(stderr, "Missing features for: %s", key.c_str());
+            throw std::runtime_error(msra::strfun::strprintf ("Missing features for: %s", key.c_str()));
+        }
+
+        const kaldi::Matrix<kaldi::BaseFloat> & value = feature_reader->Value(key);
+
+        if (this->feature_transform.empty()) {
+            buf.Resize(value.NumRows(), value.NumCols());
+            buf.CopyFromMat(value);
+        } else {
+            nnet_transf.Feedforward(kaldi::CuMatrix<kaldi::BaseFloat>(value), &feats_transf);
+            buf.Resize(feats_transf.NumRows(), feats_transf.NumCols());
+            feats_transf.CopyToMat(&buf);
+        }
+
+        return buf;
+    }
+
+    ~FeatureSection() {
+        //std::wcout << "Kaldi2Reader: deleted feature reader " << feature_reader << std::endl;
+
+        delete feature_reader;
+    }
+};
+
+// ===========================================================================
+// htkfeatio -- common base class for reading and writing HTK feature files
+// ===========================================================================
+
+class htkfeatio
+{
+protected:
+    htkfeatio() {}
+
+    /*
+    Kaldi is row major and stores each feature as a row. Cntk is col major, but it stores each feature as a column.
+    This makes it ok to copy one to the other straight-up.
+    */
+    template<class MATRIX>
+    void copyKaldiToCntk(kaldi::Matrix<kaldi::BaseFloat> & kaldifeat, MATRIX & cntkfeat) {
+        int num_rows = kaldifeat.NumRows();
+        int num_cols = kaldifeat.NumCols();
+        int src_stride = kaldifeat.Stride();
+
+        kaldi::BaseFloat * src = kaldifeat.Data();
+
+        int same_size = (num_rows == cntkfeat.cols()) && (num_cols == cntkfeat.rows());
+        if (!same_size) {
+            std::wcout << __FUNCTION__ << " not same size "
+                << "kaldifeat row-maj(" << num_rows << "," << num_cols << ")"
+                << "cntkfeat col-maj(" << cntkfeat.rows() << "," << cntkfeat.cols() << ")";
+            exit(1);
+        }
+
+        for (int r=0; r<num_rows; r++) {
+            std::copy(src, src+num_cols, &cntkfeat(0, r));
+            src += src_stride;
+        }    
+    }
+
+    template<class MATRIX>
+    void copyCntkToKaldi() {
+
+    }
+};
+
+// ===========================================================================
+// htkfeatwriter -- write HTK feature file
+// This is designed to write a single file only (no archive mode support).
+// ===========================================================================
+
+class htkfeatwriter : protected htkfeatio
+{
+public:
+    // open the file for writing
+    htkfeatwriter (wstring path, string kind, size_t dim, unsigned int period)
+    {
+    }
+
+    // read an entire utterance into a matrix
+    // Matrix type needs to have operator(i,j) and resize(n,m).
+    // We write to a tmp file first to ensure we don't leave broken files that would confuse make mode.
+    template<class MATRIX> static void write (const wstring & path, const string & kindstr, unsigned int period, const MATRIX & feat)
+    {
+        //std::wcout << __FILE__ << ":" << __FUNCTION__ << " not implemented" << std::endl;
+        exit(1);
+    }
+    template<class T> static void WriteBasicType (std::ostream &os, bool binary, T t) {
+        if (binary) {
+            char len_c = (std::numeric_limits<T>::is_signed ? 1 : -1) * static_cast<char>(sizeof(t));
+            os.put(len_c);
+            os.write(reinterpret_cast<const char *>(&t), sizeof(t));
+        } else {
+            if (sizeof(t) == 1)
+                os << static_cast<int16>(t) << " ";
+            else
+                os << t << " ";
+        }
+        if (os.fail()) {
+            throw std::runtime_error("Write failure in WriteBasicType.");
+        }
+    }
+    template<class MATRIX> static void writeKaldi (const wstring & path, const string & kindstr, unsigned int period, const MATRIX & feat, const int precision)
+    {
+        std::string path_utf8 =msra::strfun::utf8(path);
+        std::ofstream os(path_utf8.c_str());
+
+        if (!os.good())
+        {
+            throw runtime_error ("parsedpath: this mode requires an input script with start and end frames given");
+        }
+        size_t featdim = feat.rows();
+        size_t numframes = feat.cols();
+        bool binary = true;
+        os << removeExtension(basename(path_utf8)) << ' ';
+        os.put('\0');
+        os.put('B');
+        std::string my_token = (precision==4 ? "FM" : "DM");
+        //WriteToken(os, binary, my_token);
+        os << my_token << " ";
+        {
+            int32 rows = numframes;
+            int32 cols = featdim;
+            WriteBasicType(os, binary, rows);
+            WriteBasicType(os, binary, cols);
+        }
+        vector<float> v (featdim);
+        for (size_t i = 0; i < numframes; i++)
+        {
+            foreach_index (k, v)
+            {
+                v[k] = feat(k,i);
+                if (v[k] > 50)
+                {
+                    v[k] = -(float)log(1.0/featdim);
+                }
+            }
+            os.write(reinterpret_cast<const char*> (&v[0]), precision * (featdim));
+        }
+        os.flush();
+        if (!os.good()) {
+        }
+
+        /* wstring tmppath = path + L"$$"; // tmp path for make-mode compliant
+        unlinkOrDie (path);             // delete if old file is already there
+        // write it out
+        vector<float> v (featdim);
+        htkfeatwriter W (tmppath, kindstr, feat.rows(), period);
+        for (size_t i = 0; i < numframes; i++)
+        {
+            foreach_index (k, v)
+                v[k] = feat(k,i);
+            W.write (v);
+        }
+        W.close (numframes);
+        // rename to final destination
+        // (This would only fail in strange circumstances such as accidental multiple processes writing to the same file.)
+        renameOrDie (tmppath, path);*/
+    }
+};
+
+// ===========================================================================
+// htkfeatreader -- read HTK feature file, with archive support
+//
+// To support archives, one instance of this can (and is supposed to) be used
+// repeatedly. All feat files read on the same instance are validated to have
+// the same feature kind.
+//
+// For archives, this caches the last used file handle, in expectation that most reads
+// are sequential anyway. In conjunction with a big buffer, this makes a huge difference.
+// ===========================================================================
+
+class htkfeatreader : protected htkfeatio
+{
+    // information on current file
+    // File handle and feature type information is stored in the underlying htkfeatio object.
+    //TODO make this nicer
+
+public:
+
+    // parser for complex a=b[s,e] syntax
+    struct parsedpath
+    {
+    public:
+        FeatureSection * featuresection;
+
+    private:
+        wstring xpath;          // original full path specification as passed to constructor (for error messages)
+        wstring logicalpath;    // sequence ID
+        size_t num_frames;
+
+        void malformed() const { throw std::runtime_error (msra::strfun::strprintf ("parsedpath: malformed path '%S'", xpath.c_str())); }
+
+        // consume and return up to 'delim'; remove from 'input' (we try to avoid C++0x here for VS 2008 compat)
+        wstring consume (wstring & input, const wchar_t * delim)
+        {
+            vector<wstring> parts = msra::strfun::split (input, delim); // (not very efficient, but does not matter here)
+            if (parts.size() == 1) input.clear();   // not found: consume to end
+            else input = parts[1];                  // found: break at delimiter
+            return parts[0];
+        }
+
+    public:
+        // constructor parses a=b[s,e] syntax and fills in the file
+        // Can be used implicitly e.g. by passing a string to open().
+        parsedpath (wstring xpath, FeatureSection * featuresection) : xpath (xpath), featuresection (featuresection)
+        {
+            logicalpath = consume (xpath, L" ");
+            if (xpath.empty()) malformed();
+
+            num_frames = msra::strfun::toint(xpath);
+        }
+
+        // casting to wstring yields the logical path
+        operator const wstring & () const { return logicalpath; }
+
+        // get duration in frames
+        size_t numframes() const
+        {
+            return num_frames;
+        }
+    };
+
+public:
+
+    htkfeatreader() {}
+
+    // helper to create a parsed-path object
+    // const auto path = parse (xpath)
+    parsedpath parse (const wstring & xpath, FeatureSection * featuresection) { return parsedpath (xpath, featuresection); }
+
+    void getinfo (const parsedpath & ppath, size_t & featdim)
+    {
+        kaldi::Matrix<kaldi::BaseFloat> & kaldifeat = ppath.featuresection->read(ppath);
+        featdim = kaldifeat.NumCols();
+    }
+
+    // read an entire utterance into an already allocated matrix
+    // Matrix type needs to have operator(i,j)
+    template<class MATRIX> void readNoAlloc (const parsedpath & ppath, const string & kindstr, const unsigned int period, MATRIX & feat)
+    {
+        // open the file and check dimensions
+        size_t numframes = ppath.numframes();
+
+        // read vectors from file and push to our target structure
+        try {
+            kaldi::Matrix<kaldi::BaseFloat> & kaldifeat = ppath.featuresection->read(ppath);
+            size_t featdim = kaldifeat.NumCols();
+
+            if (feat.cols() != numframes || feat.rows() != featdim) {
+                throw std::logic_error ("read: stripe read called with wrong dimensions");
+            }
+            copyKaldiToCntk(kaldifeat, feat);
+
+#if 0
+            std::wcout << (wstring)ppath << std::endl;
+            for (int c=0; c<10; c++) {
+                for (int r=0; r<10; r++) {
+                    std::wcout << feat(r, c) << " ";
+                }
+                std::wcout << std::endl;
+            }
+            exit(1);
+#endif
+            
+        } catch (...) { throw; }
+    }
+
+    // read an entire utterance into a virgen, allocatable matrix
+    // Matrix type needs to have operator(i,j) and resize(n,m)
+    template<class MATRIX> void readAlloc (const parsedpath & ppath, string & kindstr, unsigned int & period, MATRIX & feat)
+    {
+        // get the file
+        size_t numframes = ppath.numframes();
+
+        // read vectors from file and push to our target structure
+        try {
+            kaldi::Matrix<kaldi::BaseFloat> & kaldifeat = ppath.featuresection->read(ppath);
+            size_t featdim = kaldifeat.NumCols();
+
+            feat.resize (featdim, numframes);   // result matrix--columns are features
+            copyKaldiToCntk(kaldifeat, feat);
+        } catch (...) { throw; }
+    }
+};
+
+struct htkmlfentry
+{
+    unsigned int firstframe;    // range [firstframe,firstframe+numframes)
+    unsigned short numframes;
+    //unsigned short classid;     // numeric state id
+    unsigned int classid;     // numeric state id - mseltzer changed from ushort to uint for untied cd phones > 2^16
+
+public:
+
+    // verify and save data
+    void setdata (size_t ts, size_t te, size_t uid)
+    {
+        if (te < ts) throw std::runtime_error ("htkmlfentry: end time below start time??");
+        // save
+        firstframe = (unsigned int) ts;
+        numframes = (unsigned short) (te - ts);
+        classid = (unsigned int) uid;
+        // check for numeric overflow
+        if (firstframe != ts || firstframe + numframes != te || classid != uid)
+            throw std::runtime_error ("htkmlfentry: not enough bits for one of the values");
+    }
+};
+
+template<class ENTRY, class WORDSEQUENCE>
+class htkmlfreader : public map<wstring,vector<ENTRY>>   // [key][i] the data
+{
+    wstring curpath;                                    // for error messages
+    hash_map<std::string, size_t> statelistmap;   // for state <=> index
+
+    void strtok (char * s, const char * delim, vector<char*> & toks)
+    {
+        toks.resize (0);
+        char * context = nullptr;
+        for (char * p = strtok_s (s, delim, &context); p; p = strtok_s (NULL, delim, &context))
+            toks.push_back (p);
+    }
+    void malformed (string what)
+    {
+        throw std::runtime_error (msra::strfun::strprintf ("htkmlfreader: %s in '%S'", what.c_str(), curpath.c_str()));
+    }
+
+    vector<char*> readlines (const wstring & path, vector<char> & buffer)
+    {
+        // load it into RAM in one huge chunk
+        auto_file_ptr f  (fopenOrDie (path, L"rb"));
+        size_t len = filesize (f);
+        buffer.reserve (len +1);
+        freadOrDie (buffer, len, f);
+        buffer.push_back (0);           // this makes it a proper C string
+
+        // parse into lines
+        vector<char *> lines;
+        lines.reserve (len / 20);
+        strtok (&buffer[0], "\r\n", lines);
+        return lines;
+    }
+
+public:
+
+    // return if input statename is sil state (hard code to compared first 3 chars with "sil")
+    bool issilstate (const string & statename) const    // (later use some configuration table)
+    {
+        return (statename.size() > 3 && statename.at(0) == 's' && statename.at(1) == 'i' && statename.at(2) == 'l');
+    }
+
+    vector<bool> issilstatetable;       // [state index] => true if is sil state (cached)
+
+    // return if input stateid represent sil state (by table lookup)
+    bool issilstate (const size_t id) const
+    {
+        assert (id < issilstatetable.size());
+        return issilstatetable[id];
+    }
+
+    // constructor reads multiple MLF files
+    htkmlfreader (const vector<wstring> & paths, const set<wstring> & restricttokeys, const wstring & stateListPath = L"", const double htkTimeToFrame = 100000.0, int targets_delay = 0)
+    {
+        // read state list
+        if (stateListPath != L"")
+            readstatelist (stateListPath);
+
+        // read MLF(s) --note: there can be multiple, so this is a loop
+        foreach_index (i, paths)
+            read (paths[i], restricttokeys, htkTimeToFrame, targets_delay);
+    }
+
+    // note: this function is not designed to be pretty but to be fast
+    void read (const wstring & path, const set<wstring> & restricttokeys, const double htkTimeToFrame, int targets_delay)
+    {
+        fprintf (stderr, "htkmlfreader: reading MLF file %S ...", path.c_str());
+        curpath = path;         // for error messages only
+
+        std::string targets_rspecifier = trimmed(fileToStr(toStr(path)));
+
+        kaldi::SequentialPosteriorReader targets_reader(targets_rspecifier);
+
+        while (!targets_reader.Done()) {
+
+            std::wstring key = toWStr(targets_reader.Key());
+            const kaldi::Posterior p = targets_reader.Value();
+
+            vector<ENTRY> & entries = (*this)[key];
+            if (!entries.empty()) malformed (msra::strfun::strprintf ("duplicate entry '%S'", key.c_str()));
+
+            int num_rows = p.size();    // number of labels for this utterance
+
+            entries.resize(num_rows);
+
+            for (int row=0; row<num_rows; row++) {
+                int num_cols = p.at(row).size();
+                if (num_cols != 1) {
+                    std::wcout << "num_cols != 1: " << num_cols << std::endl;
+                    exit(1);
+                }
+                int delay_row = 0;
+                if (row - targets_delay >= 0)
+                {
+                    delay_row = row - targets_delay;
+                }
+
+                std::pair<int32, float> pair = p.at(delay_row).at(0);
+                if (pair.second != 1) {
+                    std::wcout << "pair.second != 1: " << pair.second << std::endl;
+                    exit(1);
+                }
+
+                size_t ts = row;
+                size_t te = row + 1;
+                size_t target = pair.first;
+                
+                if (statelistmap.size() != 0) {
+                    std::string target_str = std::to_string(target);
+                    auto iter = statelistmap.find (target_str);
+                    if (iter == statelistmap.end()) {
+                        throw std::runtime_error (msra::strfun::strprintf ("kaldi htkmlfentry: state %s not found in statelist", target_str.c_str()));
+                    }
+                    target = iter->second;
+                }
+                entries[row].setdata(ts, te, target); 
+            }
+
+            targets_reader.Next();
+        }
+
+        curpath.clear();
+        fprintf (stderr, " total %lu entries\n", this->size());
+    }
+
+    // read state list, index is from 0
+    void readstatelist (const wstring & stateListPath = L"")
+    {
+        if (stateListPath != L"")
+        {
+            vector<char> buffer;    // buffer owns the characters--don't release until done
+            vector<char*> lines = readlines (stateListPath, buffer);
+            size_t index;
+            issilstatetable.reserve (lines.size());
+            for (index = 0; index < lines.size(); index++)
+            {
+                statelistmap[lines[index]] = index;
+                issilstatetable.push_back (issilstate (lines[index]));
+            }
+            if (index != statelistmap.size())
+                throw std::runtime_error (msra::strfun::strprintf ("readstatelist: lines (%d) not equal to statelistmap size (%d)", index, statelistmap.size()));
+            if (statelistmap.size() != issilstatetable.size())
+                throw std::runtime_error (msra::strfun::strprintf ("readstatelist: size of statelookuparray (%d) not equal to statelistmap size (%d)", issilstatetable.size(), statelistmap.size()));
+            fprintf (stderr, "total %lu state names in state list %S\n", statelistmap.size(), stateListPath.c_str());
+        }
+    }
+
+    // return state num: varify the fintune layer dim
+    size_t getstatenum () const
+    {
+        return statelistmap.size();
+    }
+
+    size_t getstateid (string statename)        // added by Hang Su adaptation
+    {
+        return statelistmap[statename];
+    }
+};
+
+};};    // namespaces
--- a/DataReader/Kaldi2Reader/htkfeatio_utils.h
+++ b/DataReader/Kaldi2Reader/htkfeatio_utils.h
@ -0,0 +1,42 @@
+#pragma once
+
+#include <fstream>
+#include <sstream>
+
+namespace msra { namespace asr {
+
+inline std::string toStr(std::wstring w)
+{
+    return std::string(w.begin(), w.end());
+}
+
+inline std::wstring toWStr(std::string s)
+{
+    return std::wstring(s.begin(), s.end());
+}
+
+inline std::string fileToStr(std::string fname)
+{
+	std::ifstream t(fname, std::ifstream::in);
+	std::stringstream buffer;
+	buffer << t.rdbuf();
+	return buffer.str();
+}
+
+inline std::string trimmed(std::string str)
+{
+    auto found = str.find_first_not_of(" \t\n");
+    if (found == string::npos)
+    {
+        str.erase(0);
+        return str;
+    }
+    str.erase(0, found);
+    found = str.find_last_not_of(" \t\n");
+    if (found != string::npos)
+        str.erase(found+1);
+
+    return str;
+}
+
+}}
--- a/DataReader/Kaldi2Reader/kaldi.h
+++ b/DataReader/Kaldi2Reader/kaldi.h
@ -0,0 +1,6 @@
+#pragma once
+
+#include "util/table-types.h"
+#include "hmm/posterior.h"
+#include "nnet/nnet-nnet.h"
+#include "cudamatrix/cu-device.h"
--- a/DataReader/Kaldi2Reader/latticearchive.cpp
+++ b/DataReader/Kaldi2Reader/latticearchive.cpp
@ -0,0 +1,743 @@
+//
+// <copyright file="latticearchive.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+
+
+#pragma once
+
+#include "stdafx.h"
+#include "basetypes.h"
+#include "fileutil.h"
+#include "htkfeatio.h"  // for MLF reading for numer lattices
+#include "latticearchive.h"
+#include "msra_mgram.h" // for MLF reading for numer lattices
+#include <stdio.h>
+#include <stdint.h>
+#include <vector>
+#include <string>
+#include <set>
+#include <hash_map>
+#include <regex>
+
+#pragma warning(disable : 4996)
+namespace msra { namespace lattices {
+
+// helper to write a symbol hash (string -> int) to a file
+// File has two sections:
+//  - physicalunitname     // line number is mapping, starting with 0
+//  - logunitname physicalunitname   // establishes a mapping; logunitname will get the same numeric index as physicalunitname
+template<class UNITMAP>
+static void writeunitmap (const wstring & symlistpath, const UNITMAP & unitmap)
+{
+    std::vector<std::string> units;
+    units.reserve (unitmap.size());
+    std::vector<std::string> mappings;
+    mappings.reserve (unitmap.size());
+    for (auto iter = unitmap.cbegin(); iter != unitmap.cend(); iter++)  // why would 'for (auto iter : unitmap)' not work?
+    {
+        const std::string label = iter->first;
+        const size_t unitid = iter->second;
+        if (units.size() <= unitid)
+            units.resize (unitid + 1);      // we grow it on demand; the result must be compact (all entries filled), we check that later
+        if (!units[unitid].empty())         // many-to-one mapping: remember the unit; look it up while writing
+            mappings.push_back (label);
+        else
+            units[unitid] = label;
+    }
+
+    auto_file_ptr flist = fopenOrDie (symlistpath, L"wb");
+    // write (physical) units
+    foreach_index (k, units)
+    {
+        if (units[k].empty())
+            throw std::logic_error ("build: unitmap has gaps");
+        fprintfOrDie (flist, "%s\n", units[k].c_str());
+    }
+    // write log-phys mappings
+    foreach_index (k, mappings)
+    {
+        const std::string unit = mappings[k];               // logical name
+        const size_t unitid = unitmap.find (unit)->second;  // get its unit id; this indexes the units array
+        const std::string tounit = units[unitid];           // and get the name from tehre
+        fprintfOrDie (flist, "%s %s\n", unit.c_str(), tounit.c_str());
+    }
+    fflushOrDie (flist);
+}
+
+// (little helper to do a map::find() with default value)
+template<typename MAPTYPE, typename KEYTYPE, typename VALTYPE>
+static size_t tryfind (const MAPTYPE & map, const KEYTYPE & key, VALTYPE deflt)
+{
+    auto iter = map.find (key);
+    if (iter == map.end())
+        return deflt;
+    else
+        return iter->second;
+}
+
+// archive format:
+//  - output files of build():
+//     - OUTPATH                --the resulting archive (a huge file), simple concatenation of binary blocks
+//     - OUTPATH.toc            --contains keys and offsets; this is how content in archive is found
+//       KEY=ARCHIVE[BYTEOFFSET]        // where ARCHIVE can be empty, meaning same as previous
+//     - OUTPATH.symlist    --list of all unit names encountered, in order of numeric index used in archive (first = index 0)
+//                                This file is suitable as an input to HHEd's AU command.
+//  - in actual use,
+//     - .toc files can be concatenated
+//     - .symlist files must remain paired with the archive file
+//  - for actual training, user also needs to provide, typically from an HHEd AU run:
+//     - OUTPATH.tying          --map from triphone units to senone sequence by name; get full phone set from .symlist above
+//       UNITNAME SENONE[2] SENONE[3] SENONE[4]
+/*static*/ void archive::build (const std::vector<std::wstring> & infiles, const std::wstring & outpath,
+                                const std::unordered_map<std::string,size_t> & modelsymmap,
+                                const msra::asr::htkmlfreader<msra::asr::htkmlfentry,msra::lattices::lattice::htkmlfwordsequence> & labels,   // non-empty: build numer lattices
+                                const msra::lm::CMGramLM & unigram, const msra::lm::CSymbolSet & unigramsymbols)  // for numer lattices
+{
+#if 0   // little unit test helper for testing the read function
+    bool test = true;
+    if (test)
+    {
+        archive a;
+        a.open (outpath + L".toc");
+        lattice L;
+        std::hash_map<string,size_t> symmap;
+        a.getlattice (L"sw2001_A_1263622500_1374610000", L, symmap);
+        a.getlattice (L"sw2001_A_1391162500_1409287500", L, symmap);
+        return;
+    }
+#endif
+
+    const bool numermode = !labels.empty(); // if labels are passed then we shall convert the MLFs to lattices, and 'infiles' are regular keys
+
+    const std::wstring tocpath = outpath + L".toc";
+    const std::wstring symlistpath = outpath + L".symlist";
+
+    // process all files
+    std::set<std::wstring> seenkeys;        // (keep track of seen keys; throw error for duplicate keys)
+    msra::files::make_intermediate_dirs (outpath);
+
+    auto_file_ptr f = fopenOrDie (outpath, L"wb");
+    auto_file_ptr ftoc = fopenOrDie (tocpath, L"wb");
+    size_t brokeninputfiles = 0;
+    foreach_index (i, infiles)
+    {
+        const std::wstring & inlatpath = infiles[i];
+        fprintf (stderr, "build: processing lattice '%S'\n", inlatpath.c_str());
+
+        // get key
+        std::wstring key = regex_replace (inlatpath, wregex (L"=.*"), wstring());  // delete mapping
+        key = regex_replace (key, wregex (L".*[\\\\/]"), wstring());                // delete path
+        key = regex_replace (key, wregex (L"\\.[^\\.\\\\/:]*$"), wstring());        // delete extension (or not if none)
+        if (!seenkeys.insert (key).second)
+            throw std::runtime_error (msra::strfun::strprintf ("build: duplicate key for lattice '%S'", inlatpath.c_str()));
+
+        // we fail all the time due to totally broken HDecode/copy process, OK if not too many files are missing
+        bool latticeread = false;
+        try
+        {
+            // fetch lattice
+            lattice L;
+            if (!numermode)
+                L.fromhtklattice (inlatpath, modelsymmap);      // read HTK lattice
+            else
+                L.frommlf (key, modelsymmap, labels, unigram, unigramsymbols);       // read MLF into a numerator lattice
+            latticeread = true;
+
+            // write to archive
+            uint64_t offset = fgetpos (f);
+            L.fwrite (f);
+            fflushOrDie (f);
+
+            // write reference to TOC file   --note: TOC file is a headerless UTF8 file; so don't use fprintf %S format (default code page)
+            fprintfOrDie (ftoc, "%s=%s[%llu]\n", msra::strfun::utf8 (key).c_str(), ((i - brokeninputfiles) == 0) ? msra::strfun::utf8 (outpath).c_str() : "", offset);
+            fflushOrDie (ftoc);
+
+            fprintf (stderr, "written lattice to offset %llu as '%S'\n", offset, key.c_str());
+        }
+        catch (const exception & e)
+        {
+            if (latticeread) throw;        // write failure
+            // we ignore read failures
+            fprintf (stderr, "ERROR: skipping unreadable lattice '%S': %s\n", inlatpath.c_str(), e.what());
+            brokeninputfiles++;
+        }
+    }
+
+    // write out the unit map
+    // TODO: This is sort of redundant now--it gets the symmap from the HMM, i.e. always the same for all archives.
+    writeunitmap (symlistpath, modelsymmap);
+
+    fprintf (stderr, "completed %lu out of %lu lattices (%lu read failures, %.1f%%)\n", infiles.size(), infiles.size()-brokeninputfiles, brokeninputfiles, 100.0f * brokeninputfiles / infiles.size());
+}
+
+// helper to set a context value (left, right) with checking of uniqueness
+void lattice::nodecontext::setcontext (int & lr, int val)
+{
+    if (lr == unknown)
+        lr = val;
+    else if (lr != val)
+        lr = (signed short) ambiguous;
+}
+
+// helper for merge() to determine the unique node contexts
+vector<lattice::nodecontext> lattice::determinenodecontexts (const msra::asr::simplesenonehmm & hset) const
+{
+    const size_t spunit = tryfind (hset.getsymmap(), "sp", SIZE_MAX);
+    const size_t silunit = tryfind (hset.getsymmap(), "sil", SIZE_MAX);
+    vector<lattice::nodecontext> nodecontexts (nodes.size());
+    nodecontexts.front().left = nodecontext::start;
+    nodecontexts.front().right = nodecontext::ambiguous;    // (should not happen, but won't harm either)
+    nodecontexts.back().right = nodecontext::end;
+    nodecontexts.back().left = nodecontext::ambiguous;      // (should not happen--we require !sent_end; but who knows)
+    size_t multispseen = 0;                                 // bad entries with multi-sp
+    foreach_index (j, edges)
+    {
+        const auto & e = edges[j];
+        const size_t S = e.S;
+        const size_t E = e.E;
+        auto a = getaligninfo (j);
+        if (a.size() == 0)  // !NULL edge
+            throw std::logic_error ("determinenodecontexts: !NULL edges not allowed in merging, should be removed before");
+        size_t A = a[0].unit;
+        size_t Z = a[a.size()-1].unit;
+        if (Z == spunit)
+        {
+            if (a.size() < 2)
+                throw std::runtime_error ("determinenodecontexts: context-free unit (/sp/) found as a single-phone word");
+            else
+            {
+                Z = a[a.size()-2].unit;
+                if (Z == spunit)        // a bugg lattice --I got this from HVite, to be tracked down
+                {
+                    // search from end once again, to print a warning
+                    int n;
+                    for (n = (int) a.size() -1; n >= 0; n--)
+                        if (a[n].unit != spunit)
+                            break;
+                    // ends with n = position of furthest non-sp
+                    if (n < 0)  // only sp?
+                        throw std::runtime_error ("determinenodecontexts: word consists only of /sp/");
+                    fprintf (stderr, "determinenodecontexts: word with %lu /sp/ at the end found, edge %d\n", a.size() -1 - n, j);
+                    multispseen++;
+                    Z = a[n].unit;
+                }
+            }
+        }
+        if (A == spunit || Z == spunit)
+        {
+#if 0
+            fprintf (stderr, "A=%d   Z=%d   fa=%d   j=%d/N=%d    L=%d  n=%d   totalalign=%d  ts/te=%d/%d\n", (int) A, (int) Z, (int) e.firstalign,(int) j, (int) edges.size(), (int) nodes.size(), (int) a.size(), (int) align.size(),
+                    nodes[S].t, nodes[E].t);
+            foreach_index (kk, a)
+                fprintf (stderr, "a[%d] = %d\n", kk, a[kk].unit);
+            dump (stderr, [&] (size_t i) { return hset.gethmm (i).getname(); });
+#endif
+            throw std::runtime_error ("determinenodecontexts: context-free unit (/sp/) found as a start phone or second last phone");
+        }
+        const auto & Ahmm = hset.gethmm (A);
+        const auto & Zhmm = hset.gethmm (Z);
+        int Aid = (int) Ahmm.gettransPindex();
+        int Zid = (int) Zhmm.gettransPindex();
+        nodecontexts[S].setright (Aid);
+        nodecontexts[E].setleft (Zid);
+    }
+    if (multispseen > 0)
+        fprintf (stderr, "determinenodecontexts: %lu broken edges in %lu with multiple /sp/ at the end seen\n", multispseen, edges.size());
+    // check CI conditions and put in 't'
+    // We make the hard assumption that there is only one CI phone, /sil/.
+    const auto & silhmm = hset.gethmm (silunit);
+    int silid = silhmm.gettransPindex();
+    foreach_index (i, nodecontexts)
+    {
+        auto & nc = nodecontexts[i];
+        if ((nc.left == nodecontext::unknown) ^ (nc.right == nodecontext::unknown))
+            throw std::runtime_error ("determinenodecontexts: invalid dead-end node in lattice");
+        if (nc.left == nodecontext::ambiguous && nc.right != silid && nc.right != nodecontext::end)
+            throw std::runtime_error ("determinenodecontexts: invalid ambiguous left context (right context is not CI)");
+        if (nc.right == nodecontext::ambiguous && nc.left != silid && nc.left != nodecontext::start)
+            throw std::runtime_error ("determinenodecontexts: invalid ambiguous right context (left context is not CI)");
+        nc.t = nodes[i].t;
+    }
+    return nodecontexts;    // (will this use a move constructor??)
+}
+
+// compar function for sorting and merging
+bool lattice::nodecontext::operator< (const nodecontext & other) const
+{
+    // sort by t, left, right, i  --sort by i to make i appear before iother, as assumed in merge function
+    int diff = (int) t - (int) other.t;
+    if (diff == 0)
+    {
+        diff = left - other.left;
+        if (diff == 0)
+        {
+            diff = right - other.right;
+            if (diff == 0)
+                return i < other.i; // (cannot use 'diff=' pattern since unsigned but may be SIZE_MAX)
+        }
+    }
+    return diff < 0;
+}
+
+// remove that final !NULL edge
+// We have that in HAPI lattices, but there can be only one at the end.
+void lattice::removefinalnull()
+{
+    const auto & lastedge = edges.back();
+    // last edge can be !NULL, recognized as having 0 alignment records
+    if (lastedge.firstalign < align.size()) // has alignment records --not !NULL
+        return;
+    if (lastedge.S != nodes.size() -2 || lastedge.E != nodes.size() -1)
+        throw std::runtime_error ("removefinalnull: malformed final !NULL edge");
+    edges.resize (edges.size() -1); // remove it
+    nodes.resize (nodes.size() -1); // its start node is now the new end node
+    foreach_index (j, edges)
+        if (edges[j].E >= nodes.size())
+            throw std::runtime_error ("removefinalnull: cannot have final !NULL edge and other edges connecting to end node at the same time");
+}
+
+// merge a secondary lattice into the first
+// With lots of caveats:
+//  - this optimizes lattices to true unigram lattices where the only unique node condition is acoustic context
+//  - no !NULL edge at the end, call removefinalnull() before
+//  - this function returns an unsorted edges[] array, i.e. invalid. We sort in uniq'ed representation, which is easier.
+// This function is not elegant at all, just hard labor!
+void lattice::merge (const lattice & other, const msra::asr::simplesenonehmm & hset)
+{
+    if (!edges2.empty() || !other.edges2.empty())
+        throw std::logic_error ("merge: lattice(s) must be in non-uniq'ed format (V1)");
+    if (!info.numframes || !other.info.numframes)
+        throw std::logic_error ("merge: lattice(s) must have identical number of frames");
+
+    // establish node contexts
+    auto contexts = determinenodecontexts (hset);
+    auto othercontexts = other.determinenodecontexts (hset);
+
+    // create joint node space and node mapping
+    // This also collapses non-unique nodes.
+    // Note the edge case sil-sil in one lattice which may be sil-ambiguous or ambiguous-sil on the other.
+    // We ignore this, keeping such nodes unmerged. That's OK since middle /sil/ words have zero LM, and thus it's OK to keep them non-connected.
+    foreach_index (i, contexts) contexts[i].i = i;
+    foreach_index (i, othercontexts) othercontexts[i].iother = i;
+    contexts.insert (contexts.end(), othercontexts.begin(), othercontexts.end());   // append othercontext
+    sort (contexts.begin(), contexts.end());
+    vector<size_t> nodemap (nodes.size(), SIZE_MAX);
+    vector<size_t> othernodemap (other.nodes.size(), SIZE_MAX);
+    int j = 0;
+    foreach_index (i, contexts)     // merge identical nodes  --this is the critical step
+    {
+        if (j == 0 || contexts[j-1].t != contexts[i].t || contexts[j-1].left != contexts[i].left || contexts[j-1].right != contexts[i].right)
+            contexts[j++] = contexts[i];            // entered a new one
+        // node map
+        if (contexts[i].i != SIZE_MAX)
+            nodemap[contexts[i].i] = j-1;
+        if (contexts[i].iother != SIZE_MAX)
+            othernodemap[contexts[i].iother] = j-1;
+    }
+    fprintf (stderr, "merge: joint node space uniq'ed to %d from %d\n", j, contexts.size());
+    contexts.resize (j);
+
+    // create a new node array (just copy the contexts[].t fields)
+    nodes.resize (contexts.size());
+    foreach_index (inew, nodes)
+        nodes[inew].t = (unsigned short) contexts[inew].t;
+    info.numnodes = nodes.size();
+
+    // incorporate the alignment records
+    const size_t alignoffset = align.size();
+    align.insert (align.end(), other.align.begin(), other.align.end());
+
+    // map existing edges' S and E fields, and also 'firstalign'
+    foreach_index (j, edges)
+    {
+        edges[j].S = nodemap[edges[j].S];
+        edges[j].E = nodemap[edges[j].E];
+    }
+    auto otheredges = other.edges;
+    foreach_index (j, otheredges)
+    {
+        otheredges[j].S = othernodemap[otheredges[j].S];
+        otheredges[j].E = othernodemap[otheredges[j].E];
+        otheredges[j].firstalign += alignoffset;    // that's where they are now
+    }
+
+    // at this point, a new 'nodes' array exists, and the edges already are w.r.t. the new node space and align space
+
+    // now we are read to merge 'other' edges into this, simply by concatenation
+    edges.insert (edges.end(), otheredges.begin(), otheredges.end());
+
+    // remove acoustic scores --they are likely not identical if they come from different decoders
+    // If we don't do that, this will break the sorting in builduniquealignments()
+    info.hasacscores = 0;
+    foreach_index (j, edges)
+        edges[j].a = 0.0f;
+
+    // Note: we have NOT sorted or de-duplicated yet. That is best done after conversion to the uniq'ed format.
+}
+
+// remove duplicates
+// This must be called in uniq'ed format.
+void lattice::dedup()
+{
+    if (edges2.empty())
+        throw std::logic_error ("dedup: lattice must be in uniq'ed format (V2)");
+
+    size_t k = 0;
+    foreach_index (j, edges2)
+    {
+        if (k > 0 && edges2[k-1].S == edges2[j].S && edges2[k-1].E == edges2[j].E && edges2[k-1].firstalign == edges2[j].firstalign)
+        {
+            if (edges2[k-1].implysp != edges2[j].implysp)
+                throw std::logic_error ("dedup: inconsistent 'implysp' flag for otherwise identical edges");
+            continue;
+        }
+        edges2[k++] = edges2[j];
+    }
+    fprintf (stderr, "dedup: edges reduced to %d from %d\n", k, edges2.size());
+    edges2.resize (k);
+    info.numedges = edges2.size();
+    edges.clear();  // (should already be, but isn't; make sure we no longer use it)
+}
+
+// load all lattices from a TOC file and write them to a new archive
+// Use this to
+//  - upgrade the file format to latest in case of format changes
+//  - check consistency (read only; don't write out)
+//  - dump to stdout
+//  - merge two lattices (for merging numer into denom lattices)
+// Input path is an actual TOC path, output is the stem (.TOC will be added). --yes, not nice, maybe fix it later
+// Example command:
+// convertlatticearchive --latticetocs dummy c:\smbrdebug\sw20_small.den.lats.toc.10 -w c:\smbrdebug\sw20_small.den.lats.converted --cdphonetying c:\smbrdebug\combined.tying --statelist c:\smbrdebug\swb300h.9304.aligned.statelist --transprobs c:\smbrdebug\MMF.9304.transprobs
+// How to regenerate from my test lattices:
+// buildlatticearchive c:\smbrdebug\sw20_small.den.lats.regenerated c:\smbrdebug\hvitelat\*lat
+// We support two special output path syntaxs:
+//  - empty ("") -> don't output, just check the format
+//  - dash ("-") -> dump lattice to stdout instead
+/*static*/ void archive::convert (const std::wstring & intocpath, const std::wstring & intocpath2, const std::wstring & outpath,
+                                  const msra::asr::simplesenonehmm & hset)
+{
+    const auto & modelsymmap = hset.getsymmap();
+
+    const std::wstring tocpath = outpath + L".toc";
+    const std::wstring symlistpath = outpath + L".symlist";
+
+    // open input archive
+    // TODO: I find that HVite emits redundant physical triphones, and even HHEd seems so (in .tying file).
+    //  Thus, we should uniq the units before sorting. We can do that here if we have the .tying file.
+    //  And then use the modelsymmap to map them down.
+    //  Do this directly in the hset module (it will be transparent).
+    std::vector<std::wstring> intocpaths (1, intocpath);            // set of paths consisting of 1
+    msra::lattices::archive archive (intocpaths, modelsymmap);
+
+    // secondary archive for optional merging operation
+    const bool mergemode = !intocpath2.empty();                     // true if merging two lattices
+    std::vector<std::wstring> intocpaths2;
+    if (mergemode)
+        intocpaths2.push_back (intocpath2);
+    msra::lattices::archive archive2 (intocpaths2, modelsymmap);    // (if no merging then this archive2 is empty)
+
+    // read the intocpath file once again to get the keys in original order
+    std::vector<char> textbuffer;
+    auto toclines = msra::files::fgetfilelines (intocpath, textbuffer);
+
+    auto_file_ptr f = NULL;
+    auto_file_ptr ftoc = NULL;
+
+    // process all files
+    if (outpath != L"" && outpath != L"-")  // test for special syntaxes that bypass to actually create an output archive
+    {
+        msra::files::make_intermediate_dirs (outpath);
+        f = fopenOrDie (outpath, L"wb");
+        ftoc = fopenOrDie (tocpath, L"wb");
+    }
+    vector<const char *> invmodelsymmap;    // only used for dump() mode
+
+    // we must parse the toc file once again to get the keys in original order
+    size_t skippedmerges = 0;
+    foreach_index (i, toclines)
+    {
+        const char * line = toclines[i];
+        const char * p = strchr (line, '=');
+        if (p == NULL)
+            throw std::runtime_error ("open: invalid TOC line (no = sign): " + std::string (line));
+        const std::wstring key = msra::strfun::utf16 (std::string (line, p - line));
+
+        fprintf (stderr, "convert: processing lattice '%S'\n", key.c_str());
+
+        // fetch lattice  --this performs any necessary format conversions already
+        lattice L;
+        archive.getlattice (key, L);
+
+        lattice L2;
+        if (mergemode)
+        {
+            if (!archive2.haslattice (key))
+            {
+                fprintf (stderr, "convert: cannot merge because lattice '%S' missing in secondary archive; skipping\n", key.c_str());
+                skippedmerges++;
+                continue;
+            }
+            archive2.getlattice (key, L2);
+
+            // merge it in
+            // This will connect each node with matching 1-phone context conditions; aimed at merging numer lattices.
+            L.removefinalnull();    // get rid of that final !NULL headache
+            L2.removefinalnull();
+            L.merge (L2, hset);
+            // note: we are left with dups due to true unigram merging (HTK lattices cannot represent true unigram lattices since id is on the nodes)
+        }
+        //L.removefinalnull();
+        //L.determinenodecontexts (hset);
+
+        // convert it  --TODO: once we permanently use the new format, do this in fread() for V1
+        // Note: Merging may have left this in unsorted format; we need to be robust against that.
+        const size_t spunit = tryfind (modelsymmap, "sp", SIZE_MAX);
+        L.builduniquealignments (spunit);
+
+        if (mergemode)
+            L.dedup();
+
+        if (f && ftoc)
+        {
+            // write to archive
+            uint64_t offset = fgetpos (f);
+            L.fwrite (f);
+            fflushOrDie (f);
+            
+            // write reference to TOC file   --note: TOC file is a headerless UTF8 file; so don't use fprintf %S format (default code page)
+            fprintfOrDie (ftoc, "%s=%s[%llu]\n", msra::strfun::utf8 (key).c_str(), (i == 0) ? msra::strfun::utf8 (outpath).c_str() : "", offset);
+            fflushOrDie (ftoc);
+
+            fprintf (stderr, "written converted lattice to offset %llu as '%S'\n", offset, key.c_str());
+        }
+        else if (outpath == L"-")
+        {
+            if (invmodelsymmap.empty()) // build this lazily
+            {
+                invmodelsymmap.resize (modelsymmap.size());
+                for (auto iter = modelsymmap.begin(); iter != modelsymmap.end(); iter++)
+                    invmodelsymmap[iter->second] = iter->first.c_str();
+            }
+            L.rebuildedges (false);
+            L.dump (stdout, [&] (size_t i) { return invmodelsymmap[i]; } );
+        }
+    }   // end for (toclines)
+    if (skippedmerges > 0)
+        fprintf (stderr, "convert: %d out of %d merge operations skipped due to secondary lattice missing\n", skippedmerges, toclines.size());
+
+    // write out the updated unit map
+    if (f && ftoc)
+        writeunitmap (symlistpath, modelsymmap);
+
+    fprintf (stderr, "converted %d lattices\n", toclines.size());
+}
+
+// ---------------------------------------------------------------------------
+// reading lattices from external formats (HTK lat, MLF)
+// ---------------------------------------------------------------------------
+
+// read an HTK lattice
+// The lattice is expected to be freshly constructed (I did not bother to check).
+void lattice::fromhtklattice (const wstring & path, const std::unordered_map<std::string,size_t> & unitmap)
+{
+    vector<char> textbuffer;
+    auto lines = msra::files::fgetfilelines (path, textbuffer);
+    if (lines.empty())
+                throw std::runtime_error ("lattice: mal-formed lattice--empty input file (or all-zeroes)");
+    auto iter = lines.begin();
+    // parse out LMF and WP
+    char dummychar = 0;     // dummy for sscanf() end checking
+    for ( ; iter != lines.end() && strncmp (*iter, "N=", 2); iter++)
+    {
+        if (strncmp (*iter, "lmscale=", 8) == 0)    // note: HTK sometimes generates extra garbage space at the end of this line
+            if (sscanf_s (*iter, "lmscale=%f wdpenalty=%f%c", &info.lmf, &info.wp, &dummychar, sizeof (dummychar)) != 2 && dummychar != ' ')
+                throw std::runtime_error ("lattice: mal-formed lmscale/wdpenalty line in lattice: " + string (*iter));
+    }
+    
+    // parse N and L
+    if (iter != lines.end())
+    {
+        unsigned long N, L;
+        if (sscanf_s (*iter, "N=%lu L=%lu %c", &N, &L, &dummychar, sizeof (dummychar)) != 2)
+            throw std::runtime_error ("lattice: mal-formed N=/L= line in lattice: " + string (*iter));
+        info.numnodes = N;
+        info.numedges = L;
+        iter++;
+    }
+    else
+        throw std::runtime_error ("lattice: mal-formed before parse N=/L= line in lattice.");
+    
+    ASSERT(info.numnodes > 0);
+    nodes.reserve (info.numnodes);
+    // parse the nodes
+    for (size_t i = 0; i < info.numnodes; i++, iter++)
+    {
+        if (iter == lines.end())
+            throw std::runtime_error ("lattice: not enough I lines in lattice");
+        unsigned long itest;
+        float t;
+        if (sscanf_s (*iter, "I=%lu t=%f%c", &itest, &t, &dummychar, sizeof (dummychar)) < 2)
+            throw std::runtime_error ("lattice: mal-formed node line in lattice: " + string (*iter));
+        if (i != (size_t) itest)
+            throw std::runtime_error ("lattice: out-of-sequence node line in lattice: " + string (*iter));
+        nodes.push_back (nodeinfo ((unsigned int) (t / info.frameduration + 0.5)));
+        info.numframes = max (info.numframes, (size_t) nodes.back().t);
+    }
+    // parse the edges
+    ASSERT(info.numedges > 0);
+    edges.reserve (info.numedges);
+    align.reserve (info.numedges * 10);  // 10 phones per word on av. should be enough
+    std::string label;
+    for (size_t j = 0; j < info.numedges; j++, iter++)
+    {
+        if (iter == lines.end())
+            throw std::runtime_error ("lattice: not enough J lines in lattice");
+        unsigned long jtest;
+        unsigned long S, E;
+        float a, l;
+        char d[1024];
+        // example:
+        // J=12    S=1    E=13   a=-326.81   l=-5.090  d=:sil-t:s+k:e,0.03:dh:m-ax:m+sil,0.03:sil,0.02:
+        int nvals = sscanf_s (*iter, "J=%lu S=%lu E=%lu a=%f l=%f d=%s", &jtest, &S, &E, &a, &l, &d, sizeof (d));
+        if (nvals == 5 && j == info.numedges - 1)    // special case: last edge is a !NULL and thus may have the d= record missing
+            strcpy (d, ":");
+        else if (nvals != 6)
+            throw std::runtime_error ("lattice: mal-formed edge line in lattice: " + string (*iter));
+        if (j != (size_t) jtest)
+            throw std::runtime_error ("lattice: out-of-sequence edge line in lattice: " + string (*iter));
+        edges.push_back (edgeinfowithscores (S, E, a, l, align.size()));
+        // build align array
+        size_t edgeframes = 0;      // (for checking whether the alignment sums up right)
+        const char * p = d;
+        if (p[0] != ':' || (p[1] == 0 && j < info.numedges-1))    // last edge may be empty
+            throw std::runtime_error ("lattice: alignment info must start with a colon and must have at least one entry: " + string (*iter));
+        p++;
+        while (*p)
+        {
+            // p points to an entry of the form TRIPHONE,DURATION
+            const char * q = strchr (p, ',');
+            if (q == NULL)
+                throw std::runtime_error ("lattice: alignment entry lacking a comma: " + string (*iter));
+            if (q == p)
+                throw std::runtime_error ("lattice: alignment entry label empty: " + string (*iter));
+            label.assign (p, q-p);  // the triphone label
+            q++;
+            char * ep;
+            double duration = strtod (q, &ep); // (weird--returns a non-const ptr in ep to a const object)
+            p = ep;
+            if (*p != ':')
+                throw std::runtime_error ("lattice: alignment entry not ending with a colon: " + string (*iter));
+            p++;
+            // create the alignment entry
+            const size_t frames = (unsigned int) (duration / info.frameduration + 0.5);
+            auto it = unitmap.find (label);
+            if (it == unitmap.end())
+                throw std::runtime_error ("lattice: unit in alignment that is not in model: " + label);
+            const size_t unitid = it->second;
+            //const size_t unitid = unitmap.insert (make_pair (label, unitmap.size())).first->second;  // may create a new entry with index = #entries
+            align.push_back (aligninfo (unitid, frames));
+            edgeframes += frames;
+        }
+        if (edgeframes != nodes[E].t - (size_t) nodes[S].t)
+        {
+            char msg[128];
+            sprintf (msg, "\n-- where edgeframes=%d != (nodes[E].t - nodes[S].t=%d), the gap is %d.", edgeframes, nodes[E].t - (size_t) nodes[S].t, edgeframes + nodes[S].t - nodes[E].t);
+            throw std::runtime_error ("lattice: alignment info duration mismatches edge duration: " + string (*iter) + msg);
+        }
+    }
+    if (iter != lines.end())
+        throw std::runtime_error ("lattice: unexpected garbage at end of lattice: " + string (*iter));
+    checklattice();
+
+    // create more efficient storage for alignments
+    const size_t spunit = tryfind (unitmap, "sp", SIZE_MAX);
+    builduniquealignments (spunit);
+
+    showstats();
+}
+
+// construct a numerator lattice from an MLF entry
+// The lattice is expected to be freshly constructed (I did not bother to check).
+void lattice::frommlf (const wstring & key, const std::unordered_map<std::string,size_t> & unitmap,
+                       const msra::asr::htkmlfreader<msra::asr::htkmlfentry,lattice::htkmlfwordsequence> & labels,
+                       const msra::lm::CMGramLM & unigram, const msra::lm::CSymbolSet & unigramsymbols)
+{
+    const auto & transcripts = labels.allwordtranscripts(); // (TODO: we could just pass the transcripts map--does not really matter)
+
+    // get the labels (state and word)
+    auto iter = transcripts.find (key);
+    if (iter == transcripts.end())
+        throw std::runtime_error ("frommlf: no reference word sequence in MLF for lattice with key " + strfun::utf8 (key));
+    const auto & transcript = iter->second;
+    if (transcript.words.size() == 0)
+        throw std::runtime_error ("frommlf: empty reference word sequence for lattice with key " + strfun::utf8 (key));
+
+    // determine unigram scores for all words
+    vector<float> lmscores (transcript.words.size());
+    size_t silence = unigramsymbols["!silence"];
+    size_t lmend = unigramsymbols["</s>"];
+    size_t sentstart = unigramsymbols["!sent_start"];
+    size_t sentend = unigramsymbols["!sent_end"];
+
+    // create the lattice
+    nodes.resize (transcript.words.size() +1);
+    edges.resize (transcript.words.size());
+    align.reserve (transcript.align.size());
+    size_t numframes = 0;
+    foreach_index (j, transcript.words)
+    {
+        const auto & w = transcript.words[j];
+        nodes[j].t = w.firstframe;
+        auto & e = edges[j];
+        e.unused = 0;
+        e.S = j;
+        e.E = j+1;
+        if (e.E != j+1)
+            throw std::runtime_error (msra::strfun::strprintf ("frommlf: too many tokens to be represented as edgeinfo::E in label set: %S", key.c_str()));
+        e.a = 0.0f; // no ac score
+
+        // LM score
+        // !sent_start and !silence are patched to LM score 0
+        size_t wid = w.wordindex;
+        if (wid == sentstart)
+        {
+            if (j != 0)
+                throw std::logic_error ("frommlf: found an !sent_start token not at the first position");
+        }
+        else if (wid == sentend)
+        {
+            if (j != (int) transcript.words.size()-1)
+                throw std::logic_error ("frommlf: found an !sent_end token not at the end position");
+            wid = lmend;    // use </s> for score lookup
+        }
+        const int iwid = (int) wid;
+        e.l = (wid != sentstart && wid != silence) ? (float) unigram.score (&iwid, 1) : 0.0f;
+
+        // alignment
+        e.implysp = 0;
+        e.firstalign = align.size();
+        auto a = transcript.getaligninfo (j);
+        align.insert (align.end(), a.begin(), a.end());
+        foreach_index (k, a)
+            numframes += a[k].frames;
+    }
+    nodes[transcript.words.size()].t = (unsigned short) numframes;
+    if (nodes[transcript.words.size()].t != numframes)
+        throw std::runtime_error (msra::strfun::strprintf ("frommlf: too many frames to be represented as nodeinfo::t in label set: %S", key.c_str()));
+    info.lmf = -1.0f;       // indicates not set
+    info.wp = 0.0f;         // not set indicated by lmf < 0
+    info.numedges = edges.size();
+    info.numnodes = nodes.size();
+    info.numframes = numframes;
+    checklattice();
+
+    // create more efficient storage for alignments
+    const size_t spunit = tryfind (unitmap, "sp", SIZE_MAX);
+    builduniquealignments (spunit);
+
+    showstats();
+}
+
+};};
--- a/DataReader/Kaldi2Reader/latticearchive.h
+++ b/DataReader/Kaldi2Reader/latticearchive.h
--- a/DataReader/Kaldi2Reader/latticestorage.h
+++ b/DataReader/Kaldi2Reader/latticestorage.h
@ -0,0 +1,119 @@
+//
+// <copyright file="latticestorage.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// latticestorage.h -- basic data structures for storing lattices
+
+
+#if 0       // [v-hansu]  separate code with history
+#endif
+
+#pragma once
+#include <string>       // for the error message in checkoverflow() only
+#include <stdexcept>
+#include <stdint.h>
+
+#undef INITIAL_STRANGE              // [v-hansu] intialize structs to strange values
+#define PARALLEL_SIL                // [v-hansu] process sil on CUDA, used in other files, please search this
+#define LOGZERO -1e30f
+
+namespace msra { namespace lattices {
+
+static void checkoverflow (size_t fieldval, size_t targetval, const char * fieldname)
+{
+    if (fieldval != targetval)
+    {
+        char buf[1000];
+        sprintf_s (buf, "lattice: bit field %s too small for value 0x%zu (cut from 0x%zu)", fieldname, targetval, fieldval);
+        throw std::runtime_error (buf);
+    }
+}
+
+struct nodeinfo
+{
+    //uint64_t firstinedge : 24;  // index of first incoming edge
+    //uint64_t firstoutedge : 24; // index of first outgoing edge
+    //uint64_t t : 16;            // time associated with this
+    unsigned short t;            // time associated with this
+    nodeinfo (size_t pt) : t ((unsigned short) pt)   //, firstinedge (NOEDGE), firstoutedge (NOEDGE)
+    {
+        checkoverflow (t, pt, "nodeinfo::t");
+        //checkoverflow (firstinedge, NOEDGE, "nodeinfo::firstinedge");
+        //checkoverflow (firstoutedge, NOEDGE, "nodeinfo::firstoutedge");
+    }
+    nodeinfo()   // [v-hansu] initialize to impossible values
+    {
+#ifdef INITIAL_STRANGE
+        t = unsigned short (-1);
+#endif
+    }
+};
+// V2 format: a and l are stored in separate vectors
+struct edgeinfo
+{
+    uint64_t S : 19;            // start node
+    uint64_t unused : 1;        // (for future use)
+    uint64_t E : 19;            // end node
+    uint64_t implysp : 1;       // 1--alignment ends with a /sp/ that is not stored
+    uint64_t firstalign : 24;   // index into align for first entry; end is firstalign of next edge
+    edgeinfo (size_t pS, size_t pE, size_t pfirstalign) : S (pS), E (pE), firstalign (pfirstalign), unused (0), implysp (0)
+    {
+        checkoverflow (S, pS, "edgeinfowithscores::S");
+        checkoverflow (E, pE, "edgeinfowithscores::E");
+        checkoverflow (firstalign, pfirstalign, "edgeinfowithscores::firstalign");
+    }
+    edgeinfo()  // [v-hansu] initialize to impossible values
+    {
+#ifdef INITIAL_STRANGE
+        S = uint64_t (-1);
+        unused = uint64_t (-1);
+        E = uint64_t (-1);
+        implysp = uint64_t (-1);
+        firstalign = uint64_t (-1);
+#endif
+    }
+};
+// V1 format: a and l are included in the edge itself
+struct edgeinfowithscores : edgeinfo
+{
+    float a;
+    float l;
+    edgeinfowithscores (size_t pS, size_t pE, float a, float l, size_t pfirstalign) : edgeinfo (pS, pE, pfirstalign), a(a), l(l) {}
+    edgeinfowithscores()   // [v-hansu] initialize to impossible values
+    {
+#ifdef INITIAL_STRANGE
+        a = LOGZERO;
+        l = LOGZERO;
+#endif
+    }
+};
+struct aligninfo                // phonetic alignment
+{
+    unsigned int unit : 19;     // triphone index
+    unsigned int frames : 11;   // duration in frames
+    // note: V1 did not have the following, which were instead the two 2 bits of 'frames'
+    unsigned int unused : 1;    // (for future use)
+    unsigned int last : 1;      // set for last entry
+    aligninfo (size_t punit, size_t pframes) : unit ((unsigned int) punit), frames ((unsigned int) pframes), unused (0), last (0)
+    {
+        checkoverflow (unit, punit, "aligninfo::unit");
+        checkoverflow (frames, pframes, "aligninfo::frames");
+    }
+    aligninfo()    // [v-hansu] initialize to impossible values
+    {
+#ifdef INITIAL_STRANGE
+        unit = unsigned int (-1);
+        frames = unsigned int (-1);
+        unused = unsigned int (-1);
+        last = unsigned int (-1);
+#endif
+    }
+    template<class IDMAP> void updateunit (const IDMAP & idmap/*[unit] -> new unit*/)   // update 'unit' w.r.t. a different mapping, with bit-field overflow check
+    {
+        const size_t mappedunit = idmap[unit];
+        unit = (unsigned int) mappedunit;
+        checkoverflow (unit, mappedunit, "aligninfo::unit");
+    }
+};
+};};
--- a/DataReader/Kaldi2Reader/minibatchiterator.h
+++ b/DataReader/Kaldi2Reader/minibatchiterator.h
@ -0,0 +1,268 @@
+//
+// <copyright file="minibatchiterator.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// minibatchiterator.h -- iterator for minibatches
+
+
+#pragma once
+#define NONUMLATTICEMMI     // [v-hansu] move from main.cpp, no numerator lattice for mmi training
+
+#include <vector>
+#include <unordered_map>
+#include "ssematrix.h"
+#include "latticearchive.h"         // for reading HTK phoneme lattices (MMI training)
+#include "simple_checked_arrays.h"  // for const_array_ref
+
+namespace msra { namespace dbn {
+
+// ---------------------------------------------------------------------------
+// latticesource -- manages loading of lattices for MMI (in pairs for numer and denom)
+// ---------------------------------------------------------------------------
+class latticesource
+{
+    const msra::lattices::archive numlattices, denlattices;
+public:
+    latticesource (std::pair<std::vector<wstring>,std::vector<wstring>> latticetocs, const std::unordered_map<std::string,size_t> & modelsymmap)
+        : numlattices (latticetocs.first, modelsymmap), denlattices (latticetocs.second, modelsymmap) {}
+
+    bool empty() const
+    {
+#ifndef NONUMLATTICEMMI        // TODO:set NUM lattice to null so as to save memory
+        if (numlattices.empty() ^ denlattices.empty())
+            throw std::runtime_error("latticesource: numerator and denominator lattices must be either both empty or both not empty");
+#endif
+        return denlattices.empty();
+    }
+
+    bool haslattice (wstring key) const 
+    { 
+#ifdef NONUMLATTICEMMI
+        return denlattices.haslattice (key); 
+#else
+        return numlattices.haslattice (key) && denlattices.haslattice (key); 
+#endif
+    }
+
+    class latticepair : public pair<msra::lattices::lattice,msra::lattices::lattice>
+    {
+    public:
+        // NOTE: we don't check numerator lattice now
+        size_t getnumframes () const { return second.getnumframes(); }
+        size_t getnumnodes () const { return second.getnumnodes(); }
+        size_t getnumedges () const { return second.getnumedges(); }
+        wstring getkey () const { return second.getkey(); }
+    };
+
+    void getlattices (const std::wstring & key, shared_ptr<const latticesource::latticepair> & L, size_t expectedframes) const
+    {
+        shared_ptr<latticepair> LP (new latticepair);
+        denlattices.getlattice (key, LP->second, expectedframes);     // this loads the lattice from disk, using the existing L.second object
+        L = LP;
+    }
+};
+
+
+// ---------------------------------------------------------------------------
+// minibatchsource -- abstracted interface into frame sources
+// There are three implementations:
+//  - the old minibatchframesource to randomize across frames and page to disk
+//  - minibatchutterancesource that randomizes in chunks and pages from input files directly
+//  - a wrapper that uses a thread to read ahead in parallel to CPU/GPU processing
+// ---------------------------------------------------------------------------
+class minibatchsource
+{
+public:
+    // read a minibatch
+    // This function returns all values in a "caller can keep them" fashion:
+    //  - uids are stored in a huge 'const' array, and will never go away
+    //  - transcripts are copied by value
+    //  - lattices are returned as a shared_ptr
+    // Thus, getbatch() can be called in a thread-safe fashion, allowing for a 'minibatchsource' implementation that wraps another with a read-ahead thread.
+    // Return value is 'true' if it did read anything from disk, and 'false' if data came only from RAM cache. This is used for controlling the read-ahead thread.
+    virtual bool getbatch (const size_t globalts,
+                           const size_t framesrequested, msra::dbn::matrix & feat, std::vector<size_t> & uids,
+                           std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts,
+                           std::vector<shared_ptr<const latticesource::latticepair>> & lattices) = 0;
+    // alternate (updated) definition for multiple inputs/outputs - read as a vector of feature matrixes or a vector of label strings
+    virtual bool getbatch (const size_t globalts,
+                           const size_t framesrequested, std::vector<msra::dbn::matrix> & feat, std::vector<std::vector<size_t>> & uids,
+                           std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts,
+                           std::vector<shared_ptr<const latticesource::latticepair>> & lattices) = 0;
+    virtual size_t totalframes() const = 0;
+
+    virtual double gettimegetbatch () = 0;                          // used to report runtime
+    virtual size_t firstvalidglobalts (const size_t globalts) = 0;  // get first valid epoch start from intended 'globalts'
+    virtual const std::vector<size_t> & unitcounts() const = 0;     // report number of senones
+    virtual void setverbosity(int newverbosity) = 0;    
+    virtual ~minibatchsource() { }
+};
+
+
+// ---------------------------------------------------------------------------
+// minibatchiterator -- class to iterate over one epoch, minibatch by minibatch
+// This iterator supports both random frames and random utterances through the minibatchsource interface whichis common to both.
+// This supports multiple data passes with identical randomization; which is intended to be used for utterance-based training.
+// ---------------------------------------------------------------------------
+class minibatchiterator
+{
+    void operator= (const minibatchiterator &); // (non-copyable)
+
+    const size_t epochstartframe;
+    const size_t epochendframe;
+    size_t firstvalidepochstartframe;       // epoch start frame rounded up to first utterance boundary after epoch boundary
+    const size_t requestedmbframes;         // requested mb size; actual minibatches can be smaller (or even larger for lattices)
+    const size_t datapasses;                // we return the data this many times; caller must sub-sample with 'datapass'
+
+    msra::dbn::minibatchsource & source;    // feature source to read from
+
+    std::vector<msra::dbn::matrix> featbuf;              // buffer for holding curernt minibatch's frames
+    std::vector<std::vector<size_t>> uids;               // buffer for storing current minibatch's frame-level label sequence
+    std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> transcripts;    // buffer for storing current minibatch's word-level label sequences (if available and used; empty otherwise)
+    std::vector<shared_ptr<const latticesource::latticepair>> lattices;     // lattices of the utterances in current minibatch (empty in frame mode)
+
+    size_t mbstartframe;                    // current start frame into generalized time line (used for frame-wise mode and for diagnostic messages)
+    size_t actualmbframes;                  // actual number of frames in current minibatch
+    size_t datapass;                        // current datapass = pass through the data
+    double timegetbatch;                    // [v-hansu] for time measurement
+    double timechecklattice;
+private:
+    // fetch the next mb
+    // This updates featbuf, uids[], mbstartframe, and actualmbframes.
+    void fillorclear()
+    {
+        if (!hasdata()) // we hit the end of the epoch: just cleanly clear out everything (not really needed, can't be requested ever)
+        {
+            foreach_index(i, featbuf)
+                featbuf[i].resize (0, 0);
+
+            foreach_index(i,uids)
+                uids[i].clear();
+            
+            transcripts.clear();
+            actualmbframes = 0;
+            return;
+        }
+        // process one mini-batch (accumulation and update)
+        assert (requestedmbframes > 0);
+        const size_t requestedframes = min (requestedmbframes, epochendframe - mbstartframe);    // (< mbsize at end)
+        assert (requestedframes > 0);
+        source.getbatch (mbstartframe, requestedframes, featbuf, uids, transcripts, lattices);
+        timegetbatch = source.gettimegetbatch();
+        actualmbframes = featbuf[0].cols(); // for single i/o, there featbuf is length 1
+        // note:
+        //  - in frame mode, actualmbframes may still return less if at end of sweep
+        //  - in utterance mode, it likely returns less than requested, and
+        //    it may also be > epochendframe (!) for the last utterance, which, most likely, crosses the epoch boundary
+        auto_timer timerchecklattice;
+        if (!lattices.empty())
+        {
+            size_t totalframes = 0;
+            foreach_index (i, lattices)
+                totalframes += lattices[i]->getnumframes();
+            if (totalframes != actualmbframes)
+                throw std::logic_error ("fillorclear: frames in lattices do not match minibatch size");
+        }
+        timechecklattice = timerchecklattice;
+    }
+    bool hasdata() const { return mbstartframe < epochendframe; } // true if we can access and/or advance
+    void checkhasdata() const { if (!hasdata()) throw std::logic_error ("minibatchiterator: access beyond end of epoch"); }
+public:
+    // interface: for (minibatchiterator i (...), i, i++) { ... }
+    minibatchiterator (msra::dbn::minibatchsource & source, size_t epoch, size_t epochframes, size_t requestedmbframes, size_t datapasses)
+        : source (source),
+          epochstartframe (epoch * epochframes),
+          epochendframe (epochstartframe + epochframes),
+          requestedmbframes (requestedmbframes),
+          datapasses (datapasses),
+          timegetbatch (0), timechecklattice (0)
+    {
+        firstvalidepochstartframe = source.firstvalidglobalts (epochstartframe); // epochstartframe may fall between utterance boundaries; this gets us the first valid boundary
+        fprintf (stderr, "minibatchiterator: epoch %zu: frames [%zu..%zu] (first utterance at frame %zu) with %zu datapasses\n",
+                 epoch, epochstartframe, epochendframe, firstvalidepochstartframe, datapasses);
+        mbstartframe = firstvalidepochstartframe;
+        datapass = 0;
+        fillorclear(); // get the first batch
+    }
+    
+    // TODO not nice, but don't know how to access these frames otherwise
+    // mbiterator constructor, set epochstart and -endframe explicitly
+    minibatchiterator (msra::dbn::minibatchsource & source, size_t epoch, size_t epochstart, size_t epochend, size_t requestedmbframes, size_t datapasses)
+        : source (source),
+          epochstartframe (epochstart),
+          epochendframe (epochend),
+          requestedmbframes (requestedmbframes),
+          datapasses (datapasses),
+          timegetbatch (0), timechecklattice (0)
+    {
+        firstvalidepochstartframe = source.firstvalidglobalts (epochstartframe); // epochstartframe may fall between utterance boundaries; this gets us the first valid boundary
+        fprintf (stderr, "minibatchiterator: epoch %zu: frames [%zu..%zu] (first utterance at frame %zu) with %zu datapasses\n",
+                 epoch, epochstartframe, epochendframe, firstvalidepochstartframe, datapasses);
+        mbstartframe = firstvalidepochstartframe;
+        datapass = 0;
+        fillorclear(); // get the first batch
+    }
+
+    // need virtual destructor to ensure proper destruction
+    virtual ~minibatchiterator()
+    {}
+
+    // returns true if we still have data
+    operator bool() const { return hasdata(); }
+
+    // advance to the next minimb
+    void operator++(int/*denotes postfix version*/)
+    {
+        checkhasdata();
+        mbstartframe += actualmbframes;
+        // if we hit the end, we will get mbstartframe >= epochendframe <=> !hasdata()
+        // (most likely actually mbstartframe > epochendframe since the last utterance likely crosses the epoch boundary)
+        // in case of multiple datapasses, reset to start when hitting the end
+        if (!hasdata() && datapass + 1 < datapasses)
+        {
+            mbstartframe = firstvalidepochstartframe;
+            datapass++;
+            fprintf (stderr, "\nminibatchiterator: entering %zu-th repeat pass through the data\n", datapass+1);
+        }
+        fillorclear();
+    }
+
+    // accessors to current minibatch
+    size_t currentmbstartframe() const { return mbstartframe; }
+    size_t currentmbframes() const { return actualmbframes; }
+    size_t currentmblattices() const { return lattices.size(); }
+    size_t currentdatapass() const { return datapass; } // 0..datapasses-1; use this for sub-sampling
+    size_t requestedframes() const {return requestedmbframes; }
+    double gettimegetbatch () {return timegetbatch;}
+    double gettimechecklattice () {return timechecklattice;}
+    bool isfirst() const { return mbstartframe == firstvalidepochstartframe && datapass == 0; }
+    float progress() const  // (note: 100%+eps possible for last utterance)
+    {
+        const float epochframes = (float) (epochendframe - epochstartframe);
+        return (mbstartframe + actualmbframes - epochstartframe + datapass * epochframes) / (datapasses * epochframes);
+    }
+    std::pair<size_t,size_t> range() const { return make_pair (epochstartframe, epochendframe); }
+
+    // return the current minibatch frames as a matrix ref into the feature buffer
+    // Number of frames is frames().cols() == currentmbframes().
+    // For frame-based randomization, this is 'requestedmbframes' most of the times, while for utterance randomization,
+    // this depends highly on the utterance lengths.
+    // User is allowed to manipulate the frames... for now--TODO: move silence filtering here as well
+
+    msra::dbn::matrixstripe frames(size_t i) { checkhasdata(); assert(featbuf.size()>=i+1); return msra::dbn::matrixstripe (featbuf[i], 0, actualmbframes); }
+
+    msra::dbn::matrixstripe frames() { checkhasdata(); assert(featbuf.size()==1); return msra::dbn::matrixstripe (featbuf[0], 0, actualmbframes); }
+
+    // return the reference transcript labels (state alignment) for current minibatch
+    /*const*/ std::vector<size_t> & labels() { checkhasdata(); assert(uids.size()==1);return uids[0]; }
+    /*const*/ std::vector<size_t> & labels(size_t i) { checkhasdata(); assert(uids.size()>=i+1); return uids[i]; }
+
+    // return a lattice for an utterance (caller should first get total through currentmblattices())
+    shared_ptr<const msra::dbn::latticesource::latticepair> lattice (size_t uttindex) const { return lattices[uttindex]; }    // lattices making up the current 
+
+    // return the reference transcript labels (words with alignments) for current minibatch (or empty if no transcripts requested)
+    const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word> transcript (size_t uttindex) { return transcripts.empty() ? const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>() : transcripts[uttindex]; }
+};
+
+};};
--- a/DataReader/Kaldi2Reader/minibatchsourcehelpers.h
+++ b/DataReader/Kaldi2Reader/minibatchsourcehelpers.h
@ -0,0 +1,279 @@
+//
+// <copyright file="minibatchsourcehelpers.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// minibatchsourcehelpers.h -- helper classes for minibatch sources
+//
+
+#pragma once
+
+#include "basetypes.h"
+#include <stdio.h>
+#include <vector>
+#include <algorithm>
+
+#ifndef __unix__
+#include "ssematrix.h"      // for matrix type
+#endif
+
+namespace msra { namespace dbn {
+
+// ---------------------------------------------------------------------------
+// augmentneighbors() -- augmenting features with their neighbor frames
+// ---------------------------------------------------------------------------
+    
+// implant a sub-vector into a vector, for use in augmentneighbors
+template<class INV, class OUTV> static void copytosubvector (const INV & inv, size_t subvecindex, OUTV & outv)
+{
+    size_t subdim = inv.size();
+    assert (outv.size() % subdim == 0);
+    size_t k0 = subvecindex * subdim;
+    foreach_index (k, inv)
+        outv[k + k0] = inv[k];
+}
+
+// compute the augmentation extent (how many frames added on each side)
+static size_t augmentationextent (size_t featdim/*augment from*/, size_t modeldim/*to*/)
+{
+    const size_t windowframes = modeldim / featdim;   // total number of frames to generate
+    const size_t extent = windowframes / 2;           // extend each side by this
+
+    if (modeldim % featdim != 0)
+        throw runtime_error ("augmentationextent: model vector size not multiple of input features");
+    if (windowframes % 2 == 0)
+        throw runtime_error (msra::strfun::strprintf ("augmentationextent: neighbor expansion of input features to %d not symmetrical", windowframes));
+
+    return extent;
+}
+
+// augment neighbor frames for a frame (correctly not expanding across utterance boundaries)
+// The boundaryflags[] array, if not empty, flags first (-1) and last (+1) frame, i.e. frames to not expand across.
+// The output 'v' must have te-ts columns.
+template<class MATRIX, class VECTOR> static void augmentneighbors (const MATRIX & frames, const std::vector<char> & boundaryflags, size_t t,
+                                                                   VECTOR & v)
+{
+    // how many frames are we adding on each side
+    const size_t extent = augmentationextent (frames[t].size(), v.size());
+
+    // copy the frame and its neighbors
+    // Once we hit a boundaryflag in either direction, do not move index beyond.
+    copytosubvector (frames[t], extent, v);     // frame[t] sits right in the middle
+    size_t t1 = t;  // index for frames on to the left
+    size_t t2 = t;  // and right
+    for (size_t n = 1; n <= extent; n++)
+    {
+#ifdef SAMPLING_EXPERIMENT
+        if (boundaryflags.empty())  // boundary flags not given: 'frames' is full utterance
+        {
+            if (t1 >= SAMPLING_EXPERIMENT) t1 -= SAMPLING_EXPERIMENT;                   // index does not move beyond boundary
+            if (t2 + SAMPLING_EXPERIMENT < frames.size()) t2 += SAMPLING_EXPERIMENT;
+        }
+        else
+        {
+            if (boundaryflags[t1] != -1) t1 -= SAMPLING_EXPERIMENT;  // index does not move beyond a set boundaryflag,
+            if (boundaryflags[t2] != 1) t2 += SAMPLING_EXPERIMENT;   // because that's the start/end of the utterance
+        }
+#else
+        if (boundaryflags.empty())  // boundary flags not given: 'frames' is full utterance
+        {
+            if (t1 > 0) t1--;                   // index does not move beyond boundary
+            if (t2 + 1 < frames.size()) t2++;
+        }
+        else
+        {
+            if (boundaryflags[t1] != -1) t1--;  // index does not move beyond a set boundaryflag,
+            if (boundaryflags[t2] != 1) t2++;   // because that's the start/end of the utterance
+        }
+#endif
+        copytosubvector (frames[t1], extent - n, v);
+        copytosubvector (frames[t2], extent + n, v);
+    }
+}
+
+// augment neighbor frames for a frame (correctly not expanding across utterance boundaries)
+// The boundaryflags[] array, if not empty, flags first (-1) and last (+1) frame, i.e. frames to not expand across.
+// The output 'v' must have te-ts columns.
+template<class MATRIX, class VECTOR> static void augmentneighbors(const MATRIX & frames, const std::vector<char> & boundaryflags, size_t t, const size_t leftextent, const size_t rightextent,
+    VECTOR & v)
+{
+
+    // copy the frame and its neighbors
+    // Once we hit a boundaryflag in either direction, do not move index beyond.
+    copytosubvector(frames[t], leftextent, v);     // frame[t] sits right in the middle
+    size_t t1 = t;  // index for frames on to the left
+    size_t t2 = t;  // and right
+
+    for (size_t n = 1; n <= leftextent; n++)
+    {
+        if (boundaryflags.empty())  // boundary flags not given: 'frames' is full utterance
+        {
+            if (t1 > 0) t1--;                   // index does not move beyond boundary
+        }
+        else
+        {
+            if (boundaryflags[t1] != -1) t1--;  // index does not move beyond a set boundaryflag,
+        }
+        copytosubvector(frames[t1], leftextent - n, v);
+    }
+    for (size_t n = 1; n <= rightextent; n++)
+    {
+        if (boundaryflags.empty())  // boundary flags not given: 'frames' is full utterance
+        {
+            if (t2 + 1 < frames.size()) t2++;
+        }
+        else
+        {
+            if (boundaryflags[t2] != 1) t2++;   // because that's the start/end of the utterance
+        }
+        copytosubvector(frames[t2], rightextent + n, v);
+    }
+}
+
+// augment neighbor frames for one frame t in frames[] according to boundaryflags[]; result returned in column j of v
+template<class INMATRIX, class OUTMATRIX> static void augmentneighbors (const INMATRIX & frames, const std::vector<char> & boundaryflags, size_t t,
+                                                                        OUTMATRIX & v, size_t j)
+{
+    auto v_j = v.col(j); // the vector to fill in
+    augmentneighbors (frames, boundaryflags, t, v_j);
+}
+
+// augment neighbor frames for one frame t in frames[] according to boundaryflags[]; result returned in column j of v
+template<class INMATRIX, class OUTMATRIX> static void augmentneighbors(const INMATRIX & frames, const std::vector<char> & boundaryflags, size_t t, size_t leftextent, size_t rightextent,
+    OUTMATRIX & v, size_t j)
+{
+    auto v_j = v.col(j); // the vector to fill in
+    augmentneighbors(frames, boundaryflags, t, leftextent, rightextent, v_j);
+}
+
+// augment neighbor frames for a sequence of frames (part of an utterance, possibly spanning across boundaries)
+template<class MATRIX> static void augmentneighbors (const std::vector<std::vector<float>> & frames, const std::vector<char> & boundaryflags,
+                                                     size_t ts, size_t te,  // range [ts,te)
+                                                     MATRIX & v)
+{
+    for (size_t t = ts; t < te; t++)
+    {
+        auto v_t = v.col(t-ts); // the vector to fill in
+        augmentneighbors (frames, boundaryflags, t, v_t);
+    }
+}
+
+
+// augment neighbor frames for a sequence of frames (part of an utterance, possibly spanning across boundaries)
+template<class MATRIX> static void augmentneighbors(const std::vector<std::vector<float>> & frames, const std::vector<char> & boundaryflags, size_t leftextent, size_t rightextent,
+    size_t ts, size_t te,  // range [ts,te)
+    MATRIX & v)
+{
+    for (size_t t = ts; t < te; t++)
+    {
+        auto v_t = v.col(t - ts); // the vector to fill in
+        augmentneighbors(frames, boundaryflags, t, leftextent, rightextent, v_t);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// randomordering -- class to help manage randomization of input data
+// ---------------------------------------------------------------------------
+
+static inline size_t rand (const size_t begin, const size_t end)
+{
+    const size_t randno = ::rand() * RAND_MAX + ::rand();   // BUGBUG: still only covers 32-bit range
+    return begin + randno % (end - begin);
+}
+
+class randomordering                // note: NOT thread-safe at all
+{
+    // constants for randomization
+    const static size_t randomizeAuto=0;
+    const static size_t randomizeDisable=(size_t)-1;
+
+    typedef unsigned int INDEXTYPE; // don't use size_t, as this saves HUGE amounts of RAM
+    std::vector<INDEXTYPE> map;          // [t] -> t' indices in randomized order
+    size_t currentseed;             // seed for current sequence
+    size_t randomizationrange;      // t - randomizationrange/2 <= t' < t + randomizationrange/2 (we support this to enable swapping)
+                                    // special values (randomizeAuto, randomizeDisable)
+    void invalidate() { currentseed = (size_t) -1; }
+public:
+    randomordering() { invalidate(); }
+
+    void resize (size_t len, size_t p_randomizationrange) { randomizationrange = p_randomizationrange>0?p_randomizationrange:len; map.resize (len); invalidate(); }
+
+    // return the randomized feature bounds for a time range
+    std::pair<size_t,size_t> bounds (size_t ts, size_t te) const
+    {
+        size_t tbegin = max (ts, randomizationrange/2) - randomizationrange/2;
+        size_t tend = min (te + randomizationrange/2, map.size());
+        return std::make_pair<size_t,size_t> (move(tbegin), move(tend));
+    }
+
+    // this returns the map directly (read-only) and will lazily initialize it for a given seed
+    const std::vector<INDEXTYPE> & operator() (size_t seed) //throw()
+    {
+        // if wrong seed then lazily recache the sequence
+        if (seed != currentseed)
+        {
+            // test for numeric overflow
+            if (map.size()-1 != (INDEXTYPE) (map.size()-1))
+                throw std::runtime_error ("randomordering: INDEXTYPE has too few bits for this corpus");
+            // 0, 1, 2...
+            foreach_index (t, map) map[t] = (INDEXTYPE) t;
+            // now randomize them
+            if (randomizationrange != randomizeDisable)
+            {
+    #if 1       // change to 0 to disable randomizing
+                if (map.size() > RAND_MAX * (size_t) RAND_MAX)
+                    throw std::runtime_error ("randomordering: too large training set: need to change to different random generator!");
+                srand ((unsigned int) seed);
+                size_t retries = 0;
+                foreach_index (t, map)
+                {
+                    for (int tries = 0; tries < 5; tries++)
+                    {
+                        // swap current pos with a random position
+                        // Random positions are limited to t+randomizationrange.
+                        // This ensures some locality suitable for paging with a sliding window.
+                        const size_t tbegin = max ((size_t) t, randomizationrange/2) - randomizationrange/2; // range of window  --TODO: use bounds() function above
+                        const size_t tend = min (t + randomizationrange/2, map.size());
+                        assert (tend >= tbegin);                    // (guard against potential numeric-wraparound bug)
+                        const size_t trand = rand (tbegin, tend);   // random number within windows
+                        assert ((size_t) t <= trand + randomizationrange/2 && trand < (size_t) t + randomizationrange/2);
+                        // if range condition is fulfilled then swap
+                        if (trand <= map[t] + randomizationrange/2 && map[t] < trand + randomizationrange/2
+                            && (size_t) t <= map[trand] + randomizationrange/2 && map[trand] < (size_t) t + randomizationrange/2)
+                        {
+                            ::swap (map[t], map[trand]);
+                            break;
+                        }
+                        // but don't multi-swap stuff out of its range (for swapping positions that have been swapped before)
+                        // instead, try again with a different random number
+                        retries++;
+                    }
+                }
+                fprintf (stderr, "randomordering: %zu retries for %zu elements (%.1f%%) to ensure window condition\n", retries, map.size(), 100.0 * retries / map.size());
+                // ensure the window condition
+                foreach_index (t, map) assert ((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2);
+    #if 1       // and a live check since I don't trust myself here yet
+                foreach_index (t, map) if (!((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2))
+                {
+                    fprintf (stderr, "randomordering: windowing condition violated %d -> %d\n", t, map[t]);
+                    throw std::logic_error ("randomordering: windowing condition violated");
+                }
+    #endif
+    #endif
+    #if 1       // test whether it is indeed a unique complete sequence
+                auto map2 = map;
+                ::sort (map2.begin(), map2.end());
+                foreach_index (t, map2) assert (map2[t] == (size_t) t);
+    #endif
+                fprintf (stderr, "randomordering: recached sequence for seed %d: %d, %d, ...\n", (int) seed, (int) map[0], (int) map[1]);
+            }
+            currentseed = seed;
+        }
+        return map; // caller can now access it through operator[]
+    }
+};
+
+//typedef unsigned short CLASSIDTYPE; // type to store state ids; don't use size_t --saves HUGE amounts of RAM
+typedef unsigned int CLASSIDTYPE; //mseltzer - change to unsigned int for untied context-dependent phones 
+
+};};
--- a/DataReader/Kaldi2Reader/msra_mgram.h
+++ b/DataReader/Kaldi2Reader/msra_mgram.h
--- a/DataReader/Kaldi2Reader/notes.txt
+++ b/DataReader/Kaldi2Reader/notes.txt
@ -0,0 +1,115 @@
+*** TODO ***
+
+0) why is the getbatch: getting utterances so slow...?
+1) Timit feedforward
+2) Timit LSTM
+3) get writer working
+
+scp
+counts
+feature_transform
+
+readMethod
+  rollingWindow - cache file; randomize frames
+  blockRandomize (default) - directly from feature files; randomize frames (if frameMode == false, randomize utterances)
+randomize
+  Auto
+  None
+minibatchMode
+  Partial (default) - smaller last minibatch
+  Full - ignore last minibatch
+scpFile:
+dim:
+  frame dimension
+mlfFile:
+  HTK file containing labels
+labelDim:
+  number of possible labels
+labelMappingFile:
+  all labels seen in MLF file
+frameMode:
+  true - randomize at frame level
+  false - randomize at utterance level
+nbruttsineachrecurrentiter:
+  number parallel (default 1)
+truncated
+  true
+  false
+
+=== Add ===
+countsFile: to get counts and names...
+feature_transform
+
+scp:/data/sls/scratch/leoliu/data/telugu_debug/train/merged.feats.scp
+scp:/data/sls/scratch/leoliu/data/telugu_debug/train/merged.labels.scp
+
+=== READ ===
+
+PrepareForTrainingOrTesting
+  get labels
+  create map of <uttid, list of htkmlfentry>
+    using msra::asr::htkmlfreader <--- change this (htkfeatio.h)
+
+      const vector<wstring> & paths,          file
+      const set<wstring> & restricttokeys,    set of uttids
+      const wstring & stateListPath,          mapping file
+      const WORDSYMBOLTABLE * wordmap,        X
+      const UNITSYMBOLTABLE * unitmap,        X
+      const double htkTimeToFrame             X
+
+  blockRandomize
+    minibatchutterancesourcemulti - utterancesourcemulti.h <--- change this
+
+      const std::vector<std::vector<wstring>> & infiles                               2D-array (make 1D?)
+      const std::vector<map<wstring,std::vector<msra::asr::htkmlfentry>>> & labels    labels from above
+      std::vector<size_t> vdim                                                        feature dimension
+      std::vector<size_t> udim                                                        label dimension
+      std::vector<size_t> leftcontext                                                 left context
+      std::vector<size_t> rightcontext                                                right context
+      size_t randomizationrange                                                       how much to randomize
+      const latticesource & lattice                                                   empty
+      const map<wstring,msra::lattices::lattice::htkmlfwordsequence> & allwordtranscripts  empty
+      const bool framemode                                                            shuffle by frame or utterance
+
+  rollingWindow
+    minibatchframesourcemulti - rollingwindowsource.h <--- change this
+
+      const std::vector<std::vector<wstring>> & infiles
+      const std::vector<map<std::wstring,std::vector<msra::asr::htkmlfentry>>> & labels
+      std::vector<size_t> vdim
+      std::vector<size_t> udim
+      std::vector<size_t> leftcontext
+      std::vector<size_t> rightcontext
+      size_t randomizationrange
+      const std::vector<wstring> & pagepath                                               cache file names
+      const bool mayhavenoframe=false                                                     false
+      int addEnergy=0                                                                     0
+
+  They both use htkfeatio.h <--- change this!
+  create kaldifeatio.h
+
+StartMinibatchLoopToTrainOrTest
+
+  source = that source from above
+  m_mbiter = new msra::dbn::minibatchiterator(*source, epoch, requestedEpochSamples, mbSize, datapasses);
+
+      source.getbatch
+      source.gettimegetbatch
+      source.firstvalidglobalts
+  
+  feat = m_mbiter->frames(id);
+
+=== WRITE ===
+
+PrepareForWriting
+  m_inputFilesMultiIO - either the scp list or a 2D list or HTK files <--- change this (make 1D?)
+  m_fileEvalSource = new msra::dbn::FileEvalSource(realDims, numContextLeft, numContextRight, evalchunksize);
+
+GetMinibatchToWrite
+  msra::asr::htkfeatreader reader; <--- change this
+  const auto path = reader.parse(m_inputFilesMultiIO[i][m_inputFileIndex]);
+  reader.read (path, featkind, sampperiod, feat);
+
+  m_fileEvalSource->AddFile(feat, featkind, sampperiod, i);   //sampPeriods not used, featkind not used
+  m_fileEvalSource->CreateEvalMinibatch();
+  m_fileEvalSource->ChunkOfFrames(id);
--- a/DataReader/Kaldi2Reader/numahelpers.h
+++ b/DataReader/Kaldi2Reader/numahelpers.h
@ -0,0 +1,254 @@
+//
+// <copyright file="numahelpers.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// numahelpers.h -- some helpers with NUMA
+
+#pragma once
+
+#ifndef __unix__
+#include <Windows.h>
+#include "pplhelpers.h"
+
+#endif
+#include <stdexcept>
+#include "simple_checked_arrays.h"
+#include "basetypes.h"  // for FormatWin32Error
+
+namespace msra { namespace numa {
+
+// ... TODO: this can be a 'static', as it should only be set during foreach_node but not outside
+extern int node_override;   // -1 = normal operation; >= 0: force a specific NUMA node
+
+// force a specific NUMA node (only do this during single-threading!)
+static inline void overridenode (int n = -1)
+{
+    node_override = n;
+}
+
+// get the number of NUMA nodes we would like to distinguish
+static inline size_t getnumnodes()
+{
+    ULONG n;
+    if (!GetNumaHighestNodeNumber (&n)) return 1;
+    return n +1;
+}
+
+// execute body (node, i, n), i in [0,n) on all NUMA nodes in small chunks
+template <typename FUNCTION> void parallel_for_on_each_numa_node (bool multistep, const FUNCTION & body)
+{
+    // get our configuration
+    const size_t cores = ppl_cores;
+    assert (cores > 0);
+    const size_t nodes = getnumnodes();
+    const size_t corespernode = (cores -1) / nodes + 1;
+    // break into 8 steps per thread
+    const size_t stepspernode = multistep ? 16 : 1;
+    const size_t steps = corespernode * stepspernode;
+    // now run on many threads, hoping to hit all NUMA nodes, until we are done
+    hardcoded_array<LONG/*unsigned int*/,256> nextstepcounters;    // next block to run for a NUMA node
+    if (nodes > nextstepcounters.size())
+        throw std::logic_error ("parallel_for_on_each_numa_node: nextstepcounters buffer too small, need to increase hard-coded size");
+    for (size_t k = 0; k < nodes; k++) nextstepcounters[k] = 0;
+    overridenode();
+    //unsigned int totalloops = 0;    // for debugging only, can be removed later
+    msra::parallel::parallel_for (0, nodes * steps /*execute each step on each NUMA node*/, 1, [&](size_t /*dummy*/)
+    {
+        const size_t numanodeid = getcurrentnode();
+        // find a node that still has work left, preferring our own node
+        // Towards the end we will run on wrong nodes, but what can we do.
+        for (size_t node1 = numanodeid; node1 < numanodeid + nodes; node1++)
+        {
+            const size_t node = node1 % nodes;
+            const unsigned int step = InterlockedIncrement (&nextstepcounters[node]) -1;  // grab this step
+            if (step >= steps)  // if done then counter has exceeded the required number of steps
+                continue;       // so try next NUMA node
+            // found one: execute and terminate loop
+            body (node, step, steps);
+            //InterlockedIncrement (&totalloops);
+            return; // done
+        }
+        // oops??
+        throw std::logic_error ("parallel_for_on_each_numa_node: no left-over block found--should not get here!!");
+    });
+    //assert (totalloops == nodes * steps);
+}
+
+// execute a passed function once for each NUMA node
+// This must be run from the main thread only.
+// ... TODO: honor ppl_cores == 1 for comparative measurements against single threads.
+template<typename FUNCTION>
+static void foreach_node_single_threaded (const FUNCTION & f)
+{
+    const size_t n = getnumnodes();
+    for (size_t i = 0; i < n; i++)
+    {
+        overridenode ((int) i);
+        f();
+    }
+    overridenode (-1);
+}
+
+// get the current NUMA node
+static inline size_t getcurrentnode()
+{
+    // we can force it to be a certain node, for use in initializations
+    if (node_override >= 0)
+        return (size_t) node_override;
+    // actually use current node
+    DWORD i = GetCurrentProcessorNumber();  // note: need to change for >63 processors
+    UCHAR n;
+    if (!GetNumaProcessorNode ((UCHAR) i, &n)) return 0;
+    if (n == 0xff)
+        throw std::logic_error ("GetNumaProcessorNode() failed to determine NUMA node for GetCurrentProcessorNumber()??");
+    return n;
+}
+
+// allocate memory
+// Allocation seems to be at least on a 512-byte boundary. We nevertheless verify alignment requirements.
+typedef LPVOID (WINAPI *VirtualAllocExNuma_t) (HANDLE,LPVOID,SIZE_T,DWORD,DWORD,DWORD);
+static VirtualAllocExNuma_t VirtualAllocExNuma = (VirtualAllocExNuma_t)-1;
+
+static inline void * malloc (size_t n, size_t align)
+{
+    // VirtualAllocExNuma() only exists on Vista+, so go through an explicit function pointer
+    if (VirtualAllocExNuma == (VirtualAllocExNuma_t)-1)
+    {
+        VirtualAllocExNuma = (VirtualAllocExNuma_t) GetProcAddress (GetModuleHandle ( TEXT ("kernel32.dll")), "VirtualAllocExNuma");
+    }
+
+    // if we have the function then do a NUMA-aware allocation
+    void * p;
+    if (VirtualAllocExNuma != NULL)
+    {
+        size_t node = getcurrentnode();
+        // "all Win32 heap allocations that are 1 MB or greater are forwarded directly to NtAllocateVirtualMemory
+        // when they are allocated and passed directly to NtFreeVirtualMemory when they are freed" Greg Colombo, 2010/11/17
+        if (n < 1024*1024)
+            n = 1024*1024;    // -> brings NUMA-optimized code back to Node Interleave level (slightly faster)
+        p = VirtualAllocExNuma (GetCurrentProcess(), NULL, n, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, (DWORD) node);
+    }
+    else    // on old OS call no-NUMA version
+    {
+        p = VirtualAllocEx (GetCurrentProcess(), NULL, n, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE);
+    }
+    if (p == NULL)
+        fprintf (stderr, "numa::malloc: failed allocating %d bytes with alignment %d\n", n, align);
+    if (((size_t) p) % align != 0)
+        throw std::logic_error ("VirtualAllocExNuma() returned an address that does not match the alignment requirement");
+    return p;
+}
+
+// free memory allocated with numa::malloc()
+static inline void free (void * p)
+{
+    assert (p != NULL);
+    if (!VirtualFree (p, 0, MEM_RELEASE))
+        throw std::logic_error ("VirtualFreeEx failure");
+}
+
+// dump memory allocation
+static inline void showavailablememory (const char * what)
+{
+    size_t n = getnumnodes();
+    for (size_t i = 0; i < n; i++)
+    {
+        ULONGLONG availbytes = 0;
+        BOOL rc = GetNumaAvailableMemoryNode ((UCHAR) i, &availbytes);
+        const double availmb = availbytes / (1024.0*1024.0);
+        if (rc)
+            fprintf (stderr, "%s: %8.2f MB available on NUMA node %d\n", what, availmb, i);
+        else
+            fprintf (stderr, "%s: error '%S' for getting available memory on NUMA node %d\n", what, FormatWin32Error (::GetLastError()).c_str(), i);
+    }
+}
+
+// determine NUMA node with most memory available
+static inline size_t getmostspaciousnumanode()
+{
+    size_t n = getnumnodes();
+    size_t bestnode = 0;
+    ULONGLONG bestavailbytes = 0;
+    for (size_t i = 0; i < n; i++)
+    {
+        ULONGLONG availbytes = 0;
+        GetNumaAvailableMemoryNode ((UCHAR) i, &availbytes);
+        if (availbytes > bestavailbytes)
+        {
+            bestavailbytes = availbytes;
+            bestnode = i;
+        }
+    }
+    return bestnode;
+}
+
+#if 0   // this is no longer used (we now parallelize the big matrix products directly)
+// class to manage multiple copies of data on local NUMA nodes
+template<class DATATYPE,class CACHEDTYPE> class numalocaldatacache
+{
+    numalocaldatacache (const numalocaldatacache&); numalocaldatacache & operator= (const numalocaldatacache&);
+
+    // the data set we associate to
+    const DATATYPE & data;
+
+    // cached copies of the models for NUMA
+    vector<unique_ptr<CACHEDTYPE>> cache;
+
+    // get the pointer to the clone for the NUMA node of the current thread (must exist)
+    CACHEDTYPE * getcloneptr()
+    {
+        return cache[getcurrentnode()].get();
+    }
+public:
+    numalocaldatacache (const DATATYPE & data) : data (data), cache (getnumnodes())
+    {
+        foreach_node_single_threaded ([&]()
+        {
+            cache[getcurrentnode()].reset (new CACHEDTYPE (data));
+        });
+    }
+
+    // this takes the cached versions of the parent model
+    template<typename ARGTYPE1,typename ARGTYPE2,typename ARGTYPE3>
+    numalocaldatacache (numalocaldatacache<DATATYPE,DATATYPE> & parentcache, const ARGTYPE1 & arg1, const ARGTYPE2 & arg2, const ARGTYPE3 & arg3) : data (*(DATATYPE*)nullptr), cache (getnumnodes())
+    {
+        foreach_node_single_threaded ([&]()
+        {
+            const DATATYPE & parent = parentcache.getclone();
+            size_t numanodeid = getcurrentnode();
+            cache[numanodeid].reset (new CACHEDTYPE (parent, arg1, arg2, arg3));
+        });
+    }
+
+    // re-clone --update clones from the cached 'data' reference
+    // This is only valid if CACHEDTYPE==DATATYPE.
+    // ... parallelize this!
+    void reclone()
+    {
+        parallel_for_on_each_numa_node (true, [&] (size_t numanodeid, size_t step, size_t steps)
+        {
+            if (step != 0)
+                return;     // ... TODO: tell parallel_for_on_each_numa_node() to only have one step, or parallelize
+            cache[numanodeid].get()->copyfrom (data);    // copy it all over
+        });
+    }
+
+    // post-process all clones
+    // 'numanodeid' is ideally the current NUMA node most of the time, but not required.
+    template<typename POSTPROCFUNC>
+    void process (const POSTPROCFUNC & postprocess)
+    {
+        parallel_for_on_each_numa_node (true, [&] (size_t numanodeid, size_t step, size_t steps)
+        {
+            postprocess (*cache[numanodeid].get(), step, steps);
+        });
+    }
+
+    // a thread calls this to get the data pre-cloned for its optimal NUMA node
+    // (only works for memory allocated through msra::numa::malloc())
+    const CACHEDTYPE & getclone() const { return *getcloneptr(); }
+    CACHEDTYPE & getclone()             { return *getcloneptr(); }
+};
+#endif
+};};
--- a/DataReader/Kaldi2Reader/pplhelpers.h
+++ b/DataReader/Kaldi2Reader/pplhelpers.h
@ -0,0 +1,99 @@
+//
+// <copyright file="pplhelpers.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// pplhelpers.h -- some helpers for PPL library
+//
+
+#pragma once
+
+#ifndef __unix__
+#include <ppl.h>
+#endif
+namespace msra { namespace parallel {
+
+// ===========================================================================
+// helpers related to multiprocessing and NUMA
+// ===========================================================================
+
+// determine number of CPU cores on this machine
+static inline size_t determine_num_cores()
+{
+    SYSTEM_INFO sysInfo;
+    GetSystemInfo (&sysInfo);
+    return sysInfo.dwNumberOfProcessors;
+}
+
+extern size_t ppl_cores;    // number of cores to run on as requested by user
+
+static inline void set_cores (size_t cores)
+{
+    ppl_cores = cores;
+}
+
+static inline size_t get_cores()    // if returns 1 then no parallelization will be done
+{
+    return ppl_cores;
+}
+
+#if 0
+// execute body() a bunch of times for hopefully each core
+// This is not precise. Cores will be hit multiple times, and some cores may not be touched.
+template <typename FUNCTION> void for_all_numa_nodes_approximately (const FUNCTION & body)
+{
+    if (ppl_cores > 1)  // parallel computation (regular)
+        parallel_for ((size_t) 0, ppl_cores * 2, (size_t) 1, [&](size_t) { body(); });
+    else            // for comparison: single-threaded (this also documents what the above means)
+        body();
+}
+#endif
+
+// wrapper around Concurrency::parallel_for() to allow disabling parallelization altogether
+template <typename FUNCTION> void parallel_for (size_t begin, size_t end, size_t step, const FUNCTION & f)
+{
+    const size_t cores = ppl_cores;
+    if (cores > 1)  // parallel computation (regular)
+    {
+        //fprintf (stderr, "foreach_index_block: computing %d blocks of %d frames on %d cores\n", nblocks, nfwd, determine_num_cores());
+        Concurrency::parallel_for (begin, end, step, f);
+    }
+    else            // for comparison: single-threaded (this also documents what the above means)
+    {
+        //fprintf (stderr, "foreach_index_block: computing %d blocks of %d frames on a single thread\n", nblocks, nfwd);
+        for (size_t j0 = begin; j0 < end; j0 += step) f (j0);
+    }
+}
+
+// execute a function 'body (j0, j1)' for j = [0..n) in chunks of ~targetstep in 'cores' cores
+// Very similar to parallel_for() except that body function also takes end index,
+// and the 'targetsteps' gets rounded a little to better map to 'cores.'
+// ... TODO: Currently, 'cores' does not limit the number of threads in parallel_for() (not so critical, fix later or never)
+template <typename FUNCTION> void foreach_index_block (size_t n, size_t targetstep, size_t targetalignment, const FUNCTION & body)
+{
+    const size_t cores = ppl_cores;
+    const size_t maxnfwd = 2 * targetstep;
+    size_t nblocks = (n + targetstep / 2) / targetstep;
+    if (nblocks == 0) nblocks = 1;
+    // round to a multiple of the number of cores
+    if (nblocks < cores)    // less than # cores -> round up
+        nblocks = (1+(nblocks-1)/cores) * cores;
+    else                    // more: round down (reduce overhead)
+        nblocks = nblocks / cores * cores;
+    size_t nfwd = 1 + (n - 1) / nblocks;
+    assert (nfwd * nblocks >= n);
+    if (nfwd > maxnfwd) nfwd = maxnfwd; // limit to allocated memory just in case
+    // ... TODO: does the above actually do anything/significant? nfwd != targetstep?
+
+    // enforce alignment
+    nfwd = (1 + (nfwd -1) / targetalignment) * targetalignment;
+
+    // execute it!
+    parallel_for (0, n, nfwd, [&](size_t j0)
+    {
+        size_t j1 = min (j0 + nfwd, n);
+        body (j0, j1);
+    });
+}
+
+};};
--- a/DataReader/Kaldi2Reader/readaheadsource.h
+++ b/DataReader/Kaldi2Reader/readaheadsource.h
@ -0,0 +1,249 @@
+//
+// <copyright file="readaheadsource.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// readaheadsource.h -- wrapper ('minibatchreadaheadsource') of a read-ahead thread that pre-rolls feature and lattice data
+//
+
+
+#pragma once
+
+#include "basetypes.h"
+#include "minibatchiterator.h"
+#include "latticearchive.h"
+#ifdef _WIN32
+#include "simplethread.h"
+#endif
+#include <deque>
+#include <stdexcept>
+
+namespace msra { namespace dbn {
+
+// ---------------------------------------------------------------------------
+// minibatchreadaheadsource -- read-ahead thread that pre-rolls feature and lattice data
+// ---------------------------------------------------------------------------
+class minibatchreadaheadsource : public minibatchsource/*the interface we implement*/,
+                                        noncopyable/*assignment operator needed somewhere*/,
+                                        CCritSec/*for multi-threaded access*/
+{
+    minibatchsource & source;       // the underlying source we read from
+    const size_t epochframes;       // epoch size
+    unique_ptr<msra::util::simplethread> thread;
+    int verbosity;
+    // the FIFO
+    struct batchdata // all arguments to/from getbatch
+    {
+        size_t globalts;            // time for which we get the data
+        // return values
+        msra::dbn::matrix feat;
+        std::vector<size_t> uids;
+        std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> transcripts;
+        std::vector<shared_ptr<const latticesource::latticepair>> lattices;
+        batchdata (size_t globalts) : globalts (globalts) { }
+    };
+    deque<batchdata> fifo;          // this is guarded by the CCritSec
+    size_t epoch;                   // which epoch we are in currently
+    // parameters for the thread proc (set by caller; taken over once newglobalts is set to non-SIZE_MAX (cleared back by thread))
+    volatile size_t newglobalts;        // reset request
+    volatile size_t currentepochreqframes;  // minibatch size for this epoch (taken from the first getbatch() call)
+    volatile size_t currentepochendframe;   // we cannot request beyond
+    // signalling
+    mutable msra::util::signallingevent callerchangedsignal, threadchangedsignal;
+    void waitcallerchanged() const { callerchangedsignal.wait(); }
+    void flagcallerchanged() const { callerchangedsignal.flag(); }
+    void waitthreadchanged() const { threadchangedsignal.wait(); }
+    void flagthreadchanged() const { threadchangedsignal.flag(); }
+    // the thread proc
+    volatile bool terminaterequest; // threadproc must respond to this
+    size_t globalts;                // read cursor, owned by thread only
+    void threadproc()
+    {
+        // note on signaling:
+        // This thread will always flag 'threadchangedsignal' if there is a state change,
+        // e.g. a new batch is available, or we have successfully initialized.
+        // The main ('caller') thread would check whether it finds a state it can make use of, and if not,
+        // it will wait for the 'threadchangedsignal' and then check again the state etc.
+        fprintf (stderr, "minibatchreadaheadsource: read-ahead thread entered\n");
+        try
+        {
+            size_t epochreqframes = 0;  // minibatch size for this epoch (taken from the first getbatch() call)
+            size_t epochendframe = 0;   // we cannot request beyond
+            size_t globalts = 0;        // reset request
+            while (!terminaterequest)
+            {
+                bool stillhasdata;
+                {
+                    CAutoLock lock (*this);
+                    // if reset request then do it
+                    if (newglobalts != SIZE_MAX)
+                    {
+                        // take over parameters from caller
+                        globalts = newglobalts;
+                        epochreqframes = currentepochreqframes;
+                        epochendframe = currentepochendframe;
+                        newglobalts = SIZE_MAX;     // remember we got it
+                        // reset the FIFO
+                        fifo.clear();
+                        flagthreadchanged();        // signal state change (needed?)
+                        fprintf (stderr, "minibatchreadaheadsource: thread entered new epoch, frame pos reset to %d\n", (int) globalts);
+                        continue;
+                    }
+                    // did we run out of data to give to the caller?
+                    stillhasdata = !fifo.empty();
+                }
+                // we kick in once the FIFO is empty (and only once we know the mbsize)
+                // Note that the underlying source will be able to fulfill many more minibatches at no cost
+                // since we stopped pulling minibatches from it once it told us it read something from the disk.
+                // Thus it is OK (efficient) to run the FIFO empty before we continue asking the underlying source
+                // for more data--it will give us quite some more data for free--which the caller can go and process--
+                // before an expensive read operation is needed again.
+                if (globalts >= epochendframe || stillhasdata)
+                {
+                    waitcallerchanged();    // nothing to do: wait for caller state change and check again
+                    continue;
+                }
+                // we will bring in data from the current 'globalts' until the sub-getbatch() tells us
+                // that we loaded new data (which means subsequent getbatch() will be free until the next load).
+                // We assume the access pattern that
+                //  - we start at or closely after the epoch boundary
+                //  - we never go across an epoch boundary
+                //  - the number of requested frames within an epoch is always the same except for the last MB
+                // This pattern is implemented by the minibatchiterator. We require it.
+                // (but it is possible that less is returned, i.e. at a sweep boundary or epoch end).
+                bool readfromdisk = false;
+                // we stop once data was read (the subsequent fetches will be cheap until the next data read)
+                // For small setups, all data may be in RAM and thus no reading will happen anymore.
+                // To guard against that, we limit the number of frames we pre-read.
+                fprintf (stderr, "minibatchreadaheadsource: thread entering reading loop, frame read pos %d\n", (int) globalts);
+                size_t batchesread = 0;
+                const size_t prerollendframe = globalts + 360000;    // read max. 1 hour --to guard against setups that fit to RAM entirely (no disk reading after startup)
+                while (!terminaterequest && !readfromdisk && globalts < epochendframe && globalts < prerollendframe)
+                {
+                    // get batch and append to FIFO (outside the lock)
+                    batchdata batch (globalts);
+                    const size_t requestedframes = min (epochreqframes, epochendframe - globalts);    // we must not request beyond the epoch
+                    readfromdisk = source.getbatch (globalts, requestedframes, batch.feat, batch.uids, batch.transcripts, batch.lattices);
+                    batchesread++;
+                    // Note: We may still get data beyond the end of the epoch, in utterance mode, since the epoch boundary likely falls within an utterance.
+                    CAutoLock lock (*this);
+                    if (!fifo.empty() && globalts != fifo.back().globalts + fifo.back().feat.cols())
+                        throw std::logic_error ("minibatchreadaheadsource: FIFO got out of order while pre-reading new batch");
+                    if (newglobalts != SIZE_MAX)
+                        throw std::logic_error ("minibatchreadaheadsource: main thread reset to new epoch while current epoch not yet finished");
+                    globalts += batch.feat.cols();
+                    fifo.push_back (std::move (batch));
+                    flagthreadchanged();        // signal state change so caller can pick up the new batch
+                }
+                fprintf (stderr, "minibatchreadaheadsource: thread exited reading loop, %d batches read up to frame position %d-1\n", (int) batchesread, (int) globalts);
+            }
+            fprintf (stderr, "minibatchreadaheadsource: reading loop was terminated at frame position %d-1\n", (int) globalts);
+        }
+        catch (const exception & e)
+        {
+            fprintf (stderr, "minibatchreadaheadsource: exception caught in read-ahead thread: %s\n", e.what());
+            thread->fail (e);       // set the error first before we signal the caller
+            flagthreadchanged();
+            throw;                  // (this will set the error a second time; OK)
+        }
+        fprintf (stderr, "minibatchreadaheadsource: read-ahead thread exited normally\n");
+    }
+    void cancelthread() // this is only ever called by the destructor
+    {
+        fprintf (stderr, "minibatchreadaheadsource: requesting thread termination\n");
+        terminaterequest = true;
+        flagcallerchanged();
+        thread->wait();
+    }
+public:
+    minibatchreadaheadsource (minibatchsource & source, size_t epochframes)
+      : source (source), epochframes (epochframes),
+        terminaterequest (false), globalts (SIZE_MAX),
+        epoch (SIZE_MAX), currentepochreqframes (0), currentepochendframe (0), newglobalts (SIZE_MAX), verbosity(2)
+    {
+        // kick off the thread
+        fprintf (stderr, "minibatchreadaheadsource: kicking off read-ahead thread\n");
+        thread.reset (new msra::util::simplethread ([this] () { threadproc(); }));
+    }
+    ~minibatchreadaheadsource()
+    {
+        fprintf (stderr, "~minibatchreadaheadsource: destructing read-ahead thread\n");
+        cancelthread();
+    }
+    void setverbosity(int newverbosity){ verbosity = newverbosity; }
+    bool getbatch (const size_t globalts,
+                   const size_t framesrequested, msra::dbn::matrix & feat, std::vector<size_t> & uids,
+                   std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts,
+                   std::vector<shared_ptr<const latticesource::latticepair>> & lattices)
+    {
+#if 1
+        // first check whether the thread is still alive
+        thread->check();
+        // in case of epoch change, we signal the thread
+        size_t thisepoch = globalts / epochframes;
+        if (thisepoch != epoch)
+        {
+            fprintf (stderr, "minibatchreadaheadsource: signalling thread to enter new epoch\n");
+            epoch = thisepoch;                      // remember for next check --we have officially changed epochs
+            CAutoLock lock (*this);
+            if (!fifo.empty())
+                throw std::logic_error ("getbatch: FIFO not cleared at end of epoch");
+            newglobalts = globalts;
+            currentepochreqframes = framesrequested;    // it is assumed that these won't change
+            currentepochendframe = (epoch + 1) * epochframes;
+            flagcallerchanged();
+        }
+        else if (globalts + framesrequested < currentepochendframe && currentepochreqframes != framesrequested)
+            throw std::logic_error ("getbatch: cannot change minibatch size mid-epoch");
+        // loop
+        bool readfromdisk = false;
+        for(;;) // wait for batch to appear
+        {
+            thread->check();
+            {
+                CAutoLock lock (*this);
+                if (!fifo.empty())
+                {
+                    // get the first batch from the FIFO
+                    batchdata front = std::move (fifo.front());
+                    fifo.pop_front();
+                    flagcallerchanged();
+                    // it must be the correct one
+                    if (front.globalts != globalts)
+                        throw std::logic_error ("getbatch: data in FIFO out of sequence");
+                    // return it
+                    feat = std::move (front.feat);
+                    uids = std::move (front.uids);
+                    transcripts = std::move (front.transcripts);
+                    lattices = std::move (front.lattices);
+                    return readfromdisk;
+                }
+            }
+            // batch not there --keep looping
+            waitthreadchanged();
+            readfromdisk = true;    // we had to wait --use to indicate that we needed to read data (does not really matter...)
+        }
+#else
+        return source.getbatch (globalts, framesrequested, feat, uids, transcripts, lattices);
+#endif
+    }
+    bool getbatch (const size_t globalts,
+                const size_t framesrequested, std::vector<msra::dbn::matrix> & feat, std::vector<std::vector<size_t>> & uids,
+                std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts,
+                std::vector<shared_ptr<const latticesource::latticepair>> & lattices) 
+    {
+
+        feat.resize(1);
+        uids.resize(1);
+        //transcripts.resize(1);
+        //lattices.resize(1);
+        return getbatch(globalts, framesrequested, feat[0], uids[0], transcripts, lattices);
+    }
+
+    size_t totalframes() const { return source.totalframes(); }
+    size_t epochsize() const {return epochframes;}double gettimegetbatch() { return source.gettimegetbatch(); }   // TODO: no, use our own time measurement
+    size_t firstvalidglobalts (const size_t globalts) { return source.firstvalidglobalts (globalts); }
+    const std::vector<size_t> & unitcounts() const { return source.unitcounts(); }
+};
+
+};};
--- a/DataReader/Kaldi2Reader/rollingwindowsource.h
+++ b/DataReader/Kaldi2Reader/rollingwindowsource.h
@ -0,0 +1,576 @@
+//
+// <copyright file="rollingwindowsource.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// rollingwindowsource.h -- implementation of a rolling-window minibatch source ('minibatchframesource') with a disk page file
+//
+
+#pragma once
+
+#include "basetypes.h"                  // for attempt()
+//#include "numahelpers.h"                // for NUMA allocation
+#include "minibatchsourcehelpers.h"
+#include "minibatchiterator.h"
+#include "biggrowablevectors.h"
+#include "ssematrix.h"
+
+namespace msra { namespace dbn {
+
+    // ---------------------------------------------------------------------------
+    // biggrowablevectorarray -- a big array of vectors for features, growable (push_back)
+    // Data is striped across NUMA nodes, as to not clog them up.
+    // This also supports paging to disk, which is used for the old minibatchframesource.
+    // ---------------------------------------------------------------------------
+    class biggrowablevectorarray : public growablevectorbase<msra::dbn::matrix>
+    {
+        size_t m;           // dim
+
+        size_t inmembegin;  // range we have in memory, rounded to enclosing blocks (not rounded at end)
+        size_t inmemend;
+
+        wstring pagepath;   // path for paging, empty if no paging
+        auto_file_ptr f;    // file handle for paging
+        bool reading;       // have we begun reading?
+
+        // allocate a block
+        msra::dbn::matrix * newblock() const
+        {
+            // we stripe the data across NUMA nodes as to not fill up one node with the feature data
+            //msra::numa::overridenode ((int) msra::numa::getmostspaciousnumanode());
+            msra::dbn::matrix * res = new msra::dbn::matrix (m, elementsperblock);
+            //msra::numa::overridenode (-1);  // note: we really should reset it also in case of failure
+            return res;
+        }
+
+        // handling of page file
+        bool paging() const { return !pagepath.empty(); }
+        void openpagefile (bool wantread)
+        {
+            if (!paging()) return;
+            msra::files::make_intermediate_dirs (pagepath);
+
+            if (!wantread)
+            {
+                FILE *ftry = NULL;
+                wstring pathname (pagepath);
+                ftry = _wfopen (pathname.c_str(), L"wbS");
+                if (ftry) fclose (ftry);
+            }
+
+            /* 
+                code below to cycle through a-z appended to file name is no longer necessary 
+                since caller guarantees unique file names via HTKMLFReader 
+                and we want the pagepath logged to the user to be the actual one used by the code
+
+            // try to open the pagepath from a to z
+            if (!wantread)
+            {
+                FILE *ftry = NULL;
+                char trynum = 'a';
+                while (!ftry && trynum <= 'z')
+                {
+                    wstring pathname (pagepath);
+                    pathname += trynum++;
+                    ftry = _wfopen (pathname.c_str(), L"wbS");
+                }
+                if (ftry) fclose (ftry);
+                pagepath += --trynum;
+            }
+            */
+            f = fopenOrDie (pagepath, wantread ? L"rbS" : L"wbS");
+            reading = wantread;
+        }
+        void flushlastblock()   // during population phase, must be called once per block in sequence
+        {
+            if (!paging()) return;
+            assert (!reading);
+            if (blocks.empty()) return;
+            const size_t blockid = blocks.size() -1;
+            msra::dbn::matrix & block = *blocks[blockid];
+            assert (fgetpos (f) == blockid * block.sizeinpagefile());
+            block.topagefile (f);
+            blocks[blockid].reset();    // free the memory
+            assert (blockid * elementsperblock == inmembegin);
+            inmembegin = inmemend;      // empty range
+        }
+        void releaseblock (size_t t0)   // t0=block start time
+        {
+            assert (paging() && reading);
+            size_t blockid = t0 / elementsperblock;
+            assert (blockid * elementsperblock == t0);
+            assert (blocks[blockid]);
+            fprintf (stderr, "recoverblock: releasing feature block %zu [%zu..%zu)\n", blockid, t0, t0 + elementsperblock -1);
+            blocks[blockid].reset();    // free the memory
+        }
+        void recoverblock (size_t t0)   // t0=block start time
+        {
+            assert (paging() && reading);
+            size_t blockid = t0 / elementsperblock;
+            assert (blockid * elementsperblock == t0);
+            assert (!blocks[blockid]);
+            fprintf (stderr, "recoverblock: recovering feature block %zu [%zu..%zu)\n", blockid, t0, t0 + elementsperblock -1);
+            blocks[blockid].reset (newblock());
+            msra::dbn::matrix & block = *blocks[blockid];
+            fsetpos (f, blockid * block.sizeinpagefile());
+            block.frompagefile (f);
+        }
+        
+    public:
+        biggrowablevectorarray (const wstring & pagepath)
+            : growablevectorbase (65536), m (0), 
+            inmembegin (0), inmemend (0), pagepath (pagepath), reading (false)
+        {
+            openpagefile (false);
+            if (paging())
+                fprintf (stderr, "biggrowablevectorarray: creating disk backup store at '%S'\n", pagepath.c_str());
+        }
+        ~biggrowablevectorarray() { // clean up the big temp file 
+            if (paging()) {
+                fclose (f); 
+                if (_wunlink (pagepath.c_str())==0)
+                    fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str());
+                else
+                    fprintf (stderr, "biggrowablevectorarray: unable to delete disk backup store at '%S'\n", pagepath.c_str());
+            }
+        }            
+        
+        size_t dim() const { return m; }    // dimension of a frame
+
+        // reading phase
+        void push_back (const std::vector<float> & in)
+        {
+            assert (!in.empty());
+            assert (m == 0 || m == in.size());
+            m = in.size();
+            const size_t blockid = n / elementsperblock;
+            assert (blockid <= blocks.size());
+            if (blockid == blocks.size())   // a new block is needed
+            {
+                flushlastblock();
+                blocks.push_back (std::unique_ptr<msra::dbn::matrix> (newblock()));
+            }
+            const size_t blockn = n % elementsperblock;
+            msra::dbn::matrix & block = *blocks[blockid].get();
+            foreach_index (k, in)
+                block(k,blockn) = in[k];
+            n++;
+            inmemend = n;
+        }
+        void no_more_push_back()    // done pushing --switch to consumption mode
+        {
+            if (!paging()) return;
+            // finish off last block
+            flushlastblock();
+            fflushOrDie (f);
+            fprintf (stderr, "biggrowablevectorarray: disk backup store created, %d frames, %zu bytes\n", (int) n, fgetpos (f));
+            fclose (f);
+            foreach_index (i, blocks) assert (!blocks[i]);   // ensure we flushed
+            assert (inmembegin == inmemend);    // nothing in cache
+            // switch to reading mode
+            openpagefile (true);
+        }
+
+        // access phase
+        // Returns 'true' if data was actually read from disk.
+        bool require (pair<size_t,size_t> bounds) // we require this range of frames
+        {
+            bool readfromdisk = false;
+
+            // get bounds rounded to block boundaries
+            const size_t ts = bounds.first / elementsperblock * elementsperblock;
+            const size_t te = min (n, (bounds.second + elementsperblock -1) / elementsperblock * elementsperblock);
+            assert (paging());
+            // free all the memmory
+            for (size_t t = inmembegin; t < inmemend; t += elementsperblock)
+            {
+                if (t >= ts && t < te)  // if in wanted range then skip to end of it
+                    t = te - elementsperblock;
+                else
+                    releaseblock (t);
+            }
+            // page in all required blocks
+            for (size_t t = ts; t < te; t += elementsperblock)
+            {
+                if (t >= inmembegin && t < inmemend)  // if in memory already then skip to end of it
+                    t = inmemend - elementsperblock;
+                else
+                {
+                    recoverblock (t);
+                    readfromdisk = true;            // tell caller we did something expensive
+                }
+            }
+            // got it
+            inmembegin = ts;
+            inmemend = te;
+            return readfromdisk;
+        }
+        const msra::dbn::matrixstripe operator[] (size_t t) const   // get a feature vector
+        {
+            if (t < inmembegin || t >= inmemend)
+                throw std::logic_error ("biggrowablevectorarray: attempt to access vector without requesting to page it in first");
+            const size_t blockt = getblockt (t);
+            /*const*/ msra::dbn::matrix & block = getblock (t);
+            return msra::dbn::matrixstripe (block, blockt, 1);
+        }
+        wstring pagepathname(){ return pagepath;}
+        void cleanuppagefile()
+        {
+            if (paging()) {
+                fclose (f); 
+                if (_wunlink (pagepath.c_str())==0){
+                    fprintf (stderr, "biggrowablevectorarray: deleted disk backup store at '%S'\n", pagepath.c_str());
+                }
+                else{
+                    fprintf (stderr, "biggrowablevectorarray: could NOT delete disk backup store at '%S'\n", pagepath.c_str());
+                }
+            }
+        }
+    };
+
+    // ---------------------------------------------------------------------------
+    // minibatchframesourcemulti -- feature source to provide randomized frames in minibatches
+    // this is derived from minibatchframesource but worked with multiple inputs and/or outputs
+    // by making "frames" and "classids" a vector of vectors
+    // ---------------------------------------------------------------------------
+    class minibatchframesourcemulti : public minibatchsource
+    {
+        std::vector<size_t> vdim;                       // feature dimension after augmenting neighhors (0: don't read features)
+        std::vector<size_t> leftcontext;                // number of frames to the left of the target frame in the context window
+        std::vector<size_t> rightcontext;               // number of frames to the right of the target frame in the context window
+        unsigned int sampperiod;            // (for reference and to check against model)
+        string featkind;
+        size_t featdim;
+        size_t maxvdim;
+        // cache
+        //std::vector<biggrowablevectorarray> frames;
+        std::vector<unique_ptr<biggrowablevectorarray>> pframes;      // [t][i] all features concatenated
+        std::vector<char> boundaryflags;    // [t] -1 for first and +1 for last frame, 0 else (for augmentneighbors())
+        std::vector<std::vector<CLASSIDTYPE>> classids;  // [t] the state that the frame belongs to
+        size_t numframes;                   // total frames (==frames.size()==boundaryflags.size()==classids.size()) unless special modes vdim == 0 and/or no labels
+        msra::dbn::randomordering randomordering;  // [t] -> t'
+        double timegetbatch;
+        int verbosity;
+
+    public:
+        // constructor
+        // Pass empty labels to denote unsupervised training (so getbatch() will not return uids).
+        minibatchframesourcemulti (std::vector<msra::asr::FeatureSection *> & featuresections, const std::vector<std::vector<wstring>> & infiles, const std::vector<map<std::wstring,std::vector<msra::asr::htkmlfentry>>> & labels,
+            std::vector<size_t> vdim, std::vector<size_t> udim, std::vector<size_t> leftcontext, std::vector<size_t> rightcontext, size_t randomizationrange, const std::vector<wstring> & pagepath, const bool mayhavenoframe=false, int addEnergy=0)
+            : vdim (vdim), leftcontext(leftcontext), rightcontext(rightcontext), sampperiod (0), featdim (0), numframes (0), timegetbatch (0), verbosity(2), maxvdim(0)
+        {
+
+            if (vdim[0] == 0 && labels.empty())
+                throw runtime_error ("minibatchframesourcemulti: when running without features, labels are needed");
+            // at this stage, we simply page in the entire training set at once and work off RAM
+            // We will benefit from feature archives indirectly through htkfeatio.
+            // TODO:
+            //  - infiles must specify time range
+            //  - at this stage only reserve() (we know the time range; allocate second-layer structure)
+            //  - implement block-wise paging directly from HTK feature files through htkfeatreader
+            featkind.clear();
+            std::vector<float> frame;
+            std::vector<size_t>numclasses;              // number of units found (actually max id +1)
+            size_t notfound = 0;                // number of entries missing in MLF
+
+
+            std::vector<size_t>framesaccum;
+
+            if (infiles.size()==0)
+                throw runtime_error("minibatchframesourcemulti: need at least one network input specified with features");
+
+            if (labels.size()==0)
+                fprintf(stderr,"no MLF label files detected\n");
+ 
+            foreach_index (i, infiles)
+            {
+                pframes.push_back(unique_ptr<biggrowablevectorarray>(new biggrowablevectorarray(pagepath[i])));
+
+                if (vdim[i]>maxvdim)
+                    maxvdim=vdim[i];
+            }
+
+
+            foreach_index (i, labels)
+            {
+                classids.push_back(std::vector<CLASSIDTYPE>());
+                numclasses.push_back(0);
+            }
+
+
+            fprintf (stderr, "minibatchframesourcemulti: reading %zu feature sets and %zu label sets...", infiles.size(),labels.size());
+
+            foreach_index (m, infiles)
+            {
+
+
+                featdim=0;
+                numframes=0;
+                featkind.clear();
+                msra::asr::htkfeatreader reader;    // feature reader
+
+                foreach_index (i, infiles[m]) // read each feature file in set m
+                {
+                    if (i % (infiles[m].size() / 100 + 1) == 0) { fprintf (stderr, "."); fflush (stderr); }
+                    msra::basetypes::matrix<float> feat;
+                    msra::asr::htkfeatreader::parsedpath ppath (infiles[m][i], featuresections[m]);
+
+                    // skip files for which labels don't exist (assuming bad alignment)
+                    wstring key;
+                    if (!labels.empty())
+                    {
+                        if (!labels[0].empty())    // empty means unsupervised mode (don't load any)
+                        {
+#ifdef _WIN32
+                            key = regex_replace ((wstring)ppath, wregex (L"\\.[^\\.\\\\/:]*$"), wstring());  // delete extension (or not if none)
+#endif
+#ifdef __unix__
+                            key = removeExtension(basename(ppath));
+#endif
+                            if (labels[0].find (key) == labels[0].end())
+                            {
+                                if (notfound < 5)
+                                    fprintf (stderr, "\nminibatchframesourcemulti: %d-th file not found in MLF label set: %S", i, key.c_str());
+                                notfound++;
+                                continue;   // skip this utterance at all
+                            }
+                        }
+                    }
+                    // get feature frames
+                    if (vdim[m] != 0)  // (vdim == special mode to not read features at all)
+                    {
+                        msra::util::attempt (5, [&]()
+                        {
+                            reader.readNoAlloc (ppath, featkind, sampperiod, feat);   // whole file read as columns of feature vectors
+                        });
+                        if (featdim == 0)   // first time
+                            featdim = feat.rows();
+                        else if (featdim != feat.rows())
+                            throw std::runtime_error ("minibatchframesourcemulti: inconsistent feature dimension across files");
+                        // HVite occasionally generates mismatching output --skip such files
+                        if (!key.empty())   // (we have a key if supervised mode)
+                        {
+                            const auto & labseq = labels[0].find (key)->second;    // (we already checked above that it exists)
+                            size_t labframes = labseq.empty() ? 0 : (labseq[labseq.size()-1].firstframe + labseq[labseq.size()-1].numframes);
+                            if (abs ((int) labframes - (int) feat.cols()) > 0)
+                            {
+                                fprintf (stderr, "\nminibatchframesourcemulti: %d-th file has small duration mismatch (%zu in label vs. %zu in feat file), skipping: %S", i, labframes, feat.cols(), key.c_str());
+                                notfound++;
+                                continue;   // skip this utterance at all
+                            }
+                        }
+                        // append to cache
+                        frame.resize (featdim);
+                        if (feat.cols() < 2)    // (2 frames needed for boundary markers)
+                            throw std::runtime_error ("minibatchframesourcemulti: utterances < 2 frames not supported");
+                        foreach_column (t, feat)
+                        {
+                            foreach_index (k, frame)
+                                frame[k] = feat(k,t);
+
+                            pframes[m]->push_back (frame);
+                            numframes++;
+                            if (m==0)
+                                boundaryflags.push_back ((t == 0) ? -1 : (t == feat.cols() -1) ? +1 : 0);
+                        }
+                        if (m==0)
+                            framesaccum.push_back(numframes);
+                        else
+                            assert(numframes == framesaccum[i]);
+
+                        assert (numframes == pframes[m]->size());
+                    }
+                    if (m==0)
+                        assert (numframes == boundaryflags.size());
+
+
+
+                    if (m==0) // after we get the key for this file, read all labels (only done for first feature)
+                    { 
+                        if (!key.empty())
+                        {
+                            foreach_index (j, labels)
+                            {
+                                const auto & labseq = labels[j].find (key)->second;    // (we already checked above that it exists)
+                                foreach_index (i, labseq)
+                                {
+                                    const auto & e = labseq[i];
+                                    if ((i > 0 && labseq[i-1].firstframe + labseq[i-1].numframes != e.firstframe) || (i == 0 && e.firstframe != 0))
+                                        throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: labels not in consecutive order MLF in label set: %S", key.c_str()));
+                                    for (size_t t = e.firstframe; t < e.firstframe + e.numframes; t++)
+                                    {
+                                        if (e.classid >= udim[j])
+                                            throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: class id exceeds model dimension in file %S", key.c_str()));
+                                        if (e.classid != (CLASSIDTYPE) e.classid)
+                                            throw std::runtime_error ("CLASSIDTYPE has too few bits");
+                                        classids[j].push_back ((CLASSIDTYPE) e.classid);
+                                        numclasses[j] = max (numclasses[j], (long unsigned int)(1u + e.classid));
+                                    }
+                                }
+                                if (vdim[m] == 0)
+                                    numframes = classids[j].size();
+                                if (numframes != classids[j].size())   // TODO: remove this once we are confident
+                                    throw std::runtime_error (msra::strfun::strprintf ("minibatchframesourcemulti: label duration inconsistent with feature file in MLF label set: %S", key.c_str()));
+                                assert (numframes == classids[j].size());
+
+                            }
+                        }
+                        else
+                        {
+                            assert(classids.empty());
+                        }
+
+                    }
+
+                }
+
+
+                assert (vdim[m] == 0 || numframes == pframes[m]->size());
+
+                foreach_index(j, labels)
+                    assert (labels[j].empty() || numframes == classids[j].size());
+
+                if (vdim[m] != 0 && numframes != pframes[m]->size()) // || (!labels.empty() && numframes != classids.size()))
+                    throw std::runtime_error ("\nminibatchframesource: numframes variable screwup");
+                if (m==0)
+                {
+                    foreach_index (j, numclasses)
+                        fprintf (stderr, "\nminibatchframesourcemulti: read label set %d: %zu classes\n", j, numclasses[j]);
+                }
+                fprintf (stderr, "\nminibatchframesourcemulti: feature set %d: %zu frames read from %zu utterances\n", m, pframes[m]->size(), infiles[m].size());
+                if (notfound > 0)
+                {
+                    fprintf (stderr, "minibatchframesourcemulti: %zu files out of %zu not found in label set\n", notfound, infiles[m].size());
+                    if (notfound > infiles[m].size() / 2)
+                        throw std::runtime_error ("minibatchframesourcemulti: too many files not found in label set--assuming broken configuration\n");
+                }
+                // notify frames source to switch from population to consumption mode
+                pframes[m]->no_more_push_back();
+
+            }
+
+            if (numframes == 0 && !mayhavenoframe)
+                throw std::runtime_error ("minibatchframesource: no input features given!");
+
+
+            // initialize randomizer
+            if (numframes > 0) 
+                randomordering.resize (numframes, randomizationrange);
+
+        }
+        virtual ~minibatchframesourcemulti() {}
+        size_t totalframes() const { 
+            assert (maxvdim == 0 || numframes == pframes[0]->size()); assert (!issupervised() || numframes == classids[0].size()); return numframes; }
+
+        bool issupervised() const { return !classids.empty(); }
+
+        void setverbosity(int newverbosity) { verbosity = newverbosity; }
+
+        // retrieve one minibatch
+        // Minibatches are deterministic pseudo-random samples. The entire corpus
+        // is repeated infinitely, but each repetition (a 'sweep') is randomized
+        // differently.
+        // This function allows to retrieve a mini-batch starting from any frame
+        // within this infinitely extended repetition. To the end, mini-batches are
+        // specified by start frame and #frames.
+        // This function returns the same data independent on #frames, i.e. the concept
+        // of the mini-batch is not defined in here, but on the caller side. The caller
+        // can retrieve the frames of a mini-batch in chunks that do not match the
+        // caller's definition of "mini-batch," e.g. bigger or smaller chunks.
+        // If a requested mini-batch spans a sweep boundary, then this function will
+        // not return samples after the sweep boundary. Instead, the returned frame
+        // set is shortened to not exceed the end of the sweep. The caller must make
+        // a separate second call to get the rest. In trainlayer(), the one
+        // sweep-boundary-spanning mini-batch will simply be shortened.
+        // This function is NOT thread-safe (due to caching of random sequence).
+        bool getbatch (const size_t globalts, const size_t framesrequested, std::vector<msra::dbn::matrix> & feat, std::vector<std::vector<size_t>> & uids,
+            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & transcripts, 
+            std::vector<shared_ptr<const latticesource::latticepair>> & latticepairs)
+        {
+
+            auto_timer timergetbatch;
+            bool readfromdisk;
+            size_t nreadfromdisk=0;
+            transcripts.clear();    // word-level transcripts not supported by frame source (aimed at MMI)
+            latticepairs.clear();   // neither are lattices
+
+            assert (totalframes() > 0);
+            const size_t sweep = globalts / totalframes();  // which sweep (this determines randomization)
+            const size_t ts = globalts % totalframes();     // start frame within the sweep
+            const size_t te = min (ts + framesrequested, totalframes());    // do not go beyond sweep boundary
+            assert (te > ts);
+            if (verbosity >= 2)
+                fprintf (stderr, "getbatch: frames [%zu..%zu] in sweep %zu\n", ts, te-1, sweep);
+
+            // get random sequence (each time index occurs exactly once)
+            // If the sweep changes, this will re-cache the sequence. We optimize for rare, monotonous sweep changes.
+            const auto & tmap = randomordering (sweep);
+
+            feat.resize(pframes.size());
+            uids.resize(classids.size());
+            foreach_index(i, feat)
+            {
+                size_t leftextent, rightextent;
+                // page in the needed range of frames
+                if (leftcontext[i] == 0 && rightcontext[i] == 0)
+                {
+                    leftextent = rightextent = augmentationextent(pframes[i]->dim(), vdim[i]);
+                }
+                else
+                {
+                    leftextent = leftcontext[i];
+                    rightextent = rightcontext[i];
+                }
+                readfromdisk = pframes[i]->require (randomordering.bounds (max (ts, leftextent) - leftextent, te + 1 + rightextent));
+                // generate features and uids
+                feat[i].resize (vdim[i], te - ts);    // note: special mode vdim == 0 means no features to be loaded
+                if (issupervised())             // empty means unsupervised training -> return empty uids
+                    foreach_index(j, uids)
+                    uids[j].resize (te - ts);
+                else
+                    uids.clear();
+
+                for (size_t t = ts; t < te; t++)
+                {
+                    size_t trand = tmap[t];     // the random-sequence sample point for this point in time
+                    if (vdim[i] != 0)
+                    {
+                        auto v_t = feat[i].col(t-ts); // the vector to fill in
+                        augmentneighbors (*pframes[i], boundaryflags, trand, leftextent, rightextent, v_t);
+                    }
+                    if (i==0){ // read labels for all outputs on first pass thru features. this guarantees they will be read if only one feature set but > 1 label set
+                        if (issupervised())
+                            foreach_index(j, uids)
+                            uids[j][t-ts] = classids[j][trand];
+                    }
+                }
+                timegetbatch = timergetbatch;
+                if (readfromdisk)
+                    nreadfromdisk++;
+
+            }
+
+            (nreadfromdisk==feat.size()) ? readfromdisk = true : readfromdisk = false;
+
+            return readfromdisk;
+
+        }
+
+        bool getbatch (const size_t /*globalts*/, const size_t /*framesrequested*/, msra::dbn::matrix & /*feat*/, std::vector<size_t> & /*uids*/,
+            std::vector<const_array_ref<msra::lattices::lattice::htkmlfwordsequence::word>> & /*transcripts*/, 
+            std::vector<shared_ptr<const latticesource::latticepair>> & /*latticepairs*/)
+        {
+            // should never get here
+            throw runtime_error("minibatchframesourcemulti: getbatch() being called for single input feature and single output feature, should use minibatchframesource instead\n");
+        }
+
+        double gettimegetbatch () { return timegetbatch;}
+
+        // return first valid globalts to ask getbatch() for
+        // In frame mode, there is no constraint, i.e. it is 'globalts' itself.
+        /*implement*/ size_t firstvalidglobalts (const size_t globalts) { return globalts; }
+
+        /*implement*/ const std::vector<size_t> & unitcounts() const { throw logic_error ("unitcounts: not implemented for this feature source"); }
+
+    };
+};};
--- a/DataReader/Kaldi2Reader/simple_checked_arrays.h
+++ b/DataReader/Kaldi2Reader/simple_checked_arrays.h
@ -0,0 +1,89 @@
+//
+// <copyright file="simple_checked_arrays.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// simple_checked_arrays.h -- a simple wrapper around pointers used as arrays to allow bounds checking
+//
+
+#pragma once
+
+#include <stddef.h>     // for size_t
+#include <assert.h>
+
+// ---------------------------------------------------------------------------
+// array_ref -- wraps a C pointer to an array together with its size.
+//
+// Called _ref because this is a reference to the array rather than the array
+// itself (since it wraps a pointer). No need to pass an array_ref by reference.
+//
+// operator[] checks index bounds in Debug builds. size() is provided such
+// that this class can be substituted for STL vector in many cases.
+// ---------------------------------------------------------------------------
+
+template<class _T> class array_ref
+{
+    _T * data;
+    size_t n;
+    inline void check_index (size_t i) const { i; assert (i < n); }
+    inline void check_ptr() const { n; data; assert (n == 0 || data != NULL); }
+public:
+    inline array_ref (_T * ptr, size_t size) throw() : data (ptr), n (size) { }
+    inline array_ref() throw() : data (NULL), n (0) { }   // in case we have a vector of this
+    inline       _T & operator[] (size_t i)       throw() { check_index (i); return data[i]; }
+    inline const _T & operator[] (size_t i) const throw() { check_index (i); return data[i]; }
+    inline size_t size() const throw() { return n; }
+    inline _T * begin() { return data; }
+    inline _T * end() { return data + n; }
+    inline void resize (size_t sz) { sz; assert (n == sz); }    // allow compatibility with some functions
+    // construct from other vector types
+    template<class _V> inline array_ref (_V & v) : data (v.size() > 0 ? &v[0] : NULL), n ((size_t) v.size()) { }
+};
+
+
+// ---------------------------------------------------------------------------
+// const_array_ref -- same as array_ref for 'const' (read-only) pointers
+// ---------------------------------------------------------------------------
+
+template<class _T> class const_array_ref
+{
+    const _T * data;
+    size_t n;
+    inline void check_index (size_t i) const { i; assert (i < n); }
+    inline void check_ptr() const { n; data; assert (n == 0 || data != NULL); }
+public:
+    inline const_array_ref (const _T * ptr, size_t size) throw() : data (ptr), n (size) { }
+    inline const_array_ref() throw() : data (NULL), n (0) { }   // in case we have a vector of this
+    inline const _T & operator[] (size_t i) const throw() { check_index (i); return data[i]; }
+    inline size_t size() const throw() { return n; }
+    inline const _T * begin() { return data; }
+    inline const _T * end() { return data + n; }
+    inline const _T & front() const throw() { check_index (0); return data[0];}
+    inline const _T & back() const throw() {check_index (0); return data[n-1];}
+    // construct from other vector types
+    template<class _V> inline const_array_ref (const _V & v) : data (v.size() > 0 ? &v[0] : NULL), n ((size_t) v.size()) { }
+};
+
+// ---------------------------------------------------------------------------
+// hardcoded_array -- wraps a fixed-size C array together with its size.
+//
+// operator[] checks index bounds in Debug builds. size() is provided such
+// that this class can be substituted for STL vector in many cases.
+// Can be constructed with a size parameter--it will be checked against the
+// hard-coded size.
+// Can also be constructed with an initialization parameter (typ. 0).
+// ---------------------------------------------------------------------------
+
+template<class _T, int _N> class hardcoded_array
+{
+    _T data[_N];
+    inline void check_index (size_t i) const { i; assert (i < _N); }
+    inline void check_size  (size_t n) const { n; assert (n == _N); }
+public:
+    inline hardcoded_array() throw() {}
+    inline hardcoded_array (size_t n) throw() { check_size (n); }  // we can instantiate with a size parameter--just checks the size
+    inline hardcoded_array (size_t n, const _T & val) throw() { check_size (n); for (size_t i = 0; i < n; i++) data[i] = val; }
+    inline       _T & operator[] (size_t i)       throw() { check_index (i); return data[i]; }
+    inline const _T & operator[] (size_t i) const throw() { check_index (i); return data[i]; }
+    inline size_t size() const throw() { return _N; }
+};
--- a/DataReader/Kaldi2Reader/simplesenonehmm.h
+++ b/DataReader/Kaldi2Reader/simplesenonehmm.h
@ -0,0 +1,241 @@
+//
+// <copyright file="simplesenonehmm.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// latticearchive.h -- managing lattice archives
+//
+
+#pragma once
+
+#include "basetypes.h"
+#include "fileutil.h"
+#include <vector>
+#include <string>
+#include <unordered_map>
+#include <algorithm>    // for find()
+#include "simple_checked_arrays.h"
+
+namespace msra { namespace asr {
+
+// ===========================================================================
+// simplesenonehmm -- simple senone-based CD-HMM
+// ===========================================================================
+
+class simplesenonehmm
+{
+public: // (TODO: better encapsulation)
+    static const size_t MAXSTATES = 3;              // we use a fixed memory allocation since it's almost always 3 anyway
+    struct transP;
+    struct hmm
+    {
+        const char * name;                              // (this points into the key in the hash table to save memory)
+        struct transP * transP;                         // underlying transition matrix
+        unsigned char transPindex;                      // index of transP in struct transP
+        unsigned char numstates;                        // number of states
+        unsigned short senoneids[MAXSTATES];            // [0..numstates-1] senone indices
+
+        const char * getname() const { return name; }   // (should be used for diagnostics only)
+        size_t getsenoneid (size_t i) const { if (i < numstates) return (size_t) senoneids[i]; throw std::logic_error ("getsenoneid: out of bounds access"); }
+        size_t getnumstates() const { return (size_t) numstates; }
+        unsigned char gettransPindex() const { return transPindex;}
+        const struct transP & gettransP() const { return *transP; }
+
+        bool operator< (const hmm & other) const
+        {
+            return memcmp (this, &other, sizeof (other)) < 0;
+        }
+    };
+    std::vector<hmm> hmms;                          // the set of HMMs
+    std::unordered_map<std::string,size_t> symmap; // [name] -> index into hmms[]
+    struct transP
+    {
+    private:
+        size_t numstates;
+        float loga[MAXSTATES+1][MAXSTATES+1];
+        void check (int from, size_t to) const { if (from < -1 || from >= (int) numstates || to > numstates) throw std::logic_error ("transP: index out of bounds"); }
+    public:
+        void resize (size_t n) { if (n > MAXSTATES) throw std::runtime_error ("resize: requested transP that exceeds MAXSTATES"); numstates = n; }
+        size_t getnumstates() const { return numstates; }
+        // from = -1 and to = numstates are allowed, but we also allow 'from' to be size_t to avoid silly typecasts
+        float &       operator() (int from,    size_t to)       { check (from, to);      return loga[from+1][to]; } // from >= -1
+        const float & operator() (int from,    size_t to) const { check (from, to);      return loga[from+1][to]; } // from >= -1
+        const float & operator() (size_t from, size_t to) const { check ((int)from, to); return loga[from+1][to]; } // from >= 0
+        transP() : numstates (0) {}
+    };
+    std::vector<transP> transPs;                       // the transition matrices  --TODO: finish this
+    std::hash_map<std::string,size_t> transPmap;    // [transPname] -> index into transPs[]
+public:
+    // get an hmm by index
+    const hmm & gethmm (size_t i) const { return hmms[i]; }
+
+    // get an hmm by name
+    size_t gethmmid (const string & name) const
+    {
+        auto iter = symmap.find (name);
+        if (iter == symmap.end())
+            throw std::logic_error ("gethmm: unknown unit name: " + name);
+        return iter->second;
+    }
+
+    // diagnostics: map state id to senone name
+    std::vector<std::string> statenames;
+    const char * getsenonename (size_t senoneid) const { return statenames[senoneid].c_str(); }
+
+    // inverse lookup, for re-scoring the ground-truth path for sequence training
+    // This may be ambiguous, but we know that for current setup, that's only the case for /sil/ and /sp/.
+    std::vector<int> senoneid2transPindex;      // or -1 if ambiguous
+    std::vector<int> senoneid2stateindex;       // 0..2, or -1 if ambiguous
+
+    // construct from model files
+    simplesenonehmm (const std::wstring & cdphonetyingpath, const std::wstring & statelistpath, const std::wstring & transPpath)
+    {
+        if (cdphonetyingpath.empty())   // no tying info specified --just leave an empty object
+            return;
+        fprintf (stderr, "simplesenonehmm: reading '%S', '%S', '%S'\n", cdphonetyingpath.c_str(), statelistpath.c_str(), transPpath.c_str());
+        // read the state list
+        vector<char> textbuffer;
+        auto readstatenames = msra::files::fgetfilelines (statelistpath, textbuffer);
+        foreach_index (s, readstatenames)
+            statenames.push_back (readstatenames[s]);
+        std::unordered_map<std::string,size_t> statemap; // [name] -> index
+        statemap.rehash (readstatenames.size());
+        foreach_index (i, readstatenames)
+            statemap[readstatenames[i]] = i;
+        // TRANSPNAME NUMSTATES (ROW_from[to])+
+        msra::strfun::tokenizer toks (" \t", 5);
+        auto transPlines = msra::files::fgetfilelines (transPpath, textbuffer);
+        transPs.resize (transPlines.size());
+        string key; key.reserve (100);
+        foreach_index (i, transPlines)
+        {
+            toks = transPlines[i];
+            if (toks.size() < 3)
+                throw std::runtime_error ("simplesenonehmm: too few tokens in transP line: " + string (transPlines[i]));
+            key = toks[0];  // transPname --using existing object to avoid malloc
+            transPmap[key] = i;
+            size_t numstates = msra::strfun::toint (toks[1]);
+            if (numstates == 0)
+                throw std::runtime_error ("simplesenonehmm: invalid numstates: " + string (transPlines[i]));
+            auto & transP = transPs[i];
+            transP.resize (numstates);
+            size_t k = 2;   // index into tokens; transP values start at toks[2]
+            for (int from = -1; from < (int) numstates; from++) for (size_t to = 0; to <= numstates; to++)
+            {
+                if (k >= toks.size())
+                    throw std::runtime_error ("simplesenonehmm: not enough tokens on transP line: " + string (transPlines[i]));
+                const char * sval = toks[k++];
+                const double aij = msra::strfun::todouble (sval);
+                if (aij > 1e-10)    // non-0
+                    transP(from,to) = logf ((float) aij);   // we store log probs
+                else
+                    transP(from,to) = -1e30f;
+            }
+            if (toks.size() > k)
+                throw std::runtime_error ("simplesenonehmm: unexpected garbage at endof transP line: " + string (transPlines[i]));
+        }
+        // allocate inverse lookup
+        senoneid2transPindex.resize (readstatenames.size(), -2);
+        senoneid2stateindex.resize (readstatenames.size(), -2);
+        // read the cd-phone tying info
+        // HMMNAME TRANSPNAME SENONENAME+
+        auto lines = msra::files::fgetfilelines (cdphonetyingpath, textbuffer);
+        hmms.reserve (lines.size());
+        symmap.rehash (lines.size());
+        // two tables: (1) name -> HMM; (2) HMM -> HMM index (uniq'ed)
+        map<string,hmm> name2hmm;  // [name] -> unique HMM struct (without name)
+        map<hmm,size_t> hmm2index; // [unique HMM struct] -> hmm index, hmms[i] contains full hmm
+        foreach_index (i, lines)
+        {
+            toks = lines[i];
+            if (toks.size() < 3)
+                throw std::runtime_error ("simplesenonehmm: too few tokens in line: " + string (lines[i]));
+            const char * hmmname = toks[0];
+            const char * transPname = toks[1];
+            // build the HMM structure
+            hmm hmm;
+            hmm.name = NULL;    // for use as key in hash tables, we keep this NULL
+            // get the transP pointer
+            // TODO: this becomes a hard lookup with failure
+            key = transPname;   // (reuse existing memory)
+            auto iter = transPmap.find (key);
+            if (iter == transPmap.end())
+                throw std::runtime_error ("simplesenonehmm: unknown transP name: " + string (lines[i]));
+            size_t transPindex = iter->second;
+            hmm.transPindex = (unsigned char) transPindex;
+            hmm.transP = &transPs[transPindex];
+            if (hmm.transPindex != transPindex)
+                throw std::runtime_error ("simplesenonehmm: numeric overflow for transPindex field");
+            // get the senones
+            hmm.numstates = (unsigned char) (toks.size() - 2);    // remaining tokens
+            if (hmm.numstates != transPs[transPindex].getnumstates())
+                throw std::runtime_error ("simplesenonehmm: number of states mismatches that of transP: " + string (lines[i]));
+            if (hmm.numstates > _countof (hmm.senoneids))
+                throw std::runtime_error ("simplesenonehmm: hmm.senoneids[MAXSTATES] is too small in line: " + string (lines[i]));
+            for (size_t s = 0; s < hmm.numstates; s++)
+            {
+                const char * senonename = toks[s+2];
+                key = senonename;   // (reuse existing memory)
+                auto iter = statemap.find (key);
+                if (iter == statemap.end())
+                    throw std::runtime_error ("simplesenonehmm: unrecognized senone name in line: " + string (lines[i]));
+                hmm.senoneids[s] = (unsigned short) iter->second;
+                if (hmm.getsenoneid(s) != iter->second)
+                    throw std::runtime_error ("simplesenonehmm: not enough bits to store senone index in line: " + string (lines[i]));
+                // inverse lookup
+                if (senoneid2transPindex[hmm.senoneids[s]] == -2)   // no value yet
+                    senoneid2transPindex[hmm.senoneids[s]] = hmm.transPindex;
+                else if (senoneid2transPindex[hmm.senoneids[s]] != hmm.transPindex)
+                    senoneid2transPindex[hmm.senoneids[s]] = -1;    // multiple inconsistent values
+                if (senoneid2stateindex[hmm.senoneids[s]] == -2)
+                    senoneid2stateindex[hmm.senoneids[s]] = (int) s;
+                else if (senoneid2stateindex[hmm.senoneids[s]] != (int) s)
+                    senoneid2stateindex[hmm.senoneids[s]] = -1;
+            }
+            for (size_t s = hmm.numstates; s < _countof (hmm.senoneids); s++)   // clear out the rest if needed
+                hmm.senoneids[s] = USHRT_MAX;
+            // add to name-to-HMM hash
+            auto ir = name2hmm.insert (std::make_pair (hmmname, hmm));    // insert into hash table
+            if (!ir.second) // not inserted
+                throw std::runtime_error ("simplesenonehmm: duplicate unit name in line: " + string (lines[i]));
+            // add to hmm-to-index hash
+            // and update the actual lookup table
+            size_t hmmindex = hmms.size();      // (assume it's a new entry)
+            auto is = hmm2index.insert (std::make_pair (hmm, hmmindex));
+            if (is.second)                      // was indeed inserted: add to hmms[]
+            {
+                // insert first, as this copies the name; we can then point to it
+                auto it = symmap.insert (std::make_pair (hmmname, hmmindex)); // insert into hash table
+                hmm.name = it.first->first.c_str(); // only use first name if multiple (the name is informative only anyway)
+                hmms.push_back (hmm);
+            }
+            else                                // not inserted
+            {
+                hmmindex = is.first->second;    // use existing value
+                symmap.insert (std::make_pair (hmmname, hmmindex)); // insert into hash table
+            }
+        }
+        fprintf (stderr, "simplesenonehmm: %zu units with %zu unique HMMs, %zu tied states, and %zu trans matrices read\n",
+                 symmap.size(), hmms.size(), statemap.size(), transPs.size());
+    }
+
+    // exposed so we can pass it to the lattice reader, which maps the symbol ids for us
+    const std::unordered_map<std::string,size_t> & getsymmap() const { return symmap; }
+
+    // inverse lookup --for scoring the ground-truth
+    // Note: /sil/ and /sp/ will be ambiguous, so need to handle them as a special case.
+    int senonetransP (size_t senoneid) const { return senoneid2transPindex[senoneid]; }
+    int senonestate (size_t senoneid) const { return senoneid2stateindex[senoneid]; }
+    const size_t getnumsenone () const {return senoneid2stateindex.size(); }
+    const bool statebelongstohmm (const size_t senoneid, const hmm & hmm) const       // reutrn true if one of the states of this hmm == senoneid 
+    {
+        size_t numstates = hmm.getnumstates();
+        for (size_t i = 0; i < numstates; i++)
+            if (hmm.senoneids[i] == senoneid)
+                return true;
+        return false;
+    }
+
+};
+
+};};
--- a/DataReader/Kaldi2Reader/simplethread.h
+++ b/DataReader/Kaldi2Reader/simplethread.h
@ -0,0 +1,152 @@
+//
+// <copyright file="simplethread.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// simplethread.h -- a simple thread implementation
+//
+
+#pragma once
+
+#include "basetypes.h"
+#ifdef _WIN32
+#include <process.h>        // for _beginthread()
+#endif
+
+namespace msra { namespace util {
+
+// ---------------------------------------------------------------------------
+// signallingevent  -- wrapper around Windows events
+// ---------------------------------------------------------------------------
+class signallingevent   // TODO: should this go into basetypes.h?
+{
+    HANDLE h;
+public:
+    signallingevent (bool initialstate = true)
+    {
+        h = ::CreateEvent (NULL, FALSE/*manual reset*/, initialstate ? TRUE : FALSE, NULL);
+        if (h == NULL)
+            throw std::runtime_error ("signallingevent: CreateEvent() failed");
+    }
+    ~signallingevent() { ::CloseHandle (h); }
+    void wait() { if (::WaitForSingleObject (h, INFINITE) != WAIT_OBJECT_0) throw std::runtime_error ("wait: WaitForSingleObject() unexpectedly failed"); }
+    void flag() { if (::SetEvent (h) == 0) throw std::runtime_error ("flag: SetEvent() unexpectedly failed"); }
+};
+
+
+// ---------------------------------------------------------------------------
+// simplethread  -- simple thread wrapper
+// ---------------------------------------------------------------------------
+class simplethread : CCritSec
+{
+    std::shared_ptr<std::exception> badallocexceptionptr;   // in case we fail to copy the exception
+    std::shared_ptr<std::exception> exceptionptr;           // if non-NULL, then thread failed with exception
+    // wrapper around passing the functor
+    signallingevent startsignal;
+    const void * functorptr;
+    template<typename FUNCTION> static unsigned int __stdcall staticthreadproc (void * usv)
+    {
+        simplethread * us = (simplethread*) usv;
+        const FUNCTION body = *(const FUNCTION *) us->functorptr;
+        us->startsignal.flag();
+        us->threadproc (body);
+        return 0;
+    }
+    template<typename FUNCTION> void threadproc (const FUNCTION & body)
+    {
+        try
+        {
+            body();                 // execute the function
+        }
+        catch (const std::exception & e)
+        {
+            fail (e);
+        }
+        catch (...)                 // we do not catch anything that is not based on std::exception
+        {
+            fprintf (stderr, "simplethread: thread proc failed with unexpected unknown exception, which is not allowed. Terminating\n");
+            fflush (stderr);        // (needed?)
+            abort();                // should never happen
+        }
+    }
+    HANDLE threadhandle;
+public:
+    template<typename FUNCTION> simplethread (const FUNCTION & body) : badallocexceptionptr (new std::bad_alloc()), functorptr (&body), startsignal (false)
+    {
+        unsigned int threadid;
+        uintptr_t rc = _beginthreadex (NULL/*security*/, 0/*stack*/, staticthreadproc<FUNCTION>, this, CREATE_SUSPENDED, &threadid);
+        if (rc == 0)
+            throw std::runtime_error ("simplethread: _beginthreadex() failed");
+        threadhandle = OpenThread (THREAD_ALL_ACCESS, FALSE, threadid);
+        if (threadhandle == NULL)
+            throw std::logic_error ("simplethread: _beginthreadex()  unexpectedly did not return valid thread id");   // BUGBUG: leaking something
+        DWORD rc1 = ::ResumeThread (threadhandle);
+        if (rc1 == (DWORD) -1)
+        {
+            ::TerminateThread (threadhandle, 0);
+            ::CloseHandle (threadhandle);
+            throw std::logic_error ("simplethread: ResumeThread() failed unexpectedly");
+        }
+        try
+        {
+            startsignal.wait(); // wait until functor has been copied
+        }
+        catch (...)
+        {
+            ::TerminateThread (threadhandle, 0);
+            ::CloseHandle (threadhandle);
+            throw;
+        }
+    }
+    // check if the thread is still alive and without error
+    void check()
+    {
+        CAutoLock lock (*this);
+        // pass on a pending exception
+        if (exceptionptr)
+            throw *exceptionptr.get();
+        // the thread going away without error is also unexpected at this point
+        if (wait (0))   // (0 means don't block, so OK to call inside lock)
+            throw std::runtime_error ("check: thread terminated unexpectedly");
+    }
+    bool wait (DWORD dwMilliseconds = INFINITE)
+    {
+        DWORD rc = ::WaitForSingleObject (threadhandle, dwMilliseconds);
+        if (rc == WAIT_TIMEOUT)
+            return false;
+        else if (rc == WAIT_OBJECT_0)
+            return true;
+        else
+            throw std::runtime_error ("wait: WaitForSingleObject() failed unexpectedly");
+    }
+    // thread itself can set the failure condition, e.g. before it signals some other thread to pick it up
+    void fail (const std::exception & e)
+    {
+        // exception: remember it  --this will remove the type info :(
+        CAutoLock lock (*this);
+        try // copy the exception--this may fail if we are out of memory
+        {
+            exceptionptr.reset (new std::runtime_error (e.what()));
+        }
+        catch (...) // failed to alloc: fall back to bad_alloc, which is most likely the cause in such situation
+        {
+            exceptionptr = badallocexceptionptr;
+        }
+    }
+    //void join()
+    //{
+    //    check();
+    //    wait();
+    //    check_for_exception();    // (check() not sufficient because it would fail since thread is gone)
+    //}
+    ~simplethread() throw()
+    {
+        // wait until it shuts down
+        try { wait(); }
+        catch (...) { ::TerminateThread (threadhandle, 0); }
+        // close the handle
+        ::CloseHandle (threadhandle);
+    }
+};
+
+};};
--- a/DataReader/Kaldi2Reader/ssefloat4.h
+++ b/DataReader/Kaldi2Reader/ssefloat4.h
@ -0,0 +1,123 @@
+//
+// <copyright file="ssefloat4.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
+// ssematrix.h -- matrix with SSE-accelerated operations
+//
+
+#pragma once
+
+#ifdef _WIN32
+#include <intrin.h>         // for intrinsics
+#endif
+#ifdef __unix__
+#include <x86intrin.h>
+#endif
+
+namespace msra { namespace math {
+
+// ===========================================================================
+// float4 -- wrapper around the rather ugly SSE intrinsics for float[4]
+//
+// Do not use the intrinsics outside anymore; instead add all you need into this class.
+//
+// MSDN links:
+// basic: http://msdn.microsoft.com/en-us/library/x5c07e2a%28v=VS.80%29.aspx
+// load/store: (add this)
+// newer ones: (seems no single list available)
+// ===========================================================================
+
+class float4
+{
+    __m128 v;   // value
+private:
+    // return the low 'float'
+    float f0() const { float f; _mm_store_ss (&f, v); return f; }
+    // construct from a __m128, assuming it is a f32 vector (needed for directly returning __m128 below)
+    float4 (const __m128 & v) : v (v) {}
+    // return as a __m128 --should this be a reference?
+    operator __m128() const { return v; }
+    // assign a __m128 (needed for using nested float4 objects inside this class, e.g. sum())
+    float4 & operator= (const __m128 & other) { v = other; return *this; }
+public:
+    float4() {} // uninitialized
+    float4 (const float4 & f4) : v (f4.v) {}
+    float4 & operator= (const float4 & other) { v = other.v; return *this; }
+
+    // construct from a single float, copy to all components
+    float4 (float f) : v (_mm_load1_ps (&f)) {}
+    //float4 (float f) : v (_mm_set_ss (f)) {}  // code seems more complex than _mm_load1_ps()
+
+    // basic math
+    float4 operator-() const { return _mm_sub_ps (_mm_setzero_ps(), v); }  // UNTESTED; setzero is a composite
+
+    float4 operator& (const float4 & other) const { return _mm_and_ps (v, other); }
+    float4 operator| (const float4 & other) const { return _mm_or_ps (v, other); }
+    float4 operator+ (const float4 & other) const { return _mm_add_ps (v, other); }
+    float4 operator- (const float4 & other) const { return _mm_sub_ps (v, other); }
+    float4 operator* (const float4 & other) const { return _mm_mul_ps (v, other); }
+    float4 operator/ (const float4 & other) const { return _mm_div_ps (v, other); }
+
+    float4 & operator&= (const float4 & other) { v = _mm_and_ps (v, other); return *this; }
+    float4 & operator|= (const float4 & other) { v = _mm_or_ps (v, other); return *this; }
+    float4 & operator+= (const float4 & other) { v = _mm_add_ps (v, other); return *this; }
+    float4 & operator-= (const float4 & other) { v = _mm_sub_ps (v, other); return *this; }
+    float4 & operator*= (const float4 & other) { v = _mm_mul_ps (v, other); return *this; }
+    float4 & operator/= (const float4 & other) { v = _mm_div_ps (v, other); return *this; }
+
+    float4 operator>= (const float4 & other) const { return _mm_cmpge_ps (v, other); }
+    float4 operator<= (const float4 & other) const { return _mm_cmple_ps (v, other); }
+
+    // not yet implemented binary arithmetic ops: sqrt, rcp (reciprocal), rqsrt, min, max
+
+    // other goodies I came across (intrin.h):
+    //  - _mm_prefetch
+    //  - _mm_stream_ps --store without polluting cache
+    //  - unknown: _mm_addsub_ps, _mm_hsub_ps, _mm_movehdup_ps, _mm_moveldup_ps, _mm_blend_ps, _mm_blendv_ps, _mm_insert_ps, _mm_extract_ps, _mm_round_ps
+    //  - _mm_dp_ps dot product! http://msdn.microsoft.com/en-us/library/bb514054.aspx
+    //    Not so interesting for long vectors, we get better numerical precision with parallel adds and hadd at the end
+
+    // prefetch a float4 from an address
+    static void prefetch (const float4 * p) { _mm_prefetch ((const char *) const_cast<float4 *> (p), _MM_HINT_T0); }
+
+    // transpose a 4x4 matrix
+    // Passing input as const ref to ensure aligned-ness
+    static void transpose (const float4 & col0, const float4 & col1, const float4 & col2, const float4 & col3,
+                           float4 & row0, float4 & row1, float4 & row2, float4 & row3)
+    {   // note: the temp variable here gets completely eliminated by optimization
+        float4 m0 = col0; float4 m1 = col1; float4 m2 = col2; float4 m3 = col3;
+        _MM_TRANSPOSE4_PS (m0, m1, m2, m3); // 8 instructions for 16 elements
+        row0 = m0; row1 = m1; row2 = m2; row3 = m3;
+    }
+
+    // save a float4 to RAM bypassing the cache ('without polluting the cache')
+    void storewithoutcache (float4 & r4) const
+    {
+        //_mm_stream_ps ((float*) &r4, v);
+        r4 = v;
+    }
+
+#if 0
+    // save a float4 to RAM bypassing the cache ('without polluting the cache')
+    void storewithoutcache (float4 * p4) const
+    {
+        //_mm_stream_ps ((float*) p4, v);
+        *p4 = v;
+    }
+
+    // save a float to RAM bypassing the cache ('without polluting the cache')
+    void storewithoutcache (float & r) const
+    {
+        _mm_stream_ss (&r, v);
+    }
+#endif
+
+    // return the horizontal sum of all 4 components
+    // ... return float4, use another mechanism to store the low word
+    float sum() const { float4 hsum = _mm_hadd_ps (v, v); hsum = _mm_hadd_ps (hsum, hsum); return hsum.f0(); }
+
+    // please add anything else you might need HERE
+};
+
+};};
--- a/DataReader/Kaldi2Reader/ssematrix.h
+++ b/DataReader/Kaldi2Reader/ssematrix.h
--- a/MachineLearning/PTaskHost/stdafx.cpp
+++ b/MachineLearning/PTaskHost/stdafx.cpp
@ -1,5 +1,10 @@
+//
+// <copyright file="stdafx.cpp" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
 // stdafx.cpp : source file that includes just the standard includes
-// PTaskHost.pch will be the pre-compiled header
+// HTKMLFReader.pch will be the pre-compiled header
 // stdafx.obj will contain the pre-compiled type information

 #include "stdafx.h"
--- a/MachineLearning/PTaskHost/stdafx.h
+++ b/MachineLearning/PTaskHost/stdafx.h
@ -1,3 +1,8 @@
+//
+// <copyright file="stdafx.h" company="Microsoft">
+//     Copyright (c) Microsoft Corporation.  All rights reserved.
+// </copyright>
+//
 // stdafx.h : include file for standard system include files,
 // or project specific include files that are used frequently, but
 // are changed infrequently
@ -5,11 +10,15 @@

 #pragma once

-#include "targetver.h"
+#define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms

+#ifndef __unix__
 #define WIN32_LEAN_AND_MEAN             // Exclude rarely-used stuff from Windows headers
 // Windows Header Files:
 #include <windows.h>
+#include <objbase.h>
+#include "targetver.h"
+#endif



--- a/DataReader/Kaldi2Reader/targetver.h
+++ b/DataReader/Kaldi2Reader/targetver.h
--- a/DataReader/Kaldi2Reader/utterancesourcemulti.h
+++ b/DataReader/Kaldi2Reader/utterancesourcemulti.h
--- a/DataReader/KaldiReader/HTKMLFReader.cpp
+++ b/DataReader/KaldiReader/HTKMLFReader.cpp
@ -1185,7 +1185,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            {
                if (matrices.find(iter->first)==matrices.end())
                {
-                    fprintf(stderr,"GetMinibatchToWrite: feature node %ws specified in reader not found in the network\n",iter->first.c_str());
+                    fprintf(stderr,"GetMinibatchToWrite: feature node %S specified in reader not found in the network\n",iter->first.c_str());
                    throw std::runtime_error("GetMinibatchToWrite: feature node specified in reader not found in the network.");
                }
            }
--- a/DataReader/LMSequenceReader/LMSequenceReader.vcxproj
+++ b/DataReader/LMSequenceReader/LMSequenceReader.vcxproj
@ -52,11 +52,13 @@
    <LinkIncremental>true</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
--- a/DataReader/LMSequenceReader/SequenceParser.cpp
+++ b/DataReader/LMSequenceReader/SequenceParser.cpp
@ -7,7 +7,7 @@
 //

 #include "stdafx.h"
-#include "basetypes.h"
+#include "Basics.h"
 #include "SequenceParser.h"
 #include <stdexcept>
 #include <stdint.h>
--- a/DataReader/LMSequenceReader/SequenceParser.h
+++ b/DataReader/LMSequenceReader/SequenceParser.h
@ -442,7 +442,7 @@ public:
        long TickDelta = TickStop - TickStart;

        if (m_traceLevel > 2)
-            fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
+            fprintf(stderr, "\n%zu ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
        return lineCount;
    }

@ -602,7 +602,7 @@ public:
        long TickDelta = TickStop - TickStart;

        if (m_traceLevel > 2)
-            fprintf(stderr, "\n%d ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
+            fprintf(stderr, "\n%zu ms, %d numbers parsed\n\n", TickDelta, m_totalNumbersConverted );
        return lineCount;
    }

--- a/DataReader/LMSequenceReader/SequenceReader.cpp
+++ b/DataReader/LMSequenceReader/SequenceReader.cpp
@ -340,7 +340,7 @@ void SequenceReader<ElemType>::WriteLabelFile()
            }
            else if (!m_cachingWriter)
            {
-                //fprintf(stderr, "WARNING: file %ws NOT written to disk, label files only written when starting at epoch zero!", labelInfo.fileToWrite.c_str());
+                //fprintf(stderr, "WARNING: file %ls NOT written to disk, label files only written when starting at epoch zero!", labelInfo.fileToWrite.c_str());
                std::wcerr << "WARNING: file " << labelInfo.fileToWrite.c_str() << " NOT written to disk, label files only written when starting at epoch zero!" << endl;
            }
        }
@ -565,7 +565,7 @@ void SequenceReader<ElemType>::Init(const ConfigParameters& readerConfig)
    std::wstring m_file = readerConfig("file");
    if (m_traceLevel > 0)
    {
-        //fprintf(stderr, "reading sequence file %ws\n", m_file.c_str());
+        //fprintf(stderr, "reading sequence file %ls\n", m_file.c_str());
        std::wcerr << "reading sequence file" << m_file.c_str() << endl;
    }

@ -646,6 +646,11 @@ void SequenceReader<ElemType>::ReadClassInfo(const wstring & vocfile, bool /*fla
    }
    fin.close();
    class_size++;
+ 
+    std::vector<double> counts(idx4cnt.size());
+    for (auto p : idx4cnt)
+        counts[p.first] = (double)p.second;
+    m = noiseSampler<long>(counts);
 }

 // InitCache - Initialize the caching reader if cache files exist, otherwise the writer
@ -970,9 +975,11 @@ bool SequenceReader<ElemType>::SentenceEnd()
    return false; 
 }

-/// the output label is a [2 x T] matrix.
+/// the output label is a [4 x T] matrix, where T is the number of words observed
 /// the first row is the word index
 /// the second row is the class id of this word
+/// the third row is begining index of the class for this word
+/// the fourth row is the ending index + 1 of the class for this word
 template<class ElemType>
 void SequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring, Matrix<ElemType>*>& matrices, 
                                              size_t m_mbStartSample, size_t actualmbsize)
@ -981,27 +988,67 @@ void SequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring, Matrix<Elem
    Matrix<ElemType>* labels = matrices[m_labelsName[labelInfoOut]];
    if (labels == nullptr) return;
    
+    if (readerMode == ReaderMode::NCE)
+        labels->Resize(2 * (this->noise_sample_size + 1), actualmbsize);
+    else if (readerMode == ReaderMode::Class)
        labels->Resize(4, actualmbsize);
+    else if (readerMode == ReaderMode::Softmax)
+        labels->Resize(1, actualmbsize);
        
    for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
    {
        // pick the right sample with randomization if desired
        size_t jRand = jSample;         
-         
        int    wrd = m_labelIdData[jRand];        
-        int    clsidx = idx4class[wrd]; 
-        
        labels->SetValue(0, j, (ElemType)wrd); 

+        if (readerMode == ReaderMode::NCE)
+        {
+            labels->SetValue(1, j, (ElemType)m.logprob(wrd));
+            for (size_t noiseid = 0; noiseid < this->noise_sample_size; noiseid++)
+            {
+                int wid = m.sample();
+                labels->SetValue(2 * (noiseid + 1), j, (ElemType)wid);
+                labels->SetValue(2 * (noiseid + 1) + 1, j, -(ElemType)m.logprob(wid));
+            }
+        }
+        else if (readerMode == ReaderMode::Class)
+        {
+            int clsidx = idx4class[wrd];
            if (class_size > 0){
                labels->SetValue(1, j, (ElemType)clsidx);
-            
                /// save the [begining ending_indx) of the class 
                labels->SetValue(2, j, (*m_classInfoLocal)(0, clsidx)); /// begining index of the class
                labels->SetValue(3, j, (*m_classInfoLocal)(1, clsidx)); /// end index of the class
            }
        }
+    }
+}
+template<class ElemType>
+void SequenceReader<ElemType>::GetInputProb(std::map<std::wstring, Matrix<ElemType>*>& matrices)
+{
+    Matrix<ElemType>* idx2prob = matrices[STRIDX2PROB];
+    if (idx2prob == nullptr) return;

+    if (m_idx2probRead) return;
+
+    // populate local CPU matrix
+    m_id2Prob->SwitchToMatrixType(MatrixType::DENSE, matrixFormatDense, false);
+    m_id2Prob->Resize(nwords, 1, false);
+
+    //move to CPU since element-wise operation is expensive and can go wrong in GPU
+    int curDevId = m_id2Prob->GetDeviceId();
+    m_id2Prob->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);
+    for (size_t j = 0; j < nwords; j++)
+        (*m_id2Prob)((int)j, 0) = (float)m.prob((int)j);
+    m_id2Prob->TransferFromDeviceToDevice(CPUDEVICE, curDevId, true, false, false);
+
+    int oldDeviceId = idx2prob->GetDeviceId();
+    // caution, SetValue changes idx2cls from GPU to CPU, may change this behavior later
+    idx2prob->SetValue(*m_id2Prob);
+    idx2prob->TransferFromDeviceToDevice(idx2prob->GetDeviceId(), oldDeviceId, true);
+
+    m_idx2probRead = true;
 }

 template<class ElemType>
@ -1340,6 +1387,18 @@ void BatchSequenceReader<ElemType>::Init(const ConfigParameters& readerConfig)

    ConfigParameters featureConfig = readerConfig(m_featuresName,"");
    ConfigParameters labelConfig[2] = {readerConfig(m_labelsName[0],""),readerConfig(m_labelsName[1],"")};
+    string mode = featureConfig("mode","class");//class, softmax, nce
+
+    if (mode == "nce")
+    {
+        readerMode = ReaderMode::NCE;
+    
+        this->noise_sample_size = featureConfig("noise_number", "0");
+    }
+    else if (mode == "softmax")
+        readerMode = ReaderMode::Softmax;
+    else if (mode == "class")
+        readerMode = ReaderMode::Class;

    class_size = 0;
    m_featureDim = featureConfig("dim");
@ -1808,8 +1867,11 @@ bool BatchSequenceReader<ElemType>::GetMinibatch(std::map<std::wstring, Matrix<E
        features.TransferFromDeviceToDevice(CPUDEVICE, featureDeviceId, false,false, false);
                
        // TODO: move these two methods to startMiniBatchLoop()
+        if (readerMode == ReaderMode::Class)
+        {
            GetInputToClass(matrices);
            GetClassInfo();
+        }
        GetLabelOutput(matrices, 0, actualmbsize);

        // go to the next sequence
@ -1916,6 +1978,17 @@ bool BatchSequenceReader<ElemType>::DataEnd(EndDataType endDataType)

 }

+/// labels are in [L x T] matrix
+/// where L depends on reader mode:
+///     4             under CLASS           [wid, class-id, beg-class, end-class]
+///     2*(noise + 1) under NCE training    [wid, prob, (noise-id, noise-prob)+]
+///     1             o.w.                  [wid]
+/// the following comments are obsolete now
+/// 1st row is the word id
+/// 2nd row is the class id of this word
+/// 3rd and 4th rows are the begining and ending indices of this class
+/// notice that indices are defined as follows [begining ending_indx) of the class 
+/// i.e., the ending_index is 1 plus of the true ending index
 template<class ElemType>
 void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
    Matrix<ElemType>*>& matrices,
@ -1925,35 +1998,42 @@ void BatchSequenceReader<ElemType>::GetLabelOutput(std::map<std::wstring,
    Matrix<ElemType>* labels = matrices[m_labelsName[labelInfoOut]];
    if (labels == nullptr) return;

-    if(labels->GetMatrixType() == MatrixType::DENSE) 
-    {
+    if (readerMode == ReaderMode::NCE)
+        labels->Resize(2 * (this->noise_sample_size + 1), actualmbsize);
+    else if (readerMode == ReaderMode::Class)
        labels->Resize(4, actualmbsize, false);
-    }
    else
-    {
-        RuntimeError("GetLabelOutput::should use dense matrix for labels which only save index of words"); 
-    }
+        labels->Resize(1, actualmbsize, false);
+

    //move to CPU since element-wise operation is expensive and can go wrong in GPU
    int curDevId = labels->GetDeviceId();
    labels->TransferFromDeviceToDevice(curDevId, CPUDEVICE, true, false, false);

-
-    if(labels->GetCurrentMatrixLocation() == CPU) {
+    if (labels->GetCurrentMatrixLocation() == CPU)
    for (size_t jSample = m_mbStartSample; j < actualmbsize; ++j, ++jSample)
    {
        // pick the right sample with randomization if desired
        size_t jRand = jSample;
-
        int    wrd = m_labelIdData[jRand];
-            int    clsidx = idx4class[wrd]; 
-
        labels->SetValue(0, j, (ElemType)wrd);
-
        SetSentenceEnd(wrd, j, actualmbsize);

-            if (class_size > 0)
+        if (readerMode == ReaderMode::NCE)
        {
+            labels->SetValue(1, j, (ElemType)m.logprob(wrd));
+            for (size_t noiseid = 0; noiseid < this->noise_sample_size; noiseid++)
+            {
+                int wid = m.sample();
+                labels->SetValue(2 * (noiseid + 1), j, (ElemType)wid);
+                labels->SetValue(2 * (noiseid + 1) + 1, j, -(ElemType)m.logprob(wid));
+            }
+        }
+        else if (readerMode == ReaderMode::Class)
+        {
+            int clsidx = idx4class[wrd];
+            if (class_size > 0){
+
                labels->SetValue(1, j, (ElemType)clsidx);

                /// save the [begining ending_indx) of the class 
--- a/DataReader/LMSequenceReader/SequenceReader.h
+++ b/DataReader/LMSequenceReader/SequenceReader.h
@ -9,12 +9,13 @@

 #include "DataReader.h"
 #include "DataWriter.h"
+#include "commandArgUtil.h"
 #include "SequenceParser.h"
 #include <string>
 #include <map>
 #include <vector>
 #include "minibatchsourcehelpers.h"
-
+#include <random>

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -23,6 +24,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 #define STRIDX2CLS L"idx2cls"
 #define CLASSINFO  L"classinfo"

+#define STRIDX2PROB L"idx2prob"
 #define MAX_STRING  2048

 enum LabelKind
@ -33,6 +35,60 @@ enum LabelKind
    labelOther = 3, // some other type of label
 };
            
+enum ReaderMode
+{
+    Softmax = 0,  // no labels to worry about
+    Class = 1, // category labels, creates mapping tables
+    NCE = 2,  // sentence mapping (predicts next word)
+    None = 3, // some other type of label
+};
+
+template <typename Count>
+class noiseSampler {
+    std::vector<double> m_prob, m_log_prob;
+    std::uniform_int_distribution<Count> unif_int;
+    bool uniform_sampling;
+    double uniform_prob;
+    double uniform_log_prob;
+    std::piecewise_constant_distribution<double> d;
+    std::mt19937 rng;
+public:
+    noiseSampler(){ }
+    noiseSampler(const std::vector<double> &counts, bool xuniform_sampling = false)
+        :uniform_sampling(xuniform_sampling), rng(1234)
+    {
+        size_t k = counts.size();
+        uniform_prob = 1.0 / k;
+        uniform_log_prob = std::log(uniform_prob);
+        std::vector<double> vn(counts.size() + 1);
+        for (int i = 0; i < vn.size(); i++)
+            vn[i] = i;
+        d = std::piecewise_constant_distribution<double>(vn.begin(), vn.end(), counts.begin());
+        unif_int = std::uniform_int_distribution<Count>(0,(long) counts.size() - 1);
+        m_prob = d.densities();
+        m_log_prob.resize(m_prob.size());
+        for (int i = 0; i < k; i++)
+            m_log_prob[i] = std::log(m_prob[i]);
+    }
+    int size() const{ return m_prob.size(); }
+    double prob(int i) const { if (uniform_sampling) return uniform_prob; else return m_prob[i]; }
+    double logprob(int i) const { if (uniform_sampling) return uniform_log_prob; else return m_log_prob[i]; }
+
+    template <typename Engine>
+    int sample(Engine &eng) const
+    {
+        int m = unif_int(eng);
+        if (uniform_sampling)
+            return m;
+        return (int)d(eng);
+    }
+    
+    int sample()
+    {
+        return sample(this->rng);
+    }
+};
+
 template<class ElemType>
 class SequenceReader : public IDataReader<ElemType>
 {
@ -40,6 +96,7 @@ protected:
    bool   m_idx2clsRead; 
    bool   m_clsinfoRead;

+    bool   m_idx2probRead;
    std::wstring m_file; 
 public:
 	using LabelType = typename IDataReader<ElemType>::LabelType;
@ -52,10 +109,15 @@ public:
    Matrix<ElemType>* m_id2classLocal; // CPU version
    Matrix<ElemType>* m_classInfoLocal; // CPU version

+    Matrix<ElemType>* m_id2Prob; // CPU version
    int class_size;
    map<int, vector<int>> class_words;
    vector<int>class_cn;

+    int noise_sample_size;
+    noiseSampler<long> m;
+
+    ReaderMode readerMode;
    int eos_idx, unk_idx;
 public:
 //    typedef std::string LabelType;
@ -158,12 +220,15 @@ public:
    void GetLabelOutput(std::map<std::wstring, Matrix<ElemType>*>& matrices, 
                       size_t m_mbStartSample, size_t actualmbsize);
    void GetInputToClass(std::map<std::wstring, Matrix<ElemType>*>& matrices);
+
+    void GetInputProb(std::map<std::wstring, Matrix<ElemType>*>& matrices);
    void GetClassInfo();

    virtual void Destroy();
    SequenceReader() {
        m_featuresBuffer=NULL; m_labelsBuffer=NULL; m_clsinfoRead = false; m_idx2clsRead = false;             
        m_cachingReader=NULL; m_cachingWriter=NULL; m_labelsIdBuffer = NULL;
+        readerMode = ReaderMode::Class;
 		/*
        delete m_featuresBufferRow;
        delete m_featuresBufferRowIdx;
@ -249,6 +314,8 @@ public:
 	using SequenceReader<ElemType>::m_sequence;
 	using SequenceReader<ElemType>::idx4class;
 	using SequenceReader<ElemType>::m_indexer;
+	using SequenceReader<ElemType>::m;
+	using SequenceReader<ElemType>::readerMode;
 	using SequenceReader<ElemType>::GetIdFromLabel;
 	using SequenceReader<ElemType>::GetInputToClass;
 	using SequenceReader<ElemType>::GetClassInfo;
--- a/DataReader/LMSequenceReader/minibatchsourcehelpers.h
+++ b/DataReader/LMSequenceReader/minibatchsourcehelpers.h
@ -1,117 +0,0 @@
-//
-// <copyright file="minibatchsourcehelpers.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// minibatchsourcehelpers.h -- helper classes for minibatch sources
-//
-
-#pragma once
-
-#include "basetypes.h"
-#include <stdio.h>
-#include <vector>
-#include <algorithm>
-
-namespace msra { namespace dbn {
-
-// ---------------------------------------------------------------------------
-// randomordering -- class to help manage randomization of input data
-// ---------------------------------------------------------------------------
-
-static inline size_t rand (const size_t begin, const size_t end)
-{
-    const size_t randno = ::rand() * RAND_MAX + ::rand();   // BUGBUG: still only covers 32-bit range
-    return begin + randno % (end - begin);
-}
-
-class randomordering                // note: NOT thread-safe at all
-{
-    // constants for randomization
-    const static size_t randomizeDisable=0;
-
-    typedef unsigned int INDEXTYPE; // don't use size_t, as this saves HUGE amounts of RAM
-    std::vector<INDEXTYPE> map;          // [t] -> t' indices in randomized order
-    size_t currentseed;             // seed for current sequence
-    size_t randomizationrange;      // t - randomizationrange/2 <= t' < t + randomizationrange/2 (we support this to enable swapping)
-                                    // special values (randomizeDisable)
-    void invalidate() { currentseed = (size_t) -1; }
-public:
-    randomordering() { invalidate(); randomizationrange = randomizeDisable;}
-
-    void resize (size_t len, size_t p_randomizationrange) { randomizationrange = p_randomizationrange; if (len > 0) map.resize (len); invalidate(); }
-
-    // return the randomized feature bounds for a time range
-    std::pair<size_t,size_t> bounds (size_t ts, size_t te) const
-    {
-        size_t tbegin = max (ts, randomizationrange/2) - randomizationrange/2;
-        size_t tend = min (te + randomizationrange/2, map.size());
-        return std::make_pair<size_t,size_t> (move(tbegin), move(tend));
-    }
-
-    // this returns the map directly (read-only) and will lazily initialize it for a given seed
-    const std::vector<INDEXTYPE> & operator() (size_t seed) //throw()
-    {
-        // if wrong seed then lazily recache the sequence
-        if (seed != currentseed && randomizationrange != randomizeDisable)
-        {
-            // test for numeric overflow
-            if (map.size()-1 != (INDEXTYPE) (map.size()-1))
-                throw std::runtime_error ("randomordering: INDEXTYPE has too few bits for this corpus");
-            // 0, 1, 2...
-            foreach_index (t, map) map[t] = (INDEXTYPE) t;
-
-            if (map.size() > RAND_MAX * (size_t) RAND_MAX)
-                throw std::runtime_error ("randomordering: too large training set: need to change to different random generator!");
-            srand ((unsigned int) seed);
-            size_t retries = 0;
-            foreach_index (t, map)
-            {
-                for (int tries = 0; tries < 5; tries++)
-                {
-                    // swap current pos with a random position
-                    // Random positions are limited to t+randomizationrange.
-                    // This ensures some locality suitable for paging with a sliding window.
-                    const size_t tbegin = max ((size_t) t, randomizationrange/2) - randomizationrange/2; // range of window  --TODO: use bounds() function above
-                    const size_t tend = min (t + randomizationrange/2, map.size());
-                    assert (tend >= tbegin);                    // (guard against potential numeric-wraparound bug)
-                    const size_t trand = rand (tbegin, tend);   // random number within windows
-                    assert ((size_t) t <= trand + randomizationrange/2 && trand < (size_t) t + randomizationrange/2);
-                    // if range condition is fulfilled then swap
-                    if (trand <= map[t] + randomizationrange/2 && map[t] < trand + randomizationrange/2
-                        && (size_t) t <= map[trand] + randomizationrange/2 && map[trand] < (size_t) t + randomizationrange/2)
-                    {
-                        ::swap (map[t], map[trand]);
-                        break;
-                    }
-                    // but don't multi-swap stuff out of its range (for swapping positions that have been swapped before)
-                    // instead, try again with a different random number
-                    retries++;
-                }
-            }
-            fprintf (stderr, "randomordering: %d retries for %d elements (%.1f%%) to ensure window condition\n", retries, map.size(), 100.0 * retries / map.size());
-            // ensure the window condition
-            foreach_index (t, map) assert ((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2);
-#if 0       // and a live check since I don't trust myself here yet
-            foreach_index (t, map) if (!((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2))
-            {
-                fprintf (stderr, "randomordering: windowing condition violated %d -> %d\n", t, map[t]);
-                throw std::logic_error ("randomordering: windowing condition violated");
-            }
-#endif
-#if 0       // test whether it is indeed a unique complete sequence
-            auto map2 = map;
-            ::sort (map2.begin(), map2.end());
-            foreach_index (t, map2) assert (map2[t] == (size_t) t);
-#endif
-            fprintf (stderr, "randomordering: recached sequence for seed %d: %d, %d, ...\n", (int) seed, (int) map[0], (int) map[1]);
-            currentseed = seed;
-        }
-        return map; // caller can now access it through operator[]
-    }
-    size_t CurrentSeed() {return currentseed;}
-};
-
-typedef unsigned short CLASSIDTYPE; // type to store state ids; don't use size_t --saves HUGE amounts of RAM
-
-};};
--- a/DataReader/LUSequenceReader/DataWriter.cpp
+++ b/DataReader/LUSequenceReader/DataWriter.cpp
@ -7,7 +7,7 @@
 //

 #include "stdafx.h"
-#include "basetypes.h"
+#include "Basics.h"

 #define DATAWRITER_EXPORTS
 #include "DataWriter.h"
--- a/DataReader/LUSequenceReader/LUSequenceReader.h
+++ b/DataReader/LUSequenceReader/LUSequenceReader.h
@ -10,6 +10,7 @@
 #include "DataReader.h"
 #include "DataWriter.h"
 #include "LUSequenceParser.h"
+#include "commandArgUtil.h" // for intargvector
 #include <string>
 #include <map>
 #include <vector>
--- a/DataReader/LUSequenceReader/LUSequenceReader.vcxproj
+++ b/DataReader/LUSequenceReader/LUSequenceReader.vcxproj
@ -52,11 +52,13 @@
    <LinkIncremental>true</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
    <LinkIncremental>false</LinkIncremental>
    <IncludePath>..\..\common\include;..\..\math\math;$(VCInstallDir)include;$(VCInstallDir)atlmfc\include;$(WindowsSDK_IncludePath);</IncludePath>
    <LibraryPath>$(SolutionDir)$(Platform)\$(Configuration);$(VCInstallDir)lib\amd64;$(VCInstallDir)atlmfc\lib\amd64;$(WindowsSDK_LibraryPath_x64);</LibraryPath>
+    <IntDir>$(Platform)\$(Configuration)\$(ProjectName)\</IntDir>
  </PropertyGroup>
  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
    <ClCompile>
--- a/DataReader/LUSequenceReader/LUSequenceWriter.cpp
+++ b/DataReader/LUSequenceReader/LUSequenceWriter.cpp
@ -8,13 +8,14 @@

 #include "stdafx.h"
 #include <objbase.h>
-#include "basetypes.h"
+#include "Basics.h"
 #include <fstream>
 #include <algorithm>

 #define DATAWRITER_EXPORTS  // creating the exports here
 #include "DataWriter.h"
 #include "LUSequenceWriter.h"
+#include "commandArgUtil.h"
 #ifdef LEAKDETECT
 #include <vld.h> // for memory leak detection
 #endif
--- a/DataReader/LUSequenceReader/minibatchsourcehelpers.h
+++ b/DataReader/LUSequenceReader/minibatchsourcehelpers.h
@ -1,118 +0,0 @@
-//
-// <copyright file="minibatchsourcehelpers.h" company="Microsoft">
-//     Copyright (c) Microsoft Corporation.  All rights reserved.
-// </copyright>
-//
-// minibatchsourcehelpers.h -- helper classes for minibatch sources
-//
-
-
-#pragma once
-
-#include "basetypes.h"
-#include <stdio.h>
-#include <vector>
-#include <algorithm>
-
-namespace msra { namespace dbn {
-
-// ---------------------------------------------------------------------------
-// randomordering -- class to help manage randomization of input data
-// ---------------------------------------------------------------------------
-
-static inline size_t rand (const size_t begin, const size_t end)
-{
-    const size_t randno = ::rand() * RAND_MAX + ::rand();   // BUGBUG: still only covers 32-bit range
-    return begin + randno % (end - begin);
-}
-
-class randomordering                // note: NOT thread-safe at all
-{
-    // constants for randomization
-    const static size_t randomizeDisable=0;
-
-    typedef unsigned int INDEXTYPE; // don't use size_t, as this saves HUGE amounts of RAM
-    std::vector<INDEXTYPE> map;          // [t] -> t' indices in randomized order
-    size_t currentseed;             // seed for current sequence
-    size_t randomizationrange;      // t - randomizationrange/2 <= t' < t + randomizationrange/2 (we support this to enable swapping)
-                                    // special values (randomizeDisable)
-    void invalidate() { currentseed = (size_t) -1; }
-public:
-    randomordering() { invalidate(); randomizationrange = randomizeDisable;}
-
-    void resize (size_t len, size_t p_randomizationrange) { randomizationrange = p_randomizationrange; if (len > 0) map.resize (len); invalidate(); }
-
-    // return the randomized feature bounds for a time range
-    std::pair<size_t,size_t> bounds (size_t ts, size_t te) const
-    {
-        size_t tbegin = max (ts, randomizationrange/2) - randomizationrange/2;
-        size_t tend = min (te + randomizationrange/2, map.size());
-        return std::make_pair<size_t,size_t> (move(tbegin), move(tend));
-    }
-
-    // this returns the map directly (read-only) and will lazily initialize it for a given seed
-    const std::vector<INDEXTYPE> & operator() (size_t seed) //throw()
-    {
-        // if wrong seed then lazily recache the sequence
-        if (seed != currentseed && randomizationrange != randomizeDisable)
-        {
-            // test for numeric overflow
-            if (map.size()-1 != (INDEXTYPE) (map.size()-1))
-                throw std::runtime_error ("randomordering: INDEXTYPE has too few bits for this corpus");
-            // 0, 1, 2...
-            foreach_index (t, map) map[t] = (INDEXTYPE) t;
-
-            if (map.size() > RAND_MAX * (size_t) RAND_MAX)
-                throw std::runtime_error ("randomordering: too large training set: need to change to different random generator!");
-            srand ((unsigned int) seed);
-            size_t retries = 0;
-            foreach_index (t, map)
-            {
-                for (int tries = 0; tries < 5; tries++)
-                {
-                    // swap current pos with a random position
-                    // Random positions are limited to t+randomizationrange.
-                    // This ensures some locality suitable for paging with a sliding window.
-                    const size_t tbegin = max ((size_t) t, randomizationrange/2) - randomizationrange/2; // range of window  --TODO: use bounds() function above
-                    const size_t tend = min (t + randomizationrange/2, map.size());
-                    assert (tend >= tbegin);                    // (guard against potential numeric-wraparound bug)
-                    const size_t trand = rand (tbegin, tend);   // random number within windows
-                    assert ((size_t) t <= trand + randomizationrange/2 && trand < (size_t) t + randomizationrange/2);
-                    // if range condition is fulfilled then swap
-                    if (trand <= map[t] + randomizationrange/2 && map[t] < trand + randomizationrange/2
-                        && (size_t) t <= map[trand] + randomizationrange/2 && map[trand] < (size_t) t + randomizationrange/2)
-                    {
-                        ::swap (map[t], map[trand]);
-                        break;
-                    }
-                    // but don't multi-swap stuff out of its range (for swapping positions that have been swapped before)
-                    // instead, try again with a different random number
-                    retries++;
-                }
-            }
-            fprintf (stderr, "randomordering: %d retries for %d elements (%.1f%%) to ensure window condition\n", retries, map.size(), 100.0 * retries / map.size());
-            // ensure the window condition
-            foreach_index (t, map) assert ((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2);
-#if 0       // and a live check since I don't trust myself here yet
-            foreach_index (t, map) if (!((size_t) t <= map[t] + randomizationrange/2 && map[t] < (size_t) t + randomizationrange/2))
-            {
-                fprintf (stderr, "randomordering: windowing condition violated %d -> %d\n", t, map[t]);
-                throw std::logic_error ("randomordering: windowing condition violated");
-            }
-#endif
-#if 0       // test whether it is indeed a unique complete sequence
-            auto map2 = map;
-            ::sort (map2.begin(), map2.end());
-            foreach_index (t, map2) assert (map2[t] == (size_t) t);
-#endif
-            fprintf (stderr, "randomordering: recached sequence for seed %d: %d, %d, ...\n", (int) seed, (int) map[0], (int) map[1]);
-            currentseed = seed;
-        }
-        return map; // caller can now access it through operator[]
-    }
-    size_t CurrentSeed() {return currentseed;}
-};
-
-typedef unsigned short CLASSIDTYPE; // type to store state ids; don't use size_t --saves HUGE amounts of RAM
-
-};};
--- a/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.cpp
+++ b/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.cpp
@ -94,12 +94,12 @@ void LibSVMBinaryReader<ElemType>::WriteLabelFile()
            {
                labelFile << m_mapIdToLabel[i] << '\n';
            }
-            fprintf(stderr, "label file %ws written to disk\n", m_labelFileToWrite.c_str());
+            fprintf(stderr, "label file %ls written to disk\n", m_labelFileToWrite.c_str());
            m_labelFileToWrite.clear();
        }
        else if (!m_cachingWriter)
        {
-            fprintf(stderr, "WARNING: file %ws NOT written to disk yet, will be written the first time the end of the entire dataset is found.\n", m_labelFileToWrite.c_str());
+            fprintf(stderr, "WARNING: file %ls NOT written to disk yet, will be written the first time the end of the entire dataset is found.\n", m_labelFileToWrite.c_str());
        }
    }
 }
--- a/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.h
+++ b/DataReader/LibSVMBinaryReader/LibSVMBinaryReader.h
@ -8,6 +8,7 @@
 #include "DataReader.h"
 #include "DataWriter.h"
 #include <string>
+#include "commandArgUtil.h"
 #include <map>
 #include <vector>
 #include "minibatchsourcehelpers.h"
--- a/Показать больше
+++ b/Показать больше