merged from master and resolved conflicts

2016-04-16 14:29:11 -07:00 · 2016-04-16 14:29:11 -07:00 · 4c9f91868e
--- a/.gitignore
+++ b/.gitignore
@ -150,6 +150,10 @@ GeneratedArtifacts/
 _Pvt_Extensions/
 ModelManifest.xml

+# Python
+*.pyc
+__pychache__/
+
 # =========================
 # Windows detritus
 # =========================
--- a/CNTK.sln
+++ b/CNTK.sln
@ -913,6 +913,58 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "NetworkTests", "Tests\UnitT
 		{EAD17188-072C-4726-B840-A769C36DAD1B} = {EAD17188-072C-4726-B840-A769C36DAD1B}
 	EndProjectSection
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Text", "Text", "{8656B71D-E24C-4AC2-8BE4-C07B415A3E15}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "SequenceClassification", "SequenceClassification", "{E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Miscellaneous", "Miscellaneous", "{8629430A-821E-43BA-AEC5-8B2CF31A2A7A}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CIFAR-10", "CIFAR-10", "{0141526B-F257-4574-8CBE-99634726FFCE}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "01_Convolution", "01_Convolution", "{58286327-6742-44C4-A34E-D2583419E55E}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.cpu.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.cpu.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.gpu.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.windows.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\baseline.windows.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\run-test = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\run-test
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\testcases.yml = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\01_Convolution\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "02_BatchNormConv", "02_BatchNormConv", "{AB9207B9-B134-4C57-B7ED-F3DCF7B0DC5F}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\baseline.linux.gpu.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\baseline.windows.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\baseline.windows.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\run-test = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\run-test
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\testcases.yml = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\02_BatchNormConv\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "03_ResNet", "03_ResNet", "{12FB912C-43F8-40FE-BD7F-B52F589A1EBC}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\03_ResNet\baseline.linux.gpu.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\03_ResNet\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\03_ResNet\baseline.windows.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\03_ResNet\baseline.windows.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\03_ResNet\run-test = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\03_ResNet\run-test
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\03_ResNet\testcases.yml = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\03_ResNet\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "04_ResNet", "04_ResNet", "{2BFE4D88-6F32-4701-887A-1DE3D7626DBB}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\baseline.linux.gpu.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\baseline.linux.gpu.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\baseline.windows.txt = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\baseline.windows.txt
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\run-test = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\run-test
+		Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\testcases.yml = Tests\EndToEndTests\Examples\Image\Miscellaneous\CIFAR-10\04_ResNet_56\testcases.yml
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Config", "Config", "{EC780385-7580-4D15-914B-1D878A295CBC}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Text\SequenceClassification\Config\seqcla.cntk = Tests\EndToEndTests\Text\SequenceClassification\Config\seqcla.cntk
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Data", "Data", "{D11F76CC-DB6D-4CB4-B3B7-AB139DE2F5FA}"
+	ProjectSection(SolutionItems) = preProject
+		Tests\EndToEndTests\Text\SequenceClassification\Data\embeddingmatrix.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\embeddingmatrix.txt
+		Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt = Tests\EndToEndTests\Text\SequenceClassification\Data\Train.txt
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug_CpuOnly|x64 = Debug_CpuOnly|x64
@ -1254,5 +1306,15 @@ Global
 		{48C2A9DE-FB2C-4724-9ADC-744216D79BCF} = {08A05A9A-4E45-42D5-83FA-719E99C04A30}
 		{2B1046A1-0140-43B7-B3DC-CF7DEEE1009E} = {8071EF60-30F7-4A77-81AA-ADCA0E18B1E3}
 		{CDA96AA3-3252-4978-A0BF-2ACD670823CB} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
+		{8656B71D-E24C-4AC2-8BE4-C07B415A3E15} = {6E565B48-1923-49CE-9787-9BBB9D96F4C5}
+		{E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
+		{8629430A-821E-43BA-AEC5-8B2CF31A2A7A} = {FC7E7EC7-6E6A-4518-81C6-DA60451C657A}
+		{0141526B-F257-4574-8CBE-99634726FFCE} = {8629430A-821E-43BA-AEC5-8B2CF31A2A7A}
+		{58286327-6742-44C4-A34E-D2583419E55E} = {0141526B-F257-4574-8CBE-99634726FFCE}
+		{AB9207B9-B134-4C57-B7ED-F3DCF7B0DC5F} = {0141526B-F257-4574-8CBE-99634726FFCE}
+		{12FB912C-43F8-40FE-BD7F-B52F589A1EBC} = {0141526B-F257-4574-8CBE-99634726FFCE}
+		{2BFE4D88-6F32-4701-887A-1DE3D7626DBB} = {0141526B-F257-4574-8CBE-99634726FFCE}
+		{EC780385-7580-4D15-914B-1D878A295CBC} = {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}
+		{D11F76CC-DB6D-4CB4-B3B7-AB139DE2F5FA} = {E53E63A0-FAA9-4416-9AD1-08A8FB87FEE1}
 	EndGlobalSection
 EndGlobal
--- a/1
+++ b/1
@ -586,7 +586,6 @@ CNTK_SRC =\
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptEvaluator.cpp \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptParser.cpp \
 	$(SOURCEDIR)/CNTK/BrainScript/BrainScriptTest.cpp \
-	$(SOURCEDIR)/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp \
 	$(SOURCEDIR)/Common/BestGpu.cpp \
 	$(SOURCEDIR)/Common/MPIWrapper.cpp \

--- a/Source/ActionsLib/EvalActions.cpp
+++ b/Source/ActionsLib/EvalActions.cpp
@ -209,7 +209,8 @@ template <typename ElemType>
 void DoWriteOutput(const ConfigParameters& config)
 {
    ConfigParameters readerConfig(config(L"reader"));
-    readerConfig.Insert("traceLevel", config(L"traceLevel", "0"));
+    // Why?
+    //readerConfig.Insert("traceLevel", config(L"traceLevel", "0"));
    readerConfig.Insert("randomize", "None"); // we don't want randomization when output results

    DataReader testDataReader(readerConfig);
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -73,14 +73,16 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            size_t i = 0;
            auto tensorShape = ProcessTensorShapeParameters(node, params, i, /*isImage=*/false, cnNodeType);

+            wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", "");
+            // TODO: Map dynamicAxis from name to node at this point, where that node is memoized inside NDL.
            // first look for this node already existing in the network
            // BUGBUG: How does this set the dimensions then?
            if (m_net->NodeNameExists(name))
                nodePtr = dynamic_pointer_cast<ComputationNode<ElemType>>(m_net->GetNodeFromName(name));
            else if (isSparse)
-                nodePtr = builder.CreateSparseInputNode(name, tensorShape);
+                nodePtr = builder.CreateSparseInputNode(name, tensorShape, dynamicAxis);
            else
-                nodePtr = builder.CreateInputNode(name, tensorShape);
+                nodePtr = builder.CreateInputNode(name, tensorShape, dynamicAxis);
        }
    }
    else if (cnNodeType == L"ImageInput" || cnNodeType == L"SparseImageInput")
@ -97,11 +99,12 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            size_t imageHeight   = ((NDLNode<ElemType>*) params[1])->GetScalar();
            size_t imageChannels = ((NDLNode<ElemType>*) params[2])->GetScalar();
            ImageLayoutKind imageLayoutKind = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "HWC"));
+            wstring dynamicAxis = node->GetOptionalParameter("dynamicAxis", "");

            if (isSparse)
-                nodePtr = builder.CreateSparseInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind));
+                nodePtr = builder.CreateSparseInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind), dynamicAxis);
            else
-                nodePtr = builder.CreateInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind));
+                nodePtr = builder.CreateInputNode(name, ImageDimensions::AsTensorShape(imageWidth, imageHeight, imageChannels, imageLayoutKind), dynamicAxis);
        }
    }
    else if (OperationNameOf(LearnableParameter) == cnNodeType || cnNodeType == L"ImageParameter")
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -34,10 +34,12 @@ Parameter = LearnableParameter // deprecated
 # TODO: make Parameter take tensor dims?
 ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initFromLiteral = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]
 ConstantFromString(literal, tag='') = ParameterTensor((0)/*dim, will be inferred*/, init = 'fromLiteral', initFromLiteral = literal, learningRateMultiplier = 0.0)
-Input(dims, tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
-SparseInput(dims, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
-ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]
-SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
+DynamicAxis(tag='') = new ComputationNode [ operation = 'DynamicAxis' ; /*plus the function args*/  ]
+Input(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
+# TODO: change from dynamicAxis by name to dynamicAxis being an actual object
+SparseInput(dims, dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]
+ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]
+SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', dynamicAxis='', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]
 EnvironmentInput(propertyName, tag='') = new ComputationNode [ operation = 'EnvironmentInput' /*plus the function args*/ ]
 ConstantTensor(val, dims, tag='') = ParameterTensor(dims, learningRateMultiplier = 0, init = 'fixedValue', value = val) 
 Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, init = 'fixedValue', value = val) 
@ -79,8 +81,9 @@ Transpose(x) = TransposeDimensions(x, 1, 2)
 Times(A, B, outputRank=1, tag='') = new ComputationNode [ operation = 'Times' ; inputs = ( A : B ) /*plus the function args*/ ]
 Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]
 WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]
-ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
-CastAs (type, data) = ReconcileMBLayout (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
+ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
+ReconcileMBLayout = ReconcileDynamicAxis # back compat
+CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
 Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]
--- a/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
+++ b/Source/CNTK/BrainScript/ExperimentalNetworkBuilder.cpp
@ -1,134 +0,0 @@
-#if 0 // this entire file can be removed once CNTK.core.bs works
-// ExperimentalNetworkBuilder.cpp -- interface to new version of NDL (and config) parser  --fseide
-
-#define _CRT_NONSTDC_NO_DEPRECATE // make VS accept POSIX functions without _
-#define _CRT_SECURE_NO_WARNINGS   // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings
-
-#include <string>
-
-using namespace std;
-
-// TODO: move to actual text files to be included
-
-wstring standardFunctions =
-    L"Print(value, format='') = new PrintAction [ what = value /*; how = format*/ ] \n"
-    L"Debug(value, say = '', enabled = true) = new Debug [ /*macro arg values*/ ] \n"
-    L"Format(value, format) = new StringFunction [ what = 'Format' ; arg = value ; how = format ] \n"
-    L"Replace(s, from, to) = new StringFunction [ what = 'Replace' ; arg = s ; replacewhat = from ; withwhat = to ] \n"
-    L"Substr(s, begin, num) = new StringFunction [ what = 'Substr' ; arg = s ; pos = begin ; chars = num ] \n"
-    L"Chr(c) = new StringFunction [ what = 'Chr' ;  arg = c ] \n"
-    L"Floor(x)  = new NumericFunction [ what = 'Floor' ;  arg = x ] \n"
-    L"Length(x) = new NumericFunction [ what = 'Length' ; arg = x ] \n"
-    L"Ceil(x) = -Floor(-x) \n"
-    L"Round(x) = Floor(x+0.5) \n"
-    L"Sign(x) = if x > 0 then 1 else if x < 0 then -1 else 0 \n"
-    L"Min(a,b) = if a < b then a else b \n"
-    L"Max(a,b) = if a > b then a else b \n"
-    L"Fac(n) = if n > 1 then Fac(n-1) * n else 1 \n";
-
-wstring commonMacros =
-    L"BFF(in, rows, cols) = [ B = Parameter(rows, 1, init = 'fixedValue', value = 0) ; W = Parameter(rows, cols) ; z = W*in+B ] \n"
-    L"SBFF(in, rows, cols) = [ Eh = Sigmoid(BFF(in, rows, cols).z) ] \n "
-    L"MeanVarNorm(feat) = PerDimMeanVarNormalization(feat, Mean(feat), InvStdDev(feat)) \n"
-    L"LogPrior(labels) = Log(Mean(labels)) \n";
-
-wstring computationNodes = // TODO: use actual TypeName() here? would first need to make it a wide string; we should also extract those two methods into the base macro
-L"LearnableParameter(rows, cols, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ dims = (rows : cols) ] /*plus the function args*/ ]\n"
-    L"Parameter = LearnableParameter // deprecated \n"
-L"ParameterTensor(dims, learningRateMultiplier = 1.0, init = 'uniform'/*|fixedValue|gaussian|fromFile*/, initValueScale = 1, value = 0, initFromFilePath = '', initOnCPUOnly=true, randomSeed=-1, tag='') = new ComputationNode [ operation = 'LearnableParameter' ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
-    // TODO: ImageParameter?
-    // ^^ already works; vv untested
-    L"Input(dims, tag='feature') = new ComputationNode [ operation = 'InputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]\n" // note: naming a little inconsistent  // TODO: re-test after flag change
-    L"SparseInput(dims, tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; shape = new TensorShape [ /*dims*/ ] ; isImage = false /*plus the function args*/ ]\n"
-    L"ImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'InputValue' ; isImage = true /*plus the function args*/ ]\n"
-    L"SparseImageInput(imageWidth, imageHeight, imageChannels, imageLayout='CHW', tag='feature') = new ComputationNode [ operation = 'SparseInputValue' ; isImage = true /*plus the function args*/ ]\n"
-    L"Constant(val, rows = 1, cols = 1, tag='') = Parameter(rows, cols, learningRateMultiplier = 0, init = 'fixedValue', value = val) \n"
-    L"PastValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'PastValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
-    L"FutureValue(dims, input, timeStep = 1, defaultHiddenActivation = 0.1, tag='') = new ComputationNode [ operation = 'FutureValue' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
-    // TODO: ^^ DelayedValues no longer need to know their dimension. That is inferred in Validation.
-    L"Shift(input, fromOffset, boundaryValue, boundaryMode=-1/*context*/, dim=-1, tag='') = new ComputationNode [ operation = 'Shift' ; inputs = (input : boundaryValue) /*plus the function args*/ ]\n"
-    L"RowSlice(startIndex, numRows, input, tag='') = new ComputationNode [ operation = 'RowSlice' ; inputs = input /*plus the function args*/ ]\n"
-    L"RowRepeat(input, numRepeats, tag='') = new ComputationNode [ operation = 'RowRepeat' ; inputs = input /*plus the function args*/ ]\n"
-    L"RowStack(inputs, tag='') = new ComputationNode [ operation = 'RowStack' /*plus the function args*/ ]\n"
-    L"Reshape(input, numRows, imageWidth = 0, imageHeight = 0, imageChannels = 0, tag='') = new ComputationNode [ operation = 'LegacyReshape' ; inputs = input /*plus the function args*/ ]\n"
-    L"NewReshape(input, dims, beginDim=0, endDim=0, tag='') = new ComputationNode [ operation = 'Reshape' ; inputs = input ; shape = new TensorShape [ /*dims*/ ] /*plus the function args*/ ]\n"
-    L"ReshapeDimension(x, dim, tensorShape) = NewReshape(x, tensorShape, beginDim=dim, endDim=dim + 1) \n"
-    L"FlattenDimensions(x, dim, num) = NewReshape(x, 0, beginDim=dim, endDim=dim + num) \n"
-    L"SplitDimension(x, dim, N) = ReshapeDimension(x, dim, 0:N) \n"
-    L"TransposeDimensions(input, dim1, dim2, tag='') = new ComputationNode [ operation = 'TransposeDimensions' ; inputs = input /*plus the function args*/ ]\n"
-    L"Transpose(x) = TransposeDimensions(x, 1, 2)\n"
-    L"Times(A, B, outputRank=1, tag='') = new ComputationNode [ operation = 'Times' ; inputs = ( A : B ) /*plus the function args*/ ]\n"
-    // TODO: Logistic should be generated with with BinaryStandardNode macro below.
-    L"Logistic(label, probability, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability) /*plus the function args*/ ]\n"
-    L"WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNode [ operation = 'Logistic' ; inputs = (label : probability : instanceWeight) /*plus the function args*/ ]\n"
-    L"ReconcileMBLayout(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileMBLayout' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]\n"
-    L"Convolution(weightNode, inputValueNode, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample, zeroPadding = false, maxTempMemSizeInSamples = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode) /*plus the function args*/ ]\n"
-    L"MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]\n"
-    L"AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]\n"
-    // TODO: define DelayedValue, with negative delay for future; cannot do this yet, need to be able to say something like delay = -(^.delay)
-    // aliases
-    L"ColumnwiseCrossProduct = KhatriRaoProduct // deprecated \n" // TODO: should it be deprecated? It is described as easier to understand in the CNTKBook.
-    L"ClassificationError = ErrorPrediction \n"
-    L"Delay = PastValue \n" // TODO: should it allow negative offsets and an if test here?
-    L"BatchNormalization(input, scale, bias, runMean, runInvStdDev, eval, spatial, normalizationTimeConstant = 0, epsilon = 0.00001, useCntkEngine = true, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'BatchNormalization' ; inputs = (input : scale : bias : runMean : runInvStdDev) /*plus the function args*/ ]\n"
-// standard nodes. We use macros to define these strings.
-#define UnaryStandardNode(Op, a) L## #Op L"(" L## #a L", tag='') = new ComputationNode [ operation = '" L## #Op L"' ; inputs = " L## #a L" /*plus the function args*/ ]\n"
-#define BinaryStandardNode(Op, a, b) L## #Op L"(" L## #a L", " L## #b L", tag='') = new ComputationNode [ operation = '" L## #Op L"' ; inputs = (" L## #a L" : " L## #b L") /*plus the function args*/ ]\n"
-#define TernaryStandardNode(Op, a, b, c) L## #Op L"(" L## #a L", " L## #b L", " L## #c L", tag='') = new ComputationNode [ operation = '" L## #Op L"' ; inputs = (" L## #a L" : " L## #b L" : " L## #c L") /*plus the function args*/ ]\n"
-#define QuaternaryStandardNode(Op, a, b, c, d) L## #Op L"(" L## #a L", " L## #b L", " L## #c L", " L## #d L", tag='') = new ComputationNode [ operation = '" L## #Op L"' ; inputs = (" L## #a L" : " L## #b L" : " L## #c L" : " L## #d L") /*plus the function args*/ ]\n"
-#ifdef COMING_SOON
-    TernaryStandardNode(CRF, labelVectorSequence, positionDependenScoreVectorSequence, transitionScores) // TODO: better names
-#endif
-    UnaryStandardNode(Abs, x)
-    QuaternaryStandardNode(ClassBasedCrossEntropyWithSoftmax, labelClassDescriptorVectorSequence, mainInputInfo, mainWeight, classLogProbsBeforeSoftmax)
-    // BUGBUG: the commented-out ones are not mentioned in the CNTK book, nor are their parameters documented in the source code
-    BinaryStandardNode(ColumnElementTimes, aVectorSequence, anotherVectorSequence)
-    BinaryStandardNode(CosDistance, aVectorSequence, anotherVectorSequence)
-    QuaternaryStandardNode(CosDistanceWithNegativeSamples, aVectorSequence, anotherVectorSequence, numShifts, numNegSamples)
-    //BinaryStandardNode(CosDistanceWithNegativeSamplesNode)
-    UnaryStandardNode(Cosine, x)
-    BinaryStandardNode(CrossEntropy, refProbVectorSequence, outProbVectorSequence)
-    BinaryStandardNode(CrossEntropyWithSoftmax, labelVectorSequence, outProbVectorSequence)
-    BinaryStandardNode(DiagTimes, diagonalMatrixAsColumnVector, matrix)
-    UnaryStandardNode(Dropout, activationVectorSequence)
-    //BinaryStandardNode(DummyCriterionNode)
-    BinaryStandardNode(ElementTimes, aMatrix, anotherMatrix)
-    BinaryStandardNode(ErrorPrediction, labelVectorSequence, outVectorSequence) // CNTKBook: ClassificationError?
-    UnaryStandardNode(Exp, x)
-    QuaternaryStandardNode(GMMLogLikelihood, unnormalizedPriorVector, meansAsRows, logStdDevAsRows, dataVectorSequence)
-    UnaryStandardNode(InvStdDev, dataVectorSequence)
-    BinaryStandardNode(KhatriRaoProduct, leftMatrix, rightMatrix)
-    //BinaryStandardNode(LSTMNode)
-    UnaryStandardNode(Log, x)
-    UnaryStandardNode(LogSoftmax, z)
-    //BinaryStandardNode(LookupTableNode)
-    UnaryStandardNode(MatrixL1Reg, matrix)
-    UnaryStandardNode(MatrixL2Reg, matrix)
-    // BUGBUG: CNTKBook also mentions L1Norm and L2Norm
-    UnaryStandardNode(Mean, dataVectorSequence)
-    BinaryStandardNode(Minus, leftMatrix, rightMatrix)
-    UnaryStandardNode(Negate, input)
-    TernaryStandardNode(PerDimMeanVarDeNormalization, dataVectorSequence, meanVector, invStdDevVector) // TODO: correct?
-    TernaryStandardNode(PerDimMeanVarNormalization, dataVectorSequence, meanVector, invStdDevVector)
-    BinaryStandardNode(Plus, leftMatrix, rightMatrix)
-    UnaryStandardNode(RectifiedLinear, z)
-    //BinaryStandardNode(RowElementTimesNode)
-    BinaryStandardNode(Scale, scalarScalingFactor, matrix)
-#ifdef COMING_SOON
-    //BinaryStandardNode(SequenceDecoderNode)
-#endif
-    UnaryStandardNode(Sigmoid, z)
-    UnaryStandardNode(Softmax, z)
-    UnaryStandardNode(Hardmax, z)
-    BinaryStandardNode(SquareError, aMatrix, anotherMatrix)
-    UnaryStandardNode(SumColumnElements, z)
-    UnaryStandardNode(SumElements, matrix)
-    UnaryStandardNode(Tanh, z)
-    UnaryStandardNode(TimeReverse, vectorSequence)
-    BinaryStandardNode(TransposeTimes, leftMatrix, rightMatrix)
-    // those nodes are deprecated, we won't implement them in BS:
-    //BinaryStandardNode(NoiseContrastiveEstimationNode)
-    //BinaryStandardNode(ParallelNode)
-    //BinaryStandardNode(StrideTimesNode)
-    ;
-#endif
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -205,7 +205,6 @@
    <ClCompile Include="BrainScript\BrainScriptEvaluator.cpp" />
    <ClCompile Include="BrainScript\BrainScriptParser.cpp" />
    <ClCompile Include="BrainScript\BrainScriptTest.cpp" />
-    <ClCompile Include="BrainScript\ExperimentalNetworkBuilder.cpp" />
    <ClCompile Include="CNTK.cpp" />
    <ClCompile Include="ModelEditLanguage.cpp" />
    <ClCompile Include="stdafx.cpp" />
--- a/Source/CNTK/CNTK.vcxproj.filters
+++ b/Source/CNTK/CNTK.vcxproj.filters
@ -44,9 +44,6 @@
    <ClCompile Include="BrainScript\BrainScriptTest.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
-    <ClCompile Include="BrainScript\ExperimentalNetworkBuilder.cpp">
-      <Filter>BrainScript</Filter>
-    </ClCompile>
    <ClCompile Include="..\Common\ExceptionWithCallStack.cpp">
      <Filter>Common</Filter>
    </ClCompile>
--- a/Source/Common/DataReader.cpp
+++ b/Source/Common/DataReader.cpp
@ -31,6 +31,34 @@ static const char* GetReaderName(const string& precision)
        InvalidArgument("DataReader: The 'precision' parameter must be 'float' or 'double'.");
 }

+void DataReaderBase::SetMinibatchLayout(StreamMinibatchInputs& minibatch)
+{
+    assert(minibatch.begin() != minibatch.end());
+
+    auto& pMBLayout = minibatch.begin()->second.pMBLayout;
+    // This is only allowed for old readers, which support a single layout for all inputs.
+    for (const auto& iter : minibatch)
+    {
+        assert(iter.second.pMBLayout == pMBLayout);
+        // TODO: This should be a runtime check, not an assert() that only runs in Debug.
+        UNUSED(iter);
+    }
+
+    CopyMBLayoutTo(pMBLayout);
+}
+
+bool DataReaderBase::GetMinibatch(StreamMinibatchInputs& minibatch)
+{
+    if (TryGetMinibatch(minibatch))
+    {
+        SetMinibatchLayout(minibatch);
+        return true;
+    }
+
+    return false;
+}
+
+
 template <class ConfigRecordType>
 void DataReader::InitFromConfig(const ConfigRecordType& /*config*/)
 {
--- a/Source/Common/Include/DataReader.h
+++ b/Source/Common/Include/DataReader.h
@ -59,28 +59,29 @@ public:
        /*const*/ TensorShape sampleLayout;

        // constructor
-        Input(MatrixBasePtr matrix, MBLayoutPtr pMBLayout, TensorShape sampleLayout) : matrix(matrix), pMBLayout(pMBLayout), sampleLayout(sampleLayout)
+        Input(MatrixBasePtr matrix, MBLayoutPtr pMBLayout, TensorShape sampleLayout) : 
+            matrix(matrix), pMBLayout(pMBLayout), sampleLayout(sampleLayout)
        {
            assert(matrix);
        }
-        Input(){} // some STL classes need this for general happiness
+        Input() {} // some STL classes need this for general happiness

        // helper for typecasting the matrix pointer
-        template<class ElemType>
+    template<class ElemType>
        Matrix<ElemType>& GetMatrix(const wchar_t* name/*for debugging only*/ = L"(unknown)") const
-        {
+    {
            assert(matrix);
            auto* matrixp = dynamic_cast<Matrix<ElemType>*>(matrix.get());
-            if (!matrixp)
-            {
-                // print a rather rich error to track down a regression failure
+        if (!matrixp)
+        {
+            // print a rather rich error to track down a regression failure
                auto isFloat  = !!dynamic_cast<Matrix<float>*> (matrix.get());
                auto isDouble = !!dynamic_cast<Matrix<double>*>(matrix.get());
                LogicError("GetMatrix<%s>: Attempted to access input stream '%ls' with wrong precision, got %s {%d,%d} instead of %s.",
                    typeid(ElemType).name(), name, typeid(matrix.get()).name(), (int)isFloat, (int)isDouble, typeid(Matrix<ElemType>*).name());
-            }
-            return *matrixp;
        }
+        return *matrixp;
+    }
    };

 private:
@ -246,6 +247,21 @@ typedef std::shared_ptr<IDataReader> IDataReaderPtr;
 extern "C" DATAREADER_API void GetReaderF(IDataReader** preader);
 extern "C" DATAREADER_API void GetReaderD(IDataReader** preader);

+// The sole purpose of this base class is to provide backwards compatibility for (old)
+// readers that do not support multiple mb layouts.
+class DataReaderBase : public IDataReader
+{
+protected:
+    // Verifies that all inputs share the same layout (have the same layout pointer) 
+    // and copies the provided layout into the minibatch layout.
+    // This method is needed for backwards-compatibility and only meant to be used by old readers!
+    void SetMinibatchLayout(StreamMinibatchInputs& minibatch);
+
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices) = 0;
+public:
+    virtual bool GetMinibatch(StreamMinibatchInputs& matrices) override;
+};
+
 // Data Reader class
 // interface for clients of the Data Reader
 // mirrors the IDataReader interface, except the Init method is private (use the constructor)
@ -292,7 +308,6 @@ class DataReader : public IDataReader, protected Plugin, public ScriptableObject
    // NOTE: this destroys the object, and it can't be used past this point.
    // The reason why this is not just a destructor is that it goes across a DLL boundary.
    virtual void Destroy() override;
-
 public:
    // DataReader Constructor
    // config - [in] configuration parameters for the datareader
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -100,29 +100,29 @@ struct MBLayout
        {
            return seqId == other.seqId && s == other.s && tBegin == other.tBegin && tEnd == other.tEnd;
        }
-        size_t GetNumTimeSteps() const
-        {
-            return (size_t)(tEnd - tBegin);
-        }
+        size_t GetNumTimeSteps() const { return (size_t)(tEnd - tBegin); }
    };

    // -------------------------------------------------------------------
    // construction
    // -------------------------------------------------------------------

-    MBLayout(size_t numParallelSequences, size_t numTimeSteps)
+    MBLayout(size_t numParallelSequences, size_t numTimeSteps, const std::wstring &name)
        : m_distanceToStart(CPUDEVICE), m_distanceToEnd(CPUDEVICE), m_columnsValidityMask(CPUDEVICE)
    {
        Init(numParallelSequences, numTimeSteps);
+        SetUniqueAxisName(name != L"" ? name : L"DynamicAxis");
    }
    MBLayout()
-        : MBLayout(1, 0)
+        : MBLayout(1, 0, L"")
    {
    }

    // copy the content of another MBLayoutPtr over
    // Use this instead of actual assignment to make it super-obvious that this is not copying the pointer but actual content. The pointer is kept fixed.
-    void CopyFrom(const MBLayoutPtr& other)
+    // Use "keepName" if the "identity" of the target is to be preserved, e.g. 
+    // while copying from reader space to network space.
+    void CopyFrom(const MBLayoutPtr& other, bool keepName=false)
    {
        m_numTimeSteps = other->m_numTimeSteps;
        m_numParallelSequences = other->m_numParallelSequences;
@ -141,7 +141,8 @@ struct MBLayout
        m_columnsValidityMask.SetValue(other->m_columnsValidityMask);
        m_writable = other->m_writable;

-        m_axisName = other->m_axisName;
+        if (!keepName)
+            m_axisName = other->m_axisName;
    }

    // Destructive copy that steals ownership if the content, like std::move()
@ -275,7 +276,7 @@ public:
    }

    // return all sequences stored in this minibatch
-    const vector<SequenceInfo> &GetAllSequences() const
+    const vector<SequenceInfo>& GetAllSequences() const
    {
        return m_sequences;
    }
@ -287,7 +288,7 @@ public:
    const Matrix<char>& GetColumnsValidityMask(DEVICEID_TYPE deviceId) const;

    // compare whether two layouts are the same
-    bool operator==(const MBLayout &other) const
+    bool operator==(const MBLayout& other) const
    {
        if (this == &other)
            return true;
@ -441,8 +442,8 @@ public:
    bool HasGaps(const FrameRange &fr) const;

    // test boundary flags for a specific condition
-    bool IsBeyondStartOrEnd(const FrameRange &fr) const;
-    bool IsGap(const FrameRange &fr) const;
+    bool IsBeyondStartOrEnd(const FrameRange& fr) const;
+    bool IsGap(const FrameRange& fr) const;

    // test whether at least one sequence crosses the bounds of this minibatch
    bool HasSequenceBeyondBegin() const
@ -555,7 +556,7 @@ private:
    // Meant to guard in lazy creation of m_columnsValidityMask.
    mutable bool m_writable;

-    // the axis
+    // The axis this MBLayout represents.
    // For now only a string meant for debugging.
    std::wstring m_axisName;

@ -751,6 +752,7 @@ inline bool MBLayout::HasGaps() const
 {
    return m_numGapFrames > 0; /*HasGaps(FrameRange());*/
 }
+
 inline bool MBLayout::HasGaps(const FrameRange &fr) const
 {
    CheckIsValid();
@ -828,7 +830,7 @@ inline size_t MBLayout::GetActualNumSamples() const { return m_numFramesDeclared
 // only called from MaskMissingColumnsTo()
 // TODO: Can probably be faster by using the sequence array directly.
 // TODO: Or should we just blast m_distanceToStart to GPU, and maks based on that? It is small compared to features.
-inline const Matrix<char> &MBLayout::GetColumnsValidityMask(DEVICEID_TYPE deviceId) const
+inline const Matrix<char>& MBLayout::GetColumnsValidityMask(DEVICEID_TYPE deviceId) const
 {
    CheckIsValid();
    // lazily compute the validity mask
@ -947,7 +949,7 @@ static inline std::pair<size_t, size_t> ColumnRangeWithMBLayoutFor(size_t numCol
    // MBLayout of data and of FrameRange must be identical pointers,
    // or in case of broadcasting, respective parent pointers.
    // MBLayouts that are identical in content but not object identity (pointer) are not admissible.
-    // For those cases, use a ReconcileMBLayout node.
+    // For those cases, use a ReconcileDynamicAxis node.
    if (fr.m_pMBLayout != pMBLayout)
    {
        // if broadcast allowed then it is allowed to broadcast from an outer-loop value
@ -955,9 +957,9 @@ static inline std::pair<size_t, size_t> ColumnRangeWithMBLayoutFor(size_t numCol
        if (fr.m_broadcastAllowed && !pMBLayout && numCols == 1)
            return std::pair<size_t, size_t>(0, numCols);
        if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout)
-            LogicError("DataFor: FrameRange's MBLayout inconsistent with matrix. They are compatible though--are you missing a ReconcileMBLayout operation?");
+            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation?");
        else
-            LogicError("DataFor: FrameRange's MBLayout inconsistent with matrix.");
+            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix.");
    }
    // if FrameRange refers to whole minibatch (map mode)
    // or if we don't even have a layout
@ -1040,7 +1042,7 @@ static inline std::pair<DimensionVector, DimensionVector> TensorSliceWithMBLayou
    // MBLayout of data and of FrameRange must be identical pointers,
    // or in case of broadcasting, respective parent pointers.
    // MBLayouts that are identical in content but not object identity (pointer) are not admissible.
-    // For those cases, use a ReconcileMBLayout node.
+    // For those cases, use a ReconcileDynamicAxis node.
    if (isTimeIteration && fr.m_pMBLayout != pMBLayout)
    {
        // if broadcast allowed then it is allowed to broadcast from an outer-loop value
@ -1048,10 +1050,10 @@ static inline std::pair<DimensionVector, DimensionVector> TensorSliceWithMBLayou
        if (fr.m_pMBLayout /*get data for a loop*/ && !pMBLayout /*'data' is not samples*/ && fr.m_broadcastAllowed /*we're OK with that*/)
            ; // the time dimension is broadcasting--leave it as is
        else if (fr.m_pMBLayout && pMBLayout && *fr.m_pMBLayout == *pMBLayout)
-            LogicError("DataFor: FrameRange's MBLayout inconsistent with matrix. They are compatible though--are you missing a ReconcileMBLayout operation? %s vs. %s", 
+            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix. They are compatible though--are you missing a ReconcileDynamicAxis operation? %s vs. %s", 
                       static_cast<string>(*(fr.m_pMBLayout)).c_str(), static_cast<string>(*(pMBLayout)).c_str());
        else
-            LogicError("DataFor: FrameRange's MBLayout inconsistent with matrix: %s vs. %s", 
+            LogicError("DataFor: FrameRange's dynamic axis is inconsistent with matrix: %s vs. %s", 
                       static_cast<string>(*(fr.m_pMBLayout)).c_str(), static_cast<string>(*(pMBLayout)).c_str());
    }
    // if FrameRange refers to whole minibatch (map mode)
@ -1123,8 +1125,10 @@ static inline void MaskMissingColumnsTo(Matrix<ElemType>& matrixToMask, const MB
        TensorView<ElemType>(matrixSliceToMask).DoMaskNegativeOf(0, TensorView<ElemType>(matrixSliceToMask), TensorView<ElemType>(maskSlice), 1); val;
 #else
        const auto& maskMatrix = pMBLayout->GetColumnsValidityMask(matrixToMask.GetDeviceId());
+
        maskMatrix.TransferToDeviceIfNotThere(matrixToMask.GetDeviceId(), /*ismoved=*/ false, /*emptyTransfer=*/ false, /*updatePreferredDevice=*/ false);
        auto maskSlice = DataWithMBLayoutFor(maskMatrix, fr, pMBLayout);
+
        auto matrixSliceToMask = DataWithMBLayoutFor(matrixToMask, fr, pMBLayout);
        matrixSliceToMask.MaskColumnsValue(maskSlice, val);
 #endif
--- a/Source/Common/Include/TensorShape.h
+++ b/Source/Common/Include/TensorShape.h
@ -79,7 +79,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 //     - special case: swapping between sample and MBLayout, e.g. turn a sample dimension to a time dimension
 //  - Validate() stage will automatically infer tensor dimensions from inputs, and also infer downwards into LearnableParameters where requested
 //
-// Interfacing to and inplementation in Matrix lib:
+// Interfacing to and implementation in Matrix lib:
 //  - a Tensor is realized as a type TensorView = { Matrix&, TensorShape& } (i.e. tensors don't own their memory)
 //  - Matrix lib will contain overloads for relevant operations that take Tensor& instead of Matrix&.
 //  - elementwise ops will go through a single bottleneck function that deals with matching dimensions (extend, broadcast) and flattening
--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -50,13 +50,14 @@ public:

    ComputationNetwork() :
        m_randomSeedOffset(0),
-        m_isCompiled(false),
-        m_areMatricesAllocated(false),
-        m_pMBLayoutOfNetwork(make_shared<MBLayout>()),
+          m_isCompiled(false),
+          m_areMatricesAllocated(false),
+        m_pMBLayoutOfNetwork(make_shared<MBLayout>(1, 0, L"*")),
        m_environment(make_shared<ComputationEnvironment>())
    {
-        m_pMBLayoutOfNetwork->SetAxisName(L"T");
+        //m_pMBLayoutOfNetwork->SetAxisName(L"T");
    }
+
    ComputationNetwork(DEVICEID_TYPE deviceId)
        : ComputationNetwork()
    {
@ -289,6 +290,8 @@ public:
    // This returns max number of columns over the feature nodes.
    // Note that if we have multiple slices, MB size != #frames.
    // BUGBUG: This will break once we have inconsistent layouts.
+    // BUGBUG: The number computed here is completely off (it the layout has gaps
+    // they will also be counted towards the actualMBSize)
    size_t DetermineActualMBSizeFromFeatures() const
    {
        size_t actualMBSize = 0;
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -48,6 +48,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(DiagTimesNode))                        return New<DiagTimesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(DropoutNode))                          return New<DropoutNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(DummyCriterionNode))                   return New<DummyCriterionNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(DynamicAxisNode))                      return New<DynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ElementTimesNode))                     return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(EnvironmentInputNode))                 return New<EnvironmentInputNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ErrorPredictionNode))                  return New<ErrorPredictionNode<ElemType>>(forward<_Types>(_Args)...);
@ -75,7 +76,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(PerDimMeanVarDeNormalizationNode))     return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PassNode))                             return New<PassNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(PlusNode))                             return New<PlusNode<ElemType>>(forward<_Types>(_Args)...);
-    else if (nodeType == OperationNameOf(ReconcileMBLayoutNode))                return New<ReconcileMBLayoutNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(ReconcileDynamicAxisNode))             return New<ReconcileDynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ReciprocalNode))                       return New<ReciprocalNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(RectifiedLinearNode))                  return New<RectifiedLinearNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ReshapeNode))                          return New<ReshapeNode<ElemType>>(forward<_Types>(_Args)...);
@ -110,6 +111,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    // TODO: DiagTimes is also an alias of ElementTimes; current separate implementation is unnecessary.
    else if (nodeType == L"PerDimMeanVarNormalizationNode")                     return New<PerDimMeanVarNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"PerDimMeanVarDeNormalizationNode")                   return New<PerDimMeanVarDeNormalizationNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == L"ReconcileMBLayout")                                  return New<ReconcileDynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"RowElementTimes")                                    return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"RowSlice")                                           return New<SliceNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == L"Scale")                                              return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
@ -193,28 +195,29 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat
    return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
 }

+// TODO: change these to take an actual object instead of a name for dynamicAxis
 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const size_t rows)
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName)
 {
-    return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, rows));
+    return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, rows, dynamicAxisName));
 }

 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring& inputName, const size_t rows)
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName)
 {
-    return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, rows));
+    return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, rows, dynamicAxisName));
 }

 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout)
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
 {
-    return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout));
+    return net.AddNodeToNetWithElemType(New<InputValue<ElemType>>(net.GetDeviceId(), inputName, sampleLayout, dynamicAxisName));
 }

 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout)
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateSparseInputNode(const std::wstring& inputName, const TensorShape& imageLayout, const wstring& dynamicAxisName)
 {
-    return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout));
+    return net.AddNodeToNetWithElemType(New<SparseInputValue<ElemType>>(net.GetDeviceId(), inputName, imageLayout, dynamicAxisName));
 }

 template <class ElemType>
@ -461,6 +464,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Pass(
    return net.AddNodeToNetAndAttachInputs(New<PassNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<DynamicAxisNode<ElemType>>(net.GetDeviceId(), nodeName), { a });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::InvStdDev(const ComputationNodePtr a, const std::wstring nodeName)
 {
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -48,10 +48,10 @@ public:
    ComputationNodePtr CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape);
    // sparse matrix size is optionally specified
    // ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
-    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows);
-    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows);
-    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout);
-    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout);
+    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
+    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
+    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
+    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                                             const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
                                             ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
@ -108,6 +108,7 @@ public:
    ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr DummyCriterion(const ComputationNodePtr objectives, const ComputationNodePtr derivatives, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
    ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
+    ComputationNodePtr DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName = L"");
    ComputationNodePtr ErrorPrediction(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Exp(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr FutureValue(const ComputationNodePtr a, const float initHiddenActivity, const size_t row_size, size_t timeStep, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -517,9 +517,8 @@ void ComputationNetwork::DetermineSetOfAllRoots()
 }

 // initial setup of MBLayout pointers
-//  - link all input nodes to one or more MBLayouts    --TODO: Currently only one
+//  - link all input nodes to one or more MBLayouts
 //  - reset all others to nullptr, in expectation of a ValidateNetwork() pass
-// BUGBUG (Issue #95): Change this to use different MBLayouts for different inputs if so configured.
 void ComputationNetwork::ResetMBLayouts()
 {
    // reset to a well-defined MBLayout (any meaningful layout should do here)
@ -530,10 +529,42 @@ void ComputationNetwork::ResetMBLayouts()
    for (const auto& node : GetAllNodesForRoot(nullptr))
        node->LinkToMBLayout(nullptr);

-    // then fix up inputs (all others get propagated upwards through Validate())
-    // BUGBUG (Issue #95): Once we support mismatching layouts, this will be more involved. For now, everything shares the one layout that the Network knows about.
+    // DynamicAxis nodes are (apart from the soon-to-be-deprecated network-wide MBLayout) the main holders of MBLayouts. Initialize them.
+    // The only other instances are nodes that change the MBLayout, like WhereNode. 
+    for (auto node : GetNodesWithType(L"DynamicAxis"))
+        node->LinkToMBLayout(make_shared<MBLayout>(1, 0, node->GetName()));
+
+    // This is now initialized inside of the Input nodes, with the proper connections.
    for (auto node : InputNodes(nullptr))
-        node->LinkToMBLayout(m_pMBLayoutOfNetwork);
+    {
+        // TODO: use if (!Is<ITakesDynamicAxis>(node))...
+        auto n = dynamic_pointer_cast<ITakesDynamicAxis>(node);
+        if (!n)
+            LogicError("Expected %ls to implement ITakesDynamicAxis, but it doesn't.", node->NodeDescription().c_str());
+        std::wstring axisName = n->GetRequestedDynamicAxis();
+
+        if (axisName == L"")
+        {
+            // Legacy behavior: One shared MBLayout
+            // TODO Remove m_pMBLayoutOfNetwork altogether. See issue 358.
+            node->LinkToMBLayout(m_pMBLayoutOfNetwork);
+        }
+        else
+        {
+            auto axisNode = GetNodeFromName(axisName);
+
+            if (!axisNode)
+                RuntimeError("%ls: Can't find node '%ls' for retrieving dynamic axis.", axisNode->NodeDescription().c_str(), axisName.c_str());
+
+            // For now we require the node to be a DynamicAxisNode, though we could derive the same from other nodes. This would involve
+            // more dependencies on the order in which things are evaluated, though.
+            if (axisNode->OperationName() != L"DynamicAxis")
+                RuntimeError("%ls: dynamicAxis argument must be of type DynamicAxis(), but got %ls.", node->NodeDescription().c_str(), axisNode->NodeDescription().c_str());
+            if (!axisNode->HasMBLayout())
+                LogicError("%ls: Expected %ls to have MBLayout, but it doesn't.", node->NodeDescription().c_str(), axisNode->NodeDescription().c_str());
+            node->LinkToMBLayout(axisNode->GetMBLayout());
+        }
+    }
 }

 // -----------------------------------------------------------------------
@ -661,6 +692,11 @@ size_t ComputationNetwork::ValidateNodes(list<ComputationNodeBasePtr> nodes, boo
        {
            hasVisitedChild |= child->m_visited; // if not a single visited child then no point in validating
            allChildrenVisited &= child->m_visited;
+
+            // Make sure we don't use DynamicAxis in places where it was not designed for.
+            // This is a stop-gap. We need a more coherent concept for passing of shapes.
+            if (child->OperationName() == L"DynamicAxis")
+                RuntimeError("%ls: Cannot be used as input to another node. It can only be used on the 'dynamicAxis' property of an Input node.", child->NodeDescription().c_str());
        }

        // if there is not at least one visited child
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -100,7 +100,7 @@ void ComputationNodeBase::InferMBLayoutFromInputsForStandardCase(bool isFinalVal
        else if (!pMBLayout) // first non-NULL layout: just copy it
            pMBLayout = child->m_pMBLayout;
        else if (pMBLayout != child->m_pMBLayout && isFinalValidationPass) // got a layout--compare whether it is the same
-            RuntimeError("%ls: InferMBLayoutFromInputsForStandardCase: Expected minibatch layouts to be the same between all children. Child '%ls' (%ls) uses a different layout than previously checked children and might get out of sync during runtime. If this is by design, use ReconcileMBLayout() to forward layouts between nodes.",
+            RuntimeError("%ls: InferMBLayoutFromInputsForStandardCase: Expected minibatch layouts to be the same between all children. Child '%ls' (%ls) uses a different layout than previously checked children and might get out of sync during runtime. If this is by design, use ReconcileDynamicAxis() to forward layouts between nodes.",
                         NodeDescription().c_str(), child->NodeName().c_str(), child->OperationName().c_str());
    }
    // all are consistent: install it
@ -130,7 +130,7 @@ void ComputationNodeBase::ValidateBinaryZip(bool isFinalValidationPass, bool all
    if (isFinalValidationPass &&
        Input(0)->GetMBLayout() != Input(1)->GetMBLayout() && Input(0)->HasMBLayout() && Input(1)->HasMBLayout())
    {
-        LogicError("%ls: Minibatch layouts are not the same between arguments and might get out of sync during runtime. If this is by design, use ReconcileMBLayout() to forward layouts between nodes.", NodeDescription().c_str());
+        LogicError("%ls: Minibatch layouts are not the same between arguments and might get out of sync during runtime. If this is by design, use ReconcileDynamicAxis() to forward layouts between nodes.", NodeDescription().c_str());
    }

    // result has tensor shape with dimensions being the max over both
@ -176,6 +176,7 @@ void ComputationNodeBase::ValidateBinaryReduce(bool isFinalValidationPass)
    ComputationNodeBase::Validate(isFinalValidationPass);
    m_pMBLayout = nullptr; // this node does not hold mini-batch data
    ValidateInferBinaryInputDims();
+
    if (isFinalValidationPass)
    {
        if (!(Input(0)->GetSampleLayout().IsElementwiseCompatibleWith(Input(1)->GetSampleLayout())))
@ -338,18 +339,25 @@ TensorShape ComputationNodeBase::GetOneSampleTensorSliceFor(size_t rank, const F
                prototype += "NULL";
                continue;
            }
-
-            prototype += msra::strfun::strprintf("[%s%ls]", string(child->m_sampleLayout).c_str(), child->GetMBLayoutAxisString().c_str());
+            prototype += child->ShapeDescription().c_str();
        }
        prototype += extraArgs;
        //prototype += ")";
    }

-    prototype += msra::strfun::strprintf(" -> [%s%ls]", string(GetSampleLayout()).c_str(), GetMBLayoutAxisString().c_str());
+    prototype += msra::strfun::strprintf(" -> %s", ShapeDescription().c_str());

    return prototype;
 }

+const std::string ComputationNodeBase::ShapeDescription() const
+{
+    return msra::strfun::strprintf("[%s%s%ls]",
+        string(m_sampleLayout).c_str(),
+        HasMBLayout() ? " x " : "",
+        HasMBLayout() ? GetMBLayout()->GetAxisName() : L"");
+}
+
 template <class ElemType>
 /*virtual*/ void ComputationNode<ElemType>::DumpNodeInfo(const bool /*printValues*/, const bool printMetadata, File& fstream) const
 {
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -36,7 +36,8 @@
 #define CNTK_MODEL_VERSION_5 5 // ND convolution and pooling
 #define CNTK_MODEL_VERSION_6 6 // Batch norm blending
 #define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_7
+#define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_8

 extern bool g_shareNodeValueMatrices;

@ -553,9 +554,14 @@ public:

    // helper for the factory function for ComputationNodes
    static vector<ComputationNodeBasePtr> GetInputsFromConfig(const ScriptableObjects::IConfigRecordPtr configp)
+    {
+        return GetInputsFromConfig(configp, L"inputs");
+    }
+
+    static vector<ComputationNodeBasePtr> GetInputsFromConfig(const ScriptableObjects::IConfigRecordPtr configp, const std::wstring& property)
    {
        vector<ComputationNodeBasePtr> inputs;
-        const auto* inputsArg = configp->Find(L"inputs");
+        const auto* inputsArg = configp->Find(property);
        if (inputsArg)
        {
            if (inputsArg->Is<ComputationNodeBase>()) // single arg
@ -817,6 +823,9 @@ public:
        return std::wstring(L"Node '") + NodeName().c_str() + L"' (" + OperationName().c_str() + L" operation)"; 
    };

+    // Helper that returns [a x b x c], including dynamic axes.
+    const std::string ShapeDescription() const;
+
 protected:

    // -----------------------------------------------------------------------
@ -851,7 +860,8 @@ protected:
 typedef ComputationNodeBase::ComputationNodeBasePtr ComputationNodeBasePtr;

 // =======================================================================
-// NumInputs -- little helper interface to allow derived Node classes to specify how many inputs they expect
+// NumInputs -- little helper interface to allow derived Node classes to 
+// specify how many inputs they expect
 // =======================================================================

 struct INumInputs { virtual size_t GetExpectedNumInputs() const = 0; };
@ -864,6 +874,14 @@ struct NumInputs : public INumInputs // e.g. derive from NumInputs<2>
    }
 };

+// =======================================================================
+// Nodes that can take a dynamic axis need to implement this.
+// =======================================================================
+struct ITakesDynamicAxis
+{
+    virtual const std::wstring GetRequestedDynamicAxis() const = 0;
+};
+
 // =======================================================================
 // ComputationNode -- abstract base class for computation nodes, deriving
 // from CompuationNodeBase, parameterized by float vs. double
@ -1004,7 +1022,7 @@ public:
            if (inputs[i])
                m_inputs[i] = DownCast(inputs[i]); // (DownCast() checks the type; the assignment then downcasts it again)
            else
-                m_inputs[i] = nullptr; // during network creation, nullpts are possible
+                m_inputs[i] = nullptr; // during network creation, nullptrs are possible
    }

 protected:
@ -1406,7 +1424,7 @@ public:
    virtual void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
        if (IsValueSharable())
-			RequestMatrixFromPool(m_value, matrixPool);
+            RequestMatrixFromPool(m_value, matrixPool);
        else
            CreateMatrixIfNull(m_value);
    }
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -108,6 +108,47 @@ public:
    virtual void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override;
 };

+// -----------------------------------------------------------------------
+// DynamicAxisNode (/*no input*/)
+// This is a holder for MBLayout objects shared across inputs.
+// -----------------------------------------------------------------------
+template <class ElemType>
+class DynamicAxisNode : public ComputationNode<ElemType>, public NumInputs<0>
+{
+    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"DynamicAxis"; }
+public:
+    DynamicAxisNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+        // BUGBUG: In BS, the node name is not known during node instantiation.
+        // This may require to pass the display name as a separate parameter.
+
+        // This is the whole point of this class: Introduce a new MBLayout that others can use.
+        LinkToMBLayout(make_shared<MBLayout>(1, 0, name));
+        // We need some shape, or validation fails.
+        SetDims(TensorShape(1,1), true);
+    }
+    DynamicAxisNode(const ScriptableObjects::IConfigRecordPtr configp)
+        : DynamicAxisNode(configp->Get(L"deviceId"), L"<placeholder>")
+    {
+    }
+
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange&) override
+    {
+        RuntimeError("%ls is a special node only to be used as input to the Input() node.", NodeDescription().c_str());
+    }
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange&)
+    {
+        LogicError("%ls is a leaf node. BackpropTo() should never be called.", NodeDescription().c_str());
+    }
+};
+
+template class DynamicAxisNode<float>;
+template class DynamicAxisNode<double>;
+
+
 // -----------------------------------------------------------------------
 // InputValueBase (/*no input*/)
 // Base class for InputValue and SparseInputValue (typically fed by a DataReader)
@ -116,12 +157,12 @@ public:
 // -----------------------------------------------------------------------

 template <class ElemType>
-class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
+class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>, public ITakesDynamicAxis
 {
    typedef ComputationNode<ElemType> Base;
    UsingComputationNodeMembers;

-    void Init(const TensorShape& sampleLayout, bool isSparse)
+    void Init(const TensorShape& sampleLayout, bool isSparse, const std::wstring axisName)
    {
        m_isSparse = isSparse;
        MarkValueNonSharable();
@ -131,33 +172,61 @@ class InputValueBase : public ComputationNode<ElemType>, public NumInputs<0>
        SetDims(sampleLayout, HasMBLayout()); // also called when reloading a file. Then we have an MBLayout, otherwise not yet
        UpdateFunctionValuesSize();           // we must allocate the matrix so that the readers get objects with valid row dimensions (some readers expect that)
        SetLearningRateMultiplier(0);
+        m_dynamicAxisNodeName = axisName;
    }

 protected:
-    InputValueBase(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& sampleLayout, bool isSparse)
+    InputValueBase(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& sampleLayout, bool isSparse, const std::wstring axisName)
        : Base(deviceId, name)
    {
-        Init(sampleLayout, isSparse);
+        Init(sampleLayout, isSparse, axisName);
    }
-    InputValueBase(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, bool isSparse)
-        : InputValueBase(deviceId, name, TensorShape(rows), isSparse)
+    InputValueBase(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, bool isSparse, const std::wstring axisName)
+        : InputValueBase(deviceId, name, TensorShape(rows), isSparse, axisName)
    {
    }
-    InputValueBase(DEVICEID_TYPE deviceId, const wstring& name, bool isSparse)
-        : InputValueBase(deviceId, name, TensorShape(), isSparse)
+    InputValueBase(DEVICEID_TYPE deviceId, const wstring& name, bool isSparse, const std::wstring axisName)
+        : InputValueBase(deviceId, name, TensorShape(), isSparse, axisName)
    {
    }
    InputValueBase(const ScriptableObjects::IConfigRecordPtr configp, bool isSparse)
        : Base(configp->Get(L"deviceId"), L"<placeholder>")
    {
        AttachInputsFromConfig(configp, this->GetExpectedNumInputs());
+        wstring axisName = L"";
+        // TODO This currently reads a ComputationNode object from a property, thereby bypassing "normal" input handling.
+        // The passing of shapes represents a second graph that is "overlaid" (and previously identical) to the data
+        // flow network. This needs to be solved on a more fundamental level.
+        // The proposed future change from fseide is as follows:
+        // (2) On BS level, dynamicAxis is an optional parameter that takes a DynamicAxis object--the alternative,
+        // passing a string, will be removed.
+        // (3) The dynamicAxis argument will become an actual m_inputs[] to the InputValue. I.e.InputValues are no
+        // longer leaves from the ComputationNetwork viewpoint. But they ARE leaves from the user / BS / NDL view, as
+        // the axis is not passed as a regular input.This way, the current special - casing can and will be removed;
+        // instead, the MBLayout propagation will happen automagically as part of regular ValidateNetwork().
+        if (configp->Exists(L"dynamicAxis"))
+        {
+            auto axisConfig = configp->Find(L"dynamicAxis");
+            if (axisConfig->Is<ComputationNodeBase>())
+            {
+                ComputationNodeBasePtr axis = configp->Get(L"dynamicAxis");
+                axisName = axis->GetName();
+            }
+            else
+            {
+                axisName = (const std::wstring&)*axisConfig;
+            }
+        }
+
        bool isImage = configp->Get(L"isImage");
        if (!isImage)
-            Init(configp->Get(L"shape"), isSparse);
+            Init(configp->Get(L"shape"), isSparse, axisName);
        else
-            Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse);
+            Init(ImageDimensions::AsTensorShape(configp->Get(L"imageWidth"), configp->Get(L"imageHeight"), configp->Get(L"imageChannels"), ImageLayoutKindFrom(configp->Get(L"imageLayout"))), isSparse, axisName);
    }

+    virtual const std::wstring GetRequestedDynamicAxis() const { return m_dynamicAxisNodeName; }
+
 public:
    virtual void Save(File& fstream) const override
    {
@ -166,6 +235,10 @@ public:
        size_t colsDummy = 0;
        fstream << rowsDummy << colsDummy;
        m_sampleLayout.Save(fstream);
+
+        unsigned int nrAxes = 1;
+        fstream << nrAxes;
+        fstream << m_dynamicAxisNodeName;
    }

    virtual void Load(File& fstream, size_t modelVersion) override
@ -180,10 +253,22 @@ public:
        if (rows != 0 /*old file*/ && rows != sampleLayout.GetNumElements() /*even older file*/)
        {
            fprintf(stderr, "WARNING: %ls InputValue has inconsistent serialized sample layout %s vs. number of rows %d. Resetting sample layout to vector.\n",
-                    NodeName().c_str(), string(sampleLayout).c_str(), (int) rows);
+                NodeName().c_str(), string(sampleLayout).c_str(), (int)rows);
            sampleLayout = TensorShape(rows);
        }
-        Init(sampleLayout, m_isSparse);
+
+        if (modelVersion >= CNTK_MODEL_VERSION_8)
+        { 
+            unsigned int nrAxes;
+            fstream >> nrAxes;
+            if (nrAxes == 1)
+                fstream >> m_dynamicAxisNodeName;
+            else if (nrAxes > 1)
+                RuntimeError("Input node: This version only supports a single dynamic axis. Please update your bits.");
+        }
+        else
+            m_dynamicAxisNodeName = L""; // Use default
+        Init(sampleLayout, m_isSparse, m_dynamicAxisNodeName);
    }

    // InputValue must not resize its inputs because that might destroy it. It should already have the correct size.
@ -216,6 +301,9 @@ public:

 private:
    bool m_isSparse = false;
+    std::wstring m_dynamicAxisNodeName;
+    ComputationNodeBase* m_dynamicAxisNode;
+
    void ConvertToSparseMatrix()
    {
        m_value->SwitchToMatrixType(MatrixType::SPARSE, matrixFormatSparseCSC, false);
@ -237,15 +325,19 @@ class InputValue : public InputValueBase<ElemType>

 public:
    InputValue(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name, false)
+        : Base(deviceId, name, false, L"")
    {
    }
-    InputValue(DEVICEID_TYPE deviceId, const wstring& name, size_t rows)
-        : Base(deviceId, name, rows, false)
+    InputValue(DEVICEID_TYPE deviceId, const wstring& name, const wstring& dynamicAxisName)
+        : Base(deviceId, name, false, dynamicAxisName)
    {
    }
-    InputValue(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& sampleLayout)
-        : Base(deviceId, name, sampleLayout, false)
+    InputValue(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, const wstring& dynamicAxisName)
+        : Base(deviceId, name, rows, false, dynamicAxisName)
+    {
+    }
+    InputValue(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& sampleLayout, const wstring& dynamicAxisName)
+        : Base(deviceId, name, sampleLayout, false, dynamicAxisName)
    {
    }
    InputValue(const ScriptableObjects::IConfigRecordPtr configp)
@ -275,15 +367,19 @@ class SparseInputValue : public InputValueBase<ElemType>

 public:
    SparseInputValue(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name, true)
+        : Base(deviceId, name, true, L"")
    {
    }
-    SparseInputValue(DEVICEID_TYPE deviceId, const wstring& name, size_t rows)
-        : Base(deviceId, name, rows, true)
+    SparseInputValue(DEVICEID_TYPE deviceId, const wstring& name, const wstring& dynamicAxisName)
+        : Base(deviceId, name, true, dynamicAxisName)
    {
    }
-    SparseInputValue(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& imageLayout)
-        : Base(deviceId, name, imageLayout, true)
+    SparseInputValue(DEVICEID_TYPE deviceId, const wstring& name, size_t rows, const wstring& dynamicAxisName)
+        : Base(deviceId, name, rows, true, dynamicAxisName)
+    {
+    }
+    SparseInputValue(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& imageLayout, const wstring& dynamicAxisName)
+        : Base(deviceId, name, imageLayout, true, dynamicAxisName)
    {
    }
    SparseInputValue(const ScriptableObjects::IConfigRecordPtr configp)
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -117,7 +117,7 @@ template <class ElemType>
    if (!m_pMBLayout)
    {
        m_pMBLayout = make_shared<MBLayout>(); // this generates a new layout
-        m_pMBLayout->SetUniqueAxisName(NodeName());
+        m_pMBLayout->SetUniqueAxisName(L"WhereNodeAxis");
    }
    // we map scalars to scalars
    if (isFinalValidationPass && Input(0)->GetSampleLayout().GetNumElements() != 1)
@ -157,6 +157,7 @@ template <class ElemType>
            result(0, jIndex) = (ElemType)jSource;
        }
    }
+    // Note: maybe this is no longer needed, now that we do the same inside UpdateFunctionValueSize() for all nodes.
    result.CollapseDataLocationAfterWriting(); // BUGBUG: Move back, since BOTH state is broken at present.
 }

--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -171,7 +171,7 @@ template class ReshapeNode<float>;
 template class ReshapeNode<double>;

 // -----------------------------------------------------------------------
-// ReconcileMBLayout (dataInput, layoutInput)
+// ReconcileDynamicAxis (dataInput, layoutInput)
 // This node copies data from 'dataInput' while it propagates the minibatch-layout information from 'layoutInput'.
 // It does perform a runtime check to enforce that the layout of 'dataInput' is compatible (identical content) to that of 'layoutInput'.
 // This node is meant to be used from BrainScript macros that bracket expand/reduce pairs of nodes. It is not meant to really be used directly.
@ -179,14 +179,14 @@ template class ReshapeNode<double>;
 // -----------------------------------------------------------------------

 template <class ElemType>
-class ReconcileMBLayoutNode : public ComputationNode<ElemType>, public NumInputs<2>
+class ReconcileDynamicAxisNode : public ComputationNode<ElemType>, public NumInputs<2>
 {
    typedef ComputationNode<ElemType> Base; UsingComputationNodeMembersBoilerplate;
-    static const std::wstring TypeName() { return L"ReconcileMBLayout"; }
+    static const std::wstring TypeName() { return L"ReconcileDynamicAxis"; }

 public:
-    DeclareConstructorFromConfigWithNumInputs(ReconcileMBLayoutNode);
-    ReconcileMBLayoutNode(DEVICEID_TYPE deviceId, const wstring& name)
+    DeclareConstructorFromConfigWithNumInputs(ReconcileDynamicAxisNode);
+    ReconcileDynamicAxisNode(DEVICEID_TYPE deviceId, const wstring& name)
        : Base(deviceId, name)
    {
    }
@ -228,8 +228,8 @@ public:
    }
 };

-template class ReconcileMBLayoutNode<float>;
-template class ReconcileMBLayoutNode<double>;
+template class ReconcileDynamicAxisNode<float>;
+template class ReconcileDynamicAxisNode<double>;

 // -----------------------------------------------------------------------
 // SliceNode (input)
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@ -141,6 +141,7 @@
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptEvaluator.cpp" />
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptParser.cpp" />
    <ClCompile Include="..\Common\Config.cpp" />
+    <ClCompile Include="..\Common\DataReader.cpp" />
    <ClCompile Include="..\Common\Eval.cpp">
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
      <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">true</ExcludedFromBuild>
--- a/Source/EvalDll/EvalDll.vcxproj.filters
+++ b/Source/EvalDll/EvalDll.vcxproj.filters
@ -32,6 +32,9 @@
    <ClCompile Include="..\CNTK\BrainScript\BrainScriptParser.cpp">
      <Filter>BrainScript</Filter>
    </ClCompile>
+    <ClCompile Include="..\Common\DataReader.cpp">
+      <Filter>Common</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="EvalReader.h" />
--- a/Source/EvalDll/EvalReader.h
+++ b/Source/EvalDll/EvalReader.h
@ -12,7 +12,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // Evaluation Reader class
 // interface to pass to evaluation DLL
 template <class ElemType>
-class EvalReader : public IDataReader
+class EvalReader : public DataReaderBase
 {
    std::map<std::wstring, std::vector<ElemType>*>* m_inputs; // our input data
    std::map<std::wstring, size_t>* m_dimensions;             // the number of rows for the input data
@ -109,11 +109,11 @@ public:
        m_mbSize = min(mbSize, m_recordCount);
    }

-    // GetMinibatch - Get the next minibatch (features and labels)
+    // TryGetMinibatch - Get the next minibatch (features and labels)
    // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponding matrix,
    //             [out] each matrix resized if necessary containing data.
    // returns - true if there are more minibatches, false if no more minibatchs remain
-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices)
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices)
    {
        // how many records are we reading this time
        size_t recordCount = min(m_mbSize, m_recordCount - m_currentRecord);
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -664,7 +664,7 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::DoGatherColumnsOf(ElemType beta, const
            continue;
        size_t jIn = (size_t)jInF;
        if (jIn >= a.GetNumCols())
-            InvalidArgument("DoGatherColumnsOf: Map out of bounds.");
+            InvalidArgument("DoGatherColumnsOf: Map out of bounds. %ld >= %ld", (long int)jIn, (long int)a.GetNumCols());
        ScaleAndAddColumn(beta, &us(0,jOut), &a(0,jIn), us.GetNumRows(), alpha);
    }

@ -6091,7 +6091,7 @@ void CPUMatrix<ElemType>::TensorOp(ElemType beta, const CPUMatrix<ElemType>& a,
    if (reductionOp != ElementWiseOperator::opSum) // TODO: enable the reduction ops
        InvalidArgument("TensorOp: Unary reduction operations other than opSum not yet implemented.");

-    // TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
+// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
 #define CaseUnaryTensorOp(oper)                                                        \
    case ElementWiseOperator::op##oper:                                                \
        return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2>& pp) \
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -893,7 +893,7 @@ __global__ void _doGatherColumnsOf(ElemType* us, size_t usStride, const ElemType

    const ElemType&  ra = a[    i + jIn  *  aStride  ];
    ElemType&       rus = us[id/*i + jOut * usStride*/];
-    
+
    ElemType res = ra * alpha;
    if (beta != 0)
        res += rus * beta;
--- a/Source/Readers/BinaryReader/BinaryReader.cpp
+++ b/Source/Readers/BinaryReader/BinaryReader.cpp
@ -245,7 +245,7 @@ bool BinaryReader<ElemType>::CheckEndDataset(size_t actualmbsize)
 //             [out] each matrix resized if necessary containing data.
 // returns - true if there are more minibatches, false if no more minibatchs remain
 template <class ElemType>
-bool BinaryReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool BinaryReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    // get out if they didn't call StartMinibatchLoop() first
    if (m_mbSize == 0)
--- a/Source/Readers/BinaryReader/BinaryReader.h
+++ b/Source/Readers/BinaryReader/BinaryReader.h
@ -541,7 +541,7 @@ public:
 };

 template <class ElemType>
-class BinaryReader : public IDataReader
+class BinaryReader : public DataReaderBase
 {
    size_t m_mbSize;           // size of minibatch requested
    size_t m_mbStartSample;    // starting sample # of the next minibatch
@ -587,7 +587,7 @@ public:
    }
    virtual ~BinaryReader();
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices);
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices);

    size_t GetNumParallelSequences()
    {
--- a/Source/Readers/CNTKTextFormatReader/Indexer.cpp
+++ b/Source/Readers/CNTKTextFormatReader/Indexer.cpp
@ -140,7 +140,7 @@ void Indexer::Build()
    size_t id = 0;
    int64_t offset = GetFileOffset();
    // read the very first sequence id
-    if (!GetNextSequenceId(id))
+    if (!TryGetSequenceId(id))
    {
        RuntimeError("Expected a sequence id at the offset %" PRIi64 ", none was found.", offset);
    }
@ -156,7 +156,7 @@ void Indexer::Build()
        offset = GetFileOffset(); // a new line starts at this offset;
        sd.m_numberOfSamples++;

-        if (!m_done && GetNextSequenceId(id) && id != sd.m_id)
+        if (!m_done && TryGetSequenceId(id) && id != sd.m_id)
        {
            // found a new sequence, which starts at the [offset] bytes into the file
            sd.m_byteSize = offset - sd.m_fileOffsetBytes;
@ -192,7 +192,7 @@ void Indexer::SkipLine()
    }
 }

-bool Indexer::GetNextSequenceId(size_t& id)
+bool Indexer::TryGetSequenceId(size_t& id)
 {
    bool found = false;
    id = 0;
--- a/Source/Readers/CNTKTextFormatReader/Indexer.h
+++ b/Source/Readers/CNTKTextFormatReader/Indexer.h
@ -71,7 +71,7 @@ private:
    // EOF is reached without hitting the pipe character.
    // Returns false if no numerical characters are found preceding the pipe.
    // Otherwise, writes sequence id value to the provided reference, returns true.
-    bool GetNextSequenceId(size_t& id);
+    bool TryGetSequenceId(size_t& id);

    // Build a chunk/sequence index, treating each line as an individual sequence.
    // Does not do any sequence parsing, instead uses line number as 
--- a/Source/Readers/CNTKTextFormatReader/TextParser.cpp
+++ b/Source/Readers/CNTKTextFormatReader/TextParser.cpp
@ -332,7 +332,7 @@ void TextParser<ElemType>::IncrementNumberOfErrorsOrDie()
 }

 template <class ElemType>
-bool TextParser<ElemType>::RefillBuffer()
+bool TextParser<ElemType>::TryRefillBuffer()
 {
    size_t bytesRead = fread(m_buffer.get(), 1, BUFFER_SIZE, m_file);
    
@ -364,7 +364,7 @@ void TextParser<ElemType>::SetFileOffset(int64_t offset)
    m_fileOffsetStart = offset;
    m_fileOffsetEnd = offset;

-    RefillBuffer();
+    TryRefillBuffer();
 }

 template <class ElemType>
@ -384,7 +384,7 @@ typename TextParser<ElemType>::SequenceBuffer TextParser<ElemType>::LoadSequence
    if (verifyId) 
    {
        size_t id;
-        if (!ReadUint64(id, bytesToRead) || id != sequenceDsc.m_id) 
+        if (!TryReadUint64(id, bytesToRead) || id != sequenceDsc.m_id) 
        {
            RuntimeError("Did not find the expected sequence id ( %" PRIu64 ") "
                " at the file offset = %" PRId64 "\n", sequenceDsc.m_id, GetFileOffset());
@ -410,7 +410,7 @@ typename TextParser<ElemType>::SequenceBuffer TextParser<ElemType>::LoadSequence
    size_t numRowsRead = 0, expectedRowCount = sequenceDsc.m_numberOfSamples;
    for (size_t i = 0; i < expectedRowCount; i++)
    {
-        if ((ReadRow(sequence, bytesToRead)))
+        if ((TryReadRow(sequence, bytesToRead)))
        {
            ++numRowsRead;
        }
@ -472,7 +472,7 @@ typename TextParser<ElemType>::SequenceBuffer TextParser<ElemType>::LoadSequence
 }

 template <class ElemType>
-bool TextParser<ElemType>::ReadRow(SequenceBuffer& sequence, size_t& bytesToRead)
+bool TextParser<ElemType>::TryReadRow(SequenceBuffer& sequence, size_t& bytesToRead)
 {
    bool found = false;
    while (bytesToRead && CanRead())
@ -496,7 +496,7 @@ bool TextParser<ElemType>::ReadRow(SequenceBuffer& sequence, size_t& bytesToRead
        }

        size_t id;
-        if (!GetInputId(id, bytesToRead))
+        if (!TryGetInputId(id, bytesToRead))
        {
            IncrementNumberOfErrorsOrDie();
            SkipToNextInput(bytesToRead);
@ -511,7 +511,7 @@ bool TextParser<ElemType>::ReadRow(SequenceBuffer& sequence, size_t& bytesToRead
            vector<ElemType>& values = data->m_buffer;
            size_t size = values.size();
            assert(size % stream.m_sampleDimension == 0);
-            if (!ReadDenseSample(values, stream.m_sampleDimension, bytesToRead))
+            if (!TryReadDenseSample(values, stream.m_sampleDimension, bytesToRead))
            {
                // expected a dense sample, but was not able to fully read it, ignore it.
                if (values.size() != size)
@ -533,7 +533,7 @@ bool TextParser<ElemType>::ReadRow(SequenceBuffer& sequence, size_t& bytesToRead
            vector<IndexType>& indices = data->m_indices;
            assert(values.size() == indices.size());
            size_t size = values.size();
-            if (!ReadSparseSample(values, indices, bytesToRead))
+            if (!TryReadSparseSample(values, indices, bytesToRead))
            {
                // expected a sparse sample, but something went south, ignore it.
                if (values.size() != size)
@ -572,7 +572,7 @@ bool TextParser<ElemType>::ReadRow(SequenceBuffer& sequence, size_t& bytesToRead
 }

 template <class ElemType>
-bool TextParser<ElemType>::GetInputId(size_t& id, size_t& bytesToRead)
+bool TextParser<ElemType>::TryGetInputId(size_t& id, size_t& bytesToRead)
 {
    char* scratchIndex = m_scratch.get();

@ -664,7 +664,7 @@ bool TextParser<ElemType>::GetInputId(size_t& id, size_t& bytesToRead)
 }

 template <class ElemType>
-bool TextParser<ElemType>::ReadDenseSample(vector<ElemType>& values, size_t sampleSize, size_t& bytesToRead)
+bool TextParser<ElemType>::TryReadDenseSample(vector<ElemType>& values, size_t sampleSize, size_t& bytesToRead)
 {
    size_t counter = 0;
    ElemType value;
@ -708,7 +708,7 @@ bool TextParser<ElemType>::ReadDenseSample(vector<ElemType>& values, size_t samp
            continue;
        }

-        if (!ReadRealNumber(value, bytesToRead))
+        if (!TryReadRealNumber(value, bytesToRead))
        {
            // bail out.
            return false;
@ -730,7 +730,7 @@ bool TextParser<ElemType>::ReadDenseSample(vector<ElemType>& values, size_t samp
 }

 template <class ElemType>
-bool TextParser<ElemType>::ReadSparseSample(std::vector<ElemType>& values, std::vector<IndexType>& indices, size_t& bytesToRead)
+bool TextParser<ElemType>::TryReadSparseSample(std::vector<ElemType>& values, std::vector<IndexType>& indices, size_t& bytesToRead)
 {
    size_t index;
    ElemType value;
@ -755,7 +755,7 @@ bool TextParser<ElemType>::ReadSparseSample(std::vector<ElemType>& values, std::
        }

        // read next sparse index
-        if (!ReadUint64(index, bytesToRead))
+        if (!TryReadUint64(index, bytesToRead))
        {
            // bail out.
            return false;
@ -771,6 +771,17 @@ bool TextParser<ElemType>::ReadSparseSample(std::vector<ElemType>& values, std::
            // bail out.
            return false;
        }
+        if (index > numeric_limits<IndexType>::max())
+        {
+            if (m_traceLevel >= Warning)
+            {
+                fprintf(stderr,
+                    "WARNING: sparse index value(%" PRIu64 ") exceeds the maximum allowed "
+                    " value (%" PRIu64 ")\n", index, (size_t)numeric_limits<IndexType>::max());
+            }
+            // bail out.
+            return false;
+        }

        // an index must be followed by a delimiter
        c = *m_pos;
@ -792,7 +803,7 @@ bool TextParser<ElemType>::ReadSparseSample(std::vector<ElemType>& values, std::
        }

        // read the corresponding value
-        if (!ReadRealNumber(value, bytesToRead))
+        if (!TryReadRealNumber(value, bytesToRead))
        {
            // bail out.
            return false;
@ -847,7 +858,7 @@ void TextParser<ElemType>::SkipToNextInput(size_t& bytesToRead)
 }

 template <class ElemType>
-bool TextParser<ElemType>::ReadUint64(size_t& value, size_t& bytesToRead)
+bool TextParser<ElemType>::TryReadUint64(size_t& value, size_t& bytesToRead)
 {
    value = 0;
    bool found = false;
@ -900,7 +911,7 @@ bool TextParser<ElemType>::ReadUint64(size_t& value, size_t& bytesToRead)
 // cannot be parsed as part of a floating point number.
 // Returns true if parsing was successful.
 template <class ElemType>
-bool TextParser<ElemType>::ReadRealNumber(ElemType& value, size_t& bytesToRead)
+bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRead)
 {
    State state = State::Init;
    double coefficient = .0, number = .0, divider = .0;
--- a/Source/Readers/CNTKTextFormatReader/TextParser.h
+++ b/Source/Readers/CNTKTextFormatReader/TextParser.h
@ -124,28 +124,28 @@ private:
    void SkipToNextValue(size_t& bytesToRead);
    void SkipToNextInput(size_t& bytesToRead);

-    bool RefillBuffer();
+    bool TryRefillBuffer();

    int64_t GetFileOffset() const { return m_fileOffsetStart + (m_pos - m_bufferStart); }

    // Reads an alias/name and converts it to an internal stream id (= stream index).
-    bool GetInputId(size_t& id, size_t& bytesToRead);
+    bool TryGetInputId(size_t& id, size_t& bytesToRead);

-    bool ReadRealNumber(ElemType& value, size_t& bytesToRead);
+    bool TryReadRealNumber(ElemType& value, size_t& bytesToRead);

-    bool ReadUint64(size_t& value, size_t& bytesToRead);
+    bool TryReadUint64(size_t& value, size_t& bytesToRead);

    // Reads dense sample values into the provided vector.
-    bool ReadDenseSample(std::vector<ElemType>& values, size_t sampleSize, size_t& bytesToRead);
+    bool TryReadDenseSample(std::vector<ElemType>& values, size_t sampleSize, size_t& bytesToRead);

    // Reads sparse sample values and corresponging indices into the provided vectors.
-    bool ReadSparseSample(std::vector<ElemType>& values, std::vector<IndexType>& indices, size_t& bytesToRead);
+    bool TryReadSparseSample(std::vector<ElemType>& values, std::vector<IndexType>& indices, size_t& bytesToRead);

    // Reads one whole row (terminated by a row delimiter) of samples
-    bool ReadRow(SequenceBuffer& sequence, size_t& bytesToRead);
+    bool TryReadRow(SequenceBuffer& sequence, size_t& bytesToRead);

    // Returns true if there's still data available.
-    bool inline CanRead() { return m_pos != m_bufferEnd || RefillBuffer(); }
+    bool inline CanRead() { return m_pos != m_bufferEnd || TryRefillBuffer(); }

    // Given a descriptor, retrieves the data for the corresponging sequence from the file.
    SequenceBuffer LoadSequence(bool verifyId, const SequenceDescriptor& descriptor);
--- a/Source/Readers/DSSMReader/DSSMReader.cpp
+++ b/Source/Readers/DSSMReader/DSSMReader.cpp
@ -320,7 +320,7 @@ void DSSMReader<ElemType>::StoreLabel(ElemType& labelStore, const LabelType& lab
 //             [out] each matrix resized if necessary containing data.
 // returns - true if there are more minibatches, false if no more minibatchs remain
 template <class ElemType>
-bool DSSMReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool DSSMReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    if (m_readNextSample >= m_totalSamples)
    {
--- a/Source/Readers/DSSMReader/DSSMReader.h
+++ b/Source/Readers/DSSMReader/DSSMReader.h
@ -64,7 +64,7 @@ public:
 };

 template <class ElemType>
-class DSSMReader : public IDataReader
+class DSSMReader : public DataReaderBase
 {
    // public:
    //    typedef std::string LabelType;
@ -159,7 +159,7 @@ public:
    }
    virtual ~DSSMReader();
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices);
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices);

    size_t GetNumParallelSequences()
    {
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.cpp
@ -931,7 +931,7 @@ bool HTKMLFReader<ElemType>::GetHmmData(msra::asr::simplesenonehmm* hmm)
 // returns - true if there are more minibatches, false if no more minibatchs remain
 // TODO: Why do we have two read functions? Is one not a superset of the other?
 template <class ElemType>
-bool HTKMLFReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool HTKMLFReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    if (m_trainOrTest)
    {
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.h
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.h
@ -21,7 +21,7 @@
 namespace Microsoft { namespace MSR { namespace CNTK {

 template <class ElemType>
-class HTKMLFReader : public IDataReader
+class HTKMLFReader : public DataReaderBase
 {
 private:
    const static size_t m_htkRandomizeAuto = 0;
@ -184,7 +184,7 @@ public:

    virtual void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize) override;

-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices);
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices);
    virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<LabelIdType, LabelType>& labelMapping);
    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart = 0);
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj
@ -64,11 +64,15 @@
      <PreprocessorDefinitions>WIN32;_DEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <TreatWarningAsError>true</TreatWarningAsError>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">..\..\common\include;..\..\Math</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">..\..\common\include;..\..\Math</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalLibraryDirectories Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
@ -82,6 +86,8 @@
      <SDLCheck>true</SDLCheck>
      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
      <TreatWarningAsError>true</TreatWarningAsError>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release|x64'">..\..\common\include;..\..\Math</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">..\..\common\include;..\..\Math</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@ -90,6 +96,8 @@
      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalDependencies>Math.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
      <Profile>true</Profile>
+      <AdditionalLibraryDirectories Condition="'$(Configuration)|$(Platform)'=='Release|x64'">$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">$(SolutionDir)$(Platform)\$(Configuration)\</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
@ -115,6 +123,7 @@
    <ClInclude Include="utterancesourcemulti.h" />
  </ItemGroup>
  <ItemGroup>
+    <ClCompile Include="..\..\Common\DataReader.cpp" />
    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
    <ClCompile Include="..\..\Common\TimerUtility.cpp">
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
--- a/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj.filters
+++ b/Source/Readers/HTKMLFReader/HTKMLFReader.vcxproj.filters
@ -15,6 +15,9 @@
    <ClCompile Include="..\..\Common\ExceptionWithCallStack.cpp" />
    <ClCompile Include="Exports.cpp" />
    <ClCompile Include="DataWriterLocal.cpp" />
+    <ClCompile Include="..\..\Common\DataReader.cpp">
+      <Filter>Common\Include</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="biggrowablevectors.h" />
--- a/Source/Readers/Kaldi2Reader/HTKMLFReader.cpp
+++ b/Source/Readers/Kaldi2Reader/HTKMLFReader.cpp
@ -846,7 +846,7 @@ void HTKMLFReader<ElemType>::StartMinibatchLoopToWrite(size_t mbSize, size_t /*e
 //             [out] each matrix resized if necessary containing data.
 // returns - true if there are more minibatches, false if no more minibatchs remain
 template <class ElemType>
-bool HTKMLFReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool HTKMLFReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    if (m_trainOrTest)
    {
--- a/Source/Readers/Kaldi2Reader/HTKMLFReader.h
+++ b/Source/Readers/Kaldi2Reader/HTKMLFReader.h
@ -13,7 +13,7 @@
 namespace Microsoft { namespace MSR { namespace CNTK {

 template <class ElemType>
-class HTKMLFReader : public IDataReader
+class HTKMLFReader : public DataReaderBase
 {
 private:
    msra::dbn::minibatchiterator* m_mbiter;
@ -186,7 +186,7 @@ public:
    }
    virtual ~HTKMLFReader();
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices);
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices);
    virtual const std::map<LabelIdType, LabelType>& GetLabelMapping(const std::wstring& sectionName);
    virtual void SetLabelMapping(const std::wstring& sectionName, const std::map<LabelIdType, LabelType>& labelMapping);
    virtual bool GetData(const std::wstring& sectionName, size_t numRecords, void* data, size_t& dataBufferSize, size_t recordStart = 0);
--- a/Source/Readers/LMSequenceReader/SequenceReader.cpp
+++ b/Source/Readers/LMSequenceReader/SequenceReader.cpp
@ -1138,7 +1138,7 @@ void SequenceReader<ElemType>::GetClassInfo()
 }

 template <class ElemType>
-bool SequenceReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool SequenceReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    FailBecauseDeprecated(__FUNCTION__);    // DEPRECATED CLASS, SHOULD NOT BE USED ANYMORE

@ -1889,7 +1889,7 @@ bool BatchSequenceReader<ElemType>::GetMinibatchData(size_t& /*out*/ firstPosInS
 //  - up to N sequences of the same length are returned in each MB
 //     - minibatches consist of sequences of the same length only (no gaps)
 template <class ElemType>
-bool BatchSequenceReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool BatchSequenceReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    // get out if they didn't call StartMinibatchLoop() first
    // TODO: Why not fail here?
@ -2023,7 +2023,7 @@ bool BatchSequenceReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices
 timePos: the time position. for example, 100 actual minibatch with 10 streams,
 timePosition = [0,..,9] for each actual tiem
 */
-// This function was only called from BatchSequenceReader::GetMinibatch(), but no longer.
+// This function was only called from BatchSequenceReader::TryGetMinibatch(), but no longer.
 template <class ElemType>
 void BatchSequenceReader<ElemType>::SetSentenceBegin(int wrd, int uttPos, int timePos)
 {
--- a/Source/Readers/LMSequenceReader/SequenceReader.h
+++ b/Source/Readers/LMSequenceReader/SequenceReader.h
@ -109,7 +109,7 @@ public:

 // Note: This class is deprecated for standalone use, only used as a base for BatchSequenceReader which overrides most of the functions.
 template <class ElemType>
-class SequenceReader : public IDataReader
+class SequenceReader : public DataReaderBase
 {
 protected:
    bool m_idx2clsRead;
@ -276,7 +276,7 @@ public:
    }
    virtual ~SequenceReader();
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices);
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices);

    // void SetSentenceSegBatch(std::vector<size_t> &/*sentenceEnd*/) {};
    // TODO: ^^ should this be   void CopyMBLayoutTo(MBLayoutPtr pMBLayout);
@ -407,7 +407,7 @@ private:

 public:
    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize) override;
-    bool GetMinibatch(StreamMinibatchInputs& matrices) override;
+    bool TryGetMinibatch(StreamMinibatchInputs& matrices) override;
    bool DataEnd() override;

    void CopyMBLayoutTo(MBLayoutPtr pMBLayout) { assert(mToProcess.size() == m_pMBLayout->GetNumParallelSequences()); pMBLayout->CopyFrom(m_pMBLayout); }
--- a/Source/Readers/LUSequenceReader/LUSequenceReader.cpp
+++ b/Source/Readers/LUSequenceReader/LUSequenceReader.cpp
@ -817,7 +817,7 @@ void BatchLUSequenceReader<ElemType>::SetNumParallelSequences(const size_t mz)
 }

 template <class ElemType>
-bool BatchLUSequenceReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool BatchLUSequenceReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    // get out if they didn't call StartMinibatchLoop() first
    // TODO: Why is this allowed? Why not terminate?
@ -881,12 +881,12 @@ bool BatchLUSequenceReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matric
                    {
                        assert(idx == (LabelIdType) NULLLABEL); // TODO: what other conditions?
                        // if (!m_pMBLayout->IsGap(s, t))    // verify that these are marked as NoInput
-                        //    LogicError("BatchLUSequenceReader::GetMinibatch observation is larger than its dimension but no_labels sign is not used to indicate that this observation has no labels. Possible reason is a bug in EnsureDataAvailable or a bug here.");
+                        //    LogicError("BatchLUSequenceReader::TryGetMinibatch observation is larger than its dimension but no_labels sign is not used to indicate that this observation has no labels. Possible reason is a bug in EnsureDataAvailable or a bug here.");
                        continue;
                    }

                    // if (m_pMBLayout->IsGap(s, t))    // verify that these are marked as NoInput
-                    //    LogicError("BatchLUSequenceReader::GetMinibatch: Inconsistent NoInput flag");
+                    //    LogicError("BatchLUSequenceReader::TryGetMinibatch: Inconsistent NoInput flag");

                    locObs.SetValue(idx + jj * featInfo.dim, j, (ElemType) 1);
                }
@ -1171,7 +1171,7 @@ template class BatchLUSequenceReader<double>;
 template class BatchLUSequenceReader<float>;

 template <class ElemType>
-bool MultiIOBatchLUSequenceReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool MultiIOBatchLUSequenceReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    // on first iteration, need to check if all requested data matrices are available
    std::map<std::wstring, size_t>::iterator iter;
--- a/Source/Readers/LUSequenceReader/LUSequenceReader.h
+++ b/Source/Readers/LUSequenceReader/LUSequenceReader.h
@ -47,7 +47,7 @@ enum ReaderMode
 };

 template <class ElemType>
-class LUSequenceReader : public IDataReader
+class LUSequenceReader : public DataReaderBase
 {
 protected:
    bool m_idx2clsRead;
@ -319,7 +319,7 @@ public:
    size_t GetLabelOutput(StreamMinibatchInputs& matrices, LabelInfo& labelInfo, size_t actualmbsize);

    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
-    bool GetMinibatch(StreamMinibatchInputs& matrices);
+    bool TryGetMinibatch(StreamMinibatchInputs& matrices);

    bool EnsureDataAvailable(size_t mbStartSample);
    size_t GetNumParallelSequences();
@ -411,7 +411,7 @@ public:
        }
    };

-    bool GetMinibatch(StreamMinibatchInputs& matrices);
+    bool TryGetMinibatch(StreamMinibatchInputs& matrices);

    void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples);

--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.cpp
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.cpp
@ -780,7 +780,7 @@ void LibSVMBinaryReader<ElemType>::DoDSSMMatrix(Matrix<ElemType>& mat, size_t ac
 }

 template <class ElemType>
-bool LibSVMBinaryReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool LibSVMBinaryReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
 //timer = clock();
 #if DEBUG
--- a/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.h
+++ b/Source/Readers/LibSVMBinaryReader/LibSVMBinaryReader.h
@ -226,7 +226,7 @@ private:
 };

 template <class ElemType>
-class LibSVMBinaryReader : public IDataReader
+class LibSVMBinaryReader : public DataReaderBase
 {
 public:
    virtual void Init(const ConfigParameters& config) override
@ -254,7 +254,7 @@ public:

    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
    virtual void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples) override;
-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices);
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices);

    virtual bool SupportsDistributedMBRead() const override
    {
--- a/Source/Readers/ReaderLib/BpttPacker.cpp
+++ b/Source/Readers/ReaderLib/BpttPacker.cpp
@ -158,9 +158,10 @@ Minibatch BpttPacker::ReadMinibatch()
    for (size_t streamIndex = 0; streamIndex < m_outputStreamDescriptions.size(); ++streamIndex)
    {
        m_currentLayouts[streamIndex]->Init(m_numParallelSequences, m_truncationSize);
+        size_t sequenceId = 0;
        for (size_t slotIndex = 0; slotIndex < m_numParallelSequences; ++slotIndex)
        {
-            PackSlot(streamIndex, slotIndex);
+            PackSlot(streamIndex, slotIndex, sequenceId);
        }

        StreamMinibatchPtr m = make_shared<StreamMinibatch>();
@ -173,7 +174,7 @@ Minibatch BpttPacker::ReadMinibatch()
 }

 // Packs a slot of sequences into the minibatch.
-void BpttPacker::PackSlot(size_t streamIndex, size_t slotIndex)
+void BpttPacker::PackSlot(size_t streamIndex, size_t slotIndex, size_t& sequenceId)
 {
    auto& slot = m_sequenceBufferPerStream[streamIndex]->m_slots[slotIndex];

@ -204,7 +205,7 @@ void BpttPacker::PackSlot(size_t streamIndex, size_t slotIndex)

    // Add current sequence to the minibatch layout.
    m_currentLayouts[streamIndex]->AddSequence(
-        NEW_SEQUENCE_ID,
+        sequenceId++,
        slotIndex,
        -(int)slot.m_sampleCursor,
        slot.FrontSequence()->m_numberOfSamples - slot.m_sampleCursor);
@ -220,7 +221,7 @@ void BpttPacker::PackSlot(size_t streamIndex, size_t slotIndex)

            //Adding next sequence to the minibatch.
            m_currentLayouts[streamIndex]->AddSequence(
-                NEW_SEQUENCE_ID,
+                sequenceId++,
                slotIndex,
                currentTimestep,
                currentTimestep + slot.FrontSequence()->m_numberOfSamples);
--- a/Source/Readers/ReaderLib/BpttPacker.h
+++ b/Source/Readers/ReaderLib/BpttPacker.h
@ -36,7 +36,11 @@ private:
    void ReadSequencesToSlot(size_t slotIndex);

    // Packs a slot into the data buffer.
-    void PackSlot(size_t streamIndex, size_t slotIndex);
+    // SequenceId specifies the starting value to be used as sequence identifier.
+    // For each new input, sequence id is reset to 0, and incremented each time
+    // a sequence is added to the layout. This allows layouts corresponding to different
+    // inputs to have consistent sequence ids.
+    void PackSlot(size_t streamIndex, size_t slotIndex, size_t& sequenceId);

    virtual MBLayoutPtr CreateMBLayout(const StreamBatch& batch)
    {
--- a/Source/Readers/ReaderLib/PackerBase.h
+++ b/Source/Readers/ReaderLib/PackerBase.h
@ -34,9 +34,9 @@ protected:
    };

    PackerBase(MemoryProviderPtr memoryProvider,
-        TransformerPtr transformer,
-        size_t minibatchSize,
-        const std::vector<StreamDescriptionPtr>& streams);
+               TransformerPtr transformer,
+               size_t minibatchSize,
+               const std::vector<StreamDescriptionPtr>& streams);

    typedef std::vector<SequenceDataPtr> StreamBatch;

--- a/Source/Readers/ReaderLib/ReaderShim.cpp
+++ b/Source/Readers/ReaderLib/ReaderShim.cpp
@ -22,9 +22,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {

 template <class ElemType>
 ReaderShim<ElemType>::ReaderShim(ReaderFactory factory)
-    : m_layout(make_shared<MBLayout>()), m_factory(factory)
+    : m_factory(factory)
 {
-    m_layout->SetUniqueAxisName(L"ReaderShim");
 }

 template <class ElemType>
@ -38,8 +37,7 @@ void ReaderShim<ElemType>::Init(const ConfigParameters& config)
    // otherwise deferring - synchronous execution during .get() call
    m_launchType = prefetch ? launch::async : launch::deferred;

-    auto numSeqsPerMBForAllEpochs = numberOfuttsPerMinibatchForAllEpochs;
-    m_layout->Init(numSeqsPerMBForAllEpochs[0], 0);
+    m_numParallelSequences = numberOfuttsPerMinibatchForAllEpochs[0];

    m_reader = m_factory(config);
    m_streams = m_reader->GetStreamDescriptions();
@ -105,7 +103,6 @@ string EnumerateInputs(const map<wstring, size_t> &nameToStreamId)
 template <class ElemType>
 bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
 {
-    
    // TODO: verify that the set of matrix names is identical 
    // to the set of reader input names. Warn if it's a subset, throw
    // if it's a superset.
@ -133,6 +130,15 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
        }
    }

+    // Reset stale mb layouts.
+    // BUGBUG: This seems incorrect. (1) layouts should all be updated below, and (2) some of these layouts are the same, we are resetting them twice.
+    for (const auto& iter : matrices)
+    {
+        iter.second.pMBLayout->Init(1, 0);
+    }
+
+    // a map to generate error messages when checking layout constraints. 
+    map<wstring, wstring> layoutToInputMap;
    if (!minibatch.m_data.empty())
    {
        // TODO: Use alternating pinned buffer in the packer, do not copy anything, but pack into the pinned memory.
@ -147,9 +153,31 @@ bool ReaderShim<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
            }

            size_t streamId = m_nameToStreamId[mx.first];
-
+            
            const auto& stream = minibatch.m_data[streamId];
-            m_layout = stream->m_layout;
+
+            m_numParallelSequences = stream->m_layout->GetNumParallelSequences();
+
+            // This assert no longer holds - different inputs have different sequence lengths, resulting in different number 
+            // of parallel samples.
+            // assert(m_numParallelSequences == minibatch.m_data.front()->m_layout->GetNumParallelSequences());
+
+            auto& layout = mx.second.pMBLayout;
+
+            if (layout->GetNumCols() == 0)
+            {
+                // layout is empty, copy layout info from the reader
+                layout->CopyFrom(stream->m_layout, /*keepName*/ true);
+                layoutToInputMap[layout->GetAxisName()] = mx.first;
+            }
+            else if (*layout != *stream->m_layout) // this does a deep value-level comparison
+            {
+                RuntimeError("Dynamic axis layout '%ls' is shared between inputs '%ls' and '%ls', but layouts generated "
+                    "from the input data are incompatible on this axis. Are you using different sequence lengths? "
+                    "Did you consider adding a DynamicAxis() to the Input nodes?",
+                    layout->GetAxisName(), layoutToInputMap[layout->GetAxisName()].c_str(), mx.first.c_str());
+            }
+
            size_t sampleSize = m_streams[streamId]->m_sampleLayout->GetNumElements();
            auto& matrix = matrices.GetInputMatrix<ElemType>(mx.first);
            FillMatrixFromStream(m_streams[streamId]->m_storageType, &matrix, sampleSize, stream);
@ -200,13 +228,21 @@ bool ReaderShim<ElemType>::DataEnd() { return false; } // Note: Return value nev
 template <class ElemType>
 void ReaderShim<ElemType>::CopyMBLayoutTo(MBLayoutPtr layout)
 {
-    layout->CopyFrom(m_layout);
+    // This method is inherited from IDataReader and should be removed in the near future.
+    NOT_IMPLEMENTED;
 }

 template <class ElemType>
 size_t ReaderShim<ElemType>::GetNumParallelSequences()
 {
-    return m_layout->GetNumParallelSequences();
+    // BUGBUG This is a property of the stream, of which this reader might produce several, with different nr. of
+    // parallel sequences. Thus this property doesn't make sense anymore.
+    // This method is called by 
+    // * DataReaderHelpers::GetNumSubminibatchesNeeded to estimate mb size
+    // * ComputationNetwork::SetBatchNormalizationTimeConstants to compute learning rate per sample
+    // * ComputationNetwork::SetBatchNormalizationTimeConstants to compute actual mb size and momentum per sample
+    // * SGD::AdaptiveMinibatchSizing  to compute learning rate per sample
+    return m_numParallelSequences;
 }

 template class ReaderShim<float>;
--- a/Source/Readers/ReaderLib/ReaderShim.h
+++ b/Source/Readers/ReaderLib/ReaderShim.h
@ -58,7 +58,7 @@ private:
    ReaderFactory m_factory;
    bool m_endOfEpoch;

-    MBLayoutPtr m_layout;
+    size_t m_numParallelSequences;

    std::map<std::wstring, size_t> m_nameToStreamId;
    std::vector<StreamDescriptionPtr> m_streams;
--- a/Source/Readers/SparsePCReader/SparsePCReader.cpp
+++ b/Source/Readers/SparsePCReader/SparsePCReader.cpp
@ -207,7 +207,7 @@ void SparsePCReader<ElemType>::StartMinibatchLoop(size_t mbSize, size_t /*epoch*
 //             [out] each matrix resized if necessary containing data.
 // returns - true if there are more minibatches, false if no more minibatchs remain
 template <class ElemType>
-bool SparsePCReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool SparsePCReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    // get out if they didn't call StartMinibatchLoop() first
    if (m_miniBatchSize == 0)
--- a/Source/Readers/SparsePCReader/SparsePCReader.h
+++ b/Source/Readers/SparsePCReader/SparsePCReader.h
@ -21,7 +21,7 @@
 namespace Microsoft { namespace MSR { namespace CNTK {

 template <class ElemType>
-class SparsePCReader : public IDataReader
+class SparsePCReader : public DataReaderBase
 {
    ConfigParameters m_readerConfig;
    std::wstring m_file;
@ -76,7 +76,7 @@ public:
        InitFromConfig(config);
    }
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);
-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices);
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices);

    size_t GetNumParallelSequences()
    {
--- a/Source/Readers/UCIFastReader/UCIFastReader.cpp
+++ b/Source/Readers/UCIFastReader/UCIFastReader.cpp
@ -765,7 +765,7 @@ void UCIFastReader<ElemType>::StoreLabel(ElemType& labelStore, const LabelType&
 //             [out] each matrix resized if necessary containing data.
 // returns - true if there are more minibatches, false if no more minibatchs remain
 template <class ElemType>
-bool UCIFastReader<ElemType>::GetMinibatch(StreamMinibatchInputs& matrices)
+bool UCIFastReader<ElemType>::TryGetMinibatch(StreamMinibatchInputs& matrices)
 {
    bool minibatchesRemaining = true;
    if (m_pendingAsyncGetMinibatch.valid())
--- a/Source/Readers/UCIFastReader/UCIFastReader.h
+++ b/Source/Readers/UCIFastReader/UCIFastReader.h
@ -36,7 +36,7 @@ enum LabelKind
 };

 template <class ElemType>
-class UCIFastReader : public IDataReader
+class UCIFastReader : public DataReaderBase
 {
    shared_ptr<UCIParser<ElemType, LabelType>> m_parser;
    size_t m_mbSize;                 // size of minibatch requested
@ -151,7 +151,7 @@ public:

    virtual void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize) override;

-    virtual bool GetMinibatch(StreamMinibatchInputs& matrices);
+    virtual bool TryGetMinibatch(StreamMinibatchInputs& matrices);

    bool GetMinibatchImpl(StreamMinibatchInputs& matrices);

--- a/Source/SGDLib/DataReaderHelpers.h
+++ b/Source/SGDLib/DataReaderHelpers.h
@ -34,9 +34,8 @@ namespace Microsoft { namespace MSR { namespace CNTK {
                                        const MPIWrapperPtr& mpi)
    {
        // Reading consists of a sequence of Reader API calls:
-        //  - GetMinibatch() --fills the inputMatrices
+        //  - GetMinibatch() --fills the inputMatrices and copies the MBLayout from Reader into inputMatrices
        //  - SetActualMiniBatchSizeFromFeatures()  --tells Network to resize the nodes' buffers
-        //  - CopyMBLayoutTo()   --copies the MBLayout from Reader to Network
        // with the special twist that in presence of parallelization, there is some decimation involved.

        bool wasDataRead = trainSetDataReader.GetMinibatch(inputMatrices); // fill in the minibatch data into the Input nodes' buffers directly
@ -61,13 +60,23 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            trainSetDataReader.GetMinibatch4SE(*latticeinput, *uids, *boundaries, *extrauttmap);
        }

-        // get layout meta-data
-        // BUGBUG (Issue #95): must be adapted for multiple MBLayouts
-        trainSetDataReader.CopyMBLayoutTo(net->GetMBLayoutPtrOfNetwork());
-
+        // TODO: move this into shim for the old readers.
        // decimate if needed. Decimation happens in-place.
+        // This is only allowed for old readers, which support a single layout for all inputs.
        if (!useDistributedMBReading && useParallelTrain)
-            DecimateMinibatchInPlace<ElemType>(inputMatrices, mpi->NumNodesInUse(), mpi->CurrentNodeRank(), net->GetMBLayoutPtrOfNetwork());
+        {
+            auto& pMBLayout = net->GetMBLayoutPtrOfNetwork();
+            
+            // Verify that there's indeed a single layout
+            for (const auto& iter : inputMatrices)
+            {
+                assert(iter.second.pMBLayout == pMBLayout);
+                // TODO: This must be a runtime check, not an assert().
+                UNUSED(iter);
+            }
+        
+            DecimateMinibatchInPlace<ElemType>(inputMatrices, mpi->NumNodesInUse(), mpi->CurrentNodeRank(), pMBLayout);
+        }

        // reader will have resized input node's m_value directly. Nodes must be notified to do necessary internal state updates from that.
        // TODO: This is a stopgap. SGD will at some point change from sets of matrices to sets of nodes. Then this will become much simpler.
@ -139,7 +148,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
            // decimatedMB[name]->SetValue(mat.Reshaped(nRows*nSequence, nT).RowSlice( st*nRows , (en-st)*nRows).Reshaped(nRows, nNewParallelSequence*nT));
        }
        // decimate MBLayout as well
-        pDecimateMBLayout = make_shared<MBLayout>(numNewParallelSequence, nT);
+        pDecimateMBLayout = make_shared<MBLayout>(numNewParallelSequence, nT, L"");
        pDecimateMBLayout->SetAxisName(pMBLayout->GetAxisName());
 #if 1
        // now copy over all sequence info records that are inside the range, with adjusted 's'
--- a/Tests/EndToEndTests/Text/SequenceClassification/Config/seqcla.cntk
+++ b/Tests/EndToEndTests/Text/SequenceClassification/Config/seqcla.cntk
@ -0,0 +1,129 @@
+# Copyright (c) Microsoft. All rights reserved.
+# Licensed under the MIT license. See LICENSE file in the project root for full license information.
+
+RootDir = ".."
+
+ConfigDir = "$RootDir$/Config"
+DataDir   = "$RootDir$/Data"
+OutputDir = "$RootDir$/Output"
+ModelDir  = "$OutputDir$/Models"
+
+command=Train #:Write
+deviceId = $DeviceId$
+modelPath="$ModelDir$/seqcla.dnn"
+
+Train=[
+    action="train"
+    run=BrainScriptNetworkBuilder
+    
+    BrainScriptNetworkBuilder=[
+        Macros = [
+            // define "last hidden state of sequence" in the LSTM (really for any sequence though)
+            TakeRight (N, x) = BS.Sequences._Take(FutureValue, N, x)
+            Last(x) = TakeRight(1, x)
+        ]
+        Layers = [
+            EmbeddingLayer(input, vocabSize, embeddingDim, embeddingPath) = [
+                embedding = Transpose(LearnableParameter(vocabSize, embeddingDim, learningRateMultiplier = 0.0, init = 'fromFile', initFromFilePath = embeddingPath))          
+                lookup = GatherPacked(features, embedding)
+            ].lookup
+            DenseLayer(input, inputSize, outputSize, activation) = [
+               z = BFF(input, outputSize, inputSize).z
+               act = activation(z)
+            ].act
+            LSTMLayer(input, inputSize, outputSize, cellSize, selector) = [ 
+               lstm = BS.RNNs.RecurrentLSTMP(inputSize, outputSize, cellSize, input)
+               result = selector(lstm)
+            ].result
+        ]        
+        
+        // LSTM params
+        lstmDim = 25
+        cellDim = 25
+        
+        // model
+        numLabels = 5        
+        vocab = 2000
+        embedDim = 50        
+                
+        // set up features and labels
+        t = DynamicAxis()
+        features = Input(1, dynamicAxis=t)   # Input has shape (1,t)
+        labels   = Input(numLabels)          # Input has shape (numLabels,*) where all sequences in *=1
+        
+        // load the pre-learned word embedding matrix
+        l1 = Layers.EmbeddingLayer(features, vocab, embedDim, 'embeddingmatrix.txt')
+        l2 = Layers.LSTMLayer(l1, embedDim, lstmDim, cellDim, Macros.Last)
+        l3 = Layers.DenseLayer(l2, lstmDim, numLabels, Pass)
+        out = Pass(l3, tag='output')   
+        
+        // Make sure the trainer understands that the time dimension of l3 is actually the same as that of labels.
+        l3p = ReconcileDynamicAxis(l3, labels)
+        
+        // training criteria
+        ce  = CrossEntropyWithSoftmax(labels, l3p, tag='criterion')   // this is the training objective
+        wer = ErrorPrediction        (labels, l3p, tag='evaluation')  // this also gets tracked
+    ]
+    
+    SGD = [	
+      epochSize = 0
+      minibatchSize = 200
+      maxEpochs = 5
+      momentumPerMB = 0.9
+      learningRatesPerMB = 0.1
+    ]
+    
+    reader = [
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Train.txt"            
+        
+        input = [            
+            features=[
+                alias = "x"                
+                dim = 1               
+                format = "dense"
+            ]
+            labels=[
+                alias = "y"                
+                dim = 5           
+                format = "dense"
+            ]
+        ]
+   ]    
+   outputPath = "$OutputDir$/output.txt"        # dump the output as text?
+]
+
+Write=[
+    action="test"
+    run=BrainScriptNetworkBuilder
+    
+    format = [
+      # %n = minibatch, %x = shape, %d = sequenceId
+      sequencePrologue=%d\t|w.shape %x\n%d\t|w\s
+      sampleSeparator=\n%d\t|w\s
+      elementSeparator=\s
+    ]
+    
+    modelFile = "$ModelDir$/seqcla.dnn"    
+    
+    reader = [
+        
+            readerType = "CNTKTextFormatReader"
+            file = "$DataDir$/Train.txt"            
+            
+            input = [            
+                features=[
+                    alias = "x"                
+                    dim = 1               
+                    format = "dense"
+                ]
+                labels=[
+                    alias = "y"                
+                    dim = 5           
+                    format = "dense"
+                ]
+            ]
+            
+   ]    
+   outputPath = "$OutputDir$/output.txt"        # dump the output as text?
+]
--- a/Tests/EndToEndTests/Text/SequenceClassification/Data/Train.txt
+++ b/Tests/EndToEndTests/Text/SequenceClassification/Data/Train.txt
--- a/Tests/EndToEndTests/Text/SequenceClassification/Data/embeddingmatrix.txt
+++ b/Tests/EndToEndTests/Text/SequenceClassification/Data/embeddingmatrix.txt
--- a/Tests/EndToEndTests/Text/SequenceClassification/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Text/SequenceClassification/baseline.linux.cpu.txt
--- a/Tests/EndToEndTests/Text/SequenceClassification/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Text/SequenceClassification/baseline.linux.gpu.txt
--- a/Tests/EndToEndTests/Text/SequenceClassification/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Text/SequenceClassification/baseline.windows.cpu.txt
--- a/Tests/EndToEndTests/Text/SequenceClassification/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Text/SequenceClassification/baseline.windows.gpu.txt
--- a/Tests/EndToEndTests/Text/SequenceClassification/run-test
+++ b/Tests/EndToEndTests/Text/SequenceClassification/run-test
@ -0,0 +1,16 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+ConfigDir=$TEST_DIR/Config
+
+# cntkrun <CNTK config file name> <additional CNTK args>
+DeleteModelsAfterTest=0
+cntkrun seqcla.cntk || exit $?
+echo === Deleting last epoch data
+rm $TEST_RUN_DIR/Models/*.dnn
+echo ==== Re-running from checkpoint
+DeleteExistingModels=0
+DeleteModelsAfterTest=1
+# cntkrun <CNTK config file name> <additional CNTK args>
+cntkrun seqcla.cntk 'makeMode=true' || exit $?
--- a/Tests/EndToEndTests/Text/SequenceClassification/testcases.yml
+++ b/Tests/EndToEndTests/Text/SequenceClassification/testcases.yml
@ -0,0 +1,33 @@
+dataDir: ./Data
+tags:
+     # running on every BVT job in 'S' (Speech) leg in Debug-GPU and Release-CPU configurations:
+     - bvt-s  (build_sku == 'gpu') and ((flavor=='debug') ^ (device=='cpu'))
+     # running unconditionally on every Nightly job in 'S' leg
+     - nightly-s (build_sku == 'gpu')
+
+testCases:
+  CNTK Run must be completed:
+    patterns:
+      - __COMPLETED__
+
+  Must train epochs in exactly same order and parameters:
+    patterns:
+      - Starting Epoch {{integer}}
+      - learning rate per sample = {{float}}
+      - momentum = {{float}}
+
+  Epochs must be finished with expected results:
+    patterns:
+      - Finished Epoch[{{integer}} of {{integer}}]
+      - TrainLossPerSample = {{float,tolerance=.5%}}
+      # TODO GPU has an initial EvalErr rate which is larger than on CPU and otherwise. 
+      # In later epochs the results are aligned. Why?
+      - EvalErrPerSample = {{float,tolerance=13%}}
+      - AvgLearningRatePerSample = {{float,tolerance=0.001%}}
+
+#  Per-minibatch training results must match:
+#    patterns:
+#      - Epoch[{{integer}} of {{integer}}]-Minibatch[{{integer}}-{{integer}}
+#      - SamplesSeen = {{integer}}
+#      - TrainLossPerSample = {{float,tolerance=.5%}}
+#      - EvalErr[0]PerSample = {{float,tolerance=.5%}}
--- a/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
+++ b/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
@ -110,7 +110,6 @@
    <ClCompile Include="..\..\..\Source\CNTK\BrainScript\BrainScriptEvaluator.cpp" />
    <ClCompile Include="..\..\..\Source\CNTK\BrainScript\BrainScriptParser.cpp" />
    <ClCompile Include="..\..\..\Source\CNTK\BrainScript\BrainScriptTest.cpp" />
-    <ClCompile Include="..\..\..\Source\CNTK\BrainScript\ExperimentalNetworkBuilder.cpp" />
    <ClCompile Include="..\..\..\Source\Common\Config.cpp" />
    <ClCompile Include="..\..\..\Source\Common\DataReader.cpp" />
    <ClCompile Include="..\..\..\Source\Common\DataWriter.cpp" />
--- a/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj.filters
+++ b/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj.filters
@ -34,9 +34,6 @@
    <ClCompile Include="..\..\..\Source\CNTK\BrainScript\BrainScriptTest.cpp">
      <Filter>From BrainScript</Filter>
    </ClCompile>
-    <ClCompile Include="..\..\..\Source\CNTK\BrainScript\ExperimentalNetworkBuilder.cpp">
-      <Filter>From BrainScript</Filter>
-    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Config">
--- a/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
+++ b/Tests/UnitTests/ReaderTests/Common/ReaderTestHelper.h
@ -189,20 +189,20 @@ struct ReaderFixture

            for (auto cnt = 0; dataReader.GetMinibatch(map) && cnt < m_maxMiniBatchCount; cnt++)
            {
-                MBLayoutPtr pMBlayoutPtr = make_shared<MBLayout>();
-                dataReader.CopyMBLayoutTo(pMBlayoutPtr);
                // Process the Feature Matri(x|ces)
                for (auto i = 0; i < numFeatureFiles; i++)
                {
                    wstring name = numFeatureFiles > 1 ? L"features" + std::to_wstring(i + 1) : L"features";
-                    OutputMatrix(map.GetInputMatrix<ElemType>(name), *pMBlayoutPtr, outputFile);
+                    auto& layoutPtr = map.GetInput(name).pMBLayout;
+                    OutputMatrix(map.GetInputMatrix<ElemType>(name), *layoutPtr, outputFile);
                }

                // Process the Label Matri(x|ces)
                for (auto i = 0; i < numLabelFiles; i++)
                {
                    wstring name = numLabelFiles > 1 ? L"labels" + std::to_wstring(i + 1) : L"labels";
-                    OutputMatrix(map.GetInputMatrix<ElemType>(name), *pMBlayoutPtr, outputFile);
+                    auto& layoutPtr = map.GetInput(name).pMBLayout;
+                    OutputMatrix(map.GetInputMatrix<ElemType>(name), *layoutPtr, outputFile);
                }
            }
        }
@ -255,7 +255,10 @@ struct ReaderFixture
        std::vector<shared_ptr<Matrix<ElemType>>> features;
        std::vector<shared_ptr<Matrix<ElemType>>> labels;

-        MBLayoutPtr pMBLayout = make_shared<MBLayout>();
+        // For the time being, use the same layout across all inputs.
+        // TODO: add an option to create per-input layouts (once we have test-cases with different layouts)
+        MBLayoutPtr pMBLayout = make_shared<MBLayout>(1, 0, L"X");
+
        for (auto i = 0; i < numFeatureFiles; i++)
        {
            features.push_back(make_shared<Matrix<ElemType>>(0));