Merge branch 'qiwye/asgd-dev' of https://github.com/Microsoft/CNTK into qiwye/asgd-dev

2016-10-10 13:04:06 +08:00 · 2016-10-10 13:04:06 +08:00 · ebefc5ade5
--- a/.gitignore
+++ b/.gitignore
@ -199,3 +199,12 @@ Tests/EndToEndTests/UnitTests/MathTests/MS.txt
 Dependencies/CNTKCustomMKL/Publish
 Dependencies/CNTKCustomMKL/CNTKCustomMKL-Linux-*.tgz
 Dependencies/CNTKCustomMKL/CNTKCustomMKL-Windows-*.zip
+
+# Python bindings
+bindings/python/_cntk_py.pyd
+bindings/python/cntk.egg-info/
+bindings/python/cntk/cntk_py.py
+bindings/python/cntk/libs/
+bindings/python/cntk/cntk_py_wrap.cpp
+bindings/python/cntk/cntk_py_wrap.h
+bindings/python/dist/
--- a/CNTK.sln
+++ b/CNTK.sln
@ -174,7 +174,6 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "DoublePrecision", "DoublePr
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Kaldi2Reader", "Kaldi2Reader", "{C70E1572-20FF-496C-A0A9-10AA6755A07C}"
 	ProjectSection(SolutionItems) = preProject
-		Source\Readers\Kaldi2Reader\basetypes.h = Source\Readers\Kaldi2Reader\basetypes.h
 		Source\Readers\Kaldi2Reader\biggrowablevectors.h = Source\Readers\Kaldi2Reader\biggrowablevectors.h
 		Source\Readers\Kaldi2Reader\chunkevalsource.h = Source\Readers\Kaldi2Reader\chunkevalsource.h
 		Source\Readers\Kaldi2Reader\DataReader.cpp = Source\Readers\Kaldi2Reader\DataReader.cpp
--- a/Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp
+++ b/Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp
@ -46,22 +46,22 @@ int main(int argc, char* argv[])
    path = (pos == std::string::npos) ? "." : app.substr(0, pos);

    // This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
-    const std::string modelWorkingDirectory = path + "/../../Examples/Image/MNIST/Data/";
+    const std::string modelWorkingDirectory = path + "/../../Examples/Image/GettingStarted";
 #else // on Linux
    pos = app.rfind("/");
    path = (pos == std::string::npos) ? "." : app.substr(0, pos);

    // This relative path assumes launching from CNTK's binary folder, e.g. build/cpu/release/bin/
-    const std::string modelWorkingDirectory = path + "/../../../../Examples/Image/MNIST/Data/";
+    const std::string modelWorkingDirectory = path + "/../../../../Examples/Image/GettingStarted";
 #endif
-    const std::string modelFilePath = modelWorkingDirectory + "../Output/Models/01_OneHidden";
+    const std::string modelFilePath = modelWorkingDirectory + "/Output/Models/01_OneHidden";

    try
    {
        struct stat statBuf;
        if (stat(modelFilePath.c_str(), &statBuf) != 0)
        {
-            fprintf(stderr, "Error: The model %s does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/MNIST to create the model.\n", modelFilePath.c_str());
+            fprintf(stderr, "Error: The model %s does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/GettingStarted to create the model.\n", modelFilePath.c_str());
            return(1);
        }

--- a/Examples/Evaluation/CPPEvalV2Client/EvalMultithreads.cpp
+++ b/Examples/Evaluation/CPPEvalV2Client/EvalMultithreads.cpp
@ -1,3 +1,9 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// EvalMultithreads.cpp : Sample application shows how to evaluate a model in multiple threading environment. 
+//
 #include <functional>
 #include <thread>
 #include <iostream>
@ -5,108 +11,23 @@

 using namespace CNTK;

-FunctionPtr FullyConnectedDNNLayerWithSharedParameters(Variable input,
-                                                       const Parameter& timesParam,
-                                                       const Parameter& plusParam,
-                                                       const std::function<FunctionPtr(const FunctionPtr&)>& nonLinearity)
-{
-    assert(input.Shape().Rank() == 1);
+void OutputFunctionInfo(FunctionPtr);
+FunctionPtr FullyConnectedDNNLayerWithSharedParameters(Variable, const Parameter&, const Parameter&, const std::function<FunctionPtr(const FunctionPtr&)>&);
+void CreateFunctionAndEvaluateWithSharedParameters(size_t, size_t, size_t, const Parameter&, const Parameter&, const Parameter[], const Parameter[], const Parameter&, const DeviceDescriptor&);
+FunctionPtr SetupFullyConnectedLinearLayer(Variable, size_t, const DeviceDescriptor&, const std::wstring&);
+FunctionPtr SetupFullyConnectedDNNLayer(Variable, size_t, const DeviceDescriptor& device, const std::function<FunctionPtr(const FunctionPtr&)>& nonLinearity);
+void RunEvaluationClassifier(FunctionPtr, const DeviceDescriptor&);
+void RunEvaluationOneHidden(FunctionPtr, const DeviceDescriptor&);

-    // Todo: assume that timesParam has matched outputDim and inputDim 
-    auto timesFunction = Times(timesParam, input);
-
-    // Todo: assume that timesParam has matched outputDim 
-    auto plusFunction = Plus(plusParam, timesFunction);
-
-    return nonLinearity(plusFunction);
-}
-
-FunctionPtr FullyConnectedFeedForwardClassifierNetWithSharedParameters(Variable input,
-                                                                       size_t numHiddenLayers,
-                                                                       const Parameter& inputTimesParam,
-                                                                       const Parameter& inputPlusParam,
-                                                                       const Parameter hiddenLayerTimesParam[],
-                                                                       const Parameter hiddenLayerPlusParam[],
-                                                                       const Parameter& outputTimesParam,
-                                                                       const std::function<FunctionPtr(const FunctionPtr&)>& nonLinearity)
-{
-    assert(numHiddenLayers >= 1);
-    auto classifierRoot = FullyConnectedDNNLayerWithSharedParameters(input, inputTimesParam, inputPlusParam, nonLinearity);
-
-    for (size_t i = 1; i < numHiddenLayers; ++i)
-        classifierRoot = FullyConnectedDNNLayerWithSharedParameters(classifierRoot, hiddenLayerTimesParam[i - 1], hiddenLayerPlusParam[i - 1], nonLinearity);
-
-    // Todo: assume that outputTimesParam has matched output dim and hiddenLayerDim
-    classifierRoot = Times(outputTimesParam, classifierRoot);
-    return classifierRoot;
-}
-
-void EvaluationNewNetworkWithSharedParameters(size_t inputDim,
-                                              size_t numOutputClasses,
-                                              size_t numHiddenLayers,
-                                              const Parameter& inputTimesParam,
-                                              const Parameter& inputPlusParam,
-                                              const Parameter hiddenLayerTimesParam[],
-                                              const Parameter hiddenLayerPlusParam[],
-                                              const Parameter& outputTimesParam,
-                                              const DeviceDescriptor& computeDevice)
-{
-    using namespace std::placeholders;
-
-    // Create network using shared parameters
-    auto inputVar = InputVariable({inputDim}, DataType::Float, L"Features");
-    auto classifierOutputFunction = FullyConnectedFeedForwardClassifierNetWithSharedParameters(inputVar,
-                                                                                               numHiddenLayers,
-                                                                                               inputTimesParam,
-                                                                                               inputPlusParam,
-                                                                                               hiddenLayerTimesParam,
-                                                                                               hiddenLayerPlusParam,
-                                                                                               outputTimesParam,
-                                                                                               std::bind(Sigmoid, _1, L""));
-
-    auto labelsVar = InputVariable({numOutputClasses}, DataType::Float, L"Labels");
-    auto trainingLossFunction = CNTK::CrossEntropyWithSoftmax(classifierOutputFunction, labelsVar, L"LossFunction");
-    auto predictionFunction = CNTK::ClassificationError(classifierOutputFunction, labelsVar, L"ClassificationError");
-
-    auto ffNet = CNTK::Combine({trainingLossFunction, predictionFunction, classifierOutputFunction}, L"ClassifierModel");
-
-    if (ffNet->Parameters().size() != ((numHiddenLayers * 2) + 1))
-        throw std::runtime_error("EvaluationNewNetworkWithSharedParameters: Function does not have expected Parameter count");
-
-    if (ffNet->Arguments().size() != 2)
-        throw std::runtime_error("EvaluationNewNetworkWithSharedParameters: Function does not have expected Argument count");
-
-    if (ffNet->Outputs().size() != 3)
-        throw std::runtime_error("EvaluationNewNetworkWithSharedParameters: Function does not have expected Output count");
-
-    // Evaluate the network in several runs 
-    size_t iterationCount = 4;
-    unsigned int randSeed = 2;
-    srand(randSeed);
-    size_t numSamples = 3;
-    for (size_t t = 0; t < iterationCount; ++t)
-    {
-        std::vector<float> inputData(inputDim * numSamples);
-        for (size_t i = 0; i < inputData.size(); ++i)
-            inputData[i] = ((float)rand()) / RAND_MAX;
-
-        NDShape inputShape = {inputDim, 1, numSamples};
-        ValuePtr inputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(inputShape, inputData.data(), inputData.size(), DeviceDescriptor::CPUDevice(), true));
-
-        std::vector<float> labelData(numOutputClasses * numSamples, 0);
-        for (size_t i = 0; i < numSamples; ++i)
-            labelData[(i*numOutputClasses) + (rand() % numOutputClasses)] = 1;
-
-        NDShape labelShape = {numOutputClasses, 1, numSamples};
-        ValuePtr labelValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(labelShape, labelData.data(), labelData.size(), DeviceDescriptor::CPUDevice(), true));
-
-        ValuePtr outputValue, predictionErrorValue;
-        std::unordered_map<Variable, ValuePtr> outputs = {{classifierOutputFunction->Output(), outputValue}, {predictionFunction->Output(), predictionErrorValue}};
-        ffNet->Forward({{inputVar, inputValue}, {labelsVar, labelValue}}, outputs, computeDevice);
-    }
-}
-
-void EvalMultiThreadsWithNewNetwork(const DeviceDescriptor& device, const int threadCount)
+/// <summary>
+/// Shows how to create Function whose parameters can be shared by multi evaluation threads.
+/// </summary>
+/// <description>
+/// It first creates all parameters needed for the Function, and then spawns multi threads. 
+/// Althought each thread creates a new instance of function, all threads share the same parameters.
+/// After that, each thread runs evaluation independently.
+/// </description>
+void MultiThreadsEvaluationWithNewFunction(const DeviceDescriptor& device, const int threadCount)
 {
    const size_t inputDim = 937;
    const size_t numOutputClasses = 9304;
@ -136,7 +57,7 @@ void EvalMultiThreadsWithNewNetwork(const DeviceDescriptor& device, const int th
    std::vector<std::thread> threadList(threadCount);
    for (int th = 0; th < threadCount; ++th)
    {
-        threadList[th] = std::thread(EvaluationNewNetworkWithSharedParameters, inputDim, numOutputClasses, numHiddenLayers, inputTimesParam, inputPlusParam, hiddenLayerTimesParam, hiddenLayerPlusParam, outputTimesParam, device);
+        threadList[th] = std::thread(CreateFunctionAndEvaluateWithSharedParameters, inputDim, numOutputClasses, numHiddenLayers, inputTimesParam, inputPlusParam, hiddenLayerTimesParam, hiddenLayerPlusParam, outputTimesParam, device);
    }

    for (int th = 0; th < threadCount; ++th)
@ -146,3 +67,433 @@ void EvalMultiThreadsWithNewNetwork(const DeviceDescriptor& device, const int th
        fflush(stderr);
    }
 }
+
+/// <summary>
+/// Shows how to use Clone() to share function parameters among multi evaluation threads.
+/// </summary>
+/// <description>
+/// It first creates a new function with parameters, then spawns multi threads. Each thread uses Clone() to create a new
+/// instance of function and then use this instance to do evaluation.
+/// All cloned functions share the same parameters.
+/// </description>
+void MultiThreadsEvaluationWithClone(const DeviceDescriptor& device, const int threadCount)
+{
+    using namespace std::placeholders;
+
+    const size_t inputDim = 937;
+    const size_t numOutputClasses = 9304;
+    const size_t numHiddenLayers = 6;
+    const size_t hiddenLayersDim = 2048;
+
+    auto inputVar = InputVariable({inputDim}, DataType::Float, L"features");
+
+    assert(numHiddenLayers >= 1);
+    auto classifierRoot = SetupFullyConnectedDNNLayer(inputVar, hiddenLayersDim, device, std::bind(Sigmoid, _1, L""));
+    for (size_t i = 1; i < numHiddenLayers; ++i)
+    {
+        classifierRoot = SetupFullyConnectedDNNLayer(classifierRoot, hiddenLayersDim, device, std::bind(Sigmoid, _1, L""));
+    }
+
+    auto outputTimesParam = Parameter(NDArrayView::RandomUniform<float>({numOutputClasses, hiddenLayersDim}, -0.5, 0.5, 1, device));
+    auto classifierFunc = Times(outputTimesParam, classifierRoot, 1, L"classifierOutput");
+
+    // Now test the structure
+    if (classifierFunc->Parameters().size() != ((numHiddenLayers * 2) + 1))
+    {
+        throw std::runtime_error("MultiThreadsEvaluationWithClone: Function does not have expected Parameter count");
+    }
+
+    OutputFunctionInfo(classifierFunc);
+    fprintf(stderr, "MultiThreadsEvaluationWithClone on device=%d\n", device.Id());
+
+    // Run evaluation in parallel
+    std::vector<std::thread> threadList(threadCount);
+    for (int th = 0; th < threadCount; ++th)
+    {
+        threadList[th] = std::thread(RunEvaluationClassifier, classifierFunc->Clone(), device);
+    }
+
+    for (int th = 0; th < threadCount; ++th)
+    {
+        threadList[th].join();
+        fprintf(stderr, "thread %d joined.\n", th);
+        fflush(stderr);
+    }
+}
+
+/// <summary>
+/// Shows how to use LoadLegacyModel() and Clone() to share function parameters among multi evaluation threads.
+/// </summary>
+/// <description>
+/// It first loads a model, then spawns multi threads. Each thread uses Clone() to create a new
+/// instance of function and then use this instance to do evaluation.
+/// All cloned functions share the same parameters.
+/// </description>
+void MultiThreadsEvaluationWithLoadModel(const DeviceDescriptor& device, const int threadCount)
+{
+    // The model file will be trained and copied to the current runtime directory first.
+    auto modelFuncPtr = CNTK::LoadLegacyModel(DataType::Float, L"01_OneHidden", device);
+
+
+    OutputFunctionInfo(modelFuncPtr);
+    fprintf(stderr, "MultiThreadsEvaluationWithLoadModel on device=%d\n", device.Id());
+
+    // Run evaluation in parallel.
+    std::vector<std::thread> threadList(threadCount);
+    for (int th = 0; th < threadCount; ++th)
+    {
+        threadList[th] = std::thread(RunEvaluationOneHidden, modelFuncPtr->Clone(), device);
+    }
+
+    for (int th = 0; th < threadCount; ++th)
+    {
+        threadList[th].join();
+        fprintf(stderr, "thread %d joined.\n", th);
+        fflush(stderr);
+    }
+}
+
+inline FunctionPtr FullyConnectedDNNLayerWithSharedParameters(Variable input,
+                                                              const Parameter& timesParam,
+                                                              const Parameter& plusParam,
+                                                              const std::function<FunctionPtr(const FunctionPtr&)>& nonLinearity)
+{
+    assert(input.Shape().Rank() == 1);
+
+    // Todo: assume that timesParam has matched outputDim and inputDim 
+    auto timesFunction = Times(timesParam, input);
+
+    // Todo: assume that timesParam has matched outputDim 
+    auto plusFunction = Plus(plusParam, timesFunction);
+
+    return nonLinearity(plusFunction);
+}
+
+inline FunctionPtr FullyConnectedFeedForwardClassifierNetWithSharedParameters(Variable input,
+                                                                              size_t numHiddenLayers,
+                                                                              const Parameter& inputTimesParam,
+                                                                              const Parameter& inputPlusParam,
+                                                                              const Parameter hiddenLayerTimesParam[],
+                                                                              const Parameter hiddenLayerPlusParam[],
+                                                                              const Parameter& outputTimesParam,
+                                                                              const std::function<FunctionPtr(const FunctionPtr&)>& nonLinearity)
+{
+    assert(numHiddenLayers >= 1);
+    auto classifierRoot = FullyConnectedDNNLayerWithSharedParameters(input, inputTimesParam, inputPlusParam, nonLinearity);
+
+    for (size_t i = 1; i < numHiddenLayers; ++i)
+    {
+        classifierRoot = FullyConnectedDNNLayerWithSharedParameters(classifierRoot, hiddenLayerTimesParam[i - 1], hiddenLayerPlusParam[i - 1], nonLinearity);
+    }
+
+    // Todo: assume that outputTimesParam has matched output dim and hiddenLayerDim
+    classifierRoot = Times(outputTimesParam, classifierRoot);
+    return classifierRoot;
+}
+
+void CreateFunctionAndEvaluateWithSharedParameters(size_t inputDim,
+                                                   size_t numOutputClasses,
+                                                   size_t numHiddenLayers,
+                                                   const Parameter& inputTimesParam,
+                                                   const Parameter& inputPlusParam,
+                                                   const Parameter hiddenLayerTimesParam[],
+                                                   const Parameter hiddenLayerPlusParam[],
+                                                   const Parameter& outputTimesParam,
+                                                   const DeviceDescriptor& computeDevice)
+{
+    using namespace std::placeholders;
+
+    // Create network using shared parameters
+    auto inputVar = InputVariable({inputDim}, DataType::Float, L"Features");
+    auto classifierOutputFunction = FullyConnectedFeedForwardClassifierNetWithSharedParameters(inputVar,
+                                                                                               numHiddenLayers,
+                                                                                               inputTimesParam,
+                                                                                               inputPlusParam,
+                                                                                               hiddenLayerTimesParam,
+                                                                                               hiddenLayerPlusParam,
+                                                                                               outputTimesParam,
+                                                                                               std::bind(Sigmoid, _1, L""));
+
+    auto labelsVar = InputVariable({numOutputClasses}, DataType::Float, L"Labels");
+    auto trainingLossFunction = CNTK::CrossEntropyWithSoftmax(classifierOutputFunction, labelsVar, L"LossFunction");
+    auto predictionFunction = CNTK::ClassificationError(classifierOutputFunction, labelsVar, L"ClassificationError");
+
+    auto ffNet = CNTK::Combine({trainingLossFunction, predictionFunction, classifierOutputFunction}, L"ClassifierModel");
+
+    if (ffNet->Parameters().size() != ((numHiddenLayers * 2) + 1))
+    {
+        throw std::runtime_error("CreateFunctionAndEvaluateWithSharedParameters: Function does not have expected Parameter count");
+    }
+
+    if (ffNet->Arguments().size() != 2)
+    {
+        throw std::runtime_error("CreateFunctionAndEvaluateWithSharedParameters: Function does not have expected Argument count");
+    }
+
+    if (ffNet->Outputs().size() != 3)
+    {
+        throw std::runtime_error("CreateFunctionAndEvaluateWithSharedParameters: Function does not have expected Output count");
+    }
+
+    // Evaluate the network in several runs 
+    size_t iterationCount = 4;
+    unsigned int randSeed = 2;
+    srand(randSeed);
+    size_t numSamples = 3;
+    for (size_t t = 0; t < iterationCount; ++t)
+    {
+        std::vector<float> inputData(inputDim * numSamples);
+        for (size_t i = 0; i < inputData.size(); ++i)
+        {
+            inputData[i] = ((float)rand()) / RAND_MAX;
+        }
+
+        NDShape inputShape = {inputDim, 1, numSamples};
+        ValuePtr inputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(inputShape, inputData.data(), inputData.size(), DeviceDescriptor::CPUDevice(), true));
+
+        std::vector<float> labelData(numOutputClasses * numSamples, 0);
+        for (size_t i = 0; i < numSamples; ++i)
+        {
+            labelData[(i*numOutputClasses) + (rand() % numOutputClasses)] = 1;
+        }
+
+        NDShape labelShape = {numOutputClasses, 1, numSamples};
+        ValuePtr labelValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(labelShape, labelData.data(), labelData.size(), DeviceDescriptor::CPUDevice(), true));
+
+        ValuePtr outputValue, predictionErrorValue;
+        std::unordered_map<Variable, ValuePtr> outputs = {{classifierOutputFunction->Output(), outputValue}, {predictionFunction->Output(), predictionErrorValue}};
+        ffNet->Forward({{inputVar, inputValue}, {labelsVar, labelValue}}, outputs, computeDevice);
+    }
+}
+
+
+inline FunctionPtr SetupFullyConnectedLinearLayer(Variable input, size_t outputDim, const DeviceDescriptor& device, const std::wstring& outputName = L"")
+{
+    assert(input.Shape().Rank() == 1);
+    size_t inputDim = input.Shape()[0];
+
+    auto timesParam = CNTK::Parameter(CNTK::NDArrayView::RandomUniform<float>({outputDim, inputDim}, -0.05, 0.05, 1, device));
+    auto timesFunction = CNTK::Times(timesParam, input);
+
+    auto plusParam = CNTK::Parameter(CNTK::NDArrayView::RandomUniform<float>({outputDim}, -0.05, 0.05, 1, device));
+    return CNTK::Plus(plusParam, timesFunction, outputName);
+}
+
+inline FunctionPtr SetupFullyConnectedDNNLayer(Variable input, size_t outputDim, const DeviceDescriptor& device, const std::function<FunctionPtr(const FunctionPtr&)>& nonLinearity)
+{
+    return nonLinearity(SetupFullyConnectedLinearLayer(input, outputDim, device));
+}
+
+void OutputFunctionInfo(FunctionPtr func)
+{
+    auto inputVariables = func->Arguments();
+    fprintf(stderr, "Function %S: Input Variables (count=%lu)\n", func->Name().c_str(), inputVariables.size());
+    for_each(inputVariables.begin(), inputVariables.end(), [](const Variable v) {
+        fprintf(stderr, "    name=%S, kind=%d\n", v.Name().c_str(), v.Kind());
+    });
+
+    auto outputVariables = func->Outputs();
+    fprintf(stderr, "Function %S: Output Variables (count=%lu)\n", func->Name().c_str(), outputVariables.size());
+    for_each(outputVariables.begin(), outputVariables.end(), [](const Variable v) {
+        fprintf(stderr, "    name=%S, kind=%d\n", v.Name().c_str(), v.Kind());
+    });
+}
+
+bool GetVariableByName(std::vector<Variable> variableLists, std::wstring varName, Variable& var)
+{
+    for (std::vector<Variable>::iterator it = variableLists.begin(); it != variableLists.end(); ++it)
+    {
+        if (it->Name().compare(varName) == 0)
+        {
+            var = *it;
+            return true;
+        }
+    }
+    return false;
+}
+
+inline bool GetInputVariableByName(FunctionPtr evalFunc, std::wstring varName, Variable& var)
+{
+    return GetVariableByName(evalFunc->Arguments(), varName, var);
+}
+
+inline bool GetOutputVaraiableByName(FunctionPtr evalFunc, std::wstring varName, Variable& var)
+{
+    return GetVariableByName(evalFunc->Outputs(), varName, var);
+}
+
+void RunEvaluationClassifier(FunctionPtr evalFunc, const DeviceDescriptor& device)
+{
+    const std::wstring inputNodeName = L"features";
+
+    Variable inputVar;
+    if (!GetInputVariableByName(evalFunc, inputNodeName, inputVar))
+    {
+        fprintf(stderr, "Input variable %S is not available.\n", inputNodeName.c_str());
+        throw("Input variable not found error.");
+    }
+
+    // Evaluate the network in several runs 
+    size_t iterationCount = 4;
+    unsigned int randSeed = 2;
+    srand(randSeed);
+    size_t numSamples = 3;
+    std::vector<float> inputData(inputVar.Shape().TotalSize() * numSamples);
+    for (size_t t = 0; t < iterationCount; ++t)
+    {
+        for (size_t i = 0; i < inputData.size(); ++i)
+        {
+            inputData[i] = ((float)rand()) / RAND_MAX;
+        }
+
+        // Create input data shape. Adding sequence length and numSamples as axes.
+        // Todo: remove sequence length when only numSamples is supported.
+        // Todo: add convenience APIs to simplify data preparation here.
+        NDShape inputShape = inputVar.Shape().AppendShape({1, numSamples});
+        ValuePtr inputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(inputShape, inputData, true));
+
+        // Define output.
+        ValuePtr outputValue;
+        auto outputVar = evalFunc->Output();
+        std::unordered_map<Variable, ValuePtr> outputs = {{outputVar, outputValue}};
+
+        // Evaluate the model
+        evalFunc->Forward({{inputVar, inputValue}}, outputs, device);
+
+        // Get output value
+        outputValue = outputs[outputVar];
+
+        // Todo: remove sequence length when only numSamples is supported.
+        // Todo: add convenience APIs to simplify retrieval of output results.
+        NDShape outputShape = outputVar.Shape().AppendShape({1, numSamples});
+        std::vector<float> outputData(outputShape.TotalSize());
+        NDArrayViewPtr cpuArrayOutput = MakeSharedObject<NDArrayView>(outputShape, outputData, false);
+        cpuArrayOutput->CopyFrom(*outputValue->Data());
+
+        assert(outputData.size() == outputVar.Shape()[0] * numSamples);
+        fprintf(stderr, "Evaluation result:\n");
+        size_t dataIndex = 0;
+        auto outputDim = outputVar.Shape()[0];
+        for (size_t i = 0; i < numSamples; i++)
+        {
+            fprintf(stderr, "Iteration:%lu, Sample %lu:\n", t, i);
+            fprintf(stderr, "    ");
+            dataIndex = i * outputDim;
+            for (size_t j = 0; j < std::min((size_t)10, outputDim); j++)
+            {
+                fprintf(stderr, "%f ", outputData[dataIndex++]);
+            }
+            if (outputDim > 10)
+            {
+                fprintf(stderr, "...");
+            }
+            fprintf(stderr, "\n");
+        }
+    }
+}
+
+void RunEvaluationOneHidden(FunctionPtr evalFunc, const DeviceDescriptor& device)
+{
+    const std::wstring inputNodeName = L"features";
+    const std::wstring outputNodeName = L"out.z_output";
+
+    Variable inputVar;
+    if (!GetInputVariableByName(evalFunc, inputNodeName, inputVar))
+    {
+        fprintf(stderr, "Input variable %S is not available.\n", inputNodeName.c_str());
+        throw("Input variable not found error.");
+    }
+
+    Variable outputVar;
+    if (!GetOutputVaraiableByName(evalFunc, outputNodeName, outputVar))
+    {
+        fprintf(stderr, "Output variable %S is not available.\n", outputNodeName.c_str());
+        throw("Output variable not found error.");
+    }
+
+    // Evaluate the network in several runs 
+    size_t iterationCount = 4;   
+    size_t numSamples = 3;
+    for (size_t t = 0; t < iterationCount; ++t)
+    {
+        std::vector<float> inputData(inputVar.Shape().TotalSize() * numSamples);
+        for (size_t i = 0; i < inputData.size(); ++i)
+        {
+            inputData[i] = static_cast<float>(i % 255);
+        }
+
+        NDShape inputShape = inputVar.Shape().AppendShape({1, numSamples});
+        ValuePtr inputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(inputShape, inputData, true));
+
+        ValuePtr outputValue;
+        std::unordered_map<Variable, ValuePtr> outputs = {{outputVar, outputValue}};
+        evalFunc->Forward({{inputVar, inputValue}}, outputs, device);
+
+        outputValue = outputs[outputVar];        
+        NDShape outputShape = outputVar.Shape().AppendShape({1, numSamples});
+        std::vector<float> outputData(outputShape.TotalSize());
+        NDArrayViewPtr cpuArrayOutput = MakeSharedObject<NDArrayView>(outputShape, outputData, false);
+        cpuArrayOutput->CopyFrom(*outputValue->Data());
+
+        assert(outputData.size() == outputVar.Shape()[0] * numSamples);
+        fprintf(stderr, "Evaluation result:\n");
+        size_t dataIndex = 0;
+        auto outputDim = outputVar.Shape()[0];
+        for (size_t i = 0; i < numSamples; i++)
+        {
+            fprintf(stderr, "Iteration:%lu, Sample %lu:\n", t, i);
+            fprintf(stderr, "Ouput:");
+            for (size_t j = 0; j < outputDim; j++)
+            {
+                fprintf(stderr, "%f ", outputData[dataIndex++]);
+            }
+            fprintf(stderr, "\n");
+        }
+    }
+}
+
+void MultiThreadsEvaluation(bool isGPUAvailable)
+{
+#ifndef CPUONLY
+    if (isGPUAvailable)
+    {
+        fprintf(stderr, "Run evaluation on GPU device using GPU build.\n");
+    }
+    else
+    {
+        fprintf(stderr, "Run evaluation on CPU device using GPU build.\n");
+    }
+#else
+    fprintf(stderr, "Run evaluation using CPU-only build.\n");
+#endif
+
+    // Test multi-threads evaluation with new function
+    fprintf(stderr, "Test multi-threaded evaluation with new function on CPU.\n");
+    MultiThreadsEvaluationWithNewFunction(DeviceDescriptor::CPUDevice(), 2);
+    if (isGPUAvailable)
+    {
+        fprintf(stderr, "Test multi-threaded evaluation with new function on GPU\n");
+        MultiThreadsEvaluationWithNewFunction(DeviceDescriptor::GPUDevice(0), 2);
+    }
+
+    // Test multi-threads evaluation using clone.
+    fprintf(stderr, "Test multi-threaded evaluation using clone on CPU.\n");
+    MultiThreadsEvaluationWithClone(DeviceDescriptor::CPUDevice(), 2);
+    if (isGPUAvailable)
+    {
+        fprintf(stderr, "Test multi-threaded evaluation using clone on GPU.\n");
+        MultiThreadsEvaluationWithClone(DeviceDescriptor::GPUDevice(0), 2);
+    }
+
+    // test multi-threads evaluation with loading existing models
+    fprintf(stderr, "Test multi-threaded evaluation with loading existing models on CPU.\n");
+    MultiThreadsEvaluationWithLoadModel(DeviceDescriptor::CPUDevice(), 2);
+    if (isGPUAvailable)
+    {
+        fprintf(stderr, "Test multi-threaded evaluation with loading existing models on GPU.\n");
+        MultiThreadsEvaluationWithLoadModel(DeviceDescriptor::GPUDevice(0), 2);
+    }
+
+    fflush(stderr);
+
+}
--- a/Examples/Evaluation/CSEvalClient/CSEvalClient.csproj
+++ b/Examples/Evaluation/CSEvalClient/CSEvalClient.csproj
@ -49,7 +49,7 @@
  </PropertyGroup>
  <ItemGroup>
    <Reference Include="EvalWrapper, Version=0.0.0.0, Culture=neutral, processorArchitecture=AMD64">
-      <HintPath>..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.1\lib\net45\x64\EvalWrapper.dll</HintPath>
+      <HintPath>..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.2\lib\net45\x64\EvalWrapper.dll</HintPath>
      <Private>True</Private>
    </Reference>
    <Reference Include="System" />
@ -85,11 +85,11 @@
    </BootstrapperPackage>
  </ItemGroup>
  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
-  <Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.1\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.1\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
+  <Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.2\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.2\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
    <PropertyGroup>
      <ErrorText>This project references NuGet package(s) that are missing on this computer. Enable NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
    </PropertyGroup>
-    <Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.1\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.1\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
+    <Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.2\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.1.7.2\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
  </Target> 
 </Project>
--- a/Examples/Evaluation/CSEvalClient/Program.cs
+++ b/Examples/Evaluation/CSEvalClient/Program.cs
@ -30,8 +30,8 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
    /// 
    /// EvaluateModelSingleLayer and EvaluateModelMultipleLayers
    /// --------------------------------------------------------
-    /// These two cases require the 01_OneHidden model which is part of the <CNTK>/Examples/Image/MNIST example.
-    /// Refer to <see cref="https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md"/> for how to train
+    /// These two cases require the 01_OneHidden model which is part of the <CNTK>/Examples/Image/GettingStarted example.
+    /// Refer to <see cref="https://github.com/Microsoft/CNTK/blob/master/Examples/Image/GettingStarted/README.md"/> for how to train
    /// the model used in these examples.
    /// 
    /// EvaluateNetworkSingleLayer and EvaluateNetworkSingleLayerNoInput
@ -41,8 +41,8 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
    /// 
    /// EvaluateMultipleModels
    /// ----------------------
-    /// This case requires the 02_Convolution model and the Test-28x28_cntk_text.txt test file which are part of the <CNTK>/Examples/Image/MNIST example.
-    /// Refer to <see cref="https://github.com/Microsoft/CNTK/blob/master/Examples/Image/MNIST/README.md"/> for how to train
+    /// This case requires the 02_Convolution model and the Test-28x28_cntk_text.txt test file which are part of the <CNTK>/Examples/Image/GettingStarted example.
+    /// Refer to <see cref="https://github.com/Microsoft/CNTK/blob/master/Examples/Image/GettingStarted/README.md"/> for how to train
    /// the model used in this example.
    /// 
    /// EvaluateImageClassificationModel
@ -142,15 +142,15 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient

                // The examples assume the executable is running from the data folder
                // We switch the current directory to the data folder (assuming the executable is in the <CNTK>/x64/Debug|Release folder
-                Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\MNIST\Data\");
+                Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\GettingStarted");
                List<float> outputs;

                using (var model = new IEvaluateModelManagedF())
                {
                    // Load model
-                    string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\01_OneHidden");
+                    string modelFilePath = Path.Combine(Environment.CurrentDirectory, @".\Output\Models\01_OneHidden");
                    ThrowIfFileNotExist(modelFilePath, 
-                        string.Format("Error: The model '{0}' does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/MNIST to create the model.", modelFilePath));
+                        string.Format("Error: The model '{0}' does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/GettingStarted to create the model.", modelFilePath));

                    model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1);

@ -189,7 +189,7 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
            {
                // The examples assume the executable is running from the data folder
                // We switch the current directory to the data folder (assuming the executable is in the <CNTK>/x64/Debug|Release folder
-                Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\MNIST\Data\");
+                Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\GettingStarted");

                Dictionary<string, List<float>> outputs;

@ -200,9 +200,9 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
                    const string outputLayerName = "out.z";

                    // Load model
-                    string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\01_OneHidden");
+                    string modelFilePath = Path.Combine(Environment.CurrentDirectory, @".\Output\Models\01_OneHidden");
                    ThrowIfFileNotExist(modelFilePath,
-                        string.Format("Error: The model '{0}' does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/MNIST to create the model.", modelFilePath));
+                        string.Format("Error: The model '{0}' does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/GettingStarted to create the model.", modelFilePath));

                    var desiredOutputLayers = new List<string>() { hiddenLayerName, outputLayerName };
                    model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1, outputNodeNames: desiredOutputLayers);
@ -395,19 +395,19 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient

            // The examples assume the executable is running from the data folder
            // We switch the current directory to the data folder (assuming the executable is in the <CNTK>/x64/Debug|Release folder
-            Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\MNIST\Data\");
+            Environment.CurrentDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\GettingStarted");

            // Load model
-            string modelFilePath = Path.Combine(Environment.CurrentDirectory, @"..\Output\Models\02_Convolution");
+            string modelFilePath = Path.Combine(Environment.CurrentDirectory, @".\Output\Models\02_OneConv");
            ThrowIfFileNotExist(modelFilePath, 
-                string.Format("Error: The model '{0}' does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/MNIST to create the model.", modelFilePath));
+                string.Format("Error: The model '{0}' does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/GettingStarted to create the model.", modelFilePath));

            // Initializes the model instances
            ModelEvaluator.Initialize(numConcurrentModels, modelFilePath);

-            string testfile = Path.Combine(Environment.CurrentDirectory, @"Test-28x28_cntk_text.txt");
+            string testfile = Path.Combine(Environment.CurrentDirectory, @"..\DataSets\MNIST\Test-28x28_cntk_text.txt");
            ThrowIfFileNotExist(testfile, 
-                string.Format("Error: The test file '{0}' does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/MNIST to download the data.", testfile));
+                string.Format("Error: The test file '{0}' does not exist. Please follow instructions in README.md in <CNTK>/Examples/Image/GettingStarted to download the data.", testfile));

            Stopwatch sw = new Stopwatch();
            sw.Start();
@ -475,9 +475,9 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
            {
                // This example requires the RestNet_18 model.
                // The model can be downloaded from <see cref="https://www.cntk.ai/resnet/ResNet_18.model"/>
-                // The model is assumed to be located at: <CNTK>\Examples\Image\Miscellaneous\ImageNet\ResNet 
+                // The model is assumed to be located at: <CNTK>\Examples\Image\Classification\ResNet 
                // along with a sample image file named "zebra.jpg".
-                string workingDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\Miscellaneous\ImageNet\ResNet");
+                string workingDirectory = Path.Combine(initialDirectory, @"..\..\Examples\Image\Classification\ResNet");
                Environment.CurrentDirectory = initialDirectory;

                List<float> outputs;
@ -486,7 +486,7 @@ namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
                {
                    string modelFilePath = Path.Combine(workingDirectory, "ResNet_18.model");
                    ThrowIfFileNotExist(modelFilePath, 
-                        string.Format("Error: The model '{0}' does not exist. Please download the model from https://www.cntk.ai/resnet/ResNet_18.model and save it under ..\\..\\Examples\\Image\\Miscellaneous\\ImageNet\\ResNet.", modelFilePath));
+                        string.Format("Error: The model '{0}' does not exist. Please download the model from https://www.cntk.ai/resnet/ResNet_18.model and save it under ..\\..\\Examples\\Image\\Classification\\ResNet.", modelFilePath));
                        
                    model.CreateNetwork(string.Format("modelPath=\"{0}\"", modelFilePath), deviceId: -1);

--- a/Examples/Evaluation/CSEvalClient/packages.config
+++ b/Examples/Evaluation/CSEvalClient/packages.config
@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Microsoft.Research.CNTK.CpuEval-mkl" version="1.7.1" targetFramework="net45" />
+  <package id="Microsoft.Research.CNTK.CpuEval-mkl" version="1.7.2" targetFramework="net45" />
 </packages>
--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/AddTop5Layer.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/AddTop5Layer.mel
--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet_ndl_deprecated.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/AlexNet_ndl_deprecated.cntk
--- a/Examples/Image/Miscellaneous/ImageNet/AlexNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/AlexNet/Macros.ndl
--- a/Examples/Image/Classification/ConvNet/ConvNet_CIFAR10.cntk
+++ b/Examples/Image/Classification/ConvNet/ConvNet_CIFAR10.cntk
@ -1,14 +1,15 @@
-# Simple CIFAR-10 convnet, without and with BatchNormalization.
+# ConvNet applied on CIFAR-10 dataset, with no data augmentation.

 command = TrainConvNet:Eval

-makeMode = false ; traceLevel = 1 ; deviceId = 0
+precision = "float"; traceLevel = 1 ; deviceId = "auto"

-RootDir = "." ; DataDir  = "$RootDir$" ; ModelDir = "$RootDir$/Output/Models"
+rootDir = "../.." ; dataDir = "$rootDir$/DataSets/CIFAR-10" ;
+outputDir = "./Output" ;

-modelPath = "$ModelDir$/ConvNet"
+modelPath = "$outputDir$/Models/ConvNet_CIFAR10"
+#stderr = "$outputDir$/ConvNet_CIFAR10_bs_out"

-# Training without BN
 TrainConvNet = {
    action = "train"

@ -16,19 +17,21 @@ TrainConvNet = {
        imageShape = 32:32:3
        labelDim = 10

-        Subtract128 (x) = x - Constant (128)
+        featMean = 128
+        featScale = 1/256
+        Normalize{m,f} = x => f .* (x - m)

        model = Sequential (
-            Subtract128 :
-            ConvolutionalLayer {32, (5:5), pad = true, activation = ReLU, init = 'glorotUniform', initValueScale=0.00390625} :
+            Normalize {featMean, featScale} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
              MaxPoolingLayer {(3:3), stride = (2:2)} :
-            ConvolutionalLayer {32, (5:5), pad = true, activation = ReLU, init = 'glorotUniform'} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
              MaxPoolingLayer {(3:3), stride = (2:2)} :
-            ConvolutionalLayer {64, (5:5), pad = true, activation = ReLU, init = 'glorotUniform'} :
-              MaxPoolingLayer {(3:3), stride = (2:2)} :
-            DenseLayer {64, activation = ReLU, init = 'glorotUniform', initValueScale=0.1} :
-              Dropout :
-            LinearLayer {labelDim, init = 'glorotUniform', initValueScale=0.1}
+            DenseLayer {256} : Dropout : ReLU : 
+            DenseLayer {128} : Dropout : ReLU : 
+            LinearLayer {labelDim}
        )

        # inputs
@ -51,20 +54,23 @@ TrainConvNet = {
    }

    SGD = {
-        epochSize = 49984 ; minibatchSize = 64
+        epochSize = 0
+        minibatchSize = 64

-        learningRatesPerSample = 0.00015625*10:0.000046875*10:0.000015625
-        momentumAsTimeConstant = 600*20:6400
+        learningRatesPerSample = 0.0015625*10:0.00046875*10:0.00015625
+        momentumAsTimeConstant = 0*20:6400
        maxEpochs = 30
-        L2RegWeight = 0.03
+        L2RegWeight = 0.002
        dropoutRate = 0*5:0.5

-        firstMBsToShowResult = 10 ; numMBsToShowResult = 500
+        numMBsToShowResult = 100
    }

    reader = {
        readerType = "CNTKTextFormatReader"
        file = "$DataDir$/Train_cntk_text.txt"
+		randomize = true
+		keepDataInMemory = true     # cache all data in memory 	 
        input = {
            features = { dim = 3072 ; format = "dense" }
            labels   = { dim = 10 ;   format = "dense" }
--- a/Examples/Image/Classification/ConvNet/ConvNet_CIFAR10_DataAug.cntk
+++ b/Examples/Image/Classification/ConvNet/ConvNet_CIFAR10_DataAug.cntk
@ -0,0 +1,109 @@
+# ConvNet applied on CIFAR-10 dataset, with data augmentation (translation and flipping).
+
+command = TrainConvNet:Eval
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = "../.." ; dataDir = "$rootDir$/DataSets/CIFAR-10" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/ConvNet_CIFAR10_DataAug"
+#stderr = "$outputDir$/ConvNet_CIFAR10_DataAug_bs_out"
+
+TrainConvNet = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape = 32:32:3
+        labelDim = 10
+
+		featMean = 128
+        featScale = 1/256
+        Normalize{m,f} = x => Constant(f) .* (x - Constant(m))
+
+        model = Sequential (
+            Normalize {featMean, featScale} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+              MaxPoolingLayer {(3:3), stride = (2:2)} :
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+            ConvolutionalLayer {64, (3:3), pad = true} : ReLU : 
+              MaxPoolingLayer {(3:3), stride = (2:2)} :
+            DenseLayer {256} : Dropout : ReLU : 
+            DenseLayer {128} : Dropout : ReLU : 
+            LinearLayer {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        labels   = Input {labelDim}
+
+        # apply model to features
+        z = model (features)
+
+        # connect to system
+        ce       = CrossEntropyWithSoftmax     (labels, z)
+        errs     = ClassificationError         (labels, z)
+        top5Errs = ClassificationError         (labels, z, topN=5)  # only used in Eval action
+
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)  # top5Errs only used in Eval
+        outputNodes     = (z)
+    }
+
+    SGD = {
+        epochSize = 0
+        minibatchSize = 64
+
+        learningRatesPerSample = 0.0015625*20:0.00046875*20:0.00015625*20:0.000046875*10:0.000015625
+        momentumAsTimeConstant = 0*20:600*20:6400
+        maxEpochs = 80
+        L2RegWeight = 0.002
+        dropoutRate = 0*5:0.5
+
+        numMBsToShowResult = 100
+    }
+
+    reader = {
+        verbosity = 0 ; randomize = true
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$dataDir$/train_map.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
+                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
+                    { type = "Mean" ; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
+                    { type = "Transpose" }
+                )}
+                labels = { labelDim = 10 }
+            }
+        })
+    }
+}
+
+# Eval action
+Eval = {
+    action = "eval"
+    evalNodeNames = errs:top5Errs  # also test top-5 error rate
+    # Set minibatch size for testing.
+    minibatchSize = 512
+
+    reader = {
+        verbosity = 0 ; randomize = false
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$dataDir$/test_map.txt"
+            input = {
+                features = { transforms = (
+                   { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
+                   { type = "Mean"; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
+                   { type = "Transpose" }
+                )}
+                labels = { labelDim = 10 }
+            }
+        })
+    }
+}
--- a/Examples/Image/Classification/ConvNet/ConvNet_MNIST.cntk
+++ b/Examples/Image/Classification/ConvNet/ConvNet_MNIST.cntk
@ -0,0 +1,90 @@
+# ConvNet on MNIST dataset. 
+
+command = trainNetwork:testNetwork
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = "../.." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/ConvNet_MNIST"
+#stderr = "$outputDir$/ConvNet_MNIST_bs_out"
+
+# TRAINING CONFIG
+trainNetwork = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape = 28:28:1                        # image dimensions, 1 channel only
+        labelDim = 10                               # number of distinct labels
+        featScale = 1/256
+        Scale{f} = x => Constant(f) .* x
+        
+        model = Sequential (
+            Scale {featScale} :
+            ConvolutionalLayer {32, (5:5), pad = true} : ReLU : 
+            MaxPoolingLayer    {(3:3), stride=(2:2)} :
+            ConvolutionalLayer {48, (3:3), pad = false} : ReLU : 
+            MaxPoolingLayer    {(3:3), stride=(2:2)} :
+            ConvolutionalLayer {64, (3:3), pad = false} : ReLU : 
+            DenseLayer         {96} : Dropout : ReLU :  
+            LinearLayer        {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        labels = Input {labelDim}
+
+        # apply model to features
+        ol = model (features)
+
+        # loss and error computation
+        ce   = CrossEntropyWithSoftmax (labels, ol)
+        errs = ClassificationError (labels, ol)
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (ol)
+    }
+
+    SGD = {
+        epochSize = 60000
+        minibatchSize = 64
+        maxEpochs = 40
+        learningRatesPerSample = 0.001*10:0.0005*10:0.0001
+		dropoutRate = 0.5
+        momentumAsTimeConstant = 0*5:1024
+        
+        numMBsToShowResult = 500
+    }
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        # See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        randomize = true
+        keepDataInMemory = true
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }    
+}
+
+# TEST CONFIG
+testNetwork = {
+    action = test
+    minibatchSize = 1024    # reduce this if you run out of memory
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
--- a/Examples/Image/Classification/ConvNet/README.md
+++ b/Examples/Image/Classification/ConvNet/README.md
@ -0,0 +1,51 @@
+# CNTK Examples: Image/Classification/ConvNet
+
+## Overview
+
+|Data:     |The MNIST dataset (http://yann.lecun.com/exdb/mnist/) of handwritten digits and the CIFAR-10 dataset (http://www.cs.toronto.edu/~kriz/cifar.html) for image classification.
+|:---------|:---
+|Purpose   |This folder contains a number of examples that demonstrate the usage of BrainScript to define convolutional neural networks for image classification.
+|Network   |convolutional neural networks.
+|Training  |Stochastic gradient descent with momentum.
+|Comments  |See below.
+
+## Running the example
+
+### Getting the data
+
+we use the MNIST and CIFAR-10 datasets to demonstrate how to train a `convolutional neural network (CNN)`. CNN has been one of the most popular neural networks for image-related tasks. A very well-known early work on CNN is the [LeNet](http://yann.lecun.com/exdb/publis/pdf/lecun-01a.pdf). In 2012 Alex Krizhevsky, Ilya Sutskever, and Geoffrey Hinton won the ILSVRC-2012 competition using a [CNN architecture](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf). And most state-of-the-art neural networks on image classification tasks today adopts a modified CNN architecture, such as [VGG](../VGG), [GoogLeNet](../GoogLeNet), [ResNet](../ResNet), etc.
+
+MNIST and CIFAR-10 dataset is not included in the CNTK distribution but can be easily downloaded and converted by following the instructions in [DataSets/MNIST](../../DataSets/MNIST) and [DataSets/CIFAR-10](../../DataSets/CIFAR-10). We recommend you to keep the downloaded data in the respective folder while downloading, as the configuration files in this folder assumes that by default.
+
+## Details
+
+### ConvNet_MNIST.cntk
+
+Our first example applies CNN on the MNIST dataset. The network we use contains three convolution layers and two dense layers. Dropout is applied after the first dense layer. No data augmentation is used in this example. We start the training with no momentum, and add momentum after training for 5 epochs. Please refer to the cntk configuration file [ConvNet_MNIST.cntk](./ConvNet_MNIST.cntk) for more details.
+
+Run the example from the current folder using:
+
+`cntk configFile=ConvNet_MNIST.cntk`
+
+The network achieves an error rate of `0.5%`, which is very good considering no data augmentation is used. This accuracy is comparable, if not better, than many other vanilla CNN implementations (http://yann.lecun.com/exdb/mnist/).
+
+### ConvNet_CIFAR10.cntk
+
+The second exmaple applies CNN on the CIFAR-10 dataset. The network contains four convolution layers and three dense layers. Max pooling is conducted for every two convolution layers. Dropout is applied after the first two dense layers. No data augmentation is used. Please refer to the cntk configuration file [ConvNet_CIFAR10.cntk](./ConvNet_CIFAR10.cntk) for more details.
+
+Run the example from the current folder using:
+
+`cntk configFile=ConvNet_CIFAR10.cntk`
+
+The network achieves an error rate of `18.51%` after 30 epochs. This is comparable to the network published by [cuda-convnet](https://code.google.com/p/cuda-convnet/), which has 18% error with no data augmentation. One difference is that we do not use a `local response normalization layer`. This layer type is now rarely used in most state-of-the-art deep learning networks.
+
+### ConvNet_CIFAR10_DataAug.cntk
+
+The third example uses the same CNN as the previous example, but it improves by adding data augmentation to training. For this purpose, we use the `ImageReader` instead of the `CNTKTextFormatReader` to load the data. The ImageReader currently supports crop, flip, scale, color jittering, and mean subtraction.
+For a reference on image reader and transforms, please check [here](https://github.com/Microsoft/CNTK/wiki/Image-reader).
+
+Run the example from the current folder using:
+
+`cntk configFile=ConvNet_CIFAR10_DataAug.cntk`
+
+As seen in the cntk configuration file [ConvNet_CIFAR10_DataAug.cntk](./ConvNet_CIFAR10_DataAug.cntk), we use a fix crop ratio of `0.8` and scale the image to `32x32` pixels for training. Since all training images are pre-padded to `40x40` pixels, effectively we only perfrom translation transform without scaling. The accuracy of the network on test data is `14.21%`, which is a lot better than the previous model.
--- a/Examples/Image/Classification/MLP/MLP_MNIST.cntk
+++ b/Examples/Image/Classification/MLP/MLP_MNIST.cntk
@ -0,0 +1,85 @@
+# Multi-layer perceptron (MLP) on MNIST dataset. 
+
+command = trainNetwork:testNetwork
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = "../.." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/MLP_MNIST"
+#stderr = "$outputDir$/MLP_MNIST_bs_out"
+
+# TRAINING CONFIG
+trainNetwork = {
+    action = "train"
+
+    BrainScriptNetworkBuilder = {
+        imageShape = 28:28:1                        # image dimensions, 1 channel only
+        labelDim = 10                               # number of distinct labels
+        featScale = 1/256
+        Scale{f} = x => Constant(f) .* x
+        
+        model = Sequential (
+            Scale {featScale} :
+            DenseLayer  {768, init="gaussian", initValueScale=1.5} : Dropout: ReLU : 
+            DenseLayer  {512, init="gaussian", initValueScale=1.5} : Dropout: ReLU : 
+            DenseLayer  {256, init="gaussian", initValueScale=1.5} : Dropout: ReLU :  
+            LinearLayer {labelDim}
+        )
+
+        # inputs
+        features = Input {imageShape}
+        labels = Input {labelDim}
+
+        # apply model to features
+        z = model (features)
+
+        # loss and error computation
+        ce   = CrossEntropyWithSoftmax (labels, z)
+        errs = ClassificationError (labels, z)
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (z)
+    }
+
+    SGD = {
+        epochSize = 60000
+        minibatchSize = 64
+        maxEpochs = 40
+        learningRatesPerSample = 0.001*10:0.0005*10:0.0001
+		dropoutRate = 0.5
+        momentumAsTimeConstant = 600*10:4096
+        
+        numMBsToShowResult = 500
+    }
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        # See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }    
+}
+
+# TEST CONFIG
+testNetwork = {
+    action = test
+    minibatchSize = 1024    # reduce this if you run out of memory
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
--- a/Examples/Image/Classification/MLP/README.md
+++ b/Examples/Image/Classification/MLP/README.md
@ -0,0 +1,30 @@
+# CNTK Examples: Image/Classification/MLP
+
+## Overview
+
+|Data:     |The MNIST dataset (http://yann.lecun.com/exdb/mnist/) of handwritten digits.
+|:---------|:---
+|Purpose   |This folder contains a number of examples that demonstrate the usage of BrainScript to define multi-layer perceptron (MLP) networks for image classification.
+|Network   |Multi-layer perceptron.
+|Training  |Stochastic gradient descent with momentum.
+|Comments  |See below.
+
+## Running the example
+
+### Getting the data
+
+we use the MNIST dataset to demonstrate how to train a `multi-layer perceptron (MLP)` network. MLP is a feed-forward neural network that consists of multiple layers of nodes in a directed graph, where each layer fully connected to the next one. This is argueabally one of the simplest neural networks.
+
+MNIST dataset is not included in the CNTK distribution but can be easily downloaded and converted by following the instructions in [DataSets/MNIST](../../DataSets/MNIST). We recommend you to keep the downloaded data in the respective folder while downloading, as the configuration files in this folder assumes that by default.
+
+## Details
+
+### MLP_MNIST.cntk
+
+Similar to the `01_OneHidden.cntk` network in [GettingStarted](../../GettingStarted), MLP is "permutation invariant". In this particular example, we use 3 hidden layers, each containing `768`, `512` and `256` nodes, respectively. Dropout is applied after each hidden layer, with `droputRate=0.5`. The learning rate is gradually adjusted from `0.001` per sample to `0.0001`, and momentum as time constant is adjusted from `600` (effective momentum = `0.898824`) to `4096` (effective momentum = `0.984495`).
+
+Run the example from the current folder using:
+
+`cntk configFile=MLP_MNIST.cntk`
+
+The network achieves an error rate of `1.45%`, which is about as good as one can have with MLP and no data augmentation (http://yann.lecun.com/exdb/mnist/).
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/CreateEvalModel.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/CreateEvalModel.mel
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ImageNet1K_intensity.xml
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ImageNet1K_intensity.xml
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ImageNet1K_mean.xml
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ImageNet1K_mean.xml
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/Macros.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ProjWeightsGen.py
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ProjWeightsGen.py
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/README.md
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/README.md
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152_ndl_deprecated.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_152_ndl_deprecated.cntk
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18_ndl_deprecated.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_18_ndl_deprecated.cntk
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34_ndl_deprecated.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_34_ndl_deprecated.cntk
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50_ndl_deprecated.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/ResNet_50_ndl_deprecated.cntk
--- a/Examples/Image/Miscellaneous/ImageNet/ResNet/zebra.jpg
+++ b/Examples/Image/Miscellaneous/ImageNet/ResNet/zebra.jpg
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/CreateEvalModel.mel
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/CreateEvalModel.mel
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/ImageNet1K_mean.xml
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/ImageNet1K_mean.xml
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/Macros.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A_ndl_deprecated.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_A_ndl_deprecated.cntk
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN.ndl
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN_ndl_deprecated.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_BN_ndl_deprecated.cntk
--- a/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_ndl_deprecated.cntk
+++ b/Examples/Image/Miscellaneous/ImageNet/VGG/VGG_E_ndl_deprecated.cntk
--- a/Examples/Image/DataSets/CIFAR-10/README.md
+++ b/Examples/Image/DataSets/CIFAR-10/README.md
@ -0,0 +1,21 @@
+# CIFAR-10 Dataset
+
+The CIFAR-10 dataset (http://www.cs.toronto.edu/~kriz/cifar.html) is a popular dataset for image classification, collected by Alex Krizhevsky, Vinod Nair, and Geoffrey Hinton. It is a labeled subset of the [80 million tiny images](http://people.csail.mit.edu/torralba/tinyimages/) dataset.
+
+The CIFAR-10 dataset consists of 60,000 32x32 color images in 10 classes, with 6,000 images per class. There are 50,000 training images and 10,000 test images. The 10 classes are: airplane, automobile, bird, cat, deer, dog, frog, horse, ship, and truck.
+
+The CIFAR-10 dataset is not included in the CNTK distribution but can be easily downloaded and converted to CNTK-supported format by running the following Python command:
+
+```
+python install_cifar10.py
+```
+
+After running `install_cifar10.py`, you will see the original CIFAR-10 data are copied in a folder named `cifar-10-batches-py`. Meanwhile, two text files `Train_cntk_text.txt` and `Test_cntk_text.txt` are created in the current folder. These text files can be read directly by CNTK.
+
+In addition, the script will create a `train` and a `test` folder that store train and test images in png format. It will also create appropriate mapping files (`train_map.txt` and `test_map.txt`) for the CNTK `ImageReader` as well as mean file `CIFAR-10_mean.xml`.
+
+The total amount of disk space required for both the text version and the png version for CIFAR-10 is around `950`MB. 
+
+We provide multiple examples in the [Classification](../../Classification) folder to train classifiers for CIFAR-10 with CNTK. Please refer there for more details.
+
+If you are curious about how well computers can perform on CIFAR-10 today, Rodrigo Benenson maintains a [blog](http://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results.html#43494641522d3130) on the state-of-the-art performance of various algorithms.
--- a/Examples/Image/DataSets/CIFAR-10/cifar_utils.py
+++ b/Examples/Image/DataSets/CIFAR-10/cifar_utils.py
@ -0,0 +1,132 @@
+from __future__ import print_function
+try: 
+    from urllib.request import urlretrieve 
+except ImportError: 
+    from urllib import urlretrieve
+import sys
+import tarfile
+import shutil
+import os
+import struct
+import numpy as np
+import pickle as cp
+from PIL import Image
+import xml.etree.cElementTree as et
+import xml.dom.minidom
+import getopt
+
+ImgSize = 32
+NumFeat = ImgSize * ImgSize * 3
+
+def readBatch(src):
+    with open(src, 'rb') as f:
+        if sys.version_info[0] < 3: 
+            d = cp.load(f) 
+        else:
+            d = cp.load(f, encoding='latin1')
+        data = d['data']
+        feat = data
+    res = np.hstack((feat, np.reshape(d['labels'], (len(d['labels']), 1))))
+    return res.astype(np.int)
+
+def loadData(src):
+    print ('Downloading ' + src)
+    fname, h = urlretrieve(src, './delete.me')
+    print ('Done.')
+    try:
+        print ('Extracting files...')
+        with tarfile.open(fname) as tar:
+            tar.extractall()
+        print ('Done.')
+        print ('Preparing train set...')
+        trn = np.empty((0, NumFeat + 1), dtype=np.int)
+        for i in range(5):
+            batchName = './cifar-10-batches-py/data_batch_{0}'.format(i + 1)
+            trn = np.vstack((trn, readBatch(batchName)))
+        print ('Done.')
+        print ('Preparing test set...')
+        tst = readBatch('./cifar-10-batches-py/test_batch')
+        print ('Done.')
+    finally:
+        os.remove(fname)
+    return (trn, tst)
+
+def saveTxt(filename, ndarray):
+    with open(filename, 'w') as f:
+        labels = list(map(' '.join, np.eye(10, dtype=np.uint).astype(str)))
+        for row in ndarray:
+            row_str = row.astype(str)
+            label_str = labels[row[-1]]
+            feature_str = ' '.join(row_str[:-1])
+            f.write('|labels {} |features {}\n'.format(label_str, feature_str))
+
+def saveImage(fname, data, label, mapFile, regrFile, pad, **key_parms):
+    # data in CIFAR-10 dataset is in CHW format.
+    pixData = data.reshape((3, ImgSize, ImgSize))
+    if ('mean' in key_parms):
+        key_parms['mean'] += pixData
+
+    if pad > 0:
+        pixData = np.pad(pixData, ((0, 0), (pad, pad), (pad, pad)), mode='constant', constant_values=128) # can also use mode='edge'
+
+    img = Image.new('RGB', (ImgSize + 2 * pad, ImgSize + 2 * pad))
+    pixels = img.load()
+    for x in range(img.size[0]):
+        for y in range(img.size[1]):
+            pixels[x, y] = (pixData[0][y][x], pixData[1][y][x], pixData[2][y][x])
+    img.save(fname)
+    mapFile.write("%s\t%d\n" % (fname, label))
+    
+    # compute per channel mean and store for regression example
+    channelMean = np.mean(pixData, axis=(1,2))
+    regrFile.write("|regrLabels\t%f\t%f\t%f\n" % (channelMean[0]/255.0, channelMean[1]/255.0, channelMean[2]/255.0))
+    
+def saveMean(fname, data):
+    root = et.Element('opencv_storage')
+    et.SubElement(root, 'Channel').text = '3'
+    et.SubElement(root, 'Row').text = str(ImgSize)
+    et.SubElement(root, 'Col').text = str(ImgSize)
+    meanImg = et.SubElement(root, 'MeanImg', type_id='opencv-matrix')
+    et.SubElement(meanImg, 'rows').text = '1'
+    et.SubElement(meanImg, 'cols').text = str(ImgSize * ImgSize * 3)
+    et.SubElement(meanImg, 'dt').text = 'f'
+    et.SubElement(meanImg, 'data').text = ' '.join(['%e' % n for n in np.reshape(data, (ImgSize * ImgSize * 3))])
+
+    tree = et.ElementTree(root)
+    tree.write(fname)
+    x = xml.dom.minidom.parse(fname)
+    with open(fname, 'w') as f:
+        f.write(x.toprettyxml(indent = '  '))
+
+def saveTrainImages(filename, foldername):
+    if not os.path.exists(foldername):
+        os.makedirs(foldername)
+    data = {}
+    dataMean = np.zeros((3, ImgSize, ImgSize)) # mean is in CHW format.
+    with open('train_map.txt', 'w') as mapFile:
+        with open('train_regrLabels.txt', 'w') as regrFile:
+            for ifile in range(1, 6):
+                with open(os.path.join('./cifar-10-batches-py', 'data_batch_' + str(ifile)), 'rb') as f:
+                    if sys.version_info[0] < 3: 
+                        data = cp.load(f)
+                    else: 
+                        data = cp.load(f, encoding='latin1')
+                    for i in range(10000):
+                        fname = os.path.join(os.path.abspath(foldername), ('%05d.png' % (i + (ifile - 1) * 10000)))
+                        saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, regrFile, 4, mean=dataMean)
+    dataMean = dataMean / (50 * 1000)
+    saveMean('CIFAR-10_mean.xml', dataMean)
+
+def saveTestImages(filename, foldername):
+    if not os.path.exists(foldername):
+      os.makedirs(foldername)
+    with open('test_map.txt', 'w') as mapFile:
+        with open('test_regrLabels.txt', 'w') as regrFile:
+            with open(os.path.join('./cifar-10-batches-py', 'test_batch'), 'rb') as f:
+                if sys.version_info[0] < 3: 
+                    data = cp.load(f)
+                else: 
+                    data = cp.load(f, encoding='latin1')
+                for i in range(10000):
+                    fname = os.path.join(os.path.abspath(foldername), ('%05d.png' % i))
+                    saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, regrFile, 0)
--- a/Examples/Image/DataSets/CIFAR-10/install_cifar10.py
+++ b/Examples/Image/DataSets/CIFAR-10/install_cifar10.py
@ -0,0 +1,18 @@
+from __future__ import print_function
+import cifar_utils as ut
+
+if __name__ == "__main__":
+    trn, tst= ut.loadData('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')
+    print ('Writing train text file...')
+    ut.saveTxt(r'./Train_cntk_text.txt', trn)
+    print ('Done.')
+    print ('Writing test text file...')
+    ut.saveTxt(r'./Test_cntk_text.txt', tst)
+    print ('Done.')
+
+    print ('Converting train data to png images...')
+    ut.saveTrainImages(r'./Train_cntk_text.txt', 'train')
+    print ('Done.')
+    print ('Converting test data to png images...')
+    ut.saveTestImages(r'./Test_cntk_text.txt', 'test')
+    print ('Done.')
--- a/Examples/Image/DataSets/MNIST/README.md
+++ b/Examples/Image/DataSets/MNIST/README.md
@ -0,0 +1,14 @@
+# MNIST Dataset
+
+The MNIST dataset (http://yann.lecun.com/exdb/mnist/) for handwritten digits recognition is one of the most widely used image dataset for experimenting with different classification algorithms. MNIST has a training set of 60,000 examples, and a test set of 10,000 examples. Each example contains one digit that has been size-normalized and centered in a grayscale image at 28x28 pixel resolution.
+
+The MNIST dataset is not included in the CNTK distribution but can be easily
+downloaded and converted to CNTK-supported format by running the following Python command:
+
+`python install_mnist.py`
+
+After running the script, you will see two output files in the current folder: Train-28x28_cntk_text.txt and Test-28x28_cntk_text.txt. The total amount of disk space required is around `124`MB. You may now proceed to the [`GettingStarted`](../../GettingStarted) folder to play with this dataset. 
+
+Further, we provide two advanced examples with MNIST. The first one is a [`Multi-Layer Perceptron network (MLP)`](../../Classification/MLP), which achieves about 1.5% error rate. The second one is a [`Convolutional Neural Network (ConvNet)`](../../Classification/ConvNet), which achieves about 0.5% error rate. These results are comparable to the best published results using these types of networks.
+
+If you are curious about how well computers can perform on MNIST today, Rodrigo Benenson maintains a [blog](http://rodrigob.github.io/are_we_there_yet/build/classification_datasets_results.html#4d4e495354) on the state-of-the-art performance of various algorithms.  
--- a/Examples/Image/DataSets/MNIST/install_mnist.py
+++ b/Examples/Image/DataSets/MNIST/install_mnist.py
@ -0,0 +1,14 @@
+from __future__ import print_function
+import mnist_utils as ut
+
+if __name__ == "__main__":
+    train = ut.load('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
+        'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 60000)
+    print ('Writing train text file...')
+    ut.savetxt(r'./Train-28x28_cntk_text.txt', train)
+    print ('Done.')
+    test = ut.load('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
+        'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', 10000)
+    print ('Writing test text file...')
+    ut.savetxt(r'./Test-28x28_cntk_text.txt', test)
+    print ('Done.')
--- a/Examples/Image/MNIST/AdditionalFiles/mnist_convert_python3.py
+++ b/Examples/Image/MNIST/AdditionalFiles/mnist_convert_python3.py
@ -1,13 +1,18 @@
+from __future__ import print_function
+try: 
+    from urllib.request import urlretrieve 
+except ImportError: 
+    from urllib import urlretrieve
 import sys
-import urllib.request
 import gzip
+import shutil
 import os
 import struct
 import numpy as np

 def loadData(src, cimg):
    print ('Downloading ' + src)
-    gzfname, h = urllib.request.urlretrieve(src, './delete.me')
+    gzfname, h = urlretrieve(src, './delete.me')
    print ('Done.')
    try:
        with gzip.open(gzfname) as gz:
@ -31,7 +36,7 @@ def loadData(src, cimg):

 def loadLabels(src, cimg):
    print ('Downloading ' + src)
-    gzfname, h = urllib.request.urlretrieve(src, './delete.me')
+    gzfname, h = urlretrieve(src, './delete.me')
    print ('Done.')
    try:
        with gzip.open(gzfname) as gz:
@ -49,29 +54,16 @@ def loadLabels(src, cimg):
        os.remove(gzfname)
    return res.reshape((cimg, 1))

-
 def load(dataSrc, labelsSrc, cimg):
    data = loadData(dataSrc, cimg)
    labels = loadLabels(labelsSrc, cimg)
    return np.hstack((data, labels))

 def savetxt(filename, ndarray):
-    with open(filename, 'w', encoding="ascii") as f:
+    with open(filename, 'w') as f:
        labels = list(map(' '.join, np.eye(10, dtype=np.uint).astype(str)))
        for row in ndarray:
            row_str = row.astype(str)
            label_str = labels[row[-1]]
            feature_str = ' '.join(row_str[:-1])
            f.write('|labels {} |features {}\n'.format(label_str, feature_str))
-
-if __name__ == "__main__":
-    train = load('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
-        'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 60000)
-    print ('Writing train text file...')
-    savetxt(r'./../Data/Train-28x28_cntk_text.txt', train)  
-    print ('Done.')
-    test = load('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
-        'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', 10000)  
-    print ('Writing test text file...')
-    savetxt(r'./../Data/Test-28x28_cntk_text.txt', test)
-    print ('Done.')
--- a/Examples/Image/MNIST/Config/Ndl_deprecated/04_DeConv.ndl
+++ b/Examples/Image/MNIST/Config/Ndl_deprecated/04_DeConv.ndl
--- a/Examples/Image/MNIST/Config/Ndl_deprecated/04_DeConv_ndl_deprecated.cntk
+++ b/Examples/Image/MNIST/Config/Ndl_deprecated/04_DeConv_ndl_deprecated.cntk
--- a/Examples/Image/Detection/FastRCNN/fastrcnn.cntk
+++ b/Examples/Image/Detection/FastRCNN/fastrcnn.cntk
@ -0,0 +1,188 @@
+# Fast-RCNN configuration for CNTK
+# For algorithm and details see http://arxiv.org/abs/1504.08083
+# Overview:
+# The Fast-RCNN algorithm uses a DNN that takes as inputs a set of images 
+# and for each image a set of ROIs (Regions of interest). It first computes
+# a convolutional feature map for the entire image using a series of
+# of convolutional layers (usually from a pretrained network). Then it 
+# employs ROI pooling to crop out the part of the conv feature map 
+# that corresponds to an ROI and resizes it to the input size expected
+# by the following layer (usually a set of pretrained fully connected layers).
+# Classification error and evaluation criterion are computed for each ROI.
+
+command = Train:Test
+#command = Write
+
+deviceId = "Auto"
+precision = "float"
+parallelTrain = "false"
+traceLevel = 1
+
+rootDir = "." 
+dataDir = "$rootDir$/data/"
+outputDir = "$rootDir$/Output"
+
+modelPath = "$outputDir$/Fast-RCNN"
+stderr = "$outputDir$/Fast-RCNN.log"
+
+ImageH = 1000
+ImageW = 1000
+ImageC = 3
+
+NumLabels = 21
+
+NumTrainROIs = 64
+TrainROIDim = 256               # $NumTrainROIs$ * 4 
+TrainROILabelDim = 1344         # $NumTrainROIs$ * $NumLabels$
+
+NumTestROIs = 200
+TestROIDim = 800
+TestROILabelDim = 4200
+
+# For training we load a pretrained AlexNet model (AlexNet.89) and clone three parts of it.
+# For the first part (up to pool1) we keep the weights fixed. The middle part contains the
+# remaining convolutional and pooling layers and the last part are the FC layers. 
+# In the model we apply the first two cloned parts, then an ROI pooling layer and 
+# finally the pretrained FC layers followed by a new FC layer that maps to the new 
+# label dimensionality of 21 classes. 
+# The inputs are images (1000 x 1000 x 3), ROIs (64 ROIs x 4 coordinates (x, y, w, h))
+# and ground truht labels per ROI (64 ROIs x 21 classes).
+Train = {
+    action = "train"
+    
+    BrainScriptNetworkBuilder = {
+        imageShape = $ImageH$:$ImageW$:$ImageC$         # 1000:1000:3
+        labelShape = $NumLabels$:$NumTrainROIs$         # 21:64
+        ROIShape   = 4:$NumTrainROIs$                   # 4:64
+
+        network     = BS.Network.Load ("AlexNet.89")
+        pool1       = BS.Network.CloneFunction(network.features, network.pool1, parameters = "constant")
+        convLayers  = BS.Network.CloneFunction(network.pool1, network.conv5_y)
+        fcLayers    = BS.Network.CloneFunction(network.pool3, network.h2_d)
+
+        model (features, rois) = {
+            featNorm = features - 114
+            pool1Out = pool1 (featNorm)
+            conv5Out = convLayers (pool1Out)
+            roiOut   = ROIPooling (conv5Out, rois, (6:6))
+            fcOut    = fcLayers (roiOut)
+            W        = ParameterTensor{(21:4096)}
+            b        = ParameterTensor{21, init = 'zero'}
+            z        = W * fcOut + b
+        }.z
+
+        features = Input {imageShape}
+        roiLabels = Input {labelShape}
+        rois = Input {ROIShape}
+
+        z = model (features, rois)
+        
+        ce = CrossEntropyWithSoftmax(roiLabels, z, axis = 1)
+        errs = ClassificationError(roiLabels, z, axis = 1)
+        
+        featureNodes    = (features:rois)
+        labelNodes      = (roiLabels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (z)
+    }
+
+    SGD = {
+        epochSize = 0
+        minibatchSize = 2
+        maxEpochs = 15
+        
+        learningRatesPerSample = 0.00005
+        momentumAsTimeConstant = 0*5:1024 # was: 0.9 per MB 
+        L2RegWeight = 0.0001
+        dropoutRate = 0.5
+        
+        numMBsToShowResult = 50
+    }
+
+    reader = {
+        randomize = false
+        verbosity = 2
+        deserializers = ({
+            type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
+            file = "$dataDir$/tv2012pad.rois.txt"
+            input = { rois = { dim = $TrainROIDim$ ; format = "dense" } }
+        }:{
+            type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
+            file = "$dataDir$/tv2012pad.roilabels.txt"
+            input = { roiLabels = { dim = $TrainROILabelDim$ ; format = "dense" } }
+        }:{
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$dataDir$/tv2012pad.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Scale" ; width = $ImageW$ ; height = $ImageW$ ; channels = $ImageC$ ; scaleMode = "pad" ; padValue = 114 }:
+                    { type = "Transpose" }
+                )}
+                ignored = {labelDim = 1000}
+            }
+        })
+    }
+}
+
+# For testing we load the trained Fast-RCNN model and modify the input size,
+# such that the network accepts 200 ROIs per image. To this end we load and 
+# clone the entire network and define new inputs with the desired size 
+# corresponding to 200 ROIs. 
+Test = {
+    action = "test"
+    minibatchSize = 1
+
+    # use this for write action
+    # action = "write"
+    # outputPath = "$OutputDir$/fastrcnnNetOutput"
+    
+    BrainScriptNetworkBuilder = {
+        imageShape = $ImageH$:$ImageW$:$ImageC$        # 1000:1000:3
+        labelShape = $NumLabels$:$NumTestROIs$         # 21:200
+        ROIShape   = 4:$NumTestROIs$                   # 4:200
+
+        # load network
+        network = BS.Network.Load ("$modelPath$")
+        clonedNet = BS.Network.CloneFunction ((network.features:network.rois), { z = network.z }, parameters = "constant")
+
+        features = Input {imageShape}
+        roiLabels = Input {labelShape}
+        rois = Input {ROIShape}
+
+        z = clonedNet(features, rois).z
+        
+        ce = CrossEntropyWithSoftmax (roiLabels, z, axis = 1)
+        errs = ClassificationError(z, roiLabels, axis = 1)
+        
+        featureNodes    = (features:rois)
+        labelNodes      = (roiLabels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (z)
+    }
+    
+    reader = {
+        randomize = false
+        verbosity = 2
+        deserializers = ({
+            type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
+            file = "$dataDir$/test2007pad_all.rois.txt"
+            input = { rois = { dim = $TestROIDim$ ; format = "dense" } }
+        }:{
+            type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
+            file = "$dataDir$/test2007pad_all.roilabels.txt"
+            input = { roiLabels = { dim = $TestROILabelDim$ ; format = "dense" } }
+        }:{
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$dataDir$/test2007pad_all.txt"
+            input = {
+                features = { transforms = (
+                    { type = "Scale" ; width = $ImageW$ ; height = $ImageW$ ; channels = $ImageC$ ; scaleMode = "pad" ; padValue = 114 }:
+                    { type = "Transpose" }
+                )}
+                ignored = {labelDim = 1000}
+            }
+        })
+    }
+}
--- a/Examples/Image/GettingStarted/01_OneHidden.cntk
+++ b/Examples/Image/GettingStarted/01_OneHidden.cntk
@ -0,0 +1,115 @@
+# Parameters can be overwritten on the command line
+# for example: cntk configFile=myConfigFile RootDir=../.. 
+# For running from Visual Studio add
+# currentDirectory=$(SolutionDir)/<path to corresponding data folder> 
+
+command = trainNetwork:testNetwork
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/01_OneHidden"
+#stderr = "$outputDir$/01_OneHidden_bs_out"
+
+# TRAINING CONFIG
+trainNetwork = {
+    action = "train"
+    
+    BrainScriptNetworkBuilder = {
+        imageShape = 28:28:1                        # image dimensions, 1 channel only
+        labelDim = 10                               # number of distinct labels
+        featScale = 1/256
+
+        # This model returns multiple nodes as a record, which
+        # can be accessed using .x syntax.
+        model(x) = {
+            s1 = x * featScale
+            h1 = DenseLayer {200, activation=ReLU} (s1) 
+            z = LinearLayer {labelDim} (h1)
+        }
+        
+        # inputs
+        features = Input {imageShape}
+        labels = Input {labelDim}
+
+        # apply model to features
+        out = model (features)
+
+        # loss and error computation
+        ce   = CrossEntropyWithSoftmax (labels, out.z)
+        errs = ClassificationError (labels, out.z)
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs)
+        outputNodes     = (out.z)
+        
+        # Alternative, you can use the Sequential keyword and write the model 
+        # as follows. We keep the previous format because EvalClientTest needs 
+        # to access the internal nodes, which is not doable yet with Sequential 
+        #
+        # Scale{f} = x => Constant(f) .* x
+        # model = Sequential (
+            # Scale {featScale} :
+            # DenseLayer {200} : ReLU : 
+            # LinearLayer {labelDim}
+        # )
+
+        # # inputs
+        # features = Input {imageShape}
+        # labels = Input (labelDim)
+
+        # # apply model to features
+        # ol = model (features)
+
+        # # loss and error computation
+        # ce   = CrossEntropyWithSoftmax (labels, ol)
+        # errs = ClassificationError (labels, ol)
+
+        # # declare special nodes
+        # featureNodes    = (features)
+        # labelNodes      = (labels)
+        # criterionNodes  = (ce)
+        # evaluationNodes = (errs)
+        # outputNodes     = (ol)
+    }
+
+    SGD = {
+        epochSize = 60000
+        minibatchSize = 64
+        maxEpochs = 10
+        learningRatesPerSample = 0.01*5:0.005
+        momentumAsTimeConstant = 0
+        
+        numMBsToShowResult = 500
+    }
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        # See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }   
+}
+
+# TEST CONFIG
+testNetwork = {
+    action = "test"
+    minibatchSize = 1024    # reduce this if you run out of memory
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
--- a/Examples/Image/MNIST/Config/02_Convolution.cntk
+++ b/Examples/Image/MNIST/Config/02_Convolution.cntk
@ -1,5 +1,5 @@
 # Parameters can be overwritten on the command line
-# for example: cntk configFile=myConfigFile rootDir=../.. 
+# for example: cntk configFile=myConfigFile RootDir=../.. 
 # For running from Visual Studio add
 # currentDirectory=$(SolutionDir)/<path to corresponding data folder> 

@ -7,11 +7,11 @@ command = trainNetwork:testNetwork

 precision = "float"; traceLevel = 1 ; deviceId = "auto"

-rootDir = ".." ; configDir = "$rootDir$/Config" ; dataDir = "$rootDir$/Data" ;
-outputDir = "$rootDir$/Output" ;
+rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;

-modelPath = "$outputDir$/Models/02_Convolution"
-stderr = "$outputDir$/02_Convolution_bs_out"
+modelPath = "$outputDir$/Models/02_OneConv"
+#stderr = "$outputDir$/02_OneConv_bs_out"

 # TRAINING CONFIG
 trainNetwork = {
@ -27,15 +27,13 @@ trainNetwork = {
            Scale {featScale} :
            ConvolutionalLayer {16, (5:5), pad = true} : ReLU : 
            MaxPoolingLayer    {(2:2), stride=(2:2)} :
-            ConvolutionalLayer {32, (5:5), pad = true} : ReLU : 
-            MaxPoolingLayer    {(2:2), stride=(2:2)} :
-            DenseLayer         {128, activation=Sigmoid} :
+            DenseLayer {64} : ReLU : 
            LinearLayer {labelDim}
        )
        
        # inputs
        features = Input {imageShape}
-        labels = Input {labelDim}
+        labels = Input (labelDim)

        # apply model to features
        ol = model (features)
@ -57,8 +55,7 @@ trainNetwork = {
        minibatchSize = 64
        maxEpochs = 15
        learningRatesPerSample = 0.001*5:0.0005
-        momentumAsTimeConstant = 0*5:1024
-        
+        momentumAsTimeConstant = 0
        numMBsToShowResult = 500
    }

@ -75,7 +72,7 @@ trainNetwork = {

 # TEST CONFIG
 testNetwork = {
-    action = test
+    action = "test"
    minibatchSize = 1024    # reduce this if you run out of memory

    reader = {
--- a/Examples/Image/GettingStarted/03_OneConvDropout.cntk
+++ b/Examples/Image/GettingStarted/03_OneConvDropout.cntk
@ -7,11 +7,11 @@ command = trainNetwork:testNetwork

 precision = "float"; traceLevel = 1 ; deviceId = "auto"

-rootDir = ".." ; configDir = "$rootDir$/Config" ; dataDir = "$rootDir$/Data" ;
-outputDir = "$rootDir$/Output" ;
+rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;

-modelPath = "$outputDir$/Models/01_OneHidden"
-stderr = "$outputDir$/01_OneHidden_bs_out"
+modelPath = "$outputDir$/Models/03_OneConvDropout"
+#stderr = "$outputDir$/03_OneConvDropout_bs_out"

 # TRAINING CONFIG
 trainNetwork = {
@ -21,41 +21,42 @@ trainNetwork = {
        imageShape = 28:28:1                        # image dimensions, 1 channel only
        labelDim = 10                               # number of distinct labels
        featScale = 1/256
+        Scale{f} = x => Constant(f) .* x
        
-        # This model returns multiple nodes as a record, which
-        # can be accessed using .x syntax.
-        model(x) = {
-            s1 = x * featScale
-            h1 = DenseLayer {200, activation=Sigmoid} (s1)
-            z = LinearLayer {labelDim} (h1)
-        }
+        model = Sequential (
+            Scale {featScale} :
+            ConvolutionalLayer {16, (5:5), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} : Dropout : 
+            DenseLayer         {64} : ReLU : 
+            LinearLayer        {labelDim}
+        )

        # inputs
        features = Input {imageShape}
        labels = Input (labelDim)

        # apply model to features
-        out = model (features)
+        ol = model (features)

        # loss and error computation
-        ce   = CrossEntropyWithSoftmax (labels, out.z)
-        errs = ClassificationError (labels, out.z)
+        ce   = CrossEntropyWithSoftmax (labels, ol)
+        errs = ClassificationError (labels, ol)

        # declare special nodes
        featureNodes    = (features)
        labelNodes      = (labels)
        criterionNodes  = (ce)
        evaluationNodes = (errs)
-        outputNodes     = (out.z)
+        outputNodes     = (ol)
    }

    SGD = {
        epochSize = 60000
        minibatchSize = 64
-        maxEpochs = 30
-        learningRatesPerSample = 0.01*5:0.005
+        maxEpochs = 15
+        learningRatesPerSample = 0.001*5:0.0005
        momentumAsTimeConstant = 0
-        
+        dropoutRate = 0.5
        numMBsToShowResult = 500
    }

--- a/Examples/Image/MNIST/Config/03_ConvBatchNorm.cntk
+++ b/Examples/Image/MNIST/Config/03_ConvBatchNorm.cntk
@ -1,5 +1,5 @@
 # Parameters can be overwritten on the command line
-# for example: cntk configFile=myConfigFile rootDir=../.. 
+# for example: cntk configFile=myConfigFile RootDir=../.. 
 # For running from Visual Studio add
 # currentDirectory=$(SolutionDir)/<path to corresponding data folder> 

@ -7,11 +7,11 @@ command = trainNetwork:testNetwork

 precision = "float"; traceLevel = 1 ; deviceId = "auto"

-rootDir = ".." ; configDir = "$rootDir$/Config" ; dataDir = "$rootDir$/Data" ;
-outputDir = "$rootDir$/Output" ;
+rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;

-modelPath = "$outputDir$/Models/03_ConvBatchNorm"
-stderr = "$outputDir$/03_ConvBatchNorm_bs_out"
+modelPath = "$outputDir$/Models/04_OneConvBN"
+#stderr = "$outputDir$/04_OneConvBN_bs_out"

 # TRAINING CONFIG
 trainNetwork = {
@ -24,20 +24,22 @@ trainNetwork = {
        Scale{f} = x => Constant(f) .* x
        
        # define a custom layer with 5x5 convolution, batch norm, relu and 2x2 max pooling
-        ConvBnReluPoolLayer {outChannels} = Sequential (
-            ConvolutionalLayer      {outChannels, (5:5), pad=true, bias=false} :
+        ConvBnReluPoolLayer {outChannels, filterShape} = Sequential (
+            ConvolutionalLayer      {outChannels, filterShape, pad=true, bias=false} :
            BatchNormalizationLayer {spatialRank = 2} :
            ReLU :
            MaxPoolingLayer         {(2:2), stride = (2:2)} 
        )

+        DenseBnReluLayer {outDim} = Sequential (
+            LinearLayer             {outDim} :   
+            BatchNormalizationLayer {spatialRank = 1} : ReLU
+        )
+		
        model = Sequential (
            Scale {featScale} : 
-            ConvBnReluPoolLayer     {16} :
-            ConvBnReluPoolLayer     {32} :
-            LinearLayer             {128} :
-            BatchNormalizationLayer {} :
-            ReLU :
+            ConvBnReluPoolLayer {16, (5:5)} : 
+            DenseBnReluLayer {64} : 
            LinearLayer {labelDim}
        )
        
@ -63,10 +65,9 @@ trainNetwork = {
    SGD = {
        epochSize = 60000
        minibatchSize = 64
-        maxEpochs = 3
-        learningRatesPerSample = 0.02:0.005
+        maxEpochs = 10
+        learningRatesPerSample = 0.01*5:0.001
        momentumAsTimeConstant = 0
-        
        numMBsToShowResult = 500
    }

@ -83,7 +84,7 @@ trainNetwork = {

 # TEST CONFIG
 testNetwork = {
-    action = test
+    action = "test"
    minibatchSize = 1024    # reduce this if you run out of memory

    reader = {
--- a/Examples/Image/GettingStarted/05_OneConvRegr.cntk
+++ b/Examples/Image/GettingStarted/05_OneConvRegr.cntk
@ -0,0 +1,86 @@
+# Parameters can be overwritten on the command line
+# for example: cntk configFile=myConfigFile RootDir=../.. 
+# For running from Visual Studio add
+# currentDirectory=$(SolutionDir)/<path to corresponding data folder> 
+
+command = trainNetwork:testNetwork
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/05_OneConvRegr"
+#stderr = "$outputDir$/05_OneConvRegr_bs_out"
+
+# TRAINING CONFIG
+trainNetwork = {
+    action = "train"
+    
+    BrainScriptNetworkBuilder = {
+        imageShape = 28:28:1                        # image dimensions, 1 channel only
+        labelDim = 10                               # number of distinct labels
+        featScale = 1/256
+        Scale{f} = x => Constant(f) .* x
+        
+        model = Sequential (
+            Scale {featScale} :
+            ConvolutionalLayer {16, (5:5), pad = true} : ReLU : 
+            MaxPoolingLayer    {(2:2), stride=(2:2)} :
+            DenseLayer {64} : ReLU : 
+            LinearLayer {labelDim}
+        )
+        
+        # inputs
+        features = Input {imageShape}
+        labels = Input {labelDim}
+
+        # apply model to features
+        z = model (features)
+
+        # loss and error computation
+        sqErr = SquareError (labels, z)
+        rmse = Sqrt (sqErr / labelDim)
+
+        # declare special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (rmse)
+        evaluationNodes = (rmse)
+        outputNodes     = (z)
+    }
+
+    SGD = {
+        epochSize = 0
+        minibatchSize = 64
+        maxEpochs = 15
+        learningRatesPerSample = 0.001*5:0.0005
+        momentumAsTimeConstant = 1024
+        numMBsToShowResult = 500
+    }
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        # See ../REAMDE.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = {
+            features   = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }   
+}
+
+# TEST CONFIG
+testNetwork = {
+    action = "test"
+    minibatchSize = 1024    # reduce this if you run out of memory
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
--- a/Examples/Image/GettingStarted/README.md
+++ b/Examples/Image/GettingStarted/README.md
@ -0,0 +1,103 @@
+# CNTK Examples: Image/Getting Started
+
+## Overview
+
+|Data:     |The MNIST dataset (http://yann.lecun.com/exdb/mnist/) of handwritten digits.
+|:---------|:---
+|Purpose   |This folder contains a number of examples that demonstrate the usage of BrainScript to define basic networks for deep learning on image tasks.
+|Network   |Simple feed-forward networks including dense layers, convolution layers, drop out and batch normalization for classification and regression tasks.
+|Training  |Stochastic gradient descent both with and without momentum.
+|Comments  |There are five configuration files, details are provided below.
+
+## Running the example
+
+### Getting the data
+
+These examples use the MNIST dataset to demonstrate various network configurations. MNIST dataset is not included in the CNTK distribution but can be easily downloaded and converted by following the instructions in [DataSets/MNIST](../DataSets/MNIST). We recommend you to keep the downloaded data in the respective folder while downloading, as the configuration files in this folder assumes that by default.
+
+### Setup
+
+Compile the sources to generate the cntk executable (not required if you downloaded the binaries).
+
+__Windows:__ Add the folder of the cntk executable to your path
+(e.g. `set PATH=%PATH%;c:/src/cntk/x64/Release/;`)
+or prefix the call to the cntk executable with the corresponding folder.
+
+__Linux:__ Add the folder of the cntk executable to your path
+(e.g. `export PATH=$PATH:$HOME/src/cntk/build/Release/bin/`)
+or prefix the call to the cntk executable with the corresponding folder.
+
+### Run
+
+Run the example from the current folder (recommended) using:
+
+`cntk configFile=01_OneHidden.cntk`
+
+or run from any folder and specify the `GettingStarted` folder as the `currentDirectory`,
+e.g. running from the `Image` folder using:
+
+`cntk configFile=GettingStarted/01_OneHidden.cntk currentDirectory=GettingStarted`
+
+An Output folder will be created in the `Image/GettingStarted` folder, which is used to store intermediate results and trained models.
+
+## Details
+
+There are five cntk configuration files in the current folder. These cntk configuration files use BrainScript, a custom script language for CNTK. To learn more about BrainScript, please follow the introduction of [BrainScript Basic Concepts](https://github.com/Microsoft/CNTK/wiki/BS-Basic-concepts).
+
+### 01_OneHidden.cntk
+
+This is a simple, one hidden layer network that produces `1.76%` of error. Since this model does not assume any spatial relationships between the pixels, it is often referred as "permutation invariant". 
+
+To run this example, use the following command:
+
+`cntk configFile=01_OneHidden.cntk`
+
+In this example, the MNIST images are first normalized to the range `[0,1)`, followed by a single dense hidden layer with 200 nodes. A [rectified linear unit (ReLU)](http://machinelearning.wustl.edu/mlpapers/paper_files/icml2010_NairH10.pdf) activation function is added for nonlinearity. Afterwards, another dense linear layer is added to generate the output label. The training adopts cross entropy as the cost function after softmax.
+
+In the `SGD` block, `learningRatesPerSample = 0.01*5:0.005` indicates using 0.01 as learning rate per sample for 5 epochs and then 0.005 for the rest. More details about the SGD block are explained [here](https://github.com/Microsoft/CNTK/wiki/SGD-Block).
+
+The MNIST data is loaded with a simple CNTK text format reader. The train and test datasets are converted by running the Python script in [DataSets/MNIST](../DataSets/MNIST). For more information on the reader block, please refer [here](https://github.com/Microsoft/CNTK/wiki/Reader-block).
+
+### 02_OneConv.cntk
+
+In the second example, we add a convolution layer to the network. Convolution layers were inspired by biological process, and has been extremely popular in image-related tasks, where neighboring pixels have high correlation. One of the earliest papers on convolution neural networks can be found [here](http://yann.lecun.com/exdb/publis/pdf/lecun-01a.pdf).
+
+To run this example, use the following command:
+
+`cntk configFile=02_OneConv.cntk`
+
+After normalization, a convolution layer with `16` kernels at size `(5,5)` is added, followed by a ReLU nonlinearity. Then, we perform max pooling on the output feature map, with size `(2,2)` and stride `(2,2)`. A dense layer of 64 hidden nodes is then added, followed by another ReLU, and another dense layer to generate the output. This network achieves `1.22%` error rate, which is better than the previous network.
+
+In practice, one would be stacking multiple convolution layers to improve classification accuracy. State-of-the-art convolution neural networks can achieve lower than 0.5% error rate on MNIST. Interested readers can find more examples in [Classification/ConvNet](../Classification/ConvNet).
+
+### 03_OneConvdropout.cntk
+
+In the third example, we demonstrate the use of dropout layers. Dropout is a network regularization technique that helps combat overfitting, in particular when the network contains many parameters. Dropout, together with ReLU activiation, are the two key techniques that enables Alex Krizhevsky, Ilya Sutskever, and Geoffrey Hinton to win the ILSVRC-2012 competition, which has argueabally changed the course of computer vision research. Their paper can be found [here](https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf).
+
+To run this example, use the following command:
+
+`cntk configFile=03_OneConvDropout.cntk`
+
+Compared with the previous example, we added a dropout layer after max pooling. Dropout can also be added after dense layer if needed. The dropout rate is specified in the SGD block, as `dropoutRate = 0.5`.
+
+With dropout, the accuracy of the network improves slightly to `1.10%` error rate.
+
+### 04_OneConvBN.cntk
+
+In the fourth example, we add [batch normalization](https://arxiv.org/abs/1502.03167) to the network. Batch normalization was designed to address the internal covariate shift problem caused by input and parameter changes during training. The technique has been proven to be very useful in training very deep and complicated networks.
+
+In this example, we simply added a batch normalization layer to the `02_OneConv.cntk` network. To run this example, use the following command:
+
+`cntk configFile=04_OneConvBN.cntk`
+
+The network achieves around `0.96%` error rate, which is better than the previous examples. Due to the small training dataset and the extremely simple network, we have to stop the training early (10 epochs) in order to avoid overfitting.
+
+This cntk configuration file also demonstrates the use of custom layer definition in BrainScript. Note `ConvBnReluPoolLayer` and `DenseBnReluLayer` are both custom layers that contains different basic layer types.
+
+### 05_OneConvRegr.cntk
+
+In the fifth example, we show how CNTK can be used to perform a regression task. To simplify our task and not introduce any new datasets, we assume the digit labels of MNIST is a regression target rather than a classification target. We then reuse the same network architecture in `02_OneConv`, only to replace the cost function with squared error. To run this example, use the following command:
+
+`cntk configFile=05_OneConvRegr.cntk`
+
+ The trained network achieves root-mean-square error (RMSE) of 0.0039. To see more sophisticated examples on regression tasks, please refer to [Regression](../Regression).
--- a/Examples/Image/MNIST/AdditionalFiles/mnist_convert.py
+++ b/Examples/Image/MNIST/AdditionalFiles/mnist_convert.py
@ -1,77 +0,0 @@
-import sys
-import urllib
-import gzip
-import shutil
-import os
-import struct
-import numpy as np
-
-def loadData(src, cimg):
-    print ('Downloading ' + src)
-    gzfname, h = urllib.urlretrieve(src, './delete.me')
-    print ('Done.')
-    try:
-        with gzip.open(gzfname) as gz:
-            n = struct.unpack('I', gz.read(4))
-            # Read magic number.
-            if n[0] != 0x3080000:
-                raise Exception('Invalid file: unexpected magic number.')
-            # Read number of entries.
-            n = struct.unpack('>I', gz.read(4))[0]
-            if n != cimg:
-                raise Exception('Invalid file: expected {0} entries.'.format(cimg))
-            crow = struct.unpack('>I', gz.read(4))[0]
-            ccol = struct.unpack('>I', gz.read(4))[0]
-            if crow != 28 or ccol != 28:
-                raise Exception('Invalid file: expected 28 rows/cols per image.')
-            # Read data.
-            res = np.fromstring(gz.read(cimg * crow * ccol), dtype = np.uint8)
-    finally:
-        os.remove(gzfname)
-    return res.reshape((cimg, crow * ccol))
-
-def loadLabels(src, cimg):
-    print 'Downloading ' + src
-    gzfname, h = urllib.urlretrieve(src, './delete.me')
-    print 'Done.'
-    try:
-        with gzip.open(gzfname) as gz:
-            n = struct.unpack('I', gz.read(4))
-            # Read magic number.
-            if n[0] != 0x1080000:
-                raise Exception('Invalid file: unexpected magic number.')
-            # Read number of entries.
-            n = struct.unpack('>I', gz.read(4))
-            if n[0] != cimg:
-                raise Exception('Invalid file: expected {0} rows.'.format(cimg))
-            # Read labels.
-            res = np.fromstring(gz.read(cimg), dtype = np.uint8)
-    finally:
-        os.remove(gzfname)
-    return res.reshape((cimg, 1))
-
-def load(dataSrc, labelsSrc, cimg):
-    data = loadData(dataSrc, cimg)
-    labels = loadLabels(labelsSrc, cimg)
-    return np.hstack((data, labels))
-
-def savetxt(filename, ndarray):
-    with open(filename, 'w') as f:
-        labels = map(' '.join, np.eye(10, dtype=np.uint).astype(str))
-        for row in ndarray:
-            row_str = row.astype(str)
-            label_str = labels[row[-1]]
-            feature_str = ' '.join(row_str[:-1])
-            f.write('|labels {} |features {}\n'.format(label_str, feature_str))
-
-if __name__ == "__main__":
-    train = load('http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz',
-        'http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz', 60000)
-    print 'Writing train text file...'
-    savetxt(r'./../Data/Train-28x28_cntk_text.txt', train)
-    print 'Done.'
-    test = load('http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz',
-        'http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz', 10000)
-    print 'Writing test text file...'
-    savetxt(r'./../Data/Test-28x28_cntk_text.txt', test)
-    print 'Done.'
--- a/Examples/Image/MNIST/Data/labelsmap.txt
+++ b/Examples/Image/MNIST/Data/labelsmap.txt
@ -1,10 +0,0 @@
-0
-1
-2
-3
-4
-5
-6
-7
-8
-9
--- a/Examples/Image/MNIST/README.md
+++ b/Examples/Image/MNIST/README.md
@ -1,85 +0,0 @@
-# CNTK example: MNIST 
-
-## Overview
-
-|Data:     |The MNIST database (http://yann.lecun.com/exdb/mnist/) of handwritten digits.
-|:---------|:---
-|Purpose   |This example demonstrates usage of the NDL (Network Description Language) to define networks.
-|Network   |NDLNetworkBuilder, simple feed forward and convolutional networks, cross entropy with softmax.
-|Training  |Stochastic gradient descent both with and without momentum.
-|Comments  |There are four config files, details are provided below.
-
-## Running the example
-
-### Getting the data
-
-The MNIST dataset is not included in the CNTK distribution but can be easily 
-downloaded and converted by running the following command from the 'AdditionalFiles' folder:
-
-`python mnist_convert.py`
-
-The script will download all required files and convert them to CNTK-supported format. 
-The resulting files (Train-28x28_cntk_text.txt and Test-28x28_cntk_text.txt) will be stored in the 'Data' folder.
-In case you don't have Python installed, there are 2 options:
-
-1. Download and install latest version of Python 2.7 from: https://www.python.org/downloads/ 
-Then install the numpy package by following instruction from: http://www.scipy.org/install.html#individual-packages
-
-2. Alternatively install the Python Anaconda distribution which contains most of the 
-popular Python packages including numpy: http://continuum.io/downloads
-
-### Setup
-
-Compile the sources to generate the cntk executable (not required if you downloaded the binaries).
-
-__Windows:__ Add the folder of the cntk executable to your path 
-(e.g. `set PATH=%PATH%;c:/src/cntk/x64/Debug/;`) 
-or prefix the call to the cntk executable with the corresponding folder. 
-
-__Linux:__ Add the folder of the cntk executable to your path 
-(e.g. `export PATH=$PATH:$HOME/src/cntk/build/debug/bin/`) 
-or prefix the call to the cntk executable with the corresponding folder. 
-
-### Run
-
-Run the example from the Image/MNIST/Data folder using:
-
-`cntk configFile=../Config/01_OneHidden_ndl_deprecated.cntk`
-
-or run from any folder and specify the Data folder as the `currentDirectory`, 
-e.g. running from the Image/MNIST folder using:
-
-`cntk configFile=Config/01_OneHidden_ndl_deprecated.cntk currentDirectory=Data`
-
-The output folder will be created inside Image/MNIST/.
-
-## Details
-
-### Config files
-
-There are four config files and the corresponding network description files in the 'Config' folder:
-
-1. 01_OneHidden.ndl is a simple, one hidden layer network that produces 2.3% of error.
-To run the sample, navigate to the Data folder and run the following command:  
-`cntk configFile=../Config/01_OneHidden_ndl_deprecated.cntk`
-
-2. 02_Convolution.ndl is more interesting, convolutional network which has 2 convolutional and 2 max pooling layers. 
-The network produces 0.87% of error after training for about 2 minutes on GPU.
-To run the sample, navigate to the Data folder and run the following command:  
-`cntk configFile=../Config/02_Convolution_ndl_deprecated.cntk`
-
-3. 03_ConvBatchNorm.ndl is almost identical to 02_Convolution.ndl 
-except that it uses batch normalization for the convolutional and fully connected layers.
-As a result, it achieves around 0.8% of error after training for just 2 epochs (and less than 30 seconds).
-To run the sample, navigate to the Data folder and run the following command:  
-`cntk configFile=../Config/03_ConvBatchNorm_ndl_deprecated.cntk`
-
-4. 04_DeConv.ndl illustrates the usage of Deconvolution and Unpooling. It is a network with one Convolution, one Pooling, one Unpooling and one Deconvolution layer. In fact it is an auto-encoder network where Rectified Linear Unit (ReLU) or Sigmoid layer is now replaced with Convolutional ReLU (for encoding) and Deconvolutional ReLU (for decoding) layers. The network goal is to reconstruct the original signal, with Mean Squared Error (MSE) used to minimize the reconstruction error. Generally such networks are used in semantic segmentation.  
-To run the sample, navigate to the Data folder and run the following command:  
-`cntk configFile=../Config/04_DeConv_ndl_deprecated.cntk` 
-
-For more details, refer to .ndl and the corresponding .cntk files.
-
-### Additional files
-
-The 'AdditionalFiles' folder contains the python script to download and convert the data. 
--- a/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py
+++ b/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter.py
@ -1,80 +0,0 @@
-import os
-import sys
-import struct
-import cPickle as cp
-from PIL import Image
-import numpy as np
-import xml.etree.cElementTree as et
-import xml.dom.minidom
-
-imgSize = 32
-
-def saveImage(fname, data, label, mapFile, regrFile, pad, **key_parms):
-    # data in CIFAR-10 dataset is in CHW format.
-    pixData = data.reshape((3, imgSize, imgSize))
-    if ('mean' in key_parms):
-        key_parms['mean'] += pixData
-
-    if pad > 0:
-        pixData = np.pad(pixData, ((0, 0), (pad, pad), (pad, pad)), mode='constant', constant_values=128) # can also use mode='edge'
-
-    img = Image.new('RGB', (imgSize + 2 * pad, imgSize + 2 * pad))
-    pixels = img.load()
-    for x in range(img.size[0]):
-        for y in range(img.size[1]):
-            pixels[x, y] = (pixData[0][y][x], pixData[1][y][x], pixData[2][y][x])
-    img.save(fname)
-    mapFile.write("%s\t%d\n" % (fname, label))
-    
-    # compute per channel mean and store for regression example
-    channelMean = np.mean(pixData, axis=(1,2))
-    regrFile.write("|regrLabels\t%f\t%f\t%f\n" % (channelMean[0]/255.0, channelMean[1]/255.0, channelMean[2]/255.0))
-    
-
-def saveMean(fname, data):
-    root = et.Element('opencv_storage')
-    et.SubElement(root, 'Channel').text = '3'
-    et.SubElement(root, 'Row').text = str(imgSize)
-    et.SubElement(root, 'Col').text = str(imgSize)
-    meanImg = et.SubElement(root, 'MeanImg', type_id='opencv-matrix')
-    et.SubElement(meanImg, 'rows').text = '1'
-    et.SubElement(meanImg, 'cols').text = str(imgSize * imgSize * 3)
-    et.SubElement(meanImg, 'dt').text = 'f'
-    et.SubElement(meanImg, 'data').text = ' '.join(['%e' % n for n in np.reshape(data, (imgSize * imgSize * 3))])
-
-    tree = et.ElementTree(root)
-    tree.write(fname)
-    x = xml.dom.minidom.parse(fname)
-    with open(fname, 'w') as f:
-        f.write(x.toprettyxml(indent = '  '))
-
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print "Usage: CifarConverter.py <path to CIFAR-10 dataset directory>\nCIFAR-10 dataset (Python version) can be downloaded from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz"
-        sys.exit(1)
-    rootDir = sys.argv[1]
-    trainDir = os.path.join(rootDir, os.path.join('data', 'train'))
-    if not os.path.exists(trainDir):
-        os.makedirs(trainDir)
-    testDir = os.path.join(rootDir, os.path.join('data', 'test'))
-    if not os.path.exists(testDir):
-      os.makedirs(testDir)
-    data = {}
-    dataMean = np.zeros((3, imgSize, imgSize)) # mean is in CHW format.
-    with open(os.path.join(rootDir, 'train_map.txt'), 'w') as mapFile:
-        with open(os.path.join(rootDir, 'train_regrLabels.txt'), 'w') as regrFile:
-            for ifile in range(1, 6):
-                with open(os.path.join(rootDir, 'data_batch_' + str(ifile)), 'rb') as f:
-                    data = cp.load(f)
-                    for i in range(10000):
-                        fname = os.path.join(trainDir, ('%05d.png' % (i + (ifile - 1) * 10000)))
-                        saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, regrFile, 4, mean=dataMean)
-    dataMean = dataMean / (50 * 1000)
-    saveMean(os.path.join(rootDir, 'CIFAR-10_mean.xml'), dataMean)
-    with open(os.path.join(rootDir, 'test_map.txt'), 'w') as mapFile:
-        with open(os.path.join(rootDir, 'test_regrLabels.txt'), 'w') as regrFile:
-            with open(os.path.join(rootDir, 'test_batch'), 'rb') as f:
-                data = cp.load(f)
-                for i in range(10000):
-                    fname = os.path.join(testDir, ('%05d.png' % i))
-                    saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, regrFile, 0)
--- a/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter_py3.py
+++ b/Examples/Image/Miscellaneous/CIFAR-10/CifarConverter_py3.py
@ -1,73 +0,0 @@
-import os
-import sys
-import struct
-import pickle as cp
-from PIL import Image
-import numpy as np
-import xml.etree.cElementTree as et
-import xml.dom.minidom
-
-imgSize = 32
-
-def saveImage(fname, data, label, mapFile, pad, **key_parms):
-    # data in CIFAR-10 dataset is in CHW format.
-    pixData = data.reshape((3, imgSize, imgSize))
-    if ('mean' in key_parms):
-        key_parms['mean'] += pixData
-
-    if pad > 0:
-        pixData = np.pad(pixData, ((0, 0), (pad, pad), (pad, pad)), mode='constant', constant_values=128) # can also use mode='edge'
-
-    img = Image.new('RGB', (imgSize + 2 * pad, imgSize + 2 * pad))
-    pixels = img.load()
-    for x in range(img.size[0]):
-        for y in range(img.size[1]):
-            pixels[x, y] = (pixData[0][y][x], pixData[1][y][x], pixData[2][y][x])
-    img.save(fname)
-    mapFile.write("%s\t%d\n" % (fname, label))
-
-def saveMean(fname, data):
-    root = et.Element('opencv_storage')
-    et.SubElement(root, 'Channel').text = '3'
-    et.SubElement(root, 'Row').text = str(imgSize)
-    et.SubElement(root, 'Col').text = str(imgSize)
-    meanImg = et.SubElement(root, 'MeanImg', type_id='opencv-matrix')
-    et.SubElement(meanImg, 'rows').text = '1'
-    et.SubElement(meanImg, 'cols').text = str(imgSize * imgSize * 3)
-    et.SubElement(meanImg, 'dt').text = 'f'
-    et.SubElement(meanImg, 'data').text = ' '.join(['%e' % n for n in np.reshape(data, (imgSize * imgSize * 3))])
-
-    tree = et.ElementTree(root)
-    tree.write(fname)
-    x = xml.dom.minidom.parse(fname)
-    with open(fname, 'w') as f:
-        f.write(x.toprettyxml(indent = '  '))
-
-if __name__ == "__main__":
-    if len(sys.argv) != 2:
-        print ("Usage: CifarConverter.py <path to CIFAR-10 dataset directory>\nCIFAR-10 dataset (Python version) can be downloaded from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")
-        sys.exit(1)
-    rootDir = sys.argv[1]
-    trainDir = os.path.join(rootDir, os.path.join('data', 'train'))
-    if not os.path.exists(trainDir):
-        os.makedirs(trainDir)
-    testDir = os.path.join(rootDir, os.path.join('data', 'test'))
-    if not os.path.exists(testDir):
-      os.makedirs(testDir)
-    data = {}
-    dataMean = np.zeros((3, imgSize, imgSize)) # mean is in CHW format.
-    with open(os.path.join(rootDir, 'train_map.txt'), 'w') as mapFile:
-        for ifile in range(1, 6):
-            with open(os.path.join(rootDir, 'data_batch_' + str(ifile)), 'rb') as f:
-                data = cp.load(f, encoding='latin1')
-                for i in range(10000):
-                    fname = os.path.join(trainDir, ('%05d.png' % (i + (ifile - 1) * 10000)))
-                    saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, 4, mean=dataMean)
-    dataMean = dataMean / (50 * 1000)
-    saveMean(os.path.join(rootDir, 'CIFAR-10_mean.xml'), dataMean)
-    with open(os.path.join(rootDir, 'test_map.txt'), 'w') as mapFile:
-        with open(os.path.join(rootDir, 'test_batch'), 'rb') as f:
-            data = cp.load(f, encoding='latin1')
-            for i in range(10000):
-                fname = os.path.join(testDir, ('%05d.png' % i))
-                saveImage(fname, data['data'][i, :], data['labels'][i], mapFile, 0)
--- a/Examples/Image/Miscellaneous/CIFAR-10/CifarDownload.py
+++ b/Examples/Image/Miscellaneous/CIFAR-10/CifarDownload.py
@ -1,105 +0,0 @@
-import sys
-import urllib
-import tarfile
-import shutil
-import os
-import struct
-import numpy as np
-import cPickle as cp
-import getopt
-
-ImgSize = 32
-NumFeat = ImgSize * ImgSize * 3
-
-def readBatch(src, outFmt):
-    with open(src, 'rb') as f:
-        d = cp.load(f)
-        # Note: most of the frameworks use spatial-major (aka NCHW) input format:
-        # R0..RN,G0..GN,B0..BN
-        # There are 2 possible options in CNTK:
-        # 1. If CNTK is built with cuDNN then 'cudnn' (i.e. NCHW format) should be used.
-        # 2. Otherwise, legacy CNTK 'NHWC' format should be used. As CIFAR-10 dataset comes in 
-        #   NCHW format, it has to be converted to CNTK legacy format first.
-        data = d['data']
-        if outFmt == 'cudnn':
-            feat = data
-        elif outFmt == 'legacy':
-            r = data[:, : ImgSize * ImgSize]
-            g = data[:, ImgSize * ImgSize : 2 * ImgSize * ImgSize]
-            b = data[:, 2 * ImgSize * ImgSize : 3 * ImgSize * ImgSize]
-            feat = np.empty_like(data)
-            feat[:, ::3] = r
-            feat[:, 1::3] = g
-            feat[:, 2::3] = b
-        else:
-            print ('Format not supported: ' + outFmt)
-            usage()
-            sys.exit(1)
-    res = np.hstack((feat, np.reshape(d['labels'], (len(d['labels']), 1))))
-    return res.astype(np.int)
-
-def loadData(src, outFmt):
-    print ('Downloading ' + src)
-    fname, h = urllib.urlretrieve(src, './delete.me')
-    print ('Done.')
-    try:
-        print ('Extracting files...')
-        with tarfile.open(fname) as tar:
-            tar.extractall()
-        print ('Done.')
-        print ('Preparing train set...')
-        trn = np.empty((0, NumFeat + 1), dtype=np.int)
-        for i in range(5):
-            batchName = './cifar-10-batches-py/data_batch_{0}'.format(i + 1)
-            trn = np.vstack((trn, readBatch(batchName, outFmt)))
-        print ('Done.')
-        print ('Preparing test set...')
-        tst = readBatch('./cifar-10-batches-py/test_batch', outFmt)
-        print ('Done.')
-    finally:
-        os.remove(fname)
-    return (trn, tst)
-
-def usage():
-    print ('Usage: CifarDownload.py [-f <format>] \n  where format can be either cudnn or legacy. Default is cudnn.')
-
-def parseCmdOpt(argv):
-    if len(argv) == 0:
-        print ("Using cudnn output format.")
-        return "cudnn"
-    try:
-        opts, args = getopt.getopt(argv, 'hf:', ['help', 'outFormat='])
-    except getopt.GetoptError:
-        usage()
-        sys.exit(1)
-    for opt, arg in opts:
-        if opt in ('-h', '--help'):
-            usage()
-            sys.exit()
-        elif opt in ('-f', '--outFormat'):
-            fmt = arg
-            if fmt != 'cudnn' and fmt != 'legacy':
-                print ('Invalid output format option.')
-                usage()
-                sys.exit(1)
-            return fmt
-
-def savetxt(filename, ndarray):
-    with open(filename, 'w') as f:
-        labels = map(' '.join, np.eye(10, dtype=np.uint).astype(str))
-        for row in ndarray:
-            row_str = row.astype(str)
-            label_str = labels[row[-1]]
-            feature_str = ' '.join(row_str[:-1])
-            f.write('|labels {} |features {}\n'.format(label_str, feature_str))
-
-
-if __name__ == "__main__":
-    fmt = parseCmdOpt(sys.argv[1:])
-    trn, tst = loadData('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz', fmt)
-    print ('Writing train text file...')
-    savetxt(r'./Train_cntk_text.txt', trn)
-    print ('Done.')
-    print ('Writing test text file...')
-    savetxt(r'./Test_cntk_text.txt', tst)
-    print ('Done.')
--- a/Examples/Image/Miscellaneous/CIFAR-10/CifarDownload_py3.py
+++ b/Examples/Image/Miscellaneous/CIFAR-10/CifarDownload_py3.py
@ -1,105 +0,0 @@
-import sys
-import urllib.request as ul
-import pickle as cp
-import tarfile
-import shutil
-import os
-import struct
-import numpy as np
-import getopt
-
-ImgSize = 32
-NumFeat = ImgSize * ImgSize * 3
-
-def readBatch(src, outFmt):
-    with open(src, 'rb') as f:
-        d = cp.load(f,  encoding="latin1")
-        # Note: most of the frameworks use spatial-major (aka NCHW) input format:
-        # R0..RN,G0..GN,B0..BN
-        # There are 2 possible options in CNTK:
-        # 1. If CNTK is built with cuDNN then 'cudnn' (i.e. NCHW format) should be used.
-        # 2. Otherwise, legacy CNTK 'NHWC' format should be used. As CIFAR-10 dataset comes in 
-        #   NCHW format, it has to be converted to CNTK legacy format first.
-        data = d['data']
-        if outFmt == 'cudnn':
-            feat = data
-        elif outFmt == 'legacy':
-            r = data[:, : ImgSize * ImgSize]
-            g = data[:, ImgSize * ImgSize : 2 * ImgSize * ImgSize]
-            b = data[:, 2 * ImgSize * ImgSize : 3 * ImgSize * ImgSize]
-            feat = np.empty_like(data)
-            feat[:, ::3] = r
-            feat[:, 1::3] = g
-            feat[:, 2::3] = b
-        else:
-            print ('Format not supported: ' + outFmt)
-            usage()
-            sys.exit(1)
-    res = np.hstack((feat, np.reshape(d['labels'], (len(d['labels']), 1))))
-    return res.astype(np.int)
-
-def loadData(src, outFmt):
-    print ('Downloading ' + src)
-    fname, h = ul.urlretrieve(src, './delete.me')
-    print ('Done.')
-    try:
-        print ('Extracting files...')
-        with tarfile.open(fname) as tar:
-            tar.extractall()
-        print ('Done.')
-        print ('Preparing train set...')
-        trn = np.empty((0, NumFeat + 1), dtype=np.int)
-        for i in range(5):
-            batchName = './cifar-10-batches-py/data_batch_{0}'.format(i + 1)
-            trn = np.vstack((trn, readBatch(batchName, outFmt)))
-        print ('Done.')
-        print ('Preparing test set...')
-        tst = readBatch('./cifar-10-batches-py/test_batch', outFmt)
-        print ('Done.')
-    finally:
-        os.remove(fname)
-    return (trn, tst)
-
-def usage():
-    print ('Usage: CifarDownload_py3.py [-f <format>] \n  where format can be either cudnn or legacy. Default is cudnn.')
-
-def parseCmdOpt(argv):
-    if len(argv) == 0:
-        print ("Using cudnn output format.")
-        return "cudnn"
-    try:
-        opts, args = getopt.getopt(argv, 'hf:', ['help', 'outFormat='])
-    except getopt.GetoptError:
-        usage()
-        sys.exit(1)
-    for opt, arg in opts:
-        if opt in ('-h', '--help'):
-            usage()
-            sys.exit()
-        elif opt in ('-f', '--outFormat'):
-            fmt = arg
-            if fmt != 'cudnn' and fmt != 'legacy':
-                print ('Invalid output format option.')
-                usage()
-                sys.exit(1)
-            return fmt
-
-def savetxt(filename, ndarray):
-    with open(filename, 'w') as f:
-        labels = list(map(' '.join, np.eye(10, dtype=np.uint).astype(str)))
-        for row in ndarray:
-            row_str = row.astype(str)
-            label_str = labels[row[-1]]
-            feature_str = ' '.join(row_str[:-1])
-            f.write('|labels {} |features {}\n'.format(label_str, feature_str))
-
-
-if __name__ == "__main__":
-    fmt = parseCmdOpt(sys.argv[1:])
-    trn, tst = loadData('http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz', fmt)
-    print ('Writing train text file...')
-    savetxt(r'./Train_cntk_text.txt', trn)
-    print ('Done.')
-    print ('Writing test text file...')
-    savetxt(r'./Test_cntk_text.txt', tst)
-    print ('Done.')
--- a/Examples/Image/Miscellaneous/CIFAR-10/ConvNetBN.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/ConvNetBN.cntk
@ -1,91 +0,0 @@
-# Simple CIFAR-10 convnet, without and with BatchNormalization.
-
-command = TrainConvNetWithBN:Eval
-
-makeMode = false ; traceLevel = 1 ; deviceId = 0
-
-RootDir = "." ; DataDir  = "$RootDir$" ; ModelDir = "$RootDir$/Output/Models"
-
-modelPath = "$ModelDir$/ConvNetBN"
-
-# Training with BN
-TrainConvNetWithBN = {
-    action = "train"
-
-    BrainScriptNetworkBuilder = {
-        imageShape = 32:32:3
-        labelDim = 10
-
-        Subtract128 (x) = x - Constant (128)
-
-        model = Sequential (
-            Subtract128 :
-            ConvolutionalLayer {32, (5:5), pad = true, bias = false, init = "heNormal", initValueScale=0.00390625} :
-              BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
-                MaxPoolingLayer {(3:3), stride = (2:2)} :
-            ConvolutionalLayer {32, (5:5), pad = true, bias = false, init = "heNormal"} :
-              BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
-                MaxPoolingLayer {(3:3), stride = (2:2)} :
-            ConvolutionalLayer {64, (5:5), pad = true, bias = false, init = "heNormal"} :
-              BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = 4096} : ReLU :
-                MaxPoolingLayer {(3:3), stride = (2:2)} :
-            LinearLayer {64, bias = false, init = "heNormal", initValueScale=0.1} :
-              BatchNormalizationLayer {normalizationTimeConstant = 4096} : ReLU :
-            LinearLayer {labelDim, init = "heNormal", initValueScale=0.1}
-        )
-
-        # inputs
-        features = Input {imageShape}
-        labels   = Input {labelDim}
-
-        # apply model to features
-        z = model (features)
-
-        # connect to system
-        ce       = CrossEntropyWithSoftmax (labels, z)
-        errs     = ClassificationError         (labels, z)
-        top5Errs = ClassificationError         (labels, z, topN=5)
-
-        featureNodes    = (features)
-        labelNodes      = (labels)
-        criterionNodes  = (ce)
-        evaluationNodes = (errs)
-        outputNodes     = (z)
-    }
-
-    SGD = {
-        epochSize = 49984 ; minibatchSize = 64
-
-        learningRatesPerSample = 0.00046875*5:0.00015625
-        momentumAsTimeConstant = 0
-        maxEpochs = 10
-        L2RegWeight = 0.003
-        dropoutRate = 0
-
-        firstMBsToShowResult = 10 ; numMBsToShowResult = 500
-    }
-
-    reader = {
-        readerType = "CNTKTextFormatReader"
-        file = "$DataDir$/Train_cntk_text.txt"
-        input = {
-            features = { dim = 3072 ; format = "dense" }
-            labels   = { dim = 10 ;   format = "dense" }
-        }
-    }
-}
-
-# Eval action
-Eval = {
-    action = "eval"
-    minibatchSize = 16
-    evalNodeNames = errs:top5Errs  # also test top-5 error rate
-    reader = {
-        readerType = "CNTKTextFormatReader"
-        file = "$DataDir$/Test_cntk_text.txt"
-        input = {
-            features = { dim = 3072 ; format = "dense" }
-            labels   = { dim = 10 ;   format = "dense" }
-        }
-    }
-}
--- a/Examples/Image/Miscellaneous/CIFAR-10/README.md
+++ b/Examples/Image/Miscellaneous/CIFAR-10/README.md
@ -1,57 +0,0 @@
-# CNTK example: CIFAR-10
-
-## Overview
-
-|Data:     |The CIFAR-10 dataset (http://www.cs.toronto.edu/~kriz/cifar.html) of small images.
-|:---------|:---
-|Purpose   |This example demonstrates usage of the NDL (Network Description Language) to define networks.
-|Network   |NDLNetworkBuilder, convolutional networks with batch normalization (including ResNet), cross entropy with softmax.
-|Training  |Stochastic gradient descent with momentum.
-|Comments  |See below.
-
-## Running the example
-
-### Getting the data
-
-CIFAR-10 dataset is not included in CNTK distribution but can be easily downloaded and converted by running the following commands from this folder:
-
-```
-python CifarDownload.py [-f <format: cudnn|legacy>]
-python CifarConverter.py <path to CIFAR-10 dataset>
-```
-
-The scripts will download all required files and convert them to CNTK-supported format.
-In case you don't have Python installed (you require Python 2.7 and numpy), we recommend to install the Python Anaconda distribution which contains most of the popular Python packages including numpy:
-http://continuum.io/downloads
-
-The download script has an optional `-f` parameter which specifies output format of the datasets. `cudnn` option (default) saves dataset in a spatial-major format used by cuDNN, while `legacy` - in CNTK legacy format. Use `cudnn` if CNTK is compiled with cuDNN and `legacy` otherwise.
-
-The converter script takes a full path to the original CIFAR-10 dataset (in Python pickle format). The script will create `data` folder inside of provided path where it will store both train and test images (in `train` and `test` folders). It will also create appropriate mapping files for the CNTK ImageReader as well as mean file.
-
-## Details
-
-### Config files
-
-1. 01_Convolution.ndl is a convolutional network which has 3 convolutional and 3 max pooling layers and resembles the network described here:
-https://code.google.com/p/cuda-convnet/source/browse/trunk/example-layers/layers-80sec.cfg 
-(main differences are usage of max pooling layers everywhere rather than mix of max and average pooling, as well as dropout in fully-connected layer).
-The network produces 20.5% of error after training for about 3 minutes on GPU.
-To run the sample, navigate to the sample folder and run the following command:
-```
-cntk configFile=01_Conv_ndl_deprecated.cntk
-```
-2. 02_BatchNormConv.ndl is a convolutional network which uses batch normalization technique (http://arxiv.org/abs/1502.03167).
-To run the sample, navigate to the sample folder and run the following command:
-```
-cntk configFile=02_BatchNormConv_ndl_deprecated.cntk
-```
-
-3. 03_ResNet.ndl and 04_ResNet_56.ndl are very deep convolutional networks that use ResNet architecture and have 20 and 56 layers respectively (http://arxiv.org/abs/1512.03385).
-With 03_ResNet_ndl_deprecated.cntk you should get around 8.2% of error after training for about 50 minutes. 04_ResNet_56_ndl_deprecated.cntk should produce around 6.4% of error after training for about 3 hours (see log files in the Output directory).
-
-4. 05_ConvLocal_ndl_deprecated.cntk uses locally-connected convolution layers (see `conv_local3` and `conv_local4` in `05_ConvLocal_ndl_deprecated.cntk`) and resembles a network described here: https://code.google.com/p/cuda-convnet/source/browse/trunk/example-layers/layers-conv-local-11pct.cfg
-
-5. 06_RegressionSimple.cntk shows how to train a regression model on image data. It uses a very simple network and a composite reader using both the ImageReader and CNTKTextFormatReader and defines a the RMSE (root mean square error) as the loss function. The value that the network learns to predict are simply the average rgb values of an image normalized to [0, 1]. To generate the ground truth labels for regression you need to run the CifarConverter.py script (since this example was added later you might need to rerun it to generate the regression files). See also here: https://github.com/Microsoft/CNTK/wiki/Train-a-regression-model-on-images
-
-For more details, refer to .ndl and corresponding .cntk files.
-
--- a/Examples/Image/Miscellaneous/CIFAR-10/labelsmap.txt
+++ b/Examples/Image/Miscellaneous/CIFAR-10/labelsmap.txt
@ -1,12 +0,0 @@
-0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-
-
--- a/Examples/Image/Regression/README.md
+++ b/Examples/Image/Regression/README.md
@ -0,0 +1,30 @@
+# CNTK Examples: Image/Regression
+
+## Overview
+
+|Data:     |The CIFAR-10 dataset (http://www.cs.toronto.edu/~kriz/cifar.html) of small images.
+|:---------|:---
+|Purpose   |This folder contains a number of examples that demonstrate the usage of BrainScript to define deep learning networks for image regression tasks.
+|Network   |Convolution neural networks.
+|Training  |Stochastic gradient descent with momentum.
+|Comments  |See below.
+
+## Running the example
+
+### Getting the data
+
+we use the CIFAR-10 dataset to demonstrate how to perform regression on images. CIFAR-10 dataset is not included in the CNTK distribution but can be easily downloaded and converted by following the instructions in [DataSets/CIFAR-10](../DataSets/CIFAR-10). We recommend you to keep the downloaded data in the respective folder while downloading, as the configuration files in this folder assumes that by default.
+
+## Details
+
+### RegrSimple_CIFAR10.cntk
+
+In this example, we set up a very simple task to have a neural network predict the average RGB values of images normalized to [0,1). To generate the ground truth labels for this regression task, the CIFAR-10 installation script in [DataSets/CIFAR-10](../DataSets/CIFAR-10) will generate two additional files, `train_regrLabels.txt` and `test_regrLabels.txt`, for train and test respectively.
+
+Run the example from the current folder using:
+
+`cntk configFile=RegrSimple_CIFAR10.cntk`
+
+The network produces root-mean-square error (rmse) of around 0.00098, which indicates that the regression accuracy is very high for this simple task.
+
+You may examine the cntk configuration file [RegrSimple_CIFAR10.cntk](./RegrSimple_CIFAR10.cntk) for more details. Note the network is a linear one without nonlinearity. This is intended as we know that computing the average RGB values of images is a linear operation. The reader is a composite reader that uses the `ImageReader` to read images and the `CNTKTextFormatReader` to read the regression ground truth labels. The configuration file also demonstrates how to write the network prediction for the test data into an output file.
--- a/Examples/Image/Miscellaneous/CIFAR-10/06_RegressionSimple.cntk
+++ b/Examples/Image/Miscellaneous/CIFAR-10/06_RegressionSimple.cntk
@ -1,21 +1,17 @@
-# 06_RegressionSimple.cntk shows how to train a regression model on image data.
+# RegrSimple_CIFAR10.cntk shows how to train a regression model on CIFAR-10 image data.
 # It uses a very simple network and a composite reader using both the ImageReader
 # and CNTKTextFormatReader and defines the RMSE (root mean square error) as the
 # loss function. The value that the network learns to predict are simply the
 # average rgb values of an image normalized to [0, 1].
-# The network consists simply of two linear layers, i.e. two fully connected layers
-# with no non-linear activation function, simply LinearLayer{...} (v) = W * v + b
-# See https://github.com/Microsoft/CNTK/wiki/Layers-Reference for details on CNTK layers.
-# See README.md for instructions on how to generate data and regression labels for this example.

-command = TrainConvNet:Write
+command = TrainConvNet:Write:Test

 makeMode = false ; traceLevel = 1 ; deviceId = "auto"

-rootDir = "." ; configDir = "$rootDir$" ; dataDir  = "$rootDir$" ;
-outputDir = "$rootDir$/Output" ; modelDir = "$outputDir$/Models"
+rootDir = ".." ; dataDir  = "$rootDir$/DataSets/CIFAR10" ;
+outputDir = "Output" ; modelDir = "$outputDir$/Models"

-modelPath = "$ModelDir$/06_RegressionSimple.cmf"
+modelPath = "$ModelDir$/RegrSimple_CIFAR10.cmf"

 # Training action for a convolutional network
 TrainConvNet = {
@ -70,7 +66,7 @@ TrainConvNet = {
        verbosity = 0 ; randomize = true
        deserializers = ({
            type = "ImageDeserializer" ; module = "ImageReader"
-            file = "$dataDir$/cifar-10-batches-py/train_map.txt"
+            file = "$dataDir$/train_map.txt"
            input = {
                features = { transforms = (
                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
@ -80,7 +76,7 @@ TrainConvNet = {
            }
        } : {
            type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
-            file = "$dataDir$/cifar-10-batches-py/train_regrLabels.txt"
+            file = "$dataDir$/train_regrLabels.txt"
            input = {
                regrLabels = { dim = 3 ; format = "dense" }
            }
@ -93,13 +89,13 @@ Write = {
    action = "write"
    minibatchSize = 1
    outputNodeNames = (ol, regrLabels, rmse)
-    outputPath = "$OutputDir$/06_RegressionSimple"
+    outputPath = "$OutputDir$/RegrSimple_CIFAR10"
    
    reader = {
        verbosity = 0 ; randomize = false
        deserializers = ({
            type = "ImageDeserializer" ; module = "ImageReader"
-            file = "$dataDir$/cifar-10-batches-py/test_map.txt"
+            file = "$dataDir$/test_map.txt"
            input = {
                features = { transforms = (
                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
@ -109,7 +105,7 @@ Write = {
            }
        } : {
            type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
-            file = "$dataDir$/cifar-10-batches-py/test_regrLabels.txt"
+            file = "$dataDir$/test_regrLabels.txt"
            input = {
                regrLabels = { dim = 3 ; format = "dense" }
            }
@ -122,13 +118,13 @@ Test = {
    action = "test"
    minibatchSize = 512
    outputNodeNames = (ol, regrLabels, rmse)
-    outputPath = "$OutputDir$/06_RegressionSimple"
+    outputPath = "$OutputDir$/RegrSimple_CIFAR10"
    
    reader = {
        verbosity = 0 ; randomize = false
        deserializers = ({
            type = "ImageDeserializer" ; module = "ImageReader"
-            file = "$dataDir$/cifar-10-batches-py/test_map.txt"
+            file = "$dataDir$/test_map.txt"
            input = {
                features = { transforms = (
                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
@ -138,7 +134,7 @@ Test = {
            }
        } : {
            type = "CNTKTextFormatDeserializer" ; module = "CNTKTextFormatReader"
-            file = "$dataDir$/cifar-10-batches-py/test_regrLabels.txt"
+            file = "$dataDir$/test_regrLabels.txt"
            input = {
                regrLabels = { dim = 3 ; format = "dense" }
            }
--- a/Examples/README.md
+++ b/Examples/README.md
@ -16,5 +16,5 @@ Please refer to the Readme file in the corresponding folder for further details.
 |:------------------------|:-------------------------------------------------|:----------------|
 |Other/Simple2d           | Synthetic 2d data                                | FF (CPU and GPU)
 |Speech/AN4               | Speech data (CMU AN4)                            | FF and LSTM
-|Image/MNIST              | Image data (MNIST handwritten digit recognition) | CNN 
+|Image/GettingStarted     | Image data (MNIST handwritten digit recognition) | CNN 
 |Text/PennTreebank        | Text data (penn treebank)                        | RNN
--- a/59
+++ b/59
@ -412,6 +412,7 @@ $(CNTKLIBRARY_LIB): $(CNTKLIBRARY_OBJ) | $(CNTKMATH_LIB)
 CNTKLIBRARY_TESTS_SRC =\
 	Tests/UnitTests/V2LibraryTests/FeedForwardTests.cpp \
 	Tests/UnitTests/V2LibraryTests/Main.cpp \
+	Tests/UnitTests/V2LibraryTests/Common.cpp \
 	Tests/UnitTests/V2LibraryTests/NDArrayViewTests.cpp \
 	Tests/UnitTests/V2LibraryTests/RecurrentFunctionTests.cpp \
 	Tests/UnitTests/V2LibraryTests/TensorTests.cpp \
@ -422,6 +423,8 @@ CNTKLIBRARY_TESTS_SRC =\
 	Tests/UnitTests/V2LibraryTests/FunctionTests.cpp \
 	Tests/UnitTests/V2LibraryTests/SequenceClassification.cpp \
 	Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp \
+	Tests/UnitTests/V2LibraryTests/TruncatedLSTMAcousticModel.cpp \
+	Tests/UnitTests/V2LibraryTests/DeviceSelectionTests.cpp \
 	Examples/Evaluation/CPPEvalV2Client/EvalMultithreads.cpp \

 CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
@ -474,7 +477,7 @@ $(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB) $(MULTIVERSO_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -l$(MULTIVERSO)
+	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO)

 ########################################
 # Eval Sample client
@ -493,7 +496,7 @@ $(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB) $(MULTIVERSO_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $(EVAL_SAMPLE_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) -l$(MULTIVERSO)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)

 ########################################
 # BinaryReader plugin
@ -733,8 +736,11 @@ IMAGE_READER_LIBS += -lopencv_core -lopencv_imgproc -lopencv_imgcodecs

 ifdef LIBZIP_PATH
  CPPFLAGS += -DUSE_ZIP
+  #both directories are needed for building libzip
+  INCLUDEPATH += $(LIBZIP_PATH)/include
  INCLUDEPATH += $(LIBZIP_PATH)/lib/libzip/include
  IMAGE_READER_LIBS += -lzip
+  LIBPATH += $(LIBZIP_PATH)/lib
 endif

 IMAGEREADER_SRC =\
@ -788,7 +794,7 @@ ifeq (,$(wildcard Source/Multiverso/include/multiverso/*.h))
  $(error Build with Multiverso was requested but cannot find the code. Please check https://github.com/Microsoft/DMTK to learn more.)
 endif

-MULTIVERSO:=multiverso
+lMULTIVERSO:=-lmultiverso

 INCLUDEPATH += $(SOURCEDIR)/Multiverso/include
 COMMON_FLAGS += -DMULTIVERSO_SUPPORT
@ -813,6 +819,26 @@ $(MULTIVERSO_LIB):
            -B./Source/Multiverso/build -H./Source/Multiverso
 	@make VERBOSE=1 -C ./Source/Multiverso/build/ -j multiverso

+UNITTEST_MULTIVERSO_SRC = \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_array.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_blob.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_kv.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_message.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_multiverso.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_node.cpp \
+	$(SOURCEDIR)/Multiverso/Test/unittests/test_sync.cpp \
+
+UNITTEST_MULTIVERSO_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MULTIVERSO_SRC))
+
+UNITTEST_MULTIVERSO := $(BINDIR)/multiversotests
+
+ALL += $(UNITTEST_MULTIVERSO)
+
+$(UNITTEST_MULTIVERSO): $(UNITTEST_MULTIVERSO_OBJ) | $(MULTIVERSO_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(lMULTIVERSO) -ldl
 endif

 ########################################
@ -849,7 +875,7 @@ $(CNTK): $(CNTK_OBJ) | $(CNTKMATH_LIB) $(MULTIVERSO_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building output for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) -l$(MULTIVERSO) -fopenmp
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO) -fopenmp

 # deployable resources: standard library of BS
 CNTK_CORE_BS:=$(BINDIR)/cntk.core.bs
@ -886,7 +912,7 @@ $(UNITTEST_EVAL) : $(UNITTEST_EVAL_OBJ) | $(EVAL_LIB) $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH) -l$(MULTIVERSO)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)

 #TODO: create project specific makefile or rules to avoid adding project specific path to the global path
 INCLUDEPATH += $(SOURCEDIR)/Readers/CNTKTextFormatReader
@ -944,7 +970,7 @@ $(UNITTEST_NETWORK): $(UNITTEST_NETWORK_OBJ) | $(CNTKMATH_LIB) $(CNTKTEXTFORMATR
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -l$(MULTIVERSO) -l$(CNTKMATH) -fopenmp
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) $(lMULTIVERSO) -l$(CNTKMATH) -fopenmp

 UNITTEST_MATH_SRC = \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/BatchNormalizationEngineTests.cpp \
@ -1003,27 +1029,6 @@ $(UNITTEST_BRAINSCRIPT): $(UNITTEST_BRAINSCRIPT_OBJ) | $(CNTKMATH_LIB)
 	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
 	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) $(LIBS) -ldl -l$(CNTKMATH)

-UNITTEST_MULTIVERSO_SRC = \
-	$(SOURCEDIR)/Multiverso/Test/unittests/test_array.cpp \
-	$(SOURCEDIR)/Multiverso/Test/unittests/test_blob.cpp \
-	$(SOURCEDIR)/Multiverso/Test/unittests/test_kv.cpp \
-	$(SOURCEDIR)/Multiverso/Test/unittests/test_message.cpp \
-	$(SOURCEDIR)/Multiverso/Test/unittests/test_multiverso.cpp \
-	$(SOURCEDIR)/Multiverso/Test/unittests/test_node.cpp \
-	$(SOURCEDIR)/Multiverso/Test/unittests/test_sync.cpp \
-
-UNITTEST_MULTIVERSO_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MULTIVERSO_SRC))
-
-UNITTEST_MULTIVERSO := $(BINDIR)/multiversotests
-
-ALL += $(UNITTEST_MULTIVERSO)
-
-$(UNITTEST_MULTIVERSO): $(UNITTEST_MULTIVERSO_OBJ) | $(MULTIVERSO_LIB)
-	@echo $(SEPARATOR)
-	@mkdir -p $(dir $@)
-	@echo building $@ for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(BOOSTLIB_PATH)) $(patsubst %, $(RPATH)%, $(ORIGINLIBDIR) $(BOOSTLIB_PATH)) -o $@ $^ $(BOOSTLIBS) -l$(MULTIVERSO) -ldl
-
 unittests: $(UNITTEST_EVAL) $(UNITTEST_READER) $(UNITTEST_NETWORK) $(UNITTEST_MATH) $(UNITTEST_BRAINSCRIPT)

 endif
--- a/README.md
+++ b/README.md
@ -1,6 +1,14 @@
 # CNTK

 ## Latest news
+*2016-10-03.* V 1.7.2 Binary release  
+**This is a Hot Fix Release. It affects all users of Model Evaluation Library**
+
+If you are NOT using Model Evaluation Library you may skip this release.  
+If you ARE using Model Evaluation Library we **strongly recommend** installing version 1.7.2 instead of **any** previous version you might be using.
+
+See [Release Notes](https://github.com/Microsoft/CNTk/wiki/CNTK_1_7_2_Release_Notes) for details.
+
 *2016-09-28.* V 1.7.1 Binary release  
 Highlights of this Release:
 * Two Breaking Changes related to Layers library default initialization and ```fsAdagrad``` gradient-normalization scheme
@ -31,9 +39,6 @@ Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/

 *2016-08-10.* We have significantly simplified handling of **Gated Recurrent Units (GRU)**. Read more in the [corresponding article](https://github.com/Microsoft/CNTK/wiki/GRUs-on-CNTK-with-BrainScript).

-*2016-07-15.* V 1.6 Binary release  
-CNTK v.1.6 binaries are on the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 ## What is CNTK
--- a/Scripts/txt2ctf.py
+++ b/Scripts/txt2ctf.py
@ -90,22 +90,24 @@ if __name__ == "__main__":
 #####################################################################################################
 try:
    import StringIO
+    stringio = StringIO.StringIO
 except ImportError:
    from io import StringIO
+    stringio = StringIO
 try:
    import pytest
 except ImportError:
    pass

 def test_simpleSanityCheck():
-    dictionary1 = StringIO.StringIO("hello\nmy\nworld\nof\nnothing\n")
-    dictionary2 = StringIO.StringIO("let\nme\nbe\nclear\nabout\nit\n")
-    input = StringIO.StringIO("hello my\tclear about\nworld of\tit let clear\n")
-    output = StringIO.StringIO()
+    dictionary1 = stringio("hello\nmy\nworld\nof\nnothing\n")
+    dictionary2 = stringio("let\nme\nbe\nclear\nabout\nit\n")
+    input = stringio("hello my\tclear about\nworld of\tit let clear\n")
+    output = stringio()

    convert([dictionary1, dictionary2], [input], output, None, False)

-    expectedOutput = StringIO.StringIO()
+    expectedOutput = stringio()
    expectedOutput.write("0\t|S0 0:1\t|S1 3:1\n")
    expectedOutput.write("0\t|S0 1:1\t|S1 4:1\n")
    expectedOutput.write("1\t|S0 2:1\t|S1 5:1\n")
@ -115,10 +117,10 @@ def test_simpleSanityCheck():
    assert expectedOutput.getvalue() == output.getvalue()

 def test_nonExistingWord():
-    dictionary1 = StringIO.StringIO("hello\nmy\nworld\nof\nnothing\n")
-    input = StringIO.StringIO("hello my\nworld of nonexistent\n")
-    output = StringIO.StringIO()
+    dictionary1 = stringio("hello\nmy\nworld\nof\nnothing\n")
+    input = stringio("hello my\nworld of nonexistent\n")
+    output = stringio()

    with pytest.raises(Exception) as info:
        convert([dictionary1], [input], output, None, False)
-    assert info.value.message == "Token 'nonexistent' cannot be found in the dictionary for stream 0"
+    assert str(info.value) == "Token 'nonexistent' cannot be found in the dictionary for stream 0"
--- a/Source/ActionsLib/EvalActions.cpp
+++ b/Source/ActionsLib/EvalActions.cpp
@ -39,6 +39,12 @@ using namespace std;
 using namespace Microsoft::MSR;
 using namespace Microsoft::MSR::CNTK;

+bool GetDistributedMBReadingDefaultValue(const ConfigParameters& config, const IDataReader& reader)
+{
+    // Return 'true' if we're running a parallel training with a v2 reader, 'false' otherwise.
+    return (MPIWrapper::GetInstance() != nullptr && !reader.IsLegacyReader());
+}
+
 // ===========================================================================
 // DoEvalBase() - implements CNTK "eval" command
 // ===========================================================================
@ -62,7 +68,7 @@ static void DoEvalBase(const ConfigParameters& config, IDataReader& reader)
    size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
    size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);

-    bool enableDistributedMBReading = config(L"distributedMBReading", false);
+    bool enableDistributedMBReading = config(L"distributedMBReading", GetDistributedMBReadingDefaultValue(config, reader));

    vector<wstring> evalNodeNamesVector;

@ -104,7 +110,7 @@ static void DoEvalBNBase(const ConfigParameters& config, IDataReader& reader)
    size_t maxSamplesInRAM = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
    size_t numSubminiBatches = config(L"numSubminibatches", (size_t)1);

-    bool enableDistributedMBReading = config(L"distributedMBReading", false);
+    bool enableDistributedMBReading = config(L"distributedMBReading", GetDistributedMBReadingDefaultValue(config, reader));

    vector<wstring> evalNodeNamesVector;

@ -189,8 +195,6 @@ void DoCrossValidate(const ConfigParameters& config)
    size_t maxSamplesInRAM    = config(L"maxSamplesInRAM", (size_t)SIZE_MAX);
    size_t numSubminiBatches  = config(L"numSubminibatches", (size_t)1);

-    bool enableDistributedMBReading = config(L"distributedMBReading", false);
-
    ConfigArray evalNodeNames = config(L"evalNodeNames", "");
    vector<wstring> evalNodeNamesVector;
    for (int i = 0; i < evalNodeNames.size(); ++i)
@ -203,6 +207,8 @@ void DoCrossValidate(const ConfigParameters& config)

    DataReader cvDataReader(readerConfig);

+    bool enableDistributedMBReading = config(L"distributedMBReading", GetDistributedMBReadingDefaultValue(config, cvDataReader));
+
    bool finalModelEvaluated = false;
    for (size_t i = cvInterval[0]; i <= cvInterval[2]; i += cvInterval[1])
    {
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -269,6 +269,16 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
            nodePtr = builder.LegacyReshape(NULL, num_rows, ImageDimensions::AsTensorShape(img_width, img_height, img_channels, imageLayoutKind), name);
        }
    }
+    else if (cnNodeType == OperationNameOf(ReconcileDynamicAxisNode))
+    {
+        nodeParamCount = 2;
+        nodeParamStart = 0;
+
+        if (pass == ndlPassInitial)
+        {
+            nodePtr = builder.ReconcileDynamicAxis(NULL, NULL, name);
+        }
+    }
    else if (cnNodeType == OperationNameOf(PastValueNode) ||
             cnNodeType == OperationNameOf(FutureValueNode))
    {
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -208,8 +208,10 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(PerDimMeanVarNormalizationNode), L"PerDimMVNorm")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(PlusNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ReciprocalNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(ReconcileDynamicAxisNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(RectifiedLinearNode), L"ReLU")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ReshapeNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(ROIPoolingNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(RowRepeatNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(RowStackNode))) ret = true;
 #ifdef COMING_SOON
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -455,8 +455,22 @@ CNTK2 = [
    // 11. Criterion nodes
    // No changes here - we said the default input would be the label sequence here, against which the 
    // empirical sequence is compared to. Keeping this for now.
-    CrossEntropyWithSoftmax(_, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = _AsNodes (_ : outProbVectorSequence) /*plus the function args*/ ]
-    ClassificationError(_, outVectorSequence, topN=1, tag='') = new ComputationNode [ operation = 'ClassificationError' ; inputs = _AsNodes (if topN == 1 then (_ : outVectorSequence) else  (_ : outVectorSequence : Constant (topN))) /*plus the function args*/ ]
+    CrossEntropyWithSoftmax(labelSequence, outProbVectorSequence, axis=0, tag='') = 
+        if axis==0 then new ComputationNode [ operation = 'CrossEntropyWithSoftmax' ; inputs = _AsNodes (labelSequence : outProbVectorSequence) /*plus the function args*/ ]
+        else [ tag1 = tag; out = Minus (ReduceLogSum (outProbVectorSequence, axis=axis), ReduceSum (labelSequence .* outProbVectorSequence, axis=axis), tag=tag1) ].out
+    # Classification error along a specific axis: account only for missed labels, i.e.
+    # strictly check whether at the one “1” location in labels we find a value equal to the max
+    ClassificationError(labelSequence, outVectorSequence, topN=1, axis=0, tag='') =
+        if axis==0 then new ComputationNode [ operation = 'ClassificationError' ; inputs = _AsNodes (if topN == 1 then (labelSequence : outVectorSequence) else  (labelSequence : outVectorSequence : Constant (topN))) /*plus the function args*/ ]
+        else if topN != 1 then Fail ("ClassificationError() along a specific axis does not support topN.")
+        else {
+            axMax     = ReduceMax (outVectorSequence, axis=axis)    # max value along competition axis
+            pred      = outVectorSequence == axMax                  # 1 for all values that are max
+            wrongPred = labelSequence != pred                       # look up all wrong predictions {label index}
+            axErr     = ReduceSum (wrongPred, axis=axis)            # sum up wrong predictions  along competition axis
+            capErr    = axErr >= 1                                  # only count maximally one error per prediction
+            err       = ReduceMean (capErr, tag=tag)                # average into a single number per sample
+        }.err
    ErrorPrediction = ClassificationError  # legacy
    # TODO: replace with this (need to deal with topN thing):
    # (_new will be removed once the change is made)
@ -547,6 +561,7 @@ MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, l
 # 2D pooling
 MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = _AsNodes (input) /*plus the function args*/ ]
 AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = _AsNodes (input) /*plus the function args*/ ]
+ROIPooling (input, ROIs, shape) = new ComputationNode { operation = 'ROIPooling' ; inputs = _AsNodes (input : ROIs) ; outputShape = new TensorShape [ dims = shape ] ; tag='' /*plus the function args*/ } 
 ColumnwiseCrossProduct = KhatriRaoProduct // deprecated 
 ErrorPrediction = ClassificationError   # legacy name
 Delay = PastValue 
@ -560,8 +575,6 @@ CosDistance(aVectorSequence, anotherVectorSequence, tag='') = new ComputationNod
 CosDistanceWithNegativeSamples(aVectorSequence, anotherVectorSequence, numShifts, numNegSamples, tag='') = new ComputationNode [ operation = 'CosDistanceWithNegativeSamples' ; inputs = _AsNodes (aVectorSequence : anotherVectorSequence : numShifts : numNegSamples) /*plus the function args*/ ]
 Cosine(x, tag='') = new ComputationNode [ operation = 'Cosine' ; inputs = _AsNodes (x) /*plus the function args*/ ]
 CrossEntropy(refProbVectorSequence, outProbVectorSequence, tag='') = new ComputationNode [ operation = 'CrossEntropy' ; inputs = _AsNodes (refProbVectorSequence : outProbVectorSequence) /*plus the function args*/ ]
-# once ReduceLogSum becomes proper C++, CrossEntropyWithSoftmax() will become this:
-NewCrossEntropyWithSoftmax (labelSequence, z, tag='') = [ tag1 = tag; out = Minus (ReduceLogSum (z), ReduceSum (labelSequence .* z), tag=tag1) ].out
 DiagTimes(diagonalMatrixAsColumnVector, matrix, tag='') = new ComputationNode [ operation = 'DiagTimes' ; inputs = _AsNodes (diagonalMatrixAsColumnVector : matrix) /*plus the function args*/ ]
 // TODO: DiagTimes = ElementTimes
 GatherPacked(indexSequence, sourceData, tag='') = new ComputationNode [ operation = 'GatherPacked' ; inputs = _AsNodes (indexSequence : sourceData) /*plus the function args*/ ]
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -553,9 +553,14 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
    // parallel training
    shared_ptr<Microsoft::MSR::CNTK::MPIWrapper> mpi;
    auto ensureMPIWrapperCleanup = MakeScopeExit(&MPIWrapper::DeleteInstance);
-    bool paralleltrain = config(L"parallelTrain", false);
+    // when running under MPI with more than one node, use 'true' as the default value for parallelTrain,
+    // 'false' otherwise.
+    bool paralleltrain = config(L"parallelTrain", (MPIWrapper::GetTotalNumberOfMPINodes() > 1));
+
    if (paralleltrain)
+    {
        mpi = MPIWrapper::GetInstance(true /*create*/);
+    }  

    g_shareNodeValueMatrices = config(L"shareNodeValueMatrices", false);

@ -638,7 +643,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp

 static void PrintBanner(int argc, wchar_t* argv[], const string& timestamp)
 {
-    fprintf(stderr, "CNTK 1.7.1+ (");
+    fprintf(stderr, "CNTK 1.7.2+ (");
 #ifdef _GIT_EXIST
    fprintf(stderr, "%s %.6s, ", _BUILDBRANCH_, _BUILDSHA1_);
 #endif
@ -687,9 +692,15 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
    // The top-level 'parallelTrain' is a bool, not to be confused with the parallelTrain block inside SGD.
    shared_ptr<Microsoft::MSR::CNTK::MPIWrapper> mpi;
    auto ensureMPIWrapperCleanup = MakeScopeExit(&MPIWrapper::DeleteInstance);
-    bool paralleltrain = config(L"parallelTrain", "false");
+    
+    // when running under MPI with more than one node, use 'true' as the default value for parallelTrain,
+    // 'false' otherwise.
+    bool paralleltrain = config(L"parallelTrain", (MPIWrapper::GetTotalNumberOfMPINodes() > 1));
+
    if (paralleltrain)
+    {
       mpi = MPIWrapper::GetInstance(true /*create*/);
+    } 

    g_shareNodeValueMatrices = config(L"shareNodeValueMatrices", false);

--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -208,21 +208,21 @@ namespace CNTK
        NDShape() {}

        ///
-        /// Contruct a NDShape instance with the specified number of axes and dimensionality in each axis.
+        /// Construct a NDShape instance with the specified rank and dimensionality in each axis.
        ///
        explicit NDShape(size_t numAxes, size_t dimension = InferredDimension)
            : m_shapeDims(numAxes, dimension)
        {}

        ///
-        /// Contruct a NDShape instance with specified dimensions.
+        /// Construct a NDShape instance with specified dimensions.
        ///
        NDShape(const std::vector<size_t>& dimensions)
            : m_shapeDims(dimensions)
        {}

        ///
-        /// Contruct a NDShape instance with specified dimensions.
+        /// Construct a NDShape instance with specified dimensions.
        ///
        NDShape(const std::initializer_list<size_t>& dimensions)
            : m_shapeDims(dimensions)
@ -234,7 +234,7 @@ namespace CNTK
        const std::vector<size_t>& Dimensions() const { return m_shapeDims; }

        ///
-        /// Returns the number of axes of 'this' shape.
+        /// Returns the rank of 'this' shape.
        ///
        size_t Rank() const { return m_shapeDims.size(); }

@ -255,7 +255,7 @@ namespace CNTK
        {
            endAxisId = (endAxisId == SIZE_MAX) ? Rank() : endAxisId;
            if ((endAxisId < beginAxisId) || (endAxisId > Rank()))
-                InvalidArgument("NDShape::SubShape : The specified endAxisId (%d) cannot exceed the number of axes (%d) of 'this' NDShape and must be >= than the specified beginAxisId (%d)", (int)endAxisId, (int)Rank(), (int)beginAxisId);
+                InvalidArgument("NDShape::SubShape : The specified endAxisId (%d) cannot exceed the rank (%d) of 'this' NDShape and must be >= than the specified beginAxisId (%d)", (int)endAxisId, (int)Rank(), (int)beginAxisId);

            std::vector<size_t> subShapeDims(m_shapeDims.begin() + beginAxisId, m_shapeDims.begin() + endAxisId);
            return subShapeDims;
@ -343,6 +343,7 @@ namespace CNTK
        friend class CompositeFunction;
        friend class LearnerBase;
        friend class Variable;
+        friend class PackedValue;

        template <typename T, typename ...CtorArgTypes>
        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
@ -593,6 +594,13 @@ namespace CNTK
        std::shared_ptr<void> m_tensorView; // Microsoft::MSR::CNTK::TensorView<ElemType>*
    };

+    enum class MaskKind : char
+    {
+        Invalid = 0,
+        Valid = 1,
+        SequenceBegin = 2,
+    };
+
    ///
    /// Denotes a multi-dimensional mask used for specifying specific sections of a NDArrayView object as masked/invalid.
    /// This type denotes a view and there may be multiple simultaneous views of the data underlying a NDMask instance.
@ -603,6 +611,7 @@ namespace CNTK

        template <typename T, typename ...CtorArgTypes>
        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
+
    public:
        ///
        /// Construct a new Mask object of specified shape
@ -615,12 +624,32 @@ namespace CNTK
        CNTK_API ~NDMask();

        ///
-        /// Mask out the specified sub-section of 'this' mask
+        /// Mask out (i.e. mark Invalid) the specified sub-section of 'this' mask
        ///
-        CNTK_API void MaskSection(const std::vector<size_t>& sectionOffset, const NDShape& sectionShape);
+        void InvalidateSection(const std::vector<size_t>& sectionOffset, const NDShape& sectionShape)
+        {
+            MarkSectionAs(sectionOffset, sectionShape, MaskKind::Invalid);
+        }

        ///
-        /// Clear the mask; i.e. unmask all currently masked values
+        /// Mark the specified position in 'this' mask as sequence begin 
+        ///
+        void MarkSequenceBegin(const std::vector<size_t>& offset)
+        {
+            NDShape sectionShape = NDShape(Shape().Rank(), 1);
+            MarkSectionAs(offset, sectionShape, MaskKind::SequenceBegin);
+        }
+
+        ///
+        /// Mark the specified sub-section of 'this' mask as sequence begin 
+        ///
+        void MarkSequenceBegin(const std::vector<size_t>& offset, const NDShape& sectionShape)
+        {
+            MarkSectionAs(offset, sectionShape, MaskKind::SequenceBegin);
+        }
+
+        ///
+        /// Clear the mask; i.e. unmask or mark Valid all currently masked (i.e. Invalid) values
        ///
        CNTK_API void Clear();

@ -642,12 +671,20 @@ namespace CNTK
        ///
        /// Returns a read-only pointer to the data buffer underlying 'this' Mask object
        /// 
-        CNTK_API const char* DataBuffer() const;
+        CNTK_API const MaskKind* DataBuffer() const;
+
+        ///
+        /// Creates a new NDArrayView with newly allocated storage on the specified device and copies 'this' view's contents into the newly allocated view.
+        ///
+        CNTK_API NDMaskPtr DeepClone(const DeviceDescriptor& device) const;

        ///
        /// Creates a new NDMask with newly allocated storage on the same device as 'this' mask and copies 'this' mask's contents into the newly allocated mask.
        ///
-        CNTK_API NDMaskPtr DeepClone() const;
+        NDMaskPtr DeepClone() const
+        {
+            return DeepClone(this->Device());
+        }

        ///
        /// Creates a new NDMask which is an alias of 'this' mask.
@ -662,6 +699,9 @@ namespace CNTK

    private:
        NDMask(const NDShape& shape, Microsoft::MSR::CNTK::Matrix<char>* matrix);
+
+        CNTK_API void MarkSectionAs(const std::vector<size_t>& sectionOffset, const NDShape& sectionShape, MaskKind maskKind);
+
        Microsoft::MSR::CNTK::Matrix<char>* GetMatrix() const;

        // Disallow copy and move construction and assignment
@ -710,41 +750,82 @@ namespace CNTK
        ///
        /// Destruct 'this' Value object.
        ///
-        CNTK_API virtual ~Value();
+        virtual ~Value();
+
+        ///
+        /// Returns the descriptor of the device that 'this' Value resides on
+        ///
+        virtual DeviceDescriptor Device() const { return m_data->Device(); }
+
+        ///
+        /// Returns the data type of 'this' Value's contents.
+        ///
+        virtual DataType GetDataType() const { return m_data->GetDataType(); }
+
+        ///
+        /// Returns the storage format of 'this' Value.
+        ///
+        virtual StorageFormat GetStorageFormat() const { return m_data->GetStorageFormat(); }
+
+        ///
+        /// Returns the shape 'this' Value.
+        ///
+        virtual const NDShape& Shape() const { return m_data->Shape(); }
+
+        ///
+        /// Returns a boolean indicating if 'this' Value contains data in sparse storage format.
+        ///
+        bool IsSparse() const
+        {
+            return (GetStorageFormat() != StorageFormat::Dense);
+        }
+
+        ///
+        /// Returns a boolean indicating if 'this' Value is read-only.
+        ///
+        virtual bool IsReadOnly() const { return m_data->IsReadOnly(); }
+
+        ///
+        /// Returns the number of masked/invalid values
+        ///
+        virtual size_t MaskedCount() const 
+        {
+            return m_mask ? m_mask->MaskedCount() : 0;
+        }

        ///
        /// Returns the NDArrayView object corresponding to the data contents of 'this value object.
        ///
-        CNTK_API virtual NDArrayViewPtr Data() const;
+        virtual NDArrayViewPtr Data() const;

        ///
        /// Returns the NDMask object corresponding to the mask associated with 'this value object.
        ///
-        CNTK_API virtual NDMaskPtr Mask() const;
+        virtual NDMaskPtr Mask() const;

        ///
        /// Creates a new Value with newly allocated storage on the same device as 'this' Value and copies 'this' Value's contents into the newly allocated Value.
        ///
-        CNTK_API virtual ValuePtr DeepClone(bool readOnly = false) const;
+        virtual ValuePtr DeepClone(bool readOnly = false) const;

        ///
        /// Creates a new Value which is an alias of 'this' Value.
        ///
-        CNTK_API virtual ValuePtr Alias(bool readOnly = false) const;
+        virtual ValuePtr Alias(bool readOnly = false) const;

        ///
        /// Copies the contents of the 'source' Value to 'this' Value.
        /// The shapes of the 'source' Value's data and mask must be identical to 'this' Value's data and mask.
        ///
-        CNTK_API virtual void CopyFrom(const Value& source);
+        virtual void CopyFrom(const Value& source);

    private:
        // Disallow copy and move construction and assignment
        Value(const Value&) = delete; Value& operator=(const Value&) = delete; Value(Value&&) = delete; Value& operator=(Value&&) = delete;

-    private:
-        NDArrayViewPtr m_data;
-        NDMaskPtr m_mask;
+    protected:
+        mutable NDArrayViewPtr m_data;
+        mutable NDMaskPtr m_mask;
    };

    ///
@ -758,6 +839,7 @@ namespace CNTK
    {
        CNTK_API static const std::wstring StaticAxisNamePrefix;
        static const size_t SentinelStaticAxisIndexValueForDynamicAxes = SIZE_MAX;
+        static const size_t SentinelStaticAxisIndexValueForAllStaticAxes = SIZE_MAX - 1;

        class UniqueDynamicAxesNames
        {
@ -839,15 +921,20 @@ namespace CNTK
        }

        ///
-        /// Static Axis object representing the default dynamic axis.
+        /// Axis object representing the default dynamic axis.
        ///
        CNTK_API static const Axis& DefaultDynamicAxis();

        ///
-        /// Static Axis object representing the batch axis.
+        /// Axis object representing the batch axis.
        ///
        CNTK_API static const Axis& DefaultBatchAxis();

+        ///
+        /// Axis object representing all the static axes of an operand
+        ///
+        CNTK_API static const Axis& AllStaticAxes();
+
        ///
        /// Returns a new unique Dynamic axis
        ///
@ -1282,6 +1369,8 @@ namespace CNTK
            return Contains(key.c_str());
        }

+        CNTK_API void Add(const Dictionary& other);
+
        CNTK_API bool operator==(const Dictionary& other) const;
        CNTK_API bool operator!=(const Dictionary& other) const;

@ -1335,7 +1424,7 @@ namespace CNTK
    typedef Dictionary ParameterInitializer;

    // Forward declarations
-    inline Variable PlaceholderVariable(const NDShape& shape, const std::vector<Axis>& dynamicAxes = Axis::DefaultInputVariableDynamicAxes);
+    inline Variable PlaceholderVariable(const NDShape& shape, const std::wstring& name, const std::vector<Axis>& dynamicAxes = Axis::DefaultInputVariableDynamicAxes);
    inline Variable InputVariable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, bool needsGradient, const std::wstring& name, const std::vector<Axis>& dynamicAxes = Axis::DefaultInputVariableDynamicAxes);
    inline Variable OutputVariable(const NDShape& shape, CNTK::DataType dataType, Function* ownerFunction, const std::vector<Axis>& dynamicAxes, const std::wstring& name = L"");

@ -1362,7 +1451,7 @@ namespace CNTK

 #ifndef SWIG
    private:
-        friend inline Variable PlaceholderVariable(const NDShape& shape, const std::vector<Axis>& dynamicAxes /*= Axis::DefaultInputVariableDynamicAxes*/);
+        friend inline Variable PlaceholderVariable(const NDShape& shape, const std::wstring& name, const std::vector<Axis>& dynamicAxes /*= Axis::DefaultInputVariableDynamicAxes*/);
        friend inline Variable InputVariable(const NDShape& shape, bool isSparse, CNTK::DataType dataType, bool needsGradient, const std::wstring& name, const std::vector<Axis>& dynamicAxes /*= Axis::DefaultInputVariableDynamicAxes*/);
        friend inline Variable OutputVariable(const NDShape& shape, CNTK::DataType dataType, Function* ownerFunction, const std::vector<Axis>& dynamicAxes, const std::wstring& name /*= L""*/);
 #endif
@ -1481,6 +1570,7 @@ namespace CNTK
            : m_dataFields(MakeSharedObject<VariableFields>(shape, varType, dataType, ownerFunction, value, needsGradient, dynamicAxes, isSparse, name, uid))
        {}

+private:
        Variable Clone() const
        {
            Variable clonedVariable;
@ -1544,17 +1634,7 @@ namespace CNTK
                                                        Internal::GenerateUid(m_varKind));
            }

-            void SetValueInitialization(const ParameterInitializer& initializationConfig, const DeviceDescriptor& device)
-            {
-                if (m_value != nullptr)
-                    LogicError("Value initialization config cannot be set if a value already exists");
-
-                assert(!m_valueInitializer);
-                assert(!m_valueInitializationDevice);
-
-                m_valueInitializer.reset(new ParameterInitializer(initializationConfig));
-                m_valueInitializationDevice.reset(new DeviceDescriptor(device));
-            }
+            CNTK_API void SetValueInitialization(const ParameterInitializer& initializationConfig, const DeviceDescriptor& device);

        private:
            // Disallow copy and move construction and assignment
@ -1580,10 +1660,19 @@ namespace CNTK
    /// Create a Placeholder variable to be used as a temporary/placeholder input to a Function.
    /// All placeholder inputs of a Function must be replaced with non-placeholder Variables before Forward evaluation of the Function.
    ///
-    inline Variable PlaceholderVariable(const NDShape& shape, const std::vector<Axis>& dynamicAxes /*= Axis::DefaultInputVariableDynamicAxes*/)
+    inline Variable PlaceholderVariable(const NDShape& shape, const std::wstring& name, const std::vector<Axis>& dynamicAxes /*= Axis::DefaultInputVariableDynamicAxes*/)
    {
        auto varKind = VariableKind::Placeholder;
-        return Variable(shape, varKind, DataType::Unknown, nullptr, false, dynamicAxes, L"", Internal::GenerateUid(varKind));
+        return Variable(shape, varKind, DataType::Unknown, nullptr, false, dynamicAxes, name, Internal::GenerateUid(varKind));
+    }
+
+    ///
+    /// Create a Placeholder variable to be used as a temporary/placeholder input to a Function.
+    /// All placeholder inputs of a Function must be replaced with non-placeholder Variables before Forward evaluation of the Function.
+    ///
+    inline Variable PlaceholderVariable(const NDShape& shape, const std::vector<Axis>& dynamicAxes = Axis::DefaultInputVariableDynamicAxes)
+    {
+        return PlaceholderVariable(shape, L"", dynamicAxes);
    }

    ///
@ -1765,7 +1854,7 @@ namespace CNTK

    public:
        ///
-        /// Contruct a Constant whose initial contents are a copy of the specified value
+        /// Construct a Constant whose initial contents are a copy of the specified value
        ///
        Constant(const NDArrayViewPtr& value, const std::wstring& name = L"")
            : Constant(value, name, Internal::GenerateUid(VariableKind::Constant))
@ -1946,7 +2035,7 @@ namespace CNTK
        /// and the user is responsible for ensuring that the contents of the inputs and outputs are unchanged until after any uses of the BackPropState instance
        /// for backpropagating gradients through this function.
        ///
-        CNTK_API virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
+        virtual BackPropStatePtr Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
                                         std::unordered_map<Variable, ValuePtr>& outputs,
                                         const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice(),
                                         const std::unordered_set<Variable>& outputsToRetainBackwardStateFor = {}) = 0;
@ -1960,10 +2049,15 @@ namespace CNTK
        /// The 'state' parameter is an instance of an BackPropState instance obtained from a previous call to the Forward method on 'this; Function for the 
        /// computation that this gradient backpropagation corresponds to.
        ///
-        CNTK_API virtual void Backward(const BackPropStatePtr& state,
+        virtual void Backward(const BackPropStatePtr& state,
                              const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
                              std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) = 0;

+        ///
+        /// Returns the name of the operation that this Function denotes
+        ///
+        virtual const std::wstring& OpName() = 0;
+
    public:

        // Optional overrides
@ -2074,6 +2168,11 @@ namespace CNTK
        ///
        CNTK_API FunctionPtr ReplacePlaceholder(const Variable& placeholderReplacement);

+        ///
+        /// Restore the models parameters from a saved model file
+        ///
+        CNTK_API void RestoreFromLegacyModel(const std::wstring& modelFilePath);
+
    private:

        template <typename VariableType, typename FilterFunction>
@ -2144,9 +2243,6 @@ namespace CNTK
            }
        }

-    private:
-        void RestoreFromLegacyModel(const std::wstring& modelFilePath);
-
    private:

        std::vector<Variable> m_inputs;
@ -2501,7 +2597,7 @@ namespace CNTK
    /// E.g. When creating a classification model, typically the CrossEntropy loss Function and the ClassificationError Function comprise the two roots
    /// of the computation graph which can be "Combine"d to create a single Function with 2 outputs; viz. CrossEntropy loss and ClassificationError output.
    ///
-    CNTK_API FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name = L"");
+    CNTK_API FunctionPtr Combine(const std::vector<Variable>& operands, const std::wstring& name = L"");

    namespace Sequence
    {
@ -2535,12 +2631,14 @@ namespace CNTK
    ///
    class Learner : public std::enable_shared_from_this<Learner>
    {
+        static const std::wstring LearningRateAttributeName;
+
    public:
        //
        // Method to update the parameters associated with this learner. By returning false, this method indicates that
        // learning has stopped for all of the parameters associated with this learner
        //
-        CNTK_API virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) = 0;
+        virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) = 0;

        ///
        /// Returns the set of parameters associated with this learner.
@ -2552,32 +2650,50 @@ namespace CNTK
        ///
        // TODO: move the following two methods into ISerializable interface, make 
        // Learner (and all other entities that need checkpointing capability) implement it.
-        CNTK_API virtual Dictionary GetCheckpointState() const { return Dictionary(); }
+        virtual Dictionary GetCheckpointState() const 
+        {
+            Dictionary baseCheckpointState;
+            baseCheckpointState[LearningRateAttributeName] = m_learningRate;
+
+            return baseCheckpointState;
+        }

        ///
        /// Optionally overridable method to restore the learner's state from a previous checkpoint.
        ///
-        CNTK_API virtual void RestoreFromCheckpoint(const Dictionary& /*checkpoint*/) {}
+        virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) 
+        {
+            if (checkpoint.Contains(LearningRateAttributeName))
+                m_learningRate = checkpoint[LearningRateAttributeName].Value<double>();
+        }

        ///
        /// Destruct this Learner.
        ///
        virtual ~Learner() {}

+        virtual void ResetLearningRate(double learningRate) { m_learningRate = learningRate; }
+        virtual double LearningRate() const { return m_learningRate; }
+
    protected:
-        Learner(const std::vector<Parameter>& parameters)
-            : m_parameters(parameters.begin(), parameters.end())
+        Learner(const std::vector<Parameter>& parameters, double learningRate)
+            : m_parameters(parameters.begin(), parameters.end()), m_learningRate(learningRate)
        {}

        std::unordered_set<Parameter> m_parameters;
-
+        double m_learningRate;
    };

    ///
-    /// A collection of key-value pairs that represents training parameter schedule in 
-    /// terms of the number of processed samples. 
-    /// This class provides a number of convenience constructors to allow easy conversion 
-    /// from a single value, a vector of values and a list of pairs to the training schedule.
+    /// A collection of key-value pairs that represents a training parameter schedule in 
+    /// terms of the number of processed samples (e.g., learning rate and momentum schedules). 
+    /// This class is designed to simplify Learner's factory methods and provides a number of 
+    /// convenience constructors to allow easy conversion from a single value, a vector of values 
+    /// and a list of pairs to the training schedule. For example, a learning rate schedule 
+    /// { { 10, 0.5 }, { 100, 0.3 }, { 20, 0.2 } } indicates that the rate of 0.5 should be
+    /// used for the first 10 units (equivalently, samples if the default unit = 1 is used)
+    /// followed by 0.3 for the next 100 units, and then 0.2 for the remaining 20 units or 
+    /// until the end of training if it takes longer.
    ///
    template <typename T>
    class TrainingParameterSchedule
@ -2586,31 +2702,14 @@ namespace CNTK
        ///
        /// Create a schedule with a constant parameter value.
        ///
-        TrainingParameterSchedule(T value)
-            : m_schedule({ std::make_pair(0, value) }), m_unit(1)
-        {}
+        CNTK_API TrainingParameterSchedule(T value);

        ///
        /// Create a schedule where the parameter changes its value every 'unit' samples:
        /// schedule[0] is used for the first 'unit' samples, schedule[1] -- for the second,
        /// and so on. The last value is then used repeatedly until the end of training.
        ///
-        TrainingParameterSchedule(const std::vector<T>& schedule, size_t unit = 1) 
-            : m_unit(unit)
-        {
-            // TODO: 0 will be used to mean "the entire sweep"
-            if (unit == 0)
-                RuntimeError("TrainingParameterSchedule::constructor : 'unit' cannot be 0.");
-
-            if (schedule.size() == 0)
-                RuntimeError("TrainingParameterSchedule::constructor : schedule is empty.");
-
-            size_t i = 1;
-            for (const auto& value : schedule)
-            {
-                m_schedule[m_unit * i++] = value;
-            }
-        }
+        CNTK_API TrainingParameterSchedule(const std::vector<T>& schedule, size_t unit = 1);

        ///
        /// Create a schedule using the list of key-value pairs, where the key specifies 
@ -2621,74 +2720,104 @@ namespace CNTK
        /// '0.1' is used for the second 200 samples, after which the values is switched
        /// to '0.005'.
        ///
-        TrainingParameterSchedule(const std::initializer_list<std::pair<const size_t, T>>& schedule, size_t unit = 1)
-            : m_unit(unit)
-        {
-            // TODO: 0 will be used to mean "the entire sweep"
-            if (unit == 0)
-                RuntimeError("TrainingParameterSchedule::constructor : 'unit' cannot be 0.");
-
-            if (schedule.size() == 0)
-                RuntimeError("TrainingParameterSchedule::constructor : schedule is empty.");
-
-            size_t i = 0;
-            for (const auto& it : schedule)
-            {
-                if (it.first == 0)
-                    RuntimeError("TrainingParameterSchedule::constructor : unit count cannot be 0.");
-
-                i += it.first;
-                m_schedule[m_unit * i] = it.second;
-            }
-        }
+        CNTK_API TrainingParameterSchedule(const std::vector<std::pair<size_t, T>>& schedule, size_t unit = 1);

        ///
        /// Returns a value corresponding to the absolute sample count from the beginning of training.
        ///
-        CNTK_API const T& operator[](size_t samleCount) const;
+        CNTK_API virtual const T& operator[](size_t sampleCount) const;
+
+        CNTK_API virtual ~TrainingParameterSchedule();
+
+        CNTK_API TrainingParameterSchedule(const TrainingParameterSchedule<T>&); 
+        CNTK_API TrainingParameterSchedule(TrainingParameterSchedule<T>&&); 
+        CNTK_API TrainingParameterSchedule<T>& operator=(const TrainingParameterSchedule<T>&); 
+        CNTK_API TrainingParameterSchedule<T>& operator=(TrainingParameterSchedule<T>&&);

    private:
+        CNTK_API void ConstructSchedule(const std::vector<std::pair<size_t, T>>& schedule);
+
+    protected:           
        std::map<size_t, T> m_schedule;
        size_t m_unit;
    };

    typedef TrainingParameterSchedule<double> LearningRatesPerSample;
-    typedef TrainingParameterSchedule<double> MomentumsPerSample;
+    typedef TrainingParameterSchedule<double> MomentumValuesPerSample;
+
+    ///
+    /// This class allows to specify momentum as time constant in place of momentum per sample in 
+    /// all of Learners factory methods. The specified values are then automatically converted into 
+    /// per sample values.
+    /// 
+    class MomentumValuesAsTimeConstants: public MomentumValuesPerSample
+    {
+    public:
+        MomentumValuesAsTimeConstants(double value) 
+            : MomentumValuesPerSample(value)
+        { 
+            ConvertToPerSampleValues();
+        }
+        
+        MomentumValuesAsTimeConstants(const std::vector<double>& schedule, size_t unit = 1) 
+            : MomentumValuesPerSample(schedule, unit)
+        { 
+            ConvertToPerSampleValues();
+        }
+        
+        MomentumValuesAsTimeConstants(const std::vector<std::pair<size_t, double>>& schedule, size_t unit = 1) 
+            : MomentumValuesPerSample(schedule, unit)
+        { 
+            ConvertToPerSampleValues();
+        }
+
+    private:
+        CNTK_API void ConvertToPerSampleValues();
+    };
+
+    /// A collection of additional options that affect parameter updates and 
+    /// are applicable for all standard learners 
+    struct AdditionalLearningOptions
+    {
+        double l1RegularizationWeight = 0.0;
+        double l2RegularizationWeight = 0.0;
+        double gaussianNoiseInjectionStdDev = 0.0;
+        double gradientClippingThresholdPerSample = std::numeric_limits<double>::infinity();
+        bool gradientClippingWithTruncation = true;
+    };

    ///
    /// Create an instance of the CNTK built-in SGD learner.
    ///
    CNTK_API LearnerPtr SGDLearner(const std::vector<Parameter>& parameters,
                                   const LearningRatesPerSample& learningRates,
-                                   double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                                   bool gradientClippingWithTruncation = true);
+                                   AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());

    ///
    /// Create an instance of the CNTK built-in Momentum SGD learner.
    ///
    CNTK_API LearnerPtr MomentumSGDLearner(const std::vector<Parameter>& parameters,
                                           const LearningRatesPerSample& learningRates,
-                                           const MomentumsPerSample& momentums,
-                                           double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                                           bool gradientClippingWithTruncation = true);
+                                           const MomentumValuesPerSample& momentumValues,
+                                           AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());

    ///
    /// Create an instance of the CNTK built-in Nesterov's accelerated SGD learner.
    ///
    CNTK_API LearnerPtr NesterovLearner(const std::vector<Parameter>& parameters,
                                        const LearningRatesPerSample& learningRates,
-                                        const MomentumsPerSample& momentums,
-                                        double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                                        bool gradientClippingWithTruncation = true);
+                                        const MomentumValuesPerSample& momentumValues,
+                                        AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());

    ///
    /// Create an instance of the CNTK built-in FSAdaGrad (improved AdaGrad) learner.
    ///
    CNTK_API LearnerPtr FSAdaGradLearner(const std::vector<Parameter>& parameters,
                                         const LearningRatesPerSample& learningRates,
-                                         const MomentumsPerSample& momentums,
-                                         double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                                         bool gradientClippingWithTruncation = true);
+                                         const MomentumValuesPerSample& momentumValues,
+                                         const double targetAdagradAvDenom = 0.0025, // 1/400 magic constant 
+                                         const size_t adagradT = 2 * 3600 * 100,
+                                         AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());

    ///
    /// Create an instance of the CNTK built-in AdaGrad learner.
@ -2696,8 +2825,7 @@ namespace CNTK
    CNTK_API LearnerPtr AdaGradLearner(const std::vector<Parameter>& parameters,
                                       const LearningRatesPerSample& learningRates,
                                       bool needAveMultiplier = true,
-                                       double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                                       bool gradientClippingWithTruncation = true);
+                                       AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());

    ///
    /// Create an instance of the CNTK built-in RMSProp learner.
@ -2710,8 +2838,7 @@ namespace CNTK
                                       double max,
                                       double min,
                                       bool needAveMultiplier = true,
-                                       double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                                       bool gradientClippingWithTruncation = true);
+                                       AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());

    ///
    /// Trainer is the top-level abstraction responsible for the orchestration of the training of a model
@ -2805,7 +2932,9 @@ namespace CNTK
        FunctionPtr m_combinedTrainingFunction;
        FunctionPtr m_model;
        FunctionPtr m_lossFunction;
+        FunctionPtr m_aggregatedLossFunction;
        FunctionPtr m_evaluationFunction;
+        FunctionPtr m_aggregatedEvaluationFunction;

        std::unordered_set<LearnerPtr> m_parameterLearners;

@ -2930,11 +3059,14 @@ namespace CNTK
    /// 
    /// Instantiate the CNTK built-in test format minibatch source
    ///
-    inline MinibatchSourcePtr TextFormatMinibatchSource(const std::wstring& dataFilePath, const std::vector<StreamConfiguration>& streamConfigs, size_t epochSize = SIZE_MAX)
+    inline MinibatchSourcePtr TextFormatMinibatchSource(const std::wstring& dataFilePath, const std::vector<StreamConfiguration>& streamConfigs, size_t epochSize = SIZE_MAX, bool randomize = true)
    {
        CNTK::Dictionary minibatchSourceConfiguration;
        minibatchSourceConfiguration[L"epochSize"] = epochSize;

+        if (randomize)
+            minibatchSourceConfiguration[L"randomize"] = true;
+
        CNTK::Dictionary deserializerConfiguration;
        deserializerConfiguration[L"type"] = L"CNTKTextFormatDeserializer";
        deserializerConfiguration[L"file"] = dataFilePath;
@ -2968,4 +3100,17 @@ namespace CNTK
    CNTK_API void ComputeInputPerDimMeansAndInvStdDevs(const MinibatchSourcePtr& minibatchSource,
                                                       std::unordered_map<StreamInformation, std::pair<NDArrayViewPtr, NDArrayViewPtr>>& computedMeanAndVariances,
                                                       const DeviceDescriptor& device = DeviceDescriptor::CPUDevice());
+
+    ///
+    /// Set the process-wide setting for maximum number of CPU threads to be used by any individual compute operation
+    /// Note that this is a per compute operation limit and if the user performs multiple compute operations concurrently
+    /// by launching multiple threads and performing a compute operation inside, it will result in each of those concurrently
+    /// executing operations to use the specified number of CPU threads limit.
+    ///
+    CNTK_API void SetMaxNumCPUThreads(size_t numCPUThreads);
+
+    ///
+    /// Returns the current process-wide setting for maximum number of CPU threads to be used by any individual compute operation
+    ///
+    CNTK_API size_t GetMaxNumCPUThreads();
 }
--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -186,9 +186,6 @@ namespace CNTK

    namespace Internal
    {
-        // Create a new Function instance which just passes through specified list of 'operands'.
-        CNTK_API FunctionPtr Combine(const std::vector<Variable>& operands, const std::wstring& name = L"");
-
        CNTK_API FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name = L"");
        CNTK_API FunctionPtr PackedIndex(const Variable& operand, const Variable& index, const std::wstring& name = L"");
        CNTK_API FunctionPtr GatherPacked(const Variable& operand, const Variable& packedIndex, const std::wstring& name = L"");
@ -202,10 +199,15 @@ namespace CNTK

        CNTK_API size_t NewUniqueId();

+        // Internal hooks for testing and higher-level bindings
+        // These should not be directly called by C++ API users
        CNTK_API void EnableReversingTensorShapesInErrorMessages();
        bool IsReversingTensorShapesInErrorMessagesEnabled();

        CNTK_API void AlwaysAllowSettingDefaultDevice();
        bool IsSettingDefaultDeviceAlwaysAllowed();
+
+        CNTK_API void DisableAutomaticUnpackingOfPackedValues();
+        bool IsAutomaticUnpackingOfPackedValuesDisabled();
    }
 }
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -36,8 +36,11 @@ namespace CNTK

        if (node->IsLeaf())
        {
+            std::wstring varUid, varName;
            if (node->Is<InputValueBase<ElementType>>())
            {
+                std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), VariableKind::Input);
+
                bool isSparse = node->Is<SparseInputValue<ElementType>>();
                if (node->HasMBLayout())
                {
@ -45,12 +48,12 @@ namespace CNTK
                    auto inputNodeInternalDynamicAxisName = node->GetMBLayout()->GetAxisName();
                    std::vector<Axis> inputVarDynamicAxes = DynamicAxesFromInternalDynamicAxisName(inputNodeInternalDynamicAxisName);

-                    var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, node->NodeName(), inputVarDynamicAxes, node->NodeName());
+                    var = Variable(varShape, isSparse, AsDataType<ElementType>(), node->GetLearningRateMultiplier() != 0, varName, inputVarDynamicAxes, varUid);
                }
                else
                {
                    // TODO: Allow creating inputs without a dynamic axis
-                    LogicError("Found InputNode with no dynamic axis which is currently unsupported");
+                    LogicError("Found InputNode with no dynamic axes which is currently unsupported");
                }
            }
            else if (node->Is<LearnableParameter<ElementType>>())
@ -60,9 +63,15 @@ namespace CNTK
                auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorViewShape(node->GetSampleLayout()));
                NDArrayViewPtr value = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), varShape, false, tensorView);
                if (isConstant)
-                    var = Constant(value, node->NodeName(), node->NodeName());
+                {
+                    std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), VariableKind::Constant);
+                    var = Constant(value, varName, varUid);
+                }
                else
-                    var = Parameter(value, node->NodeName(), node->NodeName());
+                {
+                    std::tie(varUid, varName) = UidAndNameFromCNTKInternalNodeName(node->NodeName(), VariableKind::Parameter);
+                    var = Parameter(value, varName, varUid);
+                }
            }
            else
                LogicError("CNTK::LoadLegacyModel: Unsupported legacy CNTK node named '%S'", node->NodeName().c_str());
@ -299,17 +308,17 @@ namespace CNTK
        std::unordered_map<ComputationNodeBasePtr, Variable> nodeToVariableMap;
        std::unordered_map<Variable, Variable> placeholderReplacements;
        std::unordered_set<FunctionPtr> allPrimitiveFunctions;
-        std::vector<FunctionPtr> rootFunctions;
+        std::vector<Variable> rootVariables;
        auto& networkRoots = net->RootNodes();
        for (auto& rootNode : networkRoots)
        {
            if (rootNode->IsLeaf())
                continue;

-            rootFunctions.push_back(GetVariable<ElementType>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
+            rootVariables.push_back(GetVariable<ElementType>(rootNode, nodeToVariableMap, placeholderReplacements, allPrimitiveFunctions).Owner());
        }

-        auto rootComposite = Combine(rootFunctions);
+        auto rootComposite = Combine(rootVariables);
        rootComposite->ReplacePlaceholders(placeholderReplacements);

        return rootComposite;
@ -350,8 +359,5 @@ namespace CNTK
        }

        computationNetwork->Save(modelFile);
-
-        if (!compositeFunction->NetworkMatricesAllocated())
-            compositeFunction->PurgeComputationNetwork();
    }
 }
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -5,9 +5,12 @@

 #include "stdafx.h"
 #include "CNTKLibrary.h"
+#include "Utils.h"
 #include "BestGpu.h"
 #include <mutex>
 #include <algorithm>
+#include <CPUMatrix.h> // For CPUMatrix::SetNumThreads
+#include <thread>

 namespace CNTK
 {
@ -40,6 +43,17 @@ namespace CNTK
        {
            return s_alwaysAllowSettingDefaultDevice.load();
        }
+
+        std::atomic<bool> s_disableAutomaticUnpackingOfPackedValues(false);
+        void DisableAutomaticUnpackingOfPackedValues()
+        {
+            s_disableAutomaticUnpackingOfPackedValues.store(true);
+        }
+
+        bool IsAutomaticUnpackingOfPackedValuesDisabled()
+        {
+            return s_disableAutomaticUnpackingOfPackedValues.load();
+        }
    }

    /*static*/ std::atomic<bool> DeviceDescriptor::s_defaultDeviceFrozen(false);
@ -62,7 +76,7 @@ namespace CNTK
        auto selectedDevice = DefaultDevice();
        if (!alreadyFrozen)
        {
-            Microsoft::MSR::CNTK::OnDeviceSelected(selectedDevice.Id());
+            Microsoft::MSR::CNTK::OnDeviceSelected(AsCNTKImplDeviceId(selectedDevice));
        }
        return selectedDevice;
    }
@ -74,7 +88,7 @@ namespace CNTK
            RuntimeError("Process wide default device cannot be changed since it has been frozen by being implicitly used as the default device in a CNTK API call");

        std::call_once(s_initDefaultDeviceFlag, []{
-            // do nothing. This will set the flag above, in case the DefaultDevice() was never called before.
+            // do nothing. This will set the flag above, in case when DefaultDevice() was never called before.
        });

        s_defaultDevice.reset(new DeviceDescriptor(newDefaultDevice));
@ -82,7 +96,9 @@ namespace CNTK
    
    /*static*/ DeviceDescriptor DeviceDescriptor::BestDevice()
    {
-        // TODO: add unit tests for this.
+        //TODO: BestDevice remains locked if UseDefaultDevice is never executed
+        // or if BestDevice() is invoked after UseDefaultDevice(). 
+        // Should we do anything about it?
        auto id = Microsoft::MSR::CNTK::GetBestDevice();
        return id >= 0 ? DeviceDescriptor::GPUDevice(id) : DeviceDescriptor::CPUDevice();
    }
@ -140,6 +156,12 @@ namespace CNTK
        return s_defaultBatchAxis;
    }

+    /*static*/ const Axis& Axis::AllStaticAxes()
+    {
+        static const Axis s_allStaticAxes(SentinelStaticAxisIndexValueForAllStaticAxes);
+        return s_allStaticAxes;
+    }
+
    /*static*/ Axis Axis::NewUniqueDynamicAxis(const std::wstring& axisNamePrefix, bool isOrderedDynamicAxis /*= true*/)
    {
        return Axis(s_uniqueDynamicAxisNames.NewUniqueDynamicAxisName(axisNamePrefix), isOrderedDynamicAxis);
@ -149,4 +171,16 @@ namespace CNTK
    {
        s_uniqueDynamicAxisNames.RegisterAxisName(axisName);
    }
+
+    std::atomic<size_t> s_maxNumCPUThreads(std::thread::hardware_concurrency());
+    void SetMaxNumCPUThreads(size_t numCPUThreads)
+    {
+        s_maxNumCPUThreads.store(numCPUThreads);
+        Microsoft::MSR::CNTK::CPUMatrix<float>::SetNumThreads((int)numCPUThreads);
+    }
+
+    size_t GetMaxNumCPUThreads()
+    {
+        return s_maxNumCPUThreads.load();
+    }
 }
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -16,6 +16,7 @@
 #include "InputAndParamNodes.h"
 #include "NonlinearityNodes.h"
 #include "RecurrentNodes.h"
+#include "Value.h"

 using namespace Microsoft::MSR::CNTK;

@ -81,7 +82,7 @@ namespace CNTK
            }
        }

-        auto outputsUsingNewInputs = PrimitiveFunction::GetOutputVariables(primitiveFunction->OpType(), m_inputs, this, primitiveFunction->Attributes());
+        auto outputsUsingNewInputs = PrimitiveFunction::GetOutputVariables(primitiveFunction->OpType(), m_inputs, this, primitiveFunction->Attributes(), primitiveFunction->Name());
        auto currentOutputs = Outputs();
        for (size_t i = 0; i < currentOutputs.size(); ++i)
        {
@ -197,7 +198,7 @@ namespace CNTK
    {
        auto placeholders = Placeholders();
        if (placeholders.size() != 1)
-            InvalidArgument("Function::ReplacePlaceholders called with a single replacement variable but this Function has none or more than 1 placeholders");
+            InvalidArgument("Function::ReplacePlaceholders called with a single replacement variable but this Function has %d placeholders", (int)placeholders.size());

        return ReplacePlaceholders({ { *(placeholders.begin()), placeholderReplacement } });
    }
@ -413,26 +414,52 @@ namespace CNTK
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameEndIndex = L"endIndex";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameReductionOpName = L"reductionOpName";

-    /*static*/ std::vector<Variable> PrimitiveFunction::GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner, const Dictionary& functionConfig)
+    /*static*/ std::vector<Variable> PrimitiveFunction::GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner, const Dictionary& functionConfig, const std::wstring& functionName)
    {
        if (op == PrimitiveOpType::Combine)
            return inputs;

-        // TODO: We are just using the primary operand's DataType as output node's DataType. Is this always correct?
+        // We use the first non-constant input operand's DataType as the output DataType
+        // In case there are no non-constant known DataTypes, we just pick the first known operand DataType
+        // Also, all the known DataTypes of operands should match except for constants where coercion is allowed
+        DataType firstKnownInputDataType = DataType::Unknown;
        DataType outputDataType = DataType::Unknown;
        NDShape outputShape;
        size_t i = 0;
-        while ((outputDataType == DataType::Unknown) && (i < inputs.size()))
-            outputDataType = inputs[i++].GetDataType();
+        while (i < inputs.size())
+        {
+            auto input = inputs[i++];
+            auto inputDataType = input.GetDataType();
+            if (inputDataType != DataType::Unknown)
+            {
+                if (firstKnownInputDataType == DataType::Unknown)
+                    firstKnownInputDataType = inputDataType;

                if (outputDataType == DataType::Unknown)
-            InvalidArgument("The DataType of all the input operands of primitive function with op type %s are unknown", PrimitiveOpTypeName(op));
+                {
+                    if (!input.IsConstant())
+                        outputDataType = inputDataType;
+                }
+                else
+                {
+                    // The DataType of all operands should match except for Constants where we allow coercion
+                    if ((inputDataType != DataType::Unknown) && (inputDataType != outputDataType) && !input.IsConstant())
+                        InvalidArgument("Primitive function with op type %S has operands with different DataTypes %s and %s", PrimitiveOpTypeName(op).c_str(), DataTypeName(outputDataType), DataTypeName(inputDataType));
+                }
+            }
+        }
+
+        if (outputDataType == DataType::Unknown)
+            outputDataType = firstKnownInputDataType;
+
+        if (outputDataType == DataType::Unknown)
+            InvalidArgument("The DataType of all the input operands of primitive function with op type %S are unknown", PrimitiveOpTypeName(op).c_str());

        // We currently require that the inputs' dynamic axes if any match
        std::vector<Axis> outputDynamicAxes;
        if ((op == PrimitiveOpType::SumAll) || (op == PrimitiveOpType::SquaredError) || (op == PrimitiveOpType::CrossEntropyWithSoftmax) || (op == PrimitiveOpType::ClassificationError))
            outputDynamicAxes = std::vector<Axis>({});
-        if (op == PrimitiveOpType::Where)
+        else if (op == PrimitiveOpType::Where)
            outputDynamicAxes = AsVector<Axis>(functionConfig[PrimitiveFunction::AttributeNameNewDynamicAxes].Value<std::vector<DictionaryValue>>());
        else if (op == PrimitiveOpType::ScatterPacked)
            outputDynamicAxes = inputs[2].DynamicAxes();
@ -598,18 +625,18 @@ namespace CNTK
            assert(inputs.size() == 2);

            if ((inputs[0].Shape().Rank() > 2) || ((inputs[0].Shape().Rank() > 1) && (inputs[0].Shape()[1] != 1)))
-                InvalidArgument("The shape of input operands for the %s operation should have at most one axis", PrimitiveOpTypeName(op));
+                InvalidArgument("The shape of input operands for the %S operation should have at most one axis", PrimitiveOpTypeName(op).c_str());

            auto predictionShape = inputs[0].Shape();
            auto labelsShape = inputs[1].Shape();
            if (predictionShape != labelsShape)
-                RuntimeError("Prediction output operand's shape %S is incompatible with label operand's shape %S for the %s operation", AsStringForErrorReporting(predictionShape).c_str(), AsStringForErrorReporting(labelsShape).c_str(), PrimitiveOpTypeName(op));
+                RuntimeError("Prediction output operand's shape %S is incompatible with label operand's shape %S for the %S operation", AsStringForErrorReporting(predictionShape).c_str(), AsStringForErrorReporting(labelsShape).c_str(), PrimitiveOpTypeName(op).c_str());

            std::vector<size_t> reductionAxes;
            for (size_t i = 0; i < inputs[0].Shape().Rank(); ++i)
                reductionAxes.push_back(i);

-            outputShape = ReductionOpOutputShape(op, predictionShape, reductionAxes);
+            outputShape = ReductionOpOutputShape(op, predictionShape, reductionAxes, /*preserveReductionAxes =*/ false);
            break;
        }
        case PrimitiveOpType::PastValue:
@ -630,9 +657,13 @@ namespace CNTK
        {
            assert(inputs.size() == 1);
            auto reductionAxis = functionConfig[PrimitiveFunction::AttributeNameAxis].Value<Axis>();
+            if (reductionAxis == Axis::AllStaticAxes())
+                outputShape = {};
+            else
+            {
                std::vector<size_t> reductionAxes = { reductionAxis.StaticAxisIndex() };
-
-            outputShape = ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes);
+                outputShape = ReductionOpOutputShape(op, inputs[0].Shape(), reductionAxes, /*preserveReductionAxes =*/ true);
+            }
            break;
        }
        case PrimitiveOpType::BatchNormalization:
@ -664,9 +695,6 @@ namespace CNTK
            if (inputs[0].DynamicAxes().empty() || inputs[1].DynamicAxes().empty() || inputs[2].DynamicAxes().empty())
                InvalidArgument("ScatterPacked requires all its operands to have dynamic axes");

-            if (inputs[1].Shape().Rank() != 1)
-                InvalidArgument("ScatterPacked requires the packedIndex operand to be a scalar sequence");
-
            outputShape = inputs[0].Shape();
            break;
        }
@ -686,13 +714,14 @@ namespace CNTK
            break;
        }
        default:
-            LogicError("Specified op %s not yet supported", PrimitiveOpTypeName(op));
+            LogicError("Specified op %S not yet supported", PrimitiveOpTypeName(op).c_str());
            break;
        }

-        return{ OutputVariable(outputShape, outputDataType, owner, outputDynamicAxes) };
+        return{ OutputVariable(outputShape, outputDataType, owner, outputDynamicAxes, functionName.empty() ? L"" : functionName + L"_output") };
    }

+    /*static*/ const std::wstring CompositeFunction::CompositeFunctionOpName = L"CompositeFunctionOpName";
    /*static*/ std::atomic<unsigned int> CompositeFunction::s_nextAutoGeneratedDynamicAxis(0);

    // Names of the dynamic axes in the CNTK engine for some special sets of dynamic axes values
@ -746,9 +775,10 @@ namespace CNTK
        variableToNodeMap[variable] = nullptr;

        std::shared_ptr<ComputationNode<ElementType>> computationNodePtr;
+        auto internalNodeName = CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name());
        if (variable.IsParameter() || variable.IsConstant())
        {
-            computationNodePtr = builder.CreateLearnableParameter(variable.Uid(), AsTensorShape(variable.Shape()));
+            computationNodePtr = builder.CreateLearnableParameter(internalNodeName, AsTensorShape(variable.Shape()));
            network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
            if (!variable.NeedsGradient())
                computationNodePtr->SetLearningRateMultiplier(0.0);
@ -786,9 +816,9 @@ namespace CNTK
                network->AddNodeToNetAndAttachInputs(New<DynamicAxisNode<ElementType>>(network->GetDeviceId(), internalDynamicAxisName), {});

            if (IsSparseInput(variable))
-                computationNodePtr = builder.CreateSparseInputNode(variable.Uid(), AsTensorShape(variable.Shape()), internalDynamicAxisName);
+                computationNodePtr = builder.CreateSparseInputNode(internalNodeName, AsTensorShape(variable.Shape()), internalDynamicAxisName);
            else
-                computationNodePtr = builder.CreateInputNode(variable.Uid(), AsTensorShape(variable.Shape()), internalDynamicAxisName);
+                computationNodePtr = builder.CreateInputNode(internalNodeName, AsTensorShape(variable.Shape()), internalDynamicAxisName);

            if (variable.NeedsGradient())
            {
@ -1033,7 +1063,7 @@ namespace CNTK
            break;
        }
        default:
-            LogicError("Specified op %s not yet supported", PrimitiveOpTypeName(op));
+            LogicError("Specified op %S not yet supported", PrimitiveOpTypeName(op).c_str());
            break;
        }

@ -1047,8 +1077,8 @@ namespace CNTK
        {
            auto computationNodeExpectedInputCount = computationNodePtr->As<INumInputs>()->GetExpectedNumInputs();
            if (computationNodeExpectedInputCount != inputNodesBasePtrs.size())
-                LogicError("Input count mismatch: The Primitive function for op %s has %d inputs while the corresponding ComputationNode has %d inputs",
-                           PrimitiveOpTypeName(op),
+                LogicError("Input count mismatch: The Primitive function for op %S has %d inputs while the corresponding ComputationNode has %d inputs",
+                           PrimitiveOpTypeName(op).c_str(),
                           (int)inputNodesBasePtrs.size(),
                           (int)computationNodeExpectedInputCount);
        }
@ -1128,8 +1158,8 @@ namespace CNTK
            // TODO: Support changing the device across different invocations of the forward method on a Function instance
            if (AsDeviceDescriptor(m_computationNetwork->GetDeviceId()) != device)
                LogicError("Changing device across different Forward calls on a CNTK composite Function is currently unsupported");
-        }

+        }
        else
        {
            m_computationNetwork = std::make_shared<ComputationNetwork>(AsCNTKImplDeviceId(device));
@ -1140,20 +1170,11 @@ namespace CNTK
            if (backpropRoots.size() > 1)
                LogicError("More than one backprop roots is currently unsupported");

-            ComputationNodeBasePtr backpropRootNode;
-
            // Now recursively create the network in a top-down fashion
            auto rootFunction = RootFunction();
            auto rootFunctionOutputs = rootFunction->Outputs();
-            std::vector<ComputationNodeBasePtr> forwardRootNodes;
            for (auto rootOutput : rootFunctionOutputs)
-            {
-                auto currentRootNode = GetNode(rootOutput, m_computationNetwork, builder, m_variableToNodeMap, m_isVariableRootMap);
-                forwardRootNodes.push_back(currentRootNode);
-
-                if (backpropRoots.find(rootOutput) != backpropRoots.end())
-                    backpropRootNode = m_variableToNodeMap[rootOutput];
-            }
+                GetNode(rootOutput, m_computationNetwork, builder, m_variableToNodeMap, m_isVariableRootMap);

            // If any of the function outputs is not a root node, we need to explicitly add it to the 'output' group of the ComputationNetwork
            for (auto rootOutput : rootFunctionOutputs)
@ -1212,8 +1233,26 @@ namespace CNTK
                    }
                }
            }
+        }
+
+
+        if (!m_networkMatricesAllocated && allocateNetworkMatrices)
+        {
+            ComputationNodeBasePtr backpropRootNode;
+
+            // Now recursively create the network in a top-down fashion
+            auto rootFunction = RootFunction();
+            auto rootFunctionOutputs = rootFunction->Outputs();
+            std::vector<ComputationNodeBasePtr> forwardRootNodes;
+            for (auto rootOutput : rootFunctionOutputs)
+            {
+                auto currentRootNode = m_variableToNodeMap[rootOutput];
+                forwardRootNodes.push_back(currentRootNode);
+
+                if (m_currentBackpropRoots.find(rootOutput) != m_currentBackpropRoots.end())
+                    backpropRootNode = currentRootNode;
+            }

-            if (allocateNetworkMatrices)
            m_computationNetwork->AllocateAllMatrices(forwardRootNodes, {}, backpropRootNode);
            m_networkMatricesAllocated = allocateNetworkMatrices;
        }
@ -1224,107 +1263,180 @@ namespace CNTK
    template <typename ElementType>
    /*static*/ std::pair<std::shared_ptr<const Matrix<ElementType>>, MBLayoutPtr> CompositeFunction::GetCNTKImplMatrixAndMBLayoutFromValueObject(Variable var, const ValuePtr& value)
    {
-        if (var.GetDataType() != value->Data()->GetDataType())
-            LogicError("The Variable's DataType %s does not match the corresponding Value's DataType %s", DataTypeName(var.GetDataType()), DataTypeName(value->Data()->GetDataType()));
+        if (var.GetDataType() != value->GetDataType())
+            LogicError("The Variable's DataType %s does not match the corresponding Value's DataType %s", DataTypeName(var.GetDataType()), DataTypeName(value->GetDataType()));

-        if (AsDataType<ElementType>() != value->Data()->GetDataType())
-            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(value->Data()->GetDataType()));
+        if (AsDataType<ElementType>() != value->GetDataType())
+            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(value->GetDataType()));

        // TODO: Is supplying dense data for an Input variable tagged as sparse, a fatal error?
-        if (IsSparseInput(var) && !value->Data()->IsSparse())
+        if (IsSparseInput(var) && !value->IsSparse())
            InvalidArgument("Dense input data supplied for a sparse input Variable");

-        if (IsSparseInput(var) && (value->Data()->GetStorageFormat() != StorageFormat::SparseCSC))
+        if (IsSparseInput(var) && (value->GetStorageFormat() != StorageFormat::SparseCSC))
            InvalidArgument("Sparse Input data must be in SparseCSC format");

-        if (value->Data()->Shape().Rank() == var.Shape().Rank())
-            return{ value->Data()->GetMatrix<ElementType>(), nullptr };
+        auto varShape = var.Shape();
+        auto valueShape = value->Shape();
+        if (valueShape.Rank() < varShape.Rank())
+            InvalidArgument("Value's rank should be >= the Variable's rank");

-        if (value->Data()->Shape().Rank() < (var.Shape().Rank() + var.DynamicAxes().size()))
-            InvalidArgument("Value's number of axes should be larger than the Variable's number of axes by number of dynamic axes");
+        size_t maxAddionalValueAxes = std::max<size_t>(2, var.DynamicAxes().size());
+        if (valueShape.Rank() > (varShape.Rank() + maxAddionalValueAxes))
+            InvalidArgument("Value rank should be larger than the Variable%S rank at most by number of dynamic axes", ParanthesizedName(var.Name()).c_str());
+
+        if (valueShape.SubShape(0, varShape.Rank()) != varShape)
+        {
+            InvalidArgument("The %s dimensions of the Value shape %S do not match the shape of the variable %S that it corresponds to!",
+                            Internal::IsReversingTensorShapesInErrorMessagesEnabled() ? "trailing" : "leading",
+                            AsStringForErrorReporting(valueShape).c_str(),
+                            AsStringForErrorReporting(varShape).c_str());
+        }
+
+        if (var.DynamicAxes().empty())
+            return{ value->Data()->GetMatrix<ElementType>(), nullptr };

        if (var.DynamicAxes().size() > 2)
            LogicError("More than 2 dynamic axis for a variable is currently unsupported");

-        if (value->Data()->Shape().SubShape(0, var.Shape().Rank()) != var.Shape())
-        {
-            InvalidArgument("The %s dimensions of the Value shape %S do not match the shape of the variable %S that it corresponds to!", 
-                            Internal::IsReversingTensorShapesInErrorMessagesEnabled() ? "trailing" : "leading",
-                            AsStringForErrorReporting(value->Data()->Shape()).c_str(),
-                            AsStringForErrorReporting(var.Shape()).c_str());
-        }
-
-        size_t maxNumTimeSteps = value->Data()->Shape()[var.Shape().Rank()];
-        size_t numSequences = value->Data()->Shape()[var.Shape().Rank() + 1];
-
        auto mask = value->Mask();
-        if ((mask != nullptr) && ((var.Shape().Rank() + mask->Shape().Rank()) != value->Data()->Shape().Rank()))
+        if ((mask != nullptr) && ((varShape.Rank() + mask->Shape().Rank()) != valueShape.Rank()))
            InvalidArgument("Invalid Value object; the sum of the rank of the mask and data does not equal the Variable's rank + number of dynamic axes");

-        if ((numSequences == 1) || (maxNumTimeSteps == 1))
-        {
-            // The data need not be shuffled
-            std::shared_ptr<const Matrix<ElementType>> matrixData = value->Data()->GetMatrix<ElementType>(var.Shape().Rank());
-            auto layout = std::make_shared<MBLayout>();
-            if (maxNumTimeSteps == 1)
-                layout->InitAsFrameMode(numSequences);
-            else
-            {
-                layout->Init(1, maxNumTimeSteps);
-                layout->AddSequence(0, 0, 0, maxNumTimeSteps);
-            }
+        auto getNumTimeStepsAndSequencesFunc = [](const NDShape& maskShape) {
+            size_t maxNumTimeSteps = 1;
+            size_t numSequences = 1;
+            if (maskShape.Rank() > 0)
+                maxNumTimeSteps = maskShape[0];
+
+            if (maskShape.Rank() > 1)
+                numSequences = maskShape[1];
+
+            return std::pair<size_t, size_t>(maxNumTimeSteps, numSequences);
+        };
+
+        size_t maxNumTimeSteps, numSequences;
+        std::tie(maxNumTimeSteps, numSequences) = getNumTimeStepsAndSequencesFunc(valueShape.SubShape(varShape.Rank()));
+
+        auto getSequenceStartsAndLengthsFunc = [&getNumTimeStepsAndSequencesFunc](const NDMaskPtr& mask, std::vector<ptrdiff_t>& sequenceBeginIndices, std::vector<size_t>& sequenceLengths) {
+            auto cpuMask = mask;
+            if (mask->Device() != DeviceDescriptor::CPUDevice())
+                cpuMask = mask->DeepClone(DeviceDescriptor::CPUDevice());
+
+            const MaskKind* maskBuffer = cpuMask->DataBuffer();
+            size_t maxNumTimeSteps, numSequences;
+            std::tie(maxNumTimeSteps, numSequences) = getNumTimeStepsAndSequencesFunc(mask->Shape());

-            return{ matrixData , layout};
-        }
-        else
-        {
-            std::vector<size_t> sequenceLengths(numSequences, maxNumTimeSteps);
-            if (mask != nullptr)
-            {
-                // Determine the sequence lengths from the mask
-                std::unique_ptr<char[]> maskData(mask->GetMatrix()->CopyToArray());
            for (size_t i = 0; i < numSequences; ++i)
            {
-                    size_t currentSequenceLength = 0;
+                MaskKind firstMaskEntry = maskBuffer[i * maxNumTimeSteps];
+                if (firstMaskEntry == MaskKind::SequenceBegin)
+                    sequenceBeginIndices[i] = 0;
+                else if (firstMaskEntry == MaskKind::Valid)
+                    sequenceBeginIndices[i] = Microsoft::MSR::CNTK::SentinelValueIndicatingUnspecifedSequenceBeginIdx;
+                else
+                    LogicError("The first entry of a mask should be Valid or SequenceBegin");
+
+                size_t currentSequenceLength = 1;
                bool currentSequenceEndAlreadyFound = false;
-                    for (size_t j = 0; j < maxNumTimeSteps; ++j)
+                for (size_t j = 1; j < maxNumTimeSteps; ++j)
                {
-                        if (maskData[(i * maxNumTimeSteps) + j] == 1)
+                    if (maskBuffer[(i * maxNumTimeSteps) + j] == MaskKind::Invalid)
+                        currentSequenceEndAlreadyFound = true;
+                    else
                    {
                        if (currentSequenceEndAlreadyFound)
                            InvalidArgument("Invalid Value object; only trailing steps of a sequence can be masked");

                        currentSequenceLength++;
                    }
-                        else
-                            currentSequenceEndAlreadyFound = true;
                }

                sequenceLengths[i] = currentSequenceLength;
            }
+        };
+
+        if ((numSequences == 1) || (maxNumTimeSteps == 1))
+        {
+            // The data need not be shuffled
+            std::shared_ptr<const Matrix<ElementType>> matrixData = value->Data()->GetMatrix<ElementType>(varShape.Rank());
+            auto layout = std::make_shared<MBLayout>();
+            if (!mask)
+            {
+                if (maxNumTimeSteps == 1)
+                    layout->InitAsFrameMode(numSequences);
+                else
+                {
+                    layout->Init(numSequences, maxNumTimeSteps);
+                    layout->AddSequence(0, 0, 0, maxNumTimeSteps);
+                }
+            }
+            else
+            {
+                layout->Init(numSequences, maxNumTimeSteps);
+
+                std::vector<ptrdiff_t> sequenceBeginIndices(numSequences, 0);
+                std::vector<size_t> sequenceLengths(numSequences, maxNumTimeSteps);
+                getSequenceStartsAndLengthsFunc(mask, sequenceBeginIndices, sequenceLengths);
+
+                for (size_t i = 0; i < numSequences; ++i)
+                    layout->AddSequence(i, i, sequenceBeginIndices[i], sequenceLengths[i]);
            }

-            // The data needs to be rearranged since CNTK requires sequences to be interleaved across timesteps
-            std::vector<MBLayout::SequenceInfo> sequences;
-            for (size_t i = 0; i < numSequences; ++i)
-                sequences.push_back({ i, SIZE_MAX, 0, sequenceLengths[i]});
+            return{ matrixData , layout};
+        }
+        else
+        {
+            std::vector<ptrdiff_t> sequenceBeginIndices(numSequences, 0);
+            std::vector<size_t> sequenceLengths(numSequences, maxNumTimeSteps);
+            if (mask != nullptr)
+                getSequenceStartsAndLengthsFunc(mask, sequenceBeginIndices, sequenceLengths);
+
+            bool hasTruncatedSequences = std::find_if(sequenceBeginIndices.begin(), sequenceBeginIndices.end(), [](const int& val) { return (val < 0); }) != sequenceBeginIndices.end();

            auto layout = std::make_shared<MBLayout>();
            std::vector<std::pair<size_t, size_t>> placement;
+            if (!hasTruncatedSequences)
+            {
+                std::vector<MBLayout::SequenceInfo> sequences;
+                for (size_t i = 0; i < numSequences; ++i)
+                    sequences.push_back({ i, SIZE_MAX, sequenceBeginIndices[i], sequenceLengths[i] });
+
                std::vector<size_t> rowAllocations;
                layout->InitAsPackedSequences(sequences, placement, rowAllocations);
+            }
+            else
+            {
+                layout->Init(numSequences, maxNumTimeSteps);
+
+                // We cannot pack as some of the sequences are truncated and thus all sequences have to be
+                // kept in their original parallel streams
+                placement.resize(numSequences);
+                for (size_t i = 0; i < numSequences; ++i)
+                {
+                    layout->AddSequence(i, i, sequenceBeginIndices[i], sequenceLengths[i]);
+
+                    // Add the gap if there is one
+                    if (sequenceLengths[i] < maxNumTimeSteps)
+                        layout->AddSequence(GAP_SEQUENCE_ID, i, sequenceLengths[i], maxNumTimeSteps);
+
+                    placement[i] = std::make_pair(i, 0);
+                }
+            }
+
            if (maxNumTimeSteps != layout->GetNumTimeSteps())
                LogicError("The number of time steps in the packed MBLayout does not match the longest sequence's length in the Value object");

            if (numSequences != layout->GetNumSequences())
                LogicError("The number of sequences in the packed MBLayout does not match the sequence count in the Value object");

+            // The data needs to be rearranged since CNTK requires sequences to be interleaved across timesteps
            // Now generate the gather indices
-            auto matrixData = std::make_shared<Matrix<ElementType>>(var.Shape().TotalSize(),
+            auto matrixData = std::make_shared<Matrix<ElementType>>(varShape.TotalSize(),
                                                                    layout->GetNumCols(),
-                                                                    AsCNTKImplDeviceId(value->Data()->Device()),
-                                                                    value->Data()->IsSparse() ? MatrixType::SPARSE : MatrixType::DENSE,
-                                                                    AsCNTKImplMatrixFormat(value->Data()->GetStorageFormat()));
+                                                                    AsCNTKImplDeviceId(value->Device()),
+                                                                    value->IsSparse() ? MatrixType::SPARSE : MatrixType::DENSE,
+                                                                    AsCNTKImplMatrixFormat(value->GetStorageFormat()));

            std::vector<size_t> sequencesShorterThanLongestSequence;
            for (size_t i = 0; i < numSequences; ++i)
@ -1342,8 +1454,8 @@ namespace CNTK
                    gatherIndicesVector[((targetStartIdxInParallelStream + j) * layout->GetNumParallelSequences()) + targetParallelStreamIdx] = (ElementType)((i * maxNumTimeSteps) + j);
            }

-            auto gatherIdxMatrix = std::make_shared<Matrix<ElementType>>(1, layout->GetNumCols(), gatherIndicesVector.data(), AsCNTKImplDeviceId(value->Data()->Device()));
-            matrixData->DoGatherColumnsOf(0, *gatherIdxMatrix, *(value->Data()->GetMatrix<ElementType>(var.Shape().Rank())), 1);
+            auto gatherIdxMatrix = std::make_shared<Matrix<ElementType>>(1, layout->GetNumCols(), gatherIndicesVector.data(), AsCNTKImplDeviceId(value->Device()));
+            matrixData->DoGatherColumnsOf(0, *gatherIdxMatrix, *(value->Data()->GetMatrix<ElementType>(varShape.Rank())), 1);
            return{ matrixData, layout };
        }
    }
@ -1352,53 +1464,111 @@ namespace CNTK
    /*static*/ ValuePtr CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(const NDShape& sampleShape, const Matrix<ElementType>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/)
    {
        NDShape valueDataShape = sampleShape;
+
+        size_t maxNumTimeSteps = 1;
+        size_t numSequences = 1;
        if (layout != nullptr)
-            valueDataShape = valueDataShape.AppendShape({ layout->GetNumTimeSteps(), layout->GetNumSequences() });
+        {
+            maxNumTimeSteps = layout->GetNumTimeSteps();
+            numSequences = layout->GetNumSequences();
+            valueDataShape = valueDataShape.AppendShape({ maxNumTimeSteps, numSequences });
+        }
+
+        auto createMaskFunc = [](const MBLayoutPtr& layout, const DeviceDescriptor& device, std::vector<size_t>& sequencesShorterThanLongestSequence) {
+            std::vector<bool> sequenceBeginFlags;
+            std::vector<size_t> sequenceLengths;
+            sequencesShorterThanLongestSequence.clear();
+
+            size_t maxNumTimeSteps = layout->GetNumTimeSteps();
+            size_t numSequences = layout->GetNumSequences();
+            auto& layoutSequences = layout->GetAllSequences();
+
+            size_t sequenceIdx = 0;
+            bool allSequencesStartInThisMB = true;
+            bool allSequencesSameLength = true;
+            for (auto sequenceInfo : layoutSequences)
+            {
+                if (sequenceInfo.seqId != GAP_SEQUENCE_ID)
+                {
+                    auto currentSequenceBeginIdx = std::max<ptrdiff_t>(0, sequenceInfo.tBegin);
+                    auto currentSequenceEndIdx = std::min(maxNumTimeSteps, sequenceInfo.tEnd);
+                    auto currentSequenceLength = (currentSequenceEndIdx - currentSequenceBeginIdx);
+                    auto isCurrentSequenceBeginningInsideThisMB = sequenceInfo.tBegin >= 0;
+
+                    allSequencesStartInThisMB = allSequencesStartInThisMB && isCurrentSequenceBeginningInsideThisMB;
+                    allSequencesSameLength = allSequencesSameLength && (currentSequenceLength == maxNumTimeSteps);
+
+                    sequenceBeginFlags.push_back(isCurrentSequenceBeginningInsideThisMB);
+                    sequenceLengths.push_back(currentSequenceLength);
+
+                    if (currentSequenceLength != maxNumTimeSteps)
+                        sequencesShorterThanLongestSequence.push_back(sequenceIdx);
+
+                    sequenceIdx++;
+                }
+            }
+
+            if (!allSequencesStartInThisMB && (numSequences != layout->GetNumParallelSequences()))
+                LogicError("Cannot create an unpacked Value object from packed data where one or more sequences are truncated");
+
+            bool maskNeeded = !allSequencesSameLength || !allSequencesStartInThisMB;
+
+            NDMaskPtr mask;
+            if (maskNeeded)
+            {
+                mask = MakeSharedObject<NDMask>(NDShape({ maxNumTimeSteps, numSequences }), device);
+                for (size_t i = 0; i < numSequences; ++i)
+                    if (sequenceBeginFlags[i])
+                        mask->MarkSequenceBegin({0, i});
+
+                for (auto shortSequenceIdx : sequencesShorterThanLongestSequence)
+                    mask->InvalidateSection({ sequenceLengths[shortSequenceIdx], shortSequenceIdx }, { NDShape::InferredDimension, 1 });
+            }
+
+            return mask;
+        };

        // No data shuffling needed if no layout or the layout has just one time-step or just one sequence
-        if ((layout == nullptr) || (layout->GetNumTimeSteps() == 1) || (layout->GetNumSequences() == 1))
+        std::vector<size_t> sequencesShorterThanLongestSequence;
+        if ((maxNumTimeSteps == 1) || (numSequences == 1))
        {
            // Just create a view over the existing matrix itself
            auto tensorView = new TensorView<ElementType>(std::make_shared<Matrix<ElementType>>(matrix.AsReference()), AsTensorViewShape(valueDataShape));
            auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(matrix.GetFormat()), valueDataShape, readOnly, tensorView);
+            if (layout == nullptr)
                return MakeSharedObject<Value>(data);
+            else
+            {
+                auto mask = createMaskFunc(layout, AsDeviceDescriptor(matrix.GetDeviceId()), sequencesShorterThanLongestSequence);
+                return MakeSharedObject<Value>(data, mask);
+            }
        }

        if (layout->GetNumCols() != matrix.GetNumCols())
            LogicError("Bad MBLayout: The number of columns in the MBLayout does not match the number of columns in the data matrix!");

-        size_t maxNumTimeSteps = layout->GetNumTimeSteps();
-        size_t numSequences = layout->GetNumSequences();
-
-        std::vector<size_t> sequenceLengths;
-        auto& layoutSequences = layout->GetAllSequences();
-        for (auto sequenceInfo : layoutSequences)
-        {
-            if (sequenceInfo.seqId != GAP_SEQUENCE_ID)
-                sequenceLengths.push_back(sequenceInfo.GetNumTimeSteps());
-        }
-
        // Reshuffle to data to unpack and uninterleave the CNTK form packed data
        // Now generate the scatter indices
        auto shuffledMatrixData = std::make_shared<Matrix<ElementType>>(matrix.GetNumRows(), maxNumTimeSteps * numSequences, matrix.GetDeviceId(), matrix.GetMatrixType(), matrix.GetFormat());
-
-        std::vector<size_t> sequencesShorterThanLongestSequence;
-        for (size_t i = 0; i < numSequences; ++i)
-            if (sequenceLengths[i] != maxNumTimeSteps)
-                sequencesShorterThanLongestSequence.push_back(i);
+        auto mask = createMaskFunc(layout, AsDeviceDescriptor(matrix.GetDeviceId()), sequencesShorterThanLongestSequence);

        // Set the target location of all gaps to be the last step of the first sequence that is shorter than the longest sequence in the batch
        size_t targetColIdxForInvalidColumns = sequencesShorterThanLongestSequence.empty() ? 0 : (((sequencesShorterThanLongestSequence[0] + 1) * maxNumTimeSteps) - 1);
        std::vector<ElementType> scatterIndicesVector(layout->GetNumCols(), (ElementType)targetColIdxForInvalidColumns);
+
        size_t i = 0;
+        auto& layoutSequences = layout->GetAllSequences();
        for (auto sequenceInfo : layoutSequences)
        {
            if (sequenceInfo.seqId != GAP_SEQUENCE_ID)
            {
                size_t targetParallelStreamIdx = sequenceInfo.s;
-                size_t targetStartIdxInParallelStream = sequenceInfo.tBegin;
-                for (size_t j = 0; j < sequenceInfo.GetNumTimeSteps(); ++j)
-                    scatterIndicesVector[((targetStartIdxInParallelStream + j) * layout->GetNumParallelSequences()) + targetParallelStreamIdx] = (ElementType)((i * maxNumTimeSteps) + j);
+                auto currentSequenceBeginIdx = std::max<ptrdiff_t>(0, sequenceInfo.tBegin);
+                auto currentSequenceEndIdx = std::min(maxNumTimeSteps, sequenceInfo.tEnd);
+                size_t currentSequenceLength = (currentSequenceEndIdx - currentSequenceBeginIdx);
+
+                for (size_t j = 0; j < currentSequenceLength; ++j)
+                    scatterIndicesVector[((currentSequenceBeginIdx + j) * layout->GetNumParallelSequences()) + targetParallelStreamIdx] = (ElementType)((i * maxNumTimeSteps) + j);

                i++;
            }
@ -1407,17 +1577,6 @@ namespace CNTK
        auto scatterIdxMatrix = std::make_shared<Matrix<ElementType>>(1, layout->GetNumCols(), scatterIndicesVector.data(), matrix.GetDeviceId());
        shuffledMatrixData->DoScatterColumnsOf(0, *scatterIdxMatrix, matrix, 1);

-        // Create the mask if needed
-        NDMaskPtr mask;
-        if (!sequencesShorterThanLongestSequence.empty())
-        {
-            mask = MakeSharedObject<NDMask>(NDShape({ maxNumTimeSteps, numSequences }), AsDeviceDescriptor(matrix.GetDeviceId()));
-            for (auto shortSequenceIdx : sequencesShorterThanLongestSequence)
-            {
-                mask->MaskSection({ sequenceLengths[shortSequenceIdx], shortSequenceIdx }, { NDShape::InferredDimension, 1 });
-            }
-        }
-
        auto tensorView = new TensorView<ElementType>(shuffledMatrixData, AsTensorViewShape(valueDataShape));
        auto data = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(matrix.GetDeviceId()), AsStorageFormat(shuffledMatrixData->GetFormat()), valueDataShape, readOnly, tensorView);
        return MakeSharedObject<Value>(data, mask);
@ -1441,7 +1600,13 @@ namespace CNTK
    template <typename ElementType>
    /*static*/ void CompositeFunction::PopulateComputationNodeValue(const std::pair<Variable, ValuePtr>& variableValue, ComputationNodeBasePtr& computationNode)
    {
-        auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableValue.first, variableValue.second);
+        std::pair<std::shared_ptr<const Matrix<ElementType>>, MBLayoutPtr> CNTKMatrixAndMBLayout;
+        auto packedValue = dynamic_cast<PackedValue*>(variableValue.second.get());
+        if (packedValue)
+            CNTKMatrixAndMBLayout = packedValue->PackedData<ElementType>();
+        else
+            CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableValue.first, variableValue.second);
+
        MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;

        auto& nodeData = computationNode->As<ComputationNode<ElementType>>()->Value();
@ -1464,7 +1629,7 @@ namespace CNTK
            ValuePtr argumentValue = arguments.at(argument);

            MBLayoutPtr layout;
-            switch (argumentValue->Data()->GetDataType())
+            switch (argumentValue->GetDataType())
            {
            case DataType::Float:
                PopulateComputationNodeValue<float>({ argument, argumentValue }, argumentComputationNode);
@ -1473,7 +1638,7 @@ namespace CNTK
                PopulateComputationNodeValue<double>({ argument, argumentValue }, argumentComputationNode);
                break;
            default:
-                LogicError("Unsupported DataType %s", DataTypeName(argumentValue->Data()->GetDataType()));
+                LogicError("Unsupported DataType %s", DataTypeName(argumentValue->GetDataType()));
                break;
            }
        }
@ -1484,7 +1649,13 @@ namespace CNTK
    template <typename ElementType>
    /*static*/ void CompositeFunction::PopulateComputationNodeGradient(const std::pair<Variable, ValuePtr>& variableGradient, Microsoft::MSR::CNTK::ComputationNodeBasePtr& computationNode)
    {
-        auto CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableGradient.first, variableGradient.second);
+        std::pair<std::shared_ptr<const Matrix<ElementType>>, MBLayoutPtr> CNTKMatrixAndMBLayout;
+        auto packedValue = dynamic_cast<PackedValue*>(variableGradient.second.get());
+        if (packedValue)
+            CNTKMatrixAndMBLayout = packedValue->PackedData<ElementType>();
+        else
+            CNTKMatrixAndMBLayout = GetCNTKImplMatrixAndMBLayoutFromValueObject<ElementType>(variableGradient.first, variableGradient.second);
+
        MBLayoutPtr layout = CNTKMatrixAndMBLayout.second;
        auto nodeLayout = computationNode->GetMBLayout();
        if (((layout == nullptr) != (nodeLayout == nullptr)) || ((layout != nullptr) && (*layout != *nodeLayout)))
@ -1505,7 +1676,7 @@ namespace CNTK
            auto outputComputationNode = m_variableToNodeMap[gradientVarValuePair.first];
            ValuePtr gradientValue = gradientVarValuePair.second;

-            switch (gradientValue->Data()->GetDataType())
+            switch (gradientValue->GetDataType())
            {
            case DataType::Float:
                PopulateComputationNodeGradient<float>(gradientVarValuePair, outputComputationNode);
@ -1514,7 +1685,7 @@ namespace CNTK
                PopulateComputationNodeGradient<double>(gradientVarValuePair, outputComputationNode);
                break;
            default:
-                LogicError("Unsupported DataType %s", DataTypeName(gradientValue->Data()->GetDataType()));
+                LogicError("Unsupported DataType %s", DataTypeName(gradientValue->GetDataType()));
                break;
            }
        }
@ -1547,23 +1718,32 @@ namespace CNTK
        if (varValue != nullptr)
        {
            // TODO: The shape of the specified output Value object must match the actual output shape
-            if (varValue->Data()->Shape() != valueShape)
-                InvalidArgument("The shape %S of the specified Value object for %s does not match the actual shape %S", AsStringForErrorReporting(varValue->Data()->Shape()).c_str(), getGradient ? "gradient" : "output", AsStringForErrorReporting(valueShape).c_str());
+            if (varValue->Shape() != valueShape)
+                InvalidArgument("The shape %S of the specified Value object for %s does not match the actual shape %S", AsStringForErrorReporting(varValue->Shape()).c_str(), getGradient ? "gradient" : "output", AsStringForErrorReporting(valueShape).c_str());
        }

        ValuePtr nodeValue;
+        auto layout = computationNode->GetMBLayout();
        switch (var.GetDataType())
        {
        case DataType::Float:
-            nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(var,
-                                                                           getGradient ? computationNode->As<ComputationNode<float>>()->Gradient() : computationNode->As<ComputationNode<float>>()->Value(),
-                                                                           computationNode->GetMBLayout());
+        {
+            auto& matrix = getGradient ? computationNode->As<ComputationNode<float>>()->Gradient() : computationNode->As<ComputationNode<float>>()->Value();
+            if (varValue == nullptr)
+                nodeValue = MakeSharedObject<PackedValue>(var.Shape(), std::make_shared<Matrix<float>>(matrix.AsReference()), layout, /*readOnly =*/ false);
+            else
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(var, matrix, layout);
            break;
+        }
        case DataType::Double:
-            nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var,
-                                                                            getGradient ? computationNode->As<ComputationNode<double>>()->Gradient() : computationNode->As<ComputationNode<double>>()->Value(),
-                                                                            computationNode->GetMBLayout());
+        {
+            auto& matrix = getGradient ? computationNode->As<ComputationNode<double>>()->Gradient() : computationNode->As<ComputationNode<double>>()->Value();
+            if (varValue == nullptr)
+                nodeValue = MakeSharedObject<PackedValue>(var.Shape(), std::make_shared<Matrix<double>>(matrix.AsReference()), layout, /*readOnly =*/ false);
+            else
+                nodeValue = GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var, matrix, layout);
            break;
+        }
        default:
            LogicError("Unsupported DataType %s", DataTypeName(var.GetDataType()));
            break;
@ -1605,6 +1785,20 @@ namespace CNTK
        }
    }

+    const std::vector<Variable>& CompositeFunction::GetArgumentDependencies(const Variable& output)
+    {
+        assert(output.IsOutput());
+
+        auto iter = m_perOutputVarArgumentDependencies.find(output);
+        if (iter != m_perOutputVarArgumentDependencies.end())
+            return iter->second;
+
+        auto wrappedComposite = CompositeFunction::Create(output.Owner());
+        m_perOutputVarArgumentDependencies[output] = wrappedComposite->Arguments();
+
+        return m_perOutputVarArgumentDependencies[output];
+    }
+
    /*virtual*/ BackPropStatePtr CompositeFunction::Forward(const std::unordered_map<Variable, ValuePtr>& arguments,
                                                            std::unordered_map<Variable, ValuePtr>& outputs,
                                                            const DeviceDescriptor& computeDevice,
@ -1641,8 +1835,31 @@ namespace CNTK
        else
            InvalidArgument("Unsupported DataType %s", DataTypeName(dataType));

+        std::unordered_set<Variable> functionOutputs(this->Outputs().begin(), this->Outputs().end());
+        std::vector<ComputationNodeBasePtr> outputsToEvaluate;
+        std::unordered_set<Variable> requiredArguments;
+        for (auto outputVarValuePair : outputs)
+        {
+            // Ensure that only a subset of this function's outputs are being asked to be evaluated
+            if (functionOutputs.find(outputVarValuePair.first) == functionOutputs.end())
+                InvalidArgument("Requested output is not an Ouptut of the Function");
+
+            auto& requiredArgumentsForCurrentOutput = GetArgumentDependencies(outputVarValuePair.first);
+            requiredArguments.insert(requiredArgumentsForCurrentOutput.begin(), requiredArgumentsForCurrentOutput.end());
+
+            auto outputComputationNode = m_variableToNodeMap[outputVarValuePair.first];
+            outputsToEvaluate.push_back(outputComputationNode);
+        }
+
        // TODO: Avoid copying the data when possible

+        // We should have argument values supplied for all required argument dependencies for the requested outputs
+        for (auto requiredArgument : requiredArguments)
+        {
+            if (arguments.find(requiredArgument) == arguments.end())
+                InvalidArgument("Function::Forward: Required argument's (%S) value that the requested output(s) depend on has not been provided", requiredArgument.Name().c_str());
+        }
+
        // Feed data into the arguments of the network
        PopulateNetworkInputs(arguments);

@ -1653,19 +1870,6 @@ namespace CNTK
        for (auto& nodeIter : dropoutNodes)
            nodeIter->SetEvalTimeStampOutdatedWrtAll();

-        std::unordered_set<Variable> functionOutputs(this->Outputs().begin(), this->Outputs().end());
-        std::vector<ComputationNodeBasePtr> outputsToEvaluate;
-
-        for (auto outputVarValuePair : outputs)
-        {
-            // Ensure that only a subset of this function's outputs are being asked to be evaluated
-            if (functionOutputs.find(outputVarValuePair.first) == functionOutputs.end())
-                InvalidArgument("Requested output is not an Ouptut of the Function");
-
-            auto outputComputationNode = m_variableToNodeMap[outputVarValuePair.first];
-            outputsToEvaluate.push_back(outputComputationNode);
-        }
-
        // The 'outputsToRetainBackwardStateFor' nodes also need to be evaluated if not already specified in 'outputs'
        for (auto rootVarForBackprop : outputsToRetainBackwardStateFor)
        {
@ -1879,7 +2083,7 @@ namespace CNTK
                newDynamicAxes.push_back(operandAxis);
        }

-        return Internal::Gather(operand, flags, newDynamicAxes);
+        return Internal::Gather(operand, flags, newDynamicAxes, name);
    }

    FunctionPtr Dropout(const Variable& operand, double dropoutRate, const std::wstring& name /*= L""*/)
@ -1968,23 +2172,25 @@ namespace CNTK

    FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name/* = L""*/)
    {
-        return BinaryOp(PrimitiveOpType::SquaredError, prediction, targets, Dictionary(), name);
+        auto difference = Minus(prediction, targets);
+        auto squaredDifference = ElementTimes(difference, difference);
+        return Internal::ReduceElements(squaredDifference, PrimitiveFunction::InternalSumReductionOpName, Axis::AllStaticAxes(), name);
    }

    FunctionPtr CrossEntropyWithSoftmax(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return ReduceSum(Minus(ReduceLogSum(prediction, Axis(0)), TransposeTimes(labels, prediction)), name);
+        return Minus(ReduceLogSum(prediction, Axis(0)), TransposeTimes(labels, prediction), name);
    }

    FunctionPtr ClassificationError(const Variable& prediction, const Variable& labels, const std::wstring& name/* = L""*/)
    {
-        return ReduceSum(Minus(Constant::Scalar(prediction.GetDataType(), 1.0), TransposeTimes(labels, Hardmax(prediction))), name);
+        return Minus(Constant::Scalar(prediction.GetDataType(), 1.0), TransposeTimes(labels, Hardmax(prediction)), name);
    }

    FunctionPtr PastValue(const Variable& operand, const Variable& initialState, size_t offset, const std::wstring& name)
    {
        if (operand.DynamicAxes().size() != 2)
-            InvalidArgument("PastValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic sequence-axis");
+            InvalidArgument("PastValue can only be used for operands with exactly one dynamic sequence-axis and one dynamic batch axis");

        auto additionalProperties = Dictionary();
        additionalProperties[PrimitiveFunction::AttributeNameOffset] = DictionaryValue(offset);
@ -1994,7 +2200,7 @@ namespace CNTK
    FunctionPtr FutureValue(const Variable& operand, const Variable& initialState, size_t offset, const std::wstring& name)
    {
        if (operand.DynamicAxes().size() != 2)
-            InvalidArgument("FutureValue overload that does not explicitly specify a dynamic axis can only be used for operands with exactly one dynamic sequence-axis");
+            InvalidArgument("FutureValue can only be used for operands with exactly one dynamic sequence-axis and one dynamic batch axis");

        auto additionalProperties = Dictionary();
        additionalProperties[PrimitiveFunction::AttributeNameOffset] = DictionaryValue(offset);
@ -2035,7 +2241,7 @@ namespace CNTK
        Constant meanVar(mean);
        Constant invStdDevVar(invStdDev);

-        return ElementTimes(Minus(operand, meanVar), invStdDevVar);
+        return ElementTimes(Minus(operand, meanVar), invStdDevVar, name);
    }

    FunctionPtr Convolution(const Variable& convolutionMap,
@ -2049,6 +2255,12 @@ namespace CNTK
                            size_t maxTempMemSizeInSamples,
                            const std::wstring& name)
    {
+        // Currently we require that the Convolution function's operand have a dynamic axis since otherwise
+        // the internal implementation incorrectly infers the batch axis dimension by picking up the first axis as 
+        // the sample shape and considering the rest to be part of the batch axis
+        if (operand.DynamicAxes().empty())
+            LogicError("Convolution currently requires the main operand to have dynamic axes");
+
        auto additionalProperties = Dictionary();
        additionalProperties[PrimitiveFunction::AttributeNameStrides] = strides;
        additionalProperties[PrimitiveFunction::AttributeNameSharing] = AsDictionaryValueVector(sharing);
@ -2129,16 +2341,18 @@ namespace CNTK
        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Splice, operands, std::move(additionalProperties), name), name);
    }

-    FunctionPtr Combine(const std::vector<FunctionPtr>& operands, const std::wstring& name/* = L""*/)
+    FunctionPtr Combine(const std::vector<Variable>& operands, const std::wstring& name /*= L""*/)
    {
-        std::vector<Variable> inputs;
+        std::unordered_set<Variable> uniqueOperands;
        for (auto operand : operands)
        {
-            auto currentFunctionOutputs = operand->Outputs();
-            std::copy(currentFunctionOutputs.begin(), currentFunctionOutputs.end(), std::back_inserter(inputs));
+            if (uniqueOperands.find(operand) != uniqueOperands.end())
+                LogicError("All operands specified to Combine must be unique");
+
+            uniqueOperands.insert(operand);
        }

-        return Internal::Combine(inputs, name);
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, operands, Dictionary(), name), name);
    }

    namespace Sequence
@ -2153,25 +2367,25 @@ namespace CNTK
        FunctionPtr IsFirst(const Variable& operand, const std::wstring& name /*= L""*/)
        {
            VerifyIsSequence(operand);
-            return Internal::IsWithin(operand, 1);
+            return Internal::IsWithin(operand, 1, name);
        }

        FunctionPtr IsLast(const Variable& operand, const std::wstring& name /*= L""*/)
        {
            VerifyIsSequence(operand);
-            return Internal::IsWithin(operand, -1);
+            return Internal::IsWithin(operand, -1, name);
        }

        FunctionPtr First(const Variable& operand, const std::wstring& name /*= L""*/)
        {
            VerifyIsSequence(operand);
-            return Slice(operand, operand.DynamicAxes()[0], 0, 1);
+            return Slice(operand, operand.DynamicAxes()[0], 0, 1, name);
        }

        FunctionPtr Last(const Variable& operand, const std::wstring& name /*= L""*/)
        {
            VerifyIsSequence(operand);
-            return Slice(operand, operand.DynamicAxes()[0], -1, 0);
+            return Slice(operand, operand.DynamicAxes()[0], -1, 0, name);
        }

        std::vector<Axis> WhereOpDynamicAxes(const Variable& operand)
@ -2211,20 +2425,6 @@ namespace CNTK

    namespace Internal
    {
-        FunctionPtr Combine(const std::vector<Variable>& operands, const std::wstring& name /*= L""*/)
-        {
-            std::unordered_set<Variable> uniqueOperands;
-            for (auto operand : operands)
-            {
-                if (uniqueOperands.find(operand) != uniqueOperands.end())
-                    LogicError("All operands specified to Combine must be unique");
-
-                uniqueOperands.insert(operand);
-            }
-
-            return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Combine, operands, Dictionary(), name), name);
-        }
-
        FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name /*= L""*/)
        {
            Sequence::VerifyIsSequence(operand);
@ -2266,14 +2466,8 @@ namespace CNTK
            }
            else
            {
-                auto rowSliceFunc = Internal::Slice(operand, Axis(0), 0, 1);
-                auto result = Minus(rowSliceFunc, rowSliceFunc);
-
-                // Reduce away all but the static axis 0
-                for (size_t i = 1; i < result->Output().Shape().Rank(); ++i)
-                    result = ReduceSum(result, Axis(i));
-
-                return result;
+                auto reduceAllStaticAxesFunc = Internal::ReduceElements(operand, PrimitiveFunction::InternalSumReductionOpName, Axis::AllStaticAxes());
+                return Minus(reduceAllStaticAxesFunc, reduceAllStaticAxesFunc);
            }
        }

@ -2286,12 +2480,12 @@ namespace CNTK

        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name /*= L""*/)
        {
-            return Internal::GatherPacked(operand, Internal::PackedIndex(operand, Where(condition, newDynamicAxes)));
+            return Internal::GatherPacked(operand, Internal::PackedIndex(/*layout of*/ operand, Where(condition, newDynamicAxes)), name);
        }

        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name /*= L""*/)
        {
-            return Internal::ScatterPacked(operand, Internal::PackedIndex(operand, Where(condition, newDynamicAxes)), condition);
+            return Internal::ScatterPacked(operand, Internal::PackedIndex(/*layout of*/ condition, Where(condition, newDynamicAxes)), /*layout of*/ condition, name);
        }

        FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name /*= L""*/)
@ -2308,7 +2502,7 @@ namespace CNTK
        {
            using namespace std::placeholders;

-            if (axis.IsStaticAxis())
+            if (axis.IsStaticAxis() || (axis == Axis::AllStaticAxes()))
            {
                auto additionalProperties = Dictionary();
                additionalProperties[PrimitiveFunction::AttributeNameAxis] = axis;
@ -2332,7 +2526,7 @@ namespace CNTK
            auto cumulativeSumFunction = reductionFunctor(prevAccumulatedValuesFunction, operand);
            cumulativeSumFunction->ReplacePlaceholders({ { cumulativeSumFunctionPlaceholder, cumulativeSumFunction } });

-            return CNTK::Slice(cumulativeSumFunction, axis, -1, 0);
+            return CNTK::Slice(cumulativeSumFunction, axis, -1, 0, name);
        }
   }
 }
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -77,54 +77,54 @@ namespace std

 namespace CNTK
 {
-    inline const char* PrimitiveOpTypeName(PrimitiveOpType opType)
+    inline const std::wstring& PrimitiveOpTypeName(PrimitiveOpType opType)
    {
-        static const std::unordered_map<PrimitiveOpType, const char*> primitiveOpNames = {
-            { PrimitiveOpType::Negate, "Negate" },
-            { PrimitiveOpType::Sigmoid, "Sigmoid" },
-            { PrimitiveOpType::Tanh, "Tanh" },
-            { PrimitiveOpType::ReLU, "ReLU" },
-            { PrimitiveOpType::Exp, "Exp" },
-            { PrimitiveOpType::Log, "Log" },
-            { PrimitiveOpType::Sqrt, "Sqrt" },
-            { PrimitiveOpType::Floor, "Floor" },
-            { PrimitiveOpType::Abs, "Abs" },
-            { PrimitiveOpType::Reciprocal, "Reciprocal" },
-            { PrimitiveOpType::Softmax, "Softmax" },
-            { PrimitiveOpType::Hardmax, "Hardmax" },
-            { PrimitiveOpType::TransposeAxes, "TransposeAxes" },
-            { PrimitiveOpType::Where, "Where" },
-            { PrimitiveOpType::Slice, "Slice" },
-            { PrimitiveOpType::Dropout, "Dropout" },
-            { PrimitiveOpType::Reshape, "Reshape" },
-            { PrimitiveOpType::Pooling, "Pooling" },
-            { PrimitiveOpType::SumAll, "SumAll" },
-            { PrimitiveOpType::Plus, "Plus" },
-            { PrimitiveOpType::Minus, "Minus" },
-            { PrimitiveOpType::ElementTimes, "ElementTimes" },
-            { PrimitiveOpType::Equal, "Equal" },
-            { PrimitiveOpType::NotEqual, "NotEqual" },
-            { PrimitiveOpType::Less, "Less" },
-            { PrimitiveOpType::LessEqual, "LessEqual" },
-            { PrimitiveOpType::Greater, "Greater" },
-            { PrimitiveOpType::GreaterEqual, "GreaterEqual" },
-            { PrimitiveOpType::PackedIndex, "PackedIndex" },
-            { PrimitiveOpType::GatherPacked, "GatherPacked" },
-            { PrimitiveOpType::ScatterPacked, "ScatterPacked" },
-            { PrimitiveOpType::Times, "Times" },
-            { PrimitiveOpType::TransposeTimes, "TransposeTimes" },
-            { PrimitiveOpType::Convolution, "Convolution" },
-            { PrimitiveOpType::SquaredError, "SquaredError" },
-            { PrimitiveOpType::CrossEntropyWithSoftmax, "CrossEntropyWithSoftmax" },
-            { PrimitiveOpType::ClassificationError, "ClassificationError" },
-            { PrimitiveOpType::PastValue, "PastValue" },
-            { PrimitiveOpType::FutureValue, "FutureValue" },
-            { PrimitiveOpType::ReduceElements, "ReduceElements" },
-            { PrimitiveOpType::BatchNormalization, "BatchNormalization" },
-            { PrimitiveOpType::Clip, "Clip" },
-            { PrimitiveOpType::Select, "Select" },
-            { PrimitiveOpType::Splice, "Splice" },
-            { PrimitiveOpType::Combine, "Combine" }
+        static const std::unordered_map<PrimitiveOpType, std::wstring> primitiveOpNames = {
+            { PrimitiveOpType::Negate, L"Negate" },
+            { PrimitiveOpType::Sigmoid, L"Sigmoid" },
+            { PrimitiveOpType::Tanh, L"Tanh" },
+            { PrimitiveOpType::ReLU, L"ReLU" },
+            { PrimitiveOpType::Exp, L"Exp" },
+            { PrimitiveOpType::Log, L"Log" },
+            { PrimitiveOpType::Sqrt, L"Sqrt" },
+            { PrimitiveOpType::Floor, L"Floor" },
+            { PrimitiveOpType::Abs, L"Abs" },
+            { PrimitiveOpType::Reciprocal, L"Reciprocal" },
+            { PrimitiveOpType::Softmax, L"Softmax" },
+            { PrimitiveOpType::Hardmax, L"Hardmax" },
+            { PrimitiveOpType::TransposeAxes, L"TransposeAxes" },
+            { PrimitiveOpType::Where, L"Where" },
+            { PrimitiveOpType::Slice, L"Slice" },
+            { PrimitiveOpType::Dropout, L"Dropout" },
+            { PrimitiveOpType::Reshape, L"Reshape" },
+            { PrimitiveOpType::Pooling, L"Pooling" },
+            { PrimitiveOpType::SumAll, L"SumAll" },
+            { PrimitiveOpType::Plus, L"Plus" },
+            { PrimitiveOpType::Minus, L"Minus" },
+            { PrimitiveOpType::ElementTimes, L"ElementTimes" },
+            { PrimitiveOpType::Equal, L"Equal" },
+            { PrimitiveOpType::NotEqual, L"NotEqual" },
+            { PrimitiveOpType::Less, L"Less" },
+            { PrimitiveOpType::LessEqual, L"LessEqual" },
+            { PrimitiveOpType::Greater, L"Greater" },
+            { PrimitiveOpType::GreaterEqual, L"GreaterEqual" },
+            { PrimitiveOpType::PackedIndex, L"PackedIndex" },
+            { PrimitiveOpType::GatherPacked, L"GatherPacked" },
+            { PrimitiveOpType::ScatterPacked, L"ScatterPacked" },
+            { PrimitiveOpType::Times, L"Times" },
+            { PrimitiveOpType::TransposeTimes, L"TransposeTimes" },
+            { PrimitiveOpType::Convolution, L"Convolution" },
+            { PrimitiveOpType::SquaredError, L"SquaredError" },
+            { PrimitiveOpType::CrossEntropyWithSoftmax, L"CrossEntropyWithSoftmax" },
+            { PrimitiveOpType::ClassificationError, L"ClassificationError" },
+            { PrimitiveOpType::PastValue, L"PastValue" },
+            { PrimitiveOpType::FutureValue, L"FutureValue" },
+            { PrimitiveOpType::ReduceElements, L"ReduceElements" },
+            { PrimitiveOpType::BatchNormalization, L"BatchNormalization" },
+            { PrimitiveOpType::Clip, L"Clip" },
+            { PrimitiveOpType::Select, L"Select" },
+            { PrimitiveOpType::Splice, L"Splice" },
+            { PrimitiveOpType::Combine, L"Combine" },
        };

        if (primitiveOpNames.find(opType) == primitiveOpNames.end())
@ -220,7 +220,7 @@ namespace CNTK

    public:
        PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName = L"")
-            : Function(inputs, GetOutputVariables(op, inputs, this, functionConfig), std::move(functionConfig), nullptr, functionName), m_op(op)
+            : Function(inputs, GetOutputVariables(op, inputs, this, functionConfig, functionName), std::move(functionConfig), nullptr, functionName), m_op(op)
        {
        }

@ -239,6 +239,11 @@ namespace CNTK
            NOT_IMPLEMENTED;
        }

+        virtual const std::wstring& OpName() override
+        {
+            return PrimitiveOpTypeName(OpType());
+        }
+
    public:
        PrimitiveOpType OpType() const
        {
@ -343,7 +348,10 @@ namespace CNTK
                else
                {
                    if (leftOperandShape[i] != rightOperandShape[i])
-                        RuntimeError("Left operand's shape %S is not compatible with right operand's shape %S for the binary elementwise operation %s", AsStringForErrorReporting(leftOperandShape).c_str(), AsStringForErrorReporting(rightOperandShape).c_str(), PrimitiveOpTypeName(op));
+                        RuntimeError("Left operand's shape %S is not compatible with right operand's shape %S for the binary elementwise operation %S",
+                                     AsStringForErrorReporting(leftOperandShape).c_str(),
+                                     AsStringForErrorReporting(rightOperandShape).c_str(),
+                                     PrimitiveOpTypeName(op).c_str());

                    outputDims[i] = leftOperandShape[i];
                }
@ -399,19 +407,25 @@ namespace CNTK
            return leftOperandShape.SubShape(0, outputRank).AppendShape(rightOperandShape.SubShape(numReductionAxes));
        }

-        static NDShape ReductionOpOutputShape(PrimitiveOpType op, const NDShape& operandShape, const std::vector<size_t>& reductionAxes)
+        static NDShape ReductionOpOutputShape(PrimitiveOpType op, const NDShape& operandShape, const std::vector<size_t>& reductionAxes, bool preserveReductionAxes)
        {
            if (reductionAxes.size() > operandShape.Rank())
-                RuntimeError("The number of reduction axes %d exceeds the number of axes in the operand shape %S of the reduction operation %s", (int)reductionAxes.size(), AsStringForErrorReporting(operandShape).c_str(), PrimitiveOpTypeName(op));
+                RuntimeError("The number of reduction axes %d exceeds the rank in the operand shape %S of the reduction operation %S",
+                             (int)reductionAxes.size(),
+                             AsStringForErrorReporting(operandShape).c_str(),
+                             PrimitiveOpTypeName(op).c_str());

-            size_t numOutputAxes = operandShape.Rank() - reductionAxes.size();
+            size_t numOutputAxes = operandShape.Rank() - (preserveReductionAxes ? 0 : reductionAxes.size());
            std::vector<size_t> outputDims(numOutputAxes);
            for (size_t i = 0, j = 0; i < operandShape.Rank(); ++i)
            {
                // Skip axes being reduced over
                if (std::find(reductionAxes.begin(), reductionAxes.end(), i) != reductionAxes.end())
-                    continue;
-
+                {
+                    if (preserveReductionAxes)
+                        outputDims[j++] = 1;
+                }
+                else
                    outputDims[j++] = operandShape[i];
            }

@ -433,7 +447,7 @@ namespace CNTK
        }

        // TODO: Reconcile this with the ComputationNode::Validate functionality in core CNTK to avoid duplication of inference logic
-        static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner, const Dictionary& functionConfig);
+        static std::vector<Variable> GetOutputVariables(PrimitiveOpType op, const std::vector<Variable>& inputs, Function* owner, const Dictionary& functionConfig, const std::wstring& functionName);

    private:
        PrimitiveOpType m_op;
@ -464,6 +478,7 @@ namespace CNTK
        friend class Function;
        friend class Trainer;
        friend class CompositeMinibatchSource;
+        friend class PackedValue;

        template <typename T, typename ...CtorArgTypes>
        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
@ -476,6 +491,8 @@ namespace CNTK

        static std::atomic<unsigned int> s_nextAutoGeneratedDynamicAxis;

+        static const std::wstring CompositeFunctionOpName;
+
    public:
        static const std::wstring InternalDefaultDynamicAxisName;
        static const std::wstring InternalNoSequenceAxisName;
@ -506,15 +523,9 @@ namespace CNTK
                              const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
                              std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override;

-    public:
-        bool NetworkMatricesAllocated() const 
+        virtual const std::wstring& OpName() override
        {
-            return (m_computationNetwork != nullptr) && m_networkMatricesAllocated; 
-        }
-
-        void PurgeComputationNetwork()
-        {
-            m_computationNetwork = nullptr;
+            return CompositeFunctionOpName;
        }

    private:
@ -523,7 +534,7 @@ namespace CNTK
                                                std::unordered_set<Variable>& replacedPlaceholders) override;

        CompositeFunction(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name)
-            : Function({}, rootFunction->Outputs(), Dictionary(), rootFunction, name), m_allPrimitiveFunctions(std::move(allPrimitiveFunctions))
+            : Function({}, rootFunction->Outputs(), Dictionary(), rootFunction, name), m_allPrimitiveFunctions(std::move(allPrimitiveFunctions)), m_networkMatricesAllocated(false)
        {}

        std::vector<Variable> DetermineInputs() const
@ -597,6 +608,8 @@ namespace CNTK
        template <typename ElementType>
        static ValuePtr GetValueObjectFromCNTKImplMatrixAndMBLayout(Variable var, const Microsoft::MSR::CNTK::Matrix<ElementType>& matrix, const Microsoft::MSR::CNTK::MBLayoutPtr& layout, bool readOnly = true);

+        const std::vector<Variable>& GetArgumentDependencies(const Variable& output);
+
    private:

        // Set of all primitive functions in the graph underlying 'this' Function. Also keeps the primitive Function objects alive 
@ -617,6 +630,8 @@ namespace CNTK
        // the next 'Backward' call.
        std::unordered_set<Variable> m_currentBackpropRoots;

+        std::unordered_map<Variable, std::vector<Variable>> m_perOutputVarArgumentDependencies;
+
        bool m_networkMatricesAllocated;
    };

--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -26,6 +26,9 @@ using namespace std;

 namespace CNTK
 {
+    /*static*/ const std::wstring Learner::LearningRateAttributeName = L"learningRate";
+    /*static*/ const std::wstring LearnerBase::WasLearningRateResetAttributeName = L"wasLearningRateReset";
+
    template <typename ElementType>
    /*static*/ shared_ptr<const Matrix<ElementType>> LearnerBase::GetMatrix(const NDArrayViewPtr& arrayView)
    {
@ -141,7 +144,7 @@ namespace CNTK
        // L1 regularizer with proximal gradient descent method
        if (m_additionalOptions.l1RegularizationWeight > 0)
        {
-            auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+            auto learningRate = ElementType(LearningRate());
            // multiply by actualMBSize so that it's invariant to minibatch size since learning rate is per sample
            auto weight = ElementType(learningRate * m_additionalOptions.l1RegularizationWeight * actualMBSize);
            parameterValue->GetWritableMatrix<ElementType>()->InplaceSoftThreshold(weight);
@ -156,17 +159,15 @@ namespace CNTK

    LearnerBase::LearnerBase(const vector<Parameter>& parameters, 
                             const LearningRatesPerSample& learningRates,
-                             bool allocateSmoothGradients /* = true */,
-                             double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                             bool gradientClippingWithTruncation /*= true*/)
-        : Learner(parameters),
-        m_learningRates(learningRates),
+                             AdditionalLearningOptions additionalOptions,
+                             bool allocateSmoothGradients /* = true */)
+        : Learner(parameters, learningRates[0]),
+        m_wasLearningRateReset(false),
+        m_learningRateSchedule(learningRates),
        m_sampleCount(0),
-        m_minibatchCount(0)
+        m_minibatchCount(0),
+        m_additionalOptions(additionalOptions)
    {
-        m_additionalOptions.gradientClippingThresholdPerSample = clippingThresholdPerSample;
-        m_additionalOptions.gradientClippingWithTruncation = gradientClippingWithTruncation;
-
        for (const auto& parameter : parameters)
        {
            if (!allocateSmoothGradients)
@ -225,8 +226,8 @@ namespace CNTK
 #endif

 #if DUMPOUTPUT
-            auto learningRate = ElementType(m_learningRates[m_sampleCount]);
-            auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
+            auto learningRate = ElementType(LearningRate());
+            auto momentum = ElementType(MomentumValueForMB(m_momentumValues[m_sampleCount], trainingSampleCount));
            LOGPRINTF(stderr, "learnRatePerSample=%0.8f, momentum=%0.8f, actualMBSize=%ld\n",
                        learningRate, momentum, trainingSampleCount);
            LOGPRINTF(stderr, "GradUpdateType()=%s, GradientUpdateNoiseStd()=%0.8f\n",
@ -280,6 +281,9 @@ namespace CNTK
        checkpoint[L"sampleCount"] = m_sampleCount;
        checkpoint[L"minibatchCount"] = m_minibatchCount;

+        if (m_wasLearningRateReset)
+            checkpoint[WasLearningRateResetAttributeName] = m_wasLearningRateReset;
+
        // TODO: should we also save learning rate schedule into the checkpoint?
        // If that is the case, need to be able to override this method in subclasses
        // and save momentum schedule as well.
@ -294,11 +298,19 @@ namespace CNTK
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
            checkpoint[parameter.Uid()] = *smoothedGradientValue;
        }
+
+        // Add the base Learner's checkpoint state
+        auto baseCheckpointState = Learner::GetCheckpointState();
+        checkpoint.Add(baseCheckpointState);
+
        return checkpoint;
    }

    /*virtual*/ void LearnerBase::RestoreFromCheckpoint(const Dictionary& checkpoint) /*override*/
    {
+        // Restore the base learner's checkpoint state
+        Learner::RestoreFromCheckpoint(checkpoint);
+
        m_sampleCount = checkpoint[L"sampleCount"].Value<size_t>();
        m_minibatchCount = checkpoint[L"minibatchCount"].Value<size_t>();

@ -309,6 +321,9 @@ namespace CNTK
            LogicError("Unsupported checkpoint version.");
        }

+        if (checkpoint.Contains(WasLearningRateResetAttributeName))
+            m_wasLearningRateReset = checkpoint[WasLearningRateResetAttributeName].Value<bool>();
+
        for (const auto& parameter : Parameters())
        {
            if (!checkpoint.Contains(parameter.Uid()))
@ -348,25 +363,16 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
-        auto momentum = ElementType(MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount));
+        auto learningRate = ElementType(LearningRate());
+        auto momentum = ElementType(MomentumValueForMB(m_momentumValues[m_sampleCount], trainingSampleCount));

        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
+        // Also, come up with a better name for NormalGrad (Default? Regular? Plain?).
        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
                                           learningRate, momentum, m_useNesterovAcceleration);
    }

-    LearnerAdaGrad::LearnerAdaGrad(const vector<Parameter>& parameters,
-                                   const LearningRatesPerSample& learningRates,
-                                   bool needAveMultiplier,
-                                   double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                                   bool gradientClippingWithTruncation /*= true*/)
-        : LearnerBase(parameters, learningRates, true, clippingThresholdPerSample, gradientClippingWithTruncation), 
-        m_needAveMultiplier(needAveMultiplier)
-    {
-    }
-
    /*virtual*/ void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
    {
        UPDATE_FUNCTION;
@ -382,7 +388,7 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+        auto learningRate = ElementType(LearningRate());

        auto aveMultiplier = smoothedGradientMatrix->Adagrad(*gradientMatrix, m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
@ -390,16 +396,20 @@ namespace CNTK

    LearnerFSAdaGrad::LearnerFSAdaGrad(const vector<Parameter>& parameters,
                                       const LearningRatesPerSample& learningRates, 
-                                       const MomentumsPerSample& momentums,
-                                       double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                                       bool gradientClippingWithTruncation /*= true*/)
-        : LearnerMomentumSGD(parameters, learningRates, momentums, /*allocateSmoothGradients*/ false, clippingThresholdPerSample, gradientClippingWithTruncation)
+                                       const MomentumValuesPerSample& momentumValues,
+                                       const double targetAdagradAvDenom,
+                                       const size_t adagradT,
+                                       AdditionalLearningOptions additionalOptions)
+        : LearnerMomentumSGD(parameters, learningRates, momentumValues, additionalOptions, /*allocateSmoothGradients*/ false),
+        m_targetAdagradAvDenom(targetAdagradAvDenom),
+        m_adagradT(adagradT)
    {
        for (const auto& parameter : parameters)
        {  
            auto shape = GetMatrixShape(parameter);
            NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], 2 * shape[1]});
            m_smoothedGradientValues.insert(make_pair(parameter, view));
+            m_smoothedCounts.insert(make_pair(parameter, 0.0));
        }
    }

@ -411,36 +421,31 @@ namespace CNTK
    template <typename ElementType>
    void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
-        UNUSED(trainingSampleCount);
-
        const auto& parameterValue = parameter.Value();
        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
        
-        auto learningRate = m_learningRates[m_sampleCount];
-        auto momentum = MomentumPerMB(m_momentums[m_sampleCount], trainingSampleCount);
+        auto learningRate = LearningRate();
+        auto momentum = MomentumValueForMB(m_momentumValues[m_sampleCount], trainingSampleCount);

-        const double targetAdagradAvDenom = 0.0025; // 1/400 magic constant
-        const size_t adagradT = 2 * 3600 * 100;
+        const double varMomentum = (exp(-1.0 * trainingSampleCount / m_adagradT));
+        double& smoothedCount = m_smoothedCounts.at(parameter); 

-        const double varMomentum = (exp(-1.0 * trainingSampleCount / adagradT));
-        static double smoothedCount = 0;  // BUGBUG!!! Carried over from Alexey's original implementation, needs to be fixed.
-
-        smoothedGradientMatrix->FSAdagradUpdate(trainingSampleCount, *gradientMatrix, *parameterMatrix, smoothedCount, learningRate, targetAdagradAvDenom, momentum, varMomentum);
+        smoothedGradientMatrix->FSAdagradUpdate(trainingSampleCount, *gradientMatrix, *parameterMatrix, smoothedCount, learningRate, m_targetAdagradAvDenom, momentum, varMomentum);
    }

-    LearnerRMSProp::LearnerRMSProp(const vector<Parameter>& parameters, const LearningRatesPerSample& learningRates,
-                                   double gamma, double inc, double dec, double max, double min, bool needAveMultiplier,
-                                   double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                                   bool gradientClippingWithTruncation /*= true*/)
-    : LearnerBase(parameters, learningRates, /*allocateSmoothGradients*/ false, clippingThresholdPerSample, gradientClippingWithTruncation),
+    LearnerRMSProp::LearnerRMSProp(const vector<Parameter>& parameters, 
+                                   const LearningRatesPerSample& learningRates,
+                                   double gamma, double inc, double dec, double max, double min,
+                                   bool needAveMultiplier,
+                                   AdditionalLearningOptions additionalOptions)
+    : LearnerBase(parameters, learningRates, additionalOptions, /*allocateSmoothGradients*/ false),
    m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min), m_needAveMultiplier(needAveMultiplier)
    {
        for (const auto& parameter : parameters)
        {  
            // When needAveMultiplier == true, CPU and GPU implementations of RMSProp require different number of columns.
-            // TODO: verify that this is correct.
            size_t factor = 3;
            if (needAveMultiplier && parameter.Value()->Device().Type() == DeviceKind::GPU)
            {
@ -469,12 +474,15 @@ namespace CNTK
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);

-        auto learningRate = ElementType(m_learningRates[m_sampleCount]);
+        auto learningRate = ElementType(LearningRate());

        auto aveMultiplier = smoothedGradientMatrix->RmsProp(*gradientMatrix,
-                                                             ElementType(m_gamma), ElementType(m_inc),
-                                                             ElementType(m_max), ElementType(m_dec),
-                                                             ElementType(m_min), m_needAveMultiplier);
+                                                             ElementType(m_gamma), 
+                                                             ElementType(m_inc),
+                                                             ElementType(m_max), 
+                                                             ElementType(m_dec),
+                                                             ElementType(m_min), 
+                                                             m_needAveMultiplier);
        Matrix<ElementType>::ScaleAndAdd(ElementType(-learningRate / aveMultiplier), *gradientMatrix, *parameterMatrix);
    }

@ -484,54 +492,51 @@ namespace CNTK
    
    LearnerPtr SGDLearner(const vector<Parameter>& parameters,
                          const LearningRatesPerSample& learningRates,
-                          double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                          bool gradientClippingWithTruncation /*= true*/)
+                          AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerSGD>(parameters, learningRates, true, clippingThresholdPerSample, gradientClippingWithTruncation);
+        return MakeSharedObject<LearnerSGD>(parameters, learningRates, additionalOptions);
    }

    LearnerPtr MomentumSGDLearner(const vector<Parameter>& parameters,
                                  const LearningRatesPerSample& learningRates,
-                                  const MomentumsPerSample& momentums,
-                                  double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                                  bool gradientClippingWithTruncation /*= true*/)
+                                  const MomentumValuesPerSample& momentumValues,
+                                  AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRates, momentums, true, clippingThresholdPerSample, gradientClippingWithTruncation);
+        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRates, momentumValues, additionalOptions);
    }

    LearnerPtr NesterovLearner(const vector<Parameter>& parameters,
                               const LearningRatesPerSample& learningRates,
-                               const MomentumsPerSample& momentums,
-                               double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                               bool gradientClippingWithTruncation /*= true*/)
+                               const MomentumValuesPerSample& momentumValues,
+                               AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerNesterov>(parameters, learningRates, momentums, clippingThresholdPerSample, gradientClippingWithTruncation);
+        return MakeSharedObject<LearnerNesterov>(parameters, learningRates, momentumValues, additionalOptions);
    }

    LearnerPtr FSAdaGradLearner(const vector<Parameter>& parameters,
                                const LearningRatesPerSample& learningRates,
-                                const MomentumsPerSample& momentums,
-                                double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                                bool gradientClippingWithTruncation /*= true*/)
+                                const MomentumValuesPerSample& momentumValues,
+                                const double targetAdagradAvDenom /*= 0.0025*/,
+                                const size_t adagradT /*= 2 * 3600 * 100*/,
+                                AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRates, momentums, clippingThresholdPerSample, gradientClippingWithTruncation);
+        return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRates, momentumValues, targetAdagradAvDenom, adagradT, additionalOptions);
    }

    LearnerPtr AdaGradLearner(const vector<Parameter>& parameters,
                              const LearningRatesPerSample& learningRates,
                              bool needAveMultiplier /*= true*/,
-                              double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                              bool gradientClippingWithTruncation /*= true*/)
+                              AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerAdaGrad>(parameters, learningRates, needAveMultiplier, clippingThresholdPerSample, gradientClippingWithTruncation);
+        return MakeSharedObject<LearnerAdaGrad>(parameters, learningRates, needAveMultiplier, additionalOptions);
    }

-    LearnerPtr RMSPropLearner(const vector<Parameter>& parameters, const LearningRatesPerSample& learningRates,
+    LearnerPtr RMSPropLearner(const vector<Parameter>& parameters,
+                              const LearningRatesPerSample& learningRates,
                              double gamma, double inc, double dec, double max, double min, 
                              bool needAveMultiplier /*= true*/,
-                              double clippingThresholdPerSample /*= std::numeric_limits<double>::infinity()*/,
-                              bool gradientClippingWithTruncation /*= true*/)
+                              AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerRMSProp>(parameters, learningRates, gamma, inc, dec, max, min, needAveMultiplier, clippingThresholdPerSample, gradientClippingWithTruncation);
+        return MakeSharedObject<LearnerRMSProp>(parameters, learningRates, gamma, inc, dec, max, min, needAveMultiplier, additionalOptions);
    }
 }
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -3,29 +3,21 @@
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //

+#pragma once
+
 #include "stdafx.h"
 #include "CNTKLibrary.h"
 #include <numeric>

 namespace CNTK 
 {
-    // TODO: Move this to Trainer along with Pre-, PostProcess and ClipGradient.
-    // A collection of additional options that are applicable for all standard learners 
-    // (after these options are set, they retain their value for the entire lifespan of a learner).
-    struct AdditionalLearningOptions
-    {
-        double l1RegularizationWeight = 0.0;
-        double l2RegularizationWeight = 0.0;
-        double gaussianNoiseInjectionStdDev = 0.0;
-        bool gradientClippingWithTruncation = true;
-        double gradientClippingThresholdPerSample = std::numeric_limits<double>::infinity();
-    };
-
    // An abstract base class at the root of the standard learners hierarchy
    // It implements most of the learner functionality, except for the actual update function,
    // and adds a few pre-/postprocessing methods (which are invoked before and after the update).
    class LearnerBase : public Learner
    {
+        static const std::wstring WasLearningRateResetAttributeName;
+
    public:
        virtual bool Update(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) override final;

@ -33,18 +25,36 @@ namespace CNTK

        virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override final;

+        virtual void ResetLearningRate(double learningRate) override final
+        {
+            m_wasLearningRateReset = true;
+            Learner::ResetLearningRate(learningRate);
+        }
+
+        virtual double LearningRate() const override final
+        {
+            if (m_wasLearningRateReset)
+                return Learner::LearningRate();
+            else
+                return m_learningRateSchedule[m_sampleCount];
+        }
+
    protected:
+        // allocateSmoothGradients flag specifies whether NDArrayViews for smoothed gradients can be allocated 
+        // in the base class constructor (in which case they are allocated with the shapes identical to the shapes of
+        // the corresponding parameters) or if the allocation should be deferred to the subclass constructor (which
+        // performs allocation that is specific to the particular learner, see FSAdaGrad and RMSProp).
        LearnerBase(const std::vector<Parameter>& parameters, 
                    const LearningRatesPerSample& learningRates,
-                    bool allocateSmoothGradients = true,
-                    double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                    bool gradientClippingWithTruncation = true);
+                    AdditionalLearningOptions additionalOptions,
+                    bool allocateSmoothGradients = true);

        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const = 0;

        std::string LearnerType() const;

-        LearningRatesPerSample m_learningRates;
+        bool m_wasLearningRateReset;
+        LearningRatesPerSample m_learningRateSchedule;

        AdditionalLearningOptions m_additionalOptions;

@ -84,6 +94,7 @@ namespace CNTK
        // Retrieves the shape of the matrix corresponding to the parameter value.
        static NDShape GetMatrixShape(const Parameter& parameter);

+
        size_t m_sampleCount;
        size_t m_minibatchCount;

@ -106,11 +117,10 @@ namespace CNTK
    public:
        LearnerSGD(const std::vector<Parameter>& parameters, 
                   const LearningRatesPerSample& learningRates, 
-                   bool allocateSmoothGradients = true,
-                   double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                   bool gradientClippingWithTruncation = true)
-                   : LearnerBase(parameters, learningRates, allocateSmoothGradients, clippingThresholdPerSample, gradientClippingWithTruncation),
-            m_momentums(0.0), 
+                   AdditionalLearningOptions additionalOptions,
+                   bool allocateSmoothGradients = true)
+                   : LearnerBase(parameters, learningRates, additionalOptions, allocateSmoothGradients),
+            m_momentumValues(0.0), 
            m_useNesterovAcceleration(false)
        {}

@ -121,8 +131,8 @@ namespace CNTK
        template <typename ElementType>
        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;

-        // TODO: Move m_momentums to LearnerMomentumSGD as soon as NormalGrad is refactored.
-        MomentumsPerSample m_momentums;
+        // TODO: Move m_momentumValues to LearnerMomentumSGD as soon as NormalGrad is refactored.
+        MomentumValuesPerSample m_momentumValues;
        bool m_useNesterovAcceleration;
    };

@ -132,13 +142,12 @@ namespace CNTK
    public:
        LearnerMomentumSGD(const std::vector<Parameter>& parameters,
                           const LearningRatesPerSample& learningRates,
-                           const MomentumsPerSample& momentums,
-                           bool allocateSmoothGradients = true,
-                           double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                           bool gradientClippingWithTruncation = true)
-                           : LearnerSGD(parameters, learningRates, allocateSmoothGradients, clippingThresholdPerSample, gradientClippingWithTruncation)
+                           const MomentumValuesPerSample& momentumValues,
+                           AdditionalLearningOptions additionalOptions,
+                           bool allocateSmoothGradients = true)
+                           : LearnerSGD(parameters, learningRates, additionalOptions, allocateSmoothGradients)
        {
-            m_momentums = momentums;
+            m_momentumValues = momentumValues;
        }
    };

@ -149,10 +158,9 @@ namespace CNTK

        LearnerNesterov(const std::vector<Parameter>& parameters,
                        const LearningRatesPerSample& learningRates,
-                        const MomentumsPerSample& momentums,
-                        double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                        bool gradientClippingWithTruncation = true)
-                        : LearnerMomentumSGD(parameters, learningRates, momentums, true, clippingThresholdPerSample, gradientClippingWithTruncation)
+                        const MomentumValuesPerSample& momentumValues,
+                        AdditionalLearningOptions additionalOptions)
+                        : LearnerMomentumSGD(parameters, learningRates, momentumValues, additionalOptions, /*allocateSmoothGradients*/ true)
        {
            m_useNesterovAcceleration = true;
        }
@ -165,8 +173,11 @@ namespace CNTK
        LearnerAdaGrad(const std::vector<Parameter>& parameters,
                       const LearningRatesPerSample& learningRates,
                       bool needAveMultiplier,
-                       double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                       bool gradientClippingWithTruncation = true);
+                       AdditionalLearningOptions additionalOptions)
+                       : LearnerBase(parameters, learningRates, additionalOptions, /*allocateSmoothGradients*/ true),
+                       m_needAveMultiplier(needAveMultiplier)
+    {
+    }

    protected:
        bool m_needAveMultiplier;
@ -183,9 +194,10 @@ namespace CNTK

        LearnerFSAdaGrad(const std::vector<Parameter>& parameters,
                         const LearningRatesPerSample& learningRates,
-                         const MomentumsPerSample& momentums,
-                         double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                         bool gradientClippingWithTruncation = true);
+                         const MomentumValuesPerSample& momentumValues,
+                         const double targetAdagradAvDenom,
+                         const size_t adagradT,
+                         AdditionalLearningOptions additionalOptions);

    protected:

@ -193,6 +205,11 @@ namespace CNTK

        template <typename ElementType>
        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+
+    private:
+        mutable std::unordered_map<Parameter, double> m_smoothedCounts;
+        double m_targetAdagradAvDenom;
+        size_t m_adagradT;
    };

    class LearnerRMSProp : public LearnerBase
@ -203,8 +220,7 @@ namespace CNTK
                       const LearningRatesPerSample& learningRates,
                       double gamma, double inc, double dec, double max, double min,
                       bool needAveMultiplier,
-                       double clippingThresholdPerSample = std::numeric_limits<double>::infinity(),
-                       bool gradientClippingWithTruncation = true);
+                       AdditionalLearningOptions additionalOptions);

    protected:

--- a/Source/CNTKv2LibraryDll/MinibatchSource.cpp
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.cpp
@ -12,6 +12,7 @@
 #include "ReaderShim.h"
 #include "Function.h"
 #include <tuple>
+#include "Value.h"

 using namespace Microsoft::MSR::CNTK;

@ -78,6 +79,8 @@ namespace CNTK
            static const std::unordered_map<std::wstring, std::wstring> deserializerTypeNameToModuleNameMap = {
                { L"CNTKTextFormatDeserializer", L"CNTKTextFormatReader" },
                { L"ImageDeserializer",          L"ImageReader"          },
+                { L"HTKFeatureDeserializer",     L"HTKDeserializers"     },
+                { L"HTKMLFDeserializer",         L"HTKDeserializers"     },
            };

            auto& deserializerConfigDict = deserializerConfig.Value<Dictionary>();
@ -103,6 +106,10 @@ namespace CNTK
                }

            }
+
+            if (deserializerTypeNameToModuleNameMap.find(deserializerTypeName) == deserializerTypeNameToModuleNameMap.end())
+                InvalidArgument("Unknown deserializer type (%S)", deserializerTypeName.c_str());
+
            deserializerConfigDict[L"module"] = deserializerTypeNameToModuleNameMap.at(deserializerTypeName);
        }

@ -197,7 +204,7 @@ namespace CNTK

                    // TODO: Eliminate the unnecessary CPU to CPU copy
                    ReaderShim<float>::FillMatrixFromStream(currentStreamDesc->m_storageType, dataMatrix.get(), sampleSize, currentStreamMinibatchData, nullptr);
-                    minibatchValuePtr = CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(sampleShape, *dataMatrix, currentStreamMinibatchData->m_layout, false);
+                    minibatchValuePtr = MakeSharedObject<PackedValue>(sampleShape, dataMatrix, currentStreamMinibatchData->m_layout, /*readOnly =*/ false);

                    size_t numSamples = currentStreamMinibatchData->m_layout->GetActualNumSamples();
                    size_t numSequences = currentStreamMinibatchData->m_layout->GetNumSequences();
--- a/Source/CNTKv2LibraryDll/NDMask.cpp
+++ b/Source/CNTKv2LibraryDll/NDMask.cpp
@ -36,19 +36,18 @@ namespace CNTK
    }

    NDMask::~NDMask()
-    {
-    }
+    {}

-    void NDMask::MaskSection(const std::vector<size_t>& sectionOffset, const NDShape& sectionShape)
+    void NDMask::MarkSectionAs(const std::vector<size_t>& sectionOffset, const NDShape& sectionShape, MaskKind maskKind)
    {
        // TODO: Implement batching of masking operation for masks residing on GPUs to avoid making
        // GPU invocations for each MaskSection call.

        if (sectionOffset.size() > m_maskShape.Rank())
-            LogicError("NDMask::MaskSection: The sectionOffset cannot have dimensionality higher than the number of axes of 'this' mask");
+            LogicError("NDMask::MaskSection: The sectionOffset cannot have dimensionality higher than the rank of 'this' mask");

        if (sectionShape.Rank() > m_maskShape.Rank())
-            LogicError("NDMask::MaskSection: The section shape cannot have an axes count higher than the number of axes of 'this' mask");
+            LogicError("NDMask::MaskSection: The section shape cannot have an axes count higher than the rank of 'this' mask");

        std::vector<size_t> offset(m_maskShape.Rank(), 0);
        for (size_t i = 0; i < sectionOffset.size(); ++i)
@ -62,7 +61,7 @@ namespace CNTK
        size_t sliceRowLength = (shape[0] != NDShape::InferredDimension) ? shape[0] : (maskMatrix->GetNumRows() - rowOffset);
        size_t sliceColLength = (shape[1] != NDShape::InferredDimension) ? shape[1] : (maskMatrix->GetNumCols() - colOffset);
        if ((rowOffset == 0) && (sliceRowLength == maskMatrix->GetNumRows()))
-            maskMatrix->ColumnSlice(colOffset, sliceColLength).SetValue(0);
+            maskMatrix->ColumnSlice(colOffset, sliceColLength).SetValue((char)maskKind);
        else
        {
            // Since Matrix does not support strides in the row dimension, we will need to create separate slices for each column
@ -70,15 +69,15 @@ namespace CNTK
            {
                auto column = maskMatrix->ColumnSlice(i, 1);
                column.Reshape(1, maskMatrix->GetNumRows());
-                column.ColumnSlice(rowOffset, sliceRowLength).SetValue(0);
+                column.ColumnSlice(rowOffset, sliceRowLength).SetValue((char)maskKind);
            }
        }
    }

    void NDMask::Clear()
    {
-        // Clear the mask by marking all samples as Valid; i.e. a value of 1
-        GetMatrix()->SetValue(1);
+        // Clear the mask by marking all samples as Valid
+        GetMatrix()->SetValue((char)MaskKind::Valid);
    }

    size_t NDMask::MaskedCount() const
@ -86,17 +85,17 @@ namespace CNTK
        auto maskMatrix = GetMatrix();
        std::unique_ptr<char[]> maskData(maskMatrix->CopyToArray());
        return std::count_if(maskData.get(), maskData.get() + maskMatrix->GetNumElements(), [](const char& val) {
-            return val == 0;
+            return val == (char)MaskKind::Invalid;
        });
    }

    // TODO: This could actually be strided?
-    const char* NDMask::DataBuffer() const
+    const MaskKind* NDMask::DataBuffer() const
    {
        // First make sure that the underlying matrix is on the right device
        auto matrix = GetMatrix();
        matrix->TransferToDeviceIfNotThere(AsCNTKImplDeviceId(m_device), true);
-        return matrix->Data();
+        return (const MaskKind*)(matrix->Data());
    }

    Matrix<char>* NDMask::GetMatrix() const
@ -112,9 +111,9 @@ namespace CNTK
        GetMatrix()->AssignValuesOf(*source.GetMatrix());
    }

-    NDMaskPtr NDMask::DeepClone() const
+    NDMaskPtr NDMask::DeepClone(const DeviceDescriptor& device) const
    {
-        NDMaskPtr newMask = MakeSharedObject<NDMask>(this->Shape(), this->Device());
+        NDMaskPtr newMask = MakeSharedObject<NDMask>(this->Shape(), device);
        newMask->CopyFrom(*this);

        return newMask;
--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -13,7 +13,24 @@ namespace CNTK
    Trainer::Trainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const FunctionPtr& evaluationFunction, const std::unordered_set<LearnerPtr>& parameterLearners)
        : m_model(model), m_lossFunction(lossFunction), m_evaluationFunction(evaluationFunction), m_parameterLearners(parameterLearners), m_prevMinibatchNumSamples(1)
    {
-        m_combinedTrainingFunction = Combine({ model, lossFunction, evaluationFunction });
+        if (m_lossFunction->Output().DynamicAxes().empty())
+            InvalidArgument("The loss function specified in the Trainer constructor must correspond to minibatch data and have dynamic axes");
+
+        if (m_evaluationFunction && m_evaluationFunction->Output().DynamicAxes().empty())
+            InvalidArgument("The evaluation function specified in the Trainer constructor must correspond to minibatch data and have dynamic axes");
+
+        m_aggregatedLossFunction = ReduceSum(lossFunction);
+        if (m_evaluationFunction)
+            m_aggregatedEvaluationFunction = ReduceSum(m_evaluationFunction);
+
+        std::vector<Variable> combinedFunctionArgs = { m_model, m_aggregatedLossFunction, m_lossFunction };
+        if (m_evaluationFunction)
+        {
+            combinedFunctionArgs.push_back(m_aggregatedEvaluationFunction);
+            combinedFunctionArgs.push_back(m_evaluationFunction);
+        }
+
+        m_combinedTrainingFunction = Combine(combinedFunctionArgs);

        auto modelParameters = m_combinedTrainingFunction->Parameters();
        std::unordered_set<Parameter> learnerParameters;
@ -66,20 +83,11 @@ namespace CNTK
        return scalar;
    }

-    static size_t GetSampleCountFromArguments(const Variable& evalOrLossArgument, const std::unordered_map<Variable, ValuePtr>& arguments)
+    static size_t GetSampleCount(const Variable& var, const ValuePtr& value)
    {
-        // Find the argument whose dynamic axes match the criterion operation's dynamic axes (i.e. label dynamic axes)
-        // Then we determine the actual number of samples contributing to the training loss from the argument's Value object
-        auto argumentIter = std::find_if(arguments.begin(), arguments.end(), [evalOrLossArgument](const std::pair<Variable, ValuePtr>& currentPair) {
-            return (currentPair.first.DynamicAxes() == evalOrLossArgument.DynamicAxes());
-        });
-
-        auto argumentValue = argumentIter->second;
-        auto argumentVar = argumentIter->first;
-        auto argumentDataShape = argumentValue->Data()->Shape();
-        auto mask = argumentValue->Mask();
-        size_t numMaskedSamples = (mask != nullptr) ? mask->MaskedCount() : 0;
-        size_t numSamplesInDataArrayView = argumentDataShape.SubShape(argumentVar.Shape().Rank()).TotalSize();
+        auto valueDataShape = value->Shape();
+        size_t numMaskedSamples = value->MaskedCount();
+        size_t numSamplesInDataArrayView = valueDataShape.SubShape(var.Shape().Rank()).TotalSize();
        if (numMaskedSamples > numSamplesInDataArrayView)
            LogicError("Number of masked values cannot exceed the number of samples that the Value object's Data NDArrayView can hold");

@ -88,15 +96,15 @@ namespace CNTK

    double Trainer::TestMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
-        if (!m_evaluationFunction)
+        if (!m_aggregatedEvaluationFunction)
            InvalidArgument("Trainer::TestMinibatch: Cannot test when no evaluation function was specified during 'this' trainer's construction");

        // TODO: Should we refactor this code that is somewhat similar to the prologue of the TrainMinibatch function
-        std::unordered_map<Variable, ValuePtr> outputs = { { m_evaluationFunction, nullptr } };
+        std::unordered_map<Variable, ValuePtr> outputs = { { m_aggregatedEvaluationFunction, nullptr }, {m_evaluationFunction, nullptr} };
        m_combinedTrainingFunction->Forward(arguments, outputs, computeDevice);

-        auto sampleCount = GetSampleCountFromArguments(*(m_evaluationFunction->Arguments().begin()), arguments);
-        return (GetScalarValue(outputs[m_evaluationFunction]) / sampleCount);
+        auto sampleCount = GetSampleCount(m_evaluationFunction, outputs[m_evaluationFunction]);
+        return (GetScalarValue(outputs[m_aggregatedEvaluationFunction]) / sampleCount);
    }

    bool Trainer::TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
@ -107,16 +115,16 @@ namespace CNTK

    bool Trainer::TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
-        std::unordered_map<Variable, ValuePtr> outputs = { { m_lossFunction, nullptr } };
-        if (m_evaluationFunction)
-            outputs.insert({ m_evaluationFunction, nullptr });
+        std::unordered_map<Variable, ValuePtr> outputs = { { m_aggregatedLossFunction, nullptr }, { m_lossFunction, nullptr } };
+        if (m_aggregatedEvaluationFunction)
+            outputs.insert({ m_aggregatedEvaluationFunction, nullptr });

        outputs.insert(outputsToFetch.begin(), outputsToFetch.end());

-        auto backPropSate = m_combinedTrainingFunction->Forward(arguments, outputs, computeDevice, { m_lossFunction });
-        m_prevMinibatchAggregateTrainingLossValue = outputs[m_lossFunction];
-        if (m_evaluationFunction)
-            m_prevMinibatchAggregateEvalCriterionValue = outputs[m_evaluationFunction];
+        auto backPropSate = m_combinedTrainingFunction->Forward(arguments, outputs, computeDevice, { m_aggregatedLossFunction });
+        m_prevMinibatchAggregateTrainingLossValue = outputs[m_aggregatedLossFunction];
+        if (m_aggregatedEvaluationFunction)
+            m_prevMinibatchAggregateEvalCriterionValue = outputs[m_aggregatedEvaluationFunction];

        for (auto outputToFetch : outputsToFetch)
        {
@ -124,8 +132,8 @@ namespace CNTK
                outputsToFetch[outputToFetch.first] = outputs[outputToFetch.first];
        }

-        ValuePtr rootGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(m_lossFunction->Output().GetDataType(), m_prevMinibatchAggregateTrainingLossValue->Data()->Shape(), computeDevice), outputs.at(m_lossFunction)->Mask());
-        if (m_lossFunction->Output().GetDataType() == DataType::Float)
+        ValuePtr rootGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(m_aggregatedLossFunction->Output().GetDataType(), m_prevMinibatchAggregateTrainingLossValue->Shape(), computeDevice), outputs.at(m_aggregatedLossFunction)->Mask());
+        if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Float)
            rootGradientValue->Data()->SetValue(1.0f);
        else
            rootGradientValue->Data()->SetValue(1.0);
@ -135,9 +143,9 @@ namespace CNTK
        for (const auto& parameter : modelParameters)
            parameterGradients[parameter] = nullptr;

-        m_combinedTrainingFunction->Backward(backPropSate, { { m_lossFunction, rootGradientValue } }, parameterGradients);
+        m_combinedTrainingFunction->Backward(backPropSate, { { m_aggregatedLossFunction, rootGradientValue } }, parameterGradients);

-        m_prevMinibatchNumSamples = GetSampleCountFromArguments(*(m_lossFunction->Arguments().begin()), arguments);
+        m_prevMinibatchNumSamples = GetSampleCount(m_lossFunction, outputs[m_lossFunction]);

        bool anyUpdatesPerformed = false;
        for (auto learner : m_parameterLearners)
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -13,6 +13,7 @@ using namespace std;

 namespace CNTK
 {
+
    // This wrapper redefines operator<< in terms of unformatted (binary) write operation.
    struct BinaryOStreamWrapper
    {
@ -527,6 +528,17 @@ namespace CNTK
        return (m_dictionaryData->find(key) != m_dictionaryData->end());
    }

+    void Dictionary::Add(const Dictionary& other)
+    {
+        for (auto kv : *(other.m_dictionaryData))
+        {
+            if (Contains(kv.first))
+                InvalidArgument("Dictionary::Add: This dictionary already contains an entry with key %S that is being attempted to add from the 'other' dinctionary", kv.first.c_str());
+
+            (*this)[kv.first] = kv.second;
+        }
+    }
+
    bool Dictionary::operator==(const Dictionary& other) const
    {
        if (this == &other)
@ -539,7 +551,7 @@ namespace CNTK
            return false;
        }
        
-        for (auto& kv : *m_dictionaryData)
+        for (const auto& kv : *m_dictionaryData)
        {
            auto result = other.m_dictionaryData->find(kv.first);
            if (result == other.m_dictionaryData->end() || kv.second != result->second)
@ -561,7 +573,7 @@ namespace CNTK
        BinaryOStreamWrapper stream(stdStream);
        stream << us.version;
        stream << us.m_dictionaryData->size();
-        for (auto& kv : *(us.m_dictionaryData))
+        for (const auto& kv : *(us.m_dictionaryData))
        {
            stream << kv.first;
            stream << kv.second;
@ -586,10 +598,62 @@ namespace CNTK
        return stream;
    }

+    template <typename T>
+    TrainingParameterSchedule<T>::TrainingParameterSchedule(T value) 
+        : m_schedule({ make_pair(0, value) }), m_unit(1)
+    {
+    }
+
+    template <typename T>
+    TrainingParameterSchedule<T>::TrainingParameterSchedule(const vector<T>& schedule, size_t unit) 
+        : m_unit(unit)
+    {
+        std::vector<std::pair<size_t, T>> s(schedule.size());
+        for (auto i = 0; i < schedule.size(); ++i)
+        {
+            s[i].first = 1;
+            s[i].second = schedule[i];
+        }
+        ConstructSchedule(s);
+    }
+
+    template <typename T>
+    TrainingParameterSchedule<T>::TrainingParameterSchedule(const vector<std::pair<size_t, T>>& schedule, size_t unit)
+        : m_unit(unit)
+    {
+        ConstructSchedule(schedule);
+    }
+
+    template <typename T>
+    void TrainingParameterSchedule<T>::ConstructSchedule(const std::vector<std::pair<size_t, T>>& schedule)
+    {
+        // TODO: 0 will be used to mean "the entire sweep"
+        if (m_unit == 0)
+            RuntimeError("TrainingParameterSchedule::ConstructSchedule : 'unit' cannot be 0.");
+
+        if (schedule.size() == 0)
+            RuntimeError("TrainingParameterSchedule::ConstructSchedule : schedule is empty.");
+
+        size_t i = 0;
+        for (const auto& it : schedule)
+        {
+            if (it.first == 0)
+                RuntimeError("TrainingParameterSchedule::ConstructSchedule : unit count cannot be 0.");
+
+            i += it.first;
+            m_schedule[m_unit * i] = it.second;
+        }
+    }
+
+    template <typename T>
+    /*virtual*/ TrainingParameterSchedule<T>::~TrainingParameterSchedule()
+    {
+    }
+
    // Returns the element whose key is greater than the required sample count 
    // or the last element if no such key exists.
    template <typename T>
-    const T& TrainingParameterSchedule<T>::operator[](size_t sampleCount) const
+    /*virtual*/ const T& TrainingParameterSchedule<T>::operator[](size_t sampleCount) const
    {
        assert(m_schedule.size() > 0);
        auto it = m_schedule.upper_bound(sampleCount);
@ -600,6 +664,38 @@ namespace CNTK
        return it->second;
    }

+    template <typename T>
+    TrainingParameterSchedule<T>::TrainingParameterSchedule(const TrainingParameterSchedule<T>&) = default;
+
+    // cannot be defaulted due to a bug in VS2013 (https://connect.microsoft.com/VisualStudio/feedback/details/1255564)
+    template <typename T>
+    TrainingParameterSchedule<T>::TrainingParameterSchedule(TrainingParameterSchedule<T>&& that)
+        :m_schedule(move(that.m_schedule)), m_unit(that.m_unit)
+    {
+    }
+
+    template <typename T>
+    TrainingParameterSchedule<T>& TrainingParameterSchedule<T>::operator=(const TrainingParameterSchedule<T>&) = default;
+
+    // cannot be defaulted due to a bug in VS2013 (https://connect.microsoft.com/VisualStudio/feedback/details/1255564)
+    template <typename T>
+    TrainingParameterSchedule<T>& TrainingParameterSchedule<T>::operator=(TrainingParameterSchedule<T>&& that)
+    {
+        m_schedule = move(that.m_schedule);
+        m_unit = that.m_unit;
+        return *this;
+    }
+
+    void MomentumValuesAsTimeConstants::ConvertToPerSampleValues()
+    {
+        for (auto& it : m_schedule)
+        {
+            double momTC = it.second;
+            double momPS = momTC == 0.0 ? 0 : exp(-1.0 / momTC);
+            it.second = momPS;
+        }
+    }
+
    template void DictionaryValue::AllocateDataPtr<NDShape>(const NDShape& value);
    template void DictionaryValue::AllocateDataPtr<Axis>(const Axis& value);
    template void DictionaryValue::AllocateDataPtr<vector<DictionaryValue>>(const vector<DictionaryValue>& value);
@ -614,5 +710,5 @@ namespace CNTK
    template void DictionaryValue::FreePtrAsType<Dictionary>();
    template void DictionaryValue::FreePtrAsType<NDArrayView>();

-    template const double& TrainingParameterSchedule<double>::operator[](size_t key) const;
+    template class TrainingParameterSchedule<double>;
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -32,7 +32,7 @@ namespace CNTK
    inline DEVICEID_TYPE AsCNTKImplDeviceId(const DeviceDescriptor& device)
    {
        if (device.Type() == DeviceKind::CPU)
-            return -1;
+            return CPUDEVICE;
        else if (device.Type() == DeviceKind::GPU)
            return device.Id();
        else
@ -304,16 +304,20 @@ namespace CNTK
        }
    }

+    static size_t const CNTKInternalIdxValueForAllStaticAxes = 0;
    inline Axis AsAxis(size_t CNTKInternalAxisIdx)
    {
-        if (CNTKInternalAxisIdx == 0)
-            LogicError("CNTK internal axis indices must be > 0");
+        if (CNTKInternalAxisIdx == CNTKInternalIdxValueForAllStaticAxes)
+            return Axis::AllStaticAxes();

        return Axis(CNTKInternalAxisIdx - 1);
    }

    inline int AsCNTKInternalAxisIdx(const Axis& axis)
    {
+        if (axis == Axis::AllStaticAxes())
+            return CNTKInternalIdxValueForAllStaticAxes;
+
        if (!axis.IsStaticAxis())
            LogicError("Only Axis that represent static indices can be converted to a CNTK internal axis index");

@ -322,19 +326,16 @@ namespace CNTK

    inline std::pair<NDShape, NDShape> GetConvolutionOutputMapCountAndKernelShape(const NDShape& convolutionMapShape, const NDShape& operandShape)
    {
-        auto outputMapCount = convolutionMapShape.SubShape(0, convolutionMapShape.Rank() - operandShape.Rank());
+        NDShape kernelShape = convolutionMapShape.SubShape(0, operandShape.Rank());
+        auto outputMapCount = convolutionMapShape.SubShape(kernelShape.Rank());
        NDShape paddedOutputMapCount(operandShape.Rank(), 1);
        for (size_t i = 0; i < outputMapCount.Rank(); ++i)
            paddedOutputMapCount[paddedOutputMapCount.Rank() - 1 - i] = outputMapCount[outputMapCount.Rank() - 1 - i];
-        //for (size_t i = 0; i < outputMapCount.Rank(); ++i)
-        //    paddedOutputMapCount[i] = outputMapCount[i];
-
-        NDShape kernelShape = convolutionMapShape.SubShape(outputMapCount.Rank());

        return{ paddedOutputMapCount, kernelShape };
    }

-    inline double MomentumPerMB(double momentumPerSample, size_t minibatchSize)
+    inline double MomentumValueForMB(double momentumPerSample, size_t minibatchSize)
    {
        return std::pow(momentumPerSample, minibatchSize);
    }
@ -369,4 +370,45 @@ namespace CNTK
        double* castValue = Copy<float, double>(source->DataBuffer<float>(), sourceSize);
        return MakeSharedObject<NDArrayView>(sourceShape, castValue, sourceSize, DeviceDescriptor::CPUDevice(), readOnly);
    }
+
+    inline std::wstring ParanthesizedName(const std::wstring& name)
+    {
+        if (name.empty())
+            return name;
+
+        return L"(" + name + L")";
+    }
+
+    static const std::wstring UidPrefix = L"__v2libuid__";
+    static const std::wstring NamePrefix = L"__v2libname__";
+
+    inline std::wstring CNTKInternalNodeNameFromUidAndName(const std::wstring& uid, const std::wstring& name)
+    {
+        return UidPrefix + uid + NamePrefix + name;
+    }
+
+    inline std::pair<std::wstring, std::wstring> UidAndNameFromCNTKInternalNodeName(const std::wstring& CNTKInternalNodeName, VariableKind varKind)
+    {
+        std::wstring uid, name;
+        auto uidPrefixBeginPos = CNTKInternalNodeName.find(UidPrefix);
+        if (uidPrefixBeginPos != std::wstring::npos)
+        {
+            auto uidBeginPos = uidPrefixBeginPos + UidPrefix.length();
+            auto namePrefixBeginPos = CNTKInternalNodeName.find(NamePrefix, uidBeginPos);
+            if (namePrefixBeginPos == std::wstring::npos)
+                LogicError("CNTK internal node name found to contain uid but not name!");
+
+            auto nameBeginPos = namePrefixBeginPos + NamePrefix.length();
+            uid = CNTKInternalNodeName.substr(uidBeginPos, namePrefixBeginPos - uidBeginPos);
+            name = CNTKInternalNodeName.substr(nameBeginPos);
+        }
+        else
+        {
+            name = CNTKInternalNodeName;
+            uid = Internal::GenerateUid(varKind);
+        }
+
+        return{ uid, name };
+    }
 }
+
--- a/Source/CNTKv2LibraryDll/Value.cpp
+++ b/Source/CNTKv2LibraryDll/Value.cpp
@ -11,6 +11,8 @@

 #include "CNTKLibrary.h"
 #include "Utils.h"
+#include "Value.h"
+#include "Function.h"

 namespace CNTK
 {
@ -28,7 +30,7 @@ namespace CNTK
            auto maskShape = mask->Shape();

            if (maskShape.Rank() > dataShape.Rank())
-                InvalidArgument("The number of axes (%d) of the mask of a Value object cannot exceed the number of axes (%d) of the data NDArrayView object", (int)maskShape.Rank(), (int)dataShape.Rank());
+                InvalidArgument("The rank (%d) of the mask of a Value object cannot exceed the rank (%d) of the data NDArrayView object", (int)maskShape.Rank(), (int)dataShape.Rank());

            if (dataShape.SubShape(dataShape.Rank() - maskShape.Rank()) != maskShape)
                InvalidArgument("Invalid Value object; the data and mask are incompatible. The trailing dimensions of the data with shape %S do not match the dimensions of the mask with shape %S", AsStringForErrorReporting(dataShape).c_str(), AsStringForErrorReporting(maskShape).c_str());
@ -60,7 +62,10 @@ namespace CNTK
            NDShape valueMaskShape = { maxSequenceLength, numSequences };
            deviceValueMask = MakeSharedObject<NDMask>(valueMaskShape, device);
            for (size_t i = 0; i < numSequences; ++i)
-                deviceValueMask->MaskSection({ sequenceLengths[i], i }, { NDShape::InferredDimension, 1 });
+            {
+                deviceValueMask->MarkSequenceBegin({0, i});
+                deviceValueMask->InvalidateSection({ sequenceLengths[i], i }, { NDShape::InferredDimension, 1 });
+            }
        }

        return deviceValueMask;
@ -179,6 +184,39 @@ namespace CNTK
        }
    }

+    void PackedValue::Unpack() const
+    {
+        if (m_packedDataLayout && (m_packedDataLayout->GetNumTimeSteps() != 1) && (m_packedDataLayout->GetNumSequences() != 1) && Internal::IsAutomaticUnpackingOfPackedValuesDisabled())
+            LogicError("PackedValue::Unpack: Automatic unpacking of PackedValue objects is disabled");
+
+        if (m_isPacked)
+        {
+            ValuePtr valueObject;
+            auto dataType = m_packedData->GetDataType();
+            switch (dataType)
+            {
+            case DataType::Float:
+                valueObject = CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_sampleShape, *(m_packedData->GetMatrix<float>()), m_packedDataLayout, m_isReadOnly);
+                break;
+            case DataType::Double:
+                valueObject = CompositeFunction::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_sampleShape, *(m_packedData->GetMatrix<double>()), m_packedDataLayout, m_isReadOnly);
+                break;
+            default:
+                LogicError("Unsupported DataType %s", DataTypeName(dataType));
+            }
+
+            m_data = valueObject->Data();
+            m_mask = valueObject->Mask();
+
+            m_packedData = nullptr;
+            m_packedDataLayout = nullptr;
+            m_isPacked = false;
+
+            if (m_unpackedShape != m_data->Shape())
+                LogicError("The computed unpacked shape of the PackedValue object does not match the actual Data NDArrayView's shape after unpacking");
+        }
+    }
+
    // Explicit template instantiations
    template /*static*/ CNTK_API ValuePtr Value::Create<float>(const NDShape& sampleShape, const std::vector<std::vector<float>>& sequences, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::Create<double>(const NDShape& sampleShape, const std::vector<std::vector<double>>& sequences, const DeviceDescriptor& device, bool readOnly/* = false*/);
--- a/Source/CNTKv2LibraryDll/Value.h
+++ b/Source/CNTKv2LibraryDll/Value.h
@ -7,14 +7,112 @@

 #include "stdafx.h"
 #include "CNTKLibrary.h"
+#include "Sequences.h"
+#include "Utils.h"

 namespace CNTK
 {
-    class CNTKValue final : public Value
+    class PackedValue final : public Value
    {
+        template <typename T, typename ...CtorArgTypes>
+        friend inline std::shared_ptr<T> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
+
    public:
+        template <typename ElementType>
+        PackedValue(const NDShape& sampleShape, const std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>>& packedDataMatrix, const std::shared_ptr<Microsoft::MSR::CNTK::MBLayout>& packedDataLayout, bool isReadOnly)
+            : Value(nullptr), m_isPacked(true), m_sampleShape(sampleShape), m_packedData(nullptr), m_packedDataLayout(packedDataLayout), m_isReadOnly(isReadOnly)
+        {
+            NDShape packedMatrixShape({ packedDataMatrix->GetNumRows(), packedDataMatrix->GetNumCols() });
+            auto tensorView = new Microsoft::MSR::CNTK::TensorView<ElementType>(packedDataMatrix, AsTensorViewShape(packedMatrixShape));
+            m_packedData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), AsDeviceDescriptor(packedDataMatrix->GetDeviceId()), AsStorageFormat(packedDataMatrix->GetFormat()), packedMatrixShape, m_isReadOnly, tensorView);
+
+            // Determine unpacked shape
+            m_unpackedShape = sampleShape;
+            if (packedDataLayout)
+                m_unpackedShape = m_unpackedShape.AppendShape({ packedDataLayout->GetNumTimeSteps(), packedDataLayout->GetNumSequences() });
+        }
+
+        void Unpack() const;
+
+        const NDShape& Shape() const override { return m_unpackedShape; }
+        DeviceDescriptor Device() const override { return m_isPacked ? m_packedData->Device() : Value::Device(); }
+        DataType GetDataType() const override { return m_isPacked ? m_packedData->GetDataType() : Value::GetDataType(); }
+        StorageFormat GetStorageFormat() const override { return m_isPacked? m_packedData->GetStorageFormat() : Value::GetStorageFormat(); }
+        bool IsReadOnly() const override { return m_isPacked ? m_packedData->IsReadOnly() : Value::IsReadOnly(); }
+
+        size_t MaskedCount() const override
+        {
+            if (m_isPacked)
+                // Compute the number of masked samples after the data will be unpacked
+                return m_packedDataLayout ? ((m_packedDataLayout->GetNumTimeSteps() * m_packedDataLayout->GetNumSequences()) - m_packedDataLayout->GetActualNumSamples()) : 0;
+            else
+                return Value::MaskedCount();
+        }
+
+        NDArrayViewPtr Data() const override
+        {
+            Unpack();
+            return Value::Data();
+        }
+
+        NDMaskPtr Mask() const override
+        {
+            Unpack();
+            return Value::Mask();
+        }
+
+        ValuePtr DeepClone(bool /*readOnly = false*/) const override
+        {
+            if (m_isPacked)
+            {
+                std::shared_ptr<Microsoft::MSR::CNTK::MBLayout> packedLayoutCopy;
+                if (m_packedDataLayout)
+                {
+                    packedLayoutCopy = std::make_shared<Microsoft::MSR::CNTK::MBLayout>();
+                    packedLayoutCopy->CopyFrom(m_packedDataLayout);
+                }
+                return MakeSharedObject<PackedValue>(m_sampleShape, m_packedData->DeepClone(), packedLayoutCopy, m_isReadOnly);
+            }
+            else
+                return Value::DeepClone();
+        }
+
+        ValuePtr Alias(bool /*readOnly = false*/) const override
+        {
+            LogicError("Alias is currently unsupported for PackedValue objects");
+        }
+
+        void CopyFrom(const Value& /*source*/) override
+        {
+            LogicError("CopyFrom is currently unsupported for PackedValue objects");
+        }
+
+        template <typename ElementType>
+        std::pair<std::shared_ptr<const Microsoft::MSR::CNTK::Matrix<ElementType>>, std::shared_ptr<Microsoft::MSR::CNTK::MBLayout>> PackedData()
+        {
+            if (!m_isPacked)
+                InvalidArgument("PackedValue::PackedData called on a Value object that has already been unpacked");
+
+            return { m_packedData->GetMatrix<ElementType>(), m_packedDataLayout };
+        }

    private:
+        PackedValue(const NDShape& sampleShape, const NDArrayViewPtr& packedData, const std::shared_ptr<Microsoft::MSR::CNTK::MBLayout>& packedDataLayout, bool isReadOnly)
+            : Value(nullptr), m_isPacked(true), m_sampleShape(sampleShape), m_packedData(packedData), m_packedDataLayout(packedDataLayout), m_isReadOnly(isReadOnly)
+        {
+            // Determine unpacked shape
+            m_unpackedShape = sampleShape;
+            if (packedDataLayout)
+                m_unpackedShape = m_unpackedShape.AppendShape({ packedDataLayout->GetNumTimeSteps(), packedDataLayout->GetNumSequences() });
+        }

+    private:
+        bool m_isReadOnly;
+        NDShape m_sampleShape;
+        NDShape m_unpackedShape;
+
+        mutable bool m_isPacked;
+        mutable NDArrayViewPtr m_packedData;
+        mutable std::shared_ptr<Microsoft::MSR::CNTK::MBLayout> m_packedDataLayout;
    };
 }
--- a/Source/CNTKv2LibraryDll/Variable.cpp
+++ b/Source/CNTKv2LibraryDll/Variable.cpp
@ -30,7 +30,7 @@ namespace CNTK
        if (varOwner)
            return CompositeFunction::Create(varOwner, varOwner->Name());
        else
-            return Internal::Combine({ *this });
+            return Combine({ *this });
    }

    NDArrayViewPtr Variable::Value() const
@ -70,14 +70,24 @@ namespace CNTK
    static const std::wstring KernelWidthAttributeName = L"kernelWidth";
    static const std::wstring KernelHeightAttributeName = L"kernelHeight";

-    ParameterInitializer UniformInitializer(double scale, unsigned long seed)
+    void Variable::VariableFields::SetValueInitialization(const ParameterInitializer& initializationConfig, const DeviceDescriptor& device)
    {
-        Dictionary initConfig;
-        initConfig[InitializerTypeAttributeName] = Microsoft::MSR::CNTK::UniformInitializerTypeName;
-        initConfig[ScaleAttributeName] = scale;
-        initConfig[RandomSeedAttributeName] = (size_t)seed;
+        if (m_value != nullptr)
+            LogicError("Value initialization config cannot be set if a value already exists");

-        return initConfig;
+        assert(!m_valueInitializer);
+        assert(!m_valueInitializationDevice);
+
+        if (initializationConfig.Contains(FilterRankAttributeName))
+        {
+            auto filterRank = (int)initializationConfig[FilterRankAttributeName].Value<size_t>();
+            auto outputRank = (int)initializationConfig[OutputRankAttributeName].Value<size_t>();
+            if ((filterRank + outputRank) > m_shape.Rank())
+                InvalidArgument("Sum of filter rank (%d) and output rank (%d) of the parameter initializer cannot exceed the Parameter's rank(%d)", filterRank, outputRank, (int)m_shape.Rank());
+        }
+
+        m_valueInitializer.reset(new ParameterInitializer(initializationConfig));
+        m_valueInitializationDevice.reset(new DeviceDescriptor(device));
    }

    static ParameterInitializer CreateInitializer(const std::wstring& initializerTypeName, int outputRank, int filterRank, double scale, unsigned long seed)
@ -92,6 +102,16 @@ namespace CNTK
        return initConfig;
    }

+    ParameterInitializer UniformInitializer(double scale, unsigned long seed)
+    {
+        Dictionary initConfig;
+        initConfig[InitializerTypeAttributeName] = Microsoft::MSR::CNTK::UniformInitializerTypeName;
+        initConfig[ScaleAttributeName] = scale;
+        initConfig[RandomSeedAttributeName] = (size_t)seed;
+
+        return initConfig;
+    }
+
    ParameterInitializer GaussianInitializer(int outputRank, int filterRank, double scale, unsigned long seed)
    {
        return CreateInitializer(Microsoft::MSR::CNTK::GaussianInitializerTypeName, outputRank, filterRank, scale, seed);
--- a/Source/Common/BestGpu.cpp
+++ b/Source/Common/BestGpu.cpp
@ -124,7 +124,7 @@ private:
 };

 static DEVICEID_TYPE s_bestDeviceId = DEVICEID_NOTYETDETERMINED;
-static BestGpu* s_bestGpu = nullptr;
+static std::unique_ptr<BestGpu> s_bestGpu = nullptr;

 // DeviceFromConfig - Parse 'deviceId' config parameter to determine what type of behavior is desired
 //Symbol - Meaning
@ -149,7 +149,7 @@ static DEVICEID_TYPE SelectDevice(DEVICEID_TYPE deviceId, bool bLockGPU, const i
            // GPU device to be auto-selected, so init our class
            if (s_bestGpu == nullptr)
            {
-                s_bestGpu = new BestGpu();
+                s_bestGpu = make_unique<BestGpu>();
                for (int i = 0; i < excludedDevices.size(); ++i)
                {
                    s_bestGpu->DisallowDevice(excludedDevices[i]);
@ -270,6 +270,8 @@ void BestGpu::GetCudaProperties()
    if (m_cudaData)
        return;

+    int currentDevice, rc;
+    rc = cudaGetDevice(&currentDevice);
    int dev = 0;

    for (ProcessorData* pd : m_procData)
@ -284,9 +286,16 @@ void BestGpu::GetCudaProperties()
        pd->cudaFreeMem = free;
        pd->cudaTotalMem = total;
        dev++;
-        cudaDeviceReset();
+        // cudaDeviceReset() explicitly destroys and cleans up all resources associated with the 
+        // current device in the current process.
+        // Will result in a segmentation fault is called, for instance, after cudnnCreate, but before cudnnDestroy.
+        // cudaDeviceReset();
    }
    m_cudaData = m_procData.size() > 0;
+    if (rc == CUDA_SUCCESS)
+    {
+        cudaSetDevice(currentDevice);
+    }
 }

 void BestGpu::Init()
@ -325,8 +334,11 @@ BestGpu::~BestGpu()

    if (m_nvmlData)
    {
-        // TODO: Check for error code and throw if !std::uncaught_exception()
-        nvmlShutdown();
+        nvmlReturn_t r = nvmlShutdown();
+        if ((r != NVML_SUCCESS) && !std::uncaught_exception())
+        {
+            RuntimeError("BestGPU Destructor: failed to shut down NVML. \n");
+        }
    }
 }

--- a/Source/Common/CrossProcessMutex.h
+++ b/Source/Common/CrossProcessMutex.h
@ -7,6 +7,10 @@
 #include <cassert>
 #include <string>

+#define CLOSEHANDLE_ERROR 0
+#define RELEASEMUTEX_ERROR 0
+#define FCNTL_ERROR -1
+
 #ifdef WIN32 // --- Windows version

 #define NOMINMAX
@ -46,7 +50,11 @@ public:
        if (::WaitForSingleObject(m_handle, wait ? INFINITE : 0) != WAIT_OBJECT_0)
        {
            // failed to acquire
-            ::CloseHandle(m_handle);
+            int rc = ::CloseHandle(m_handle);
+            if ((rc == CLOSEHANDLE_ERROR) && !std::uncaught_exception())
+            {
+                RuntimeError("Acquire: Handler close failure with error code %d", ::GetLastError());
+            }
            m_handle = NULL;
            return false;
        }
@ -58,9 +66,17 @@ public:
    void Release()
    {
        assert(m_handle != NULL);
-        // TODO: Check for error code and throw if !std::uncaught_exception()
-        ::ReleaseMutex(m_handle);
-        ::CloseHandle(m_handle);
+        int rc = 0;
+        rc = ::ReleaseMutex(m_handle);
+        if ((rc == RELEASEMUTEX_ERROR) && !std::uncaught_exception())
+        {
+            RuntimeError("Mutex Release: Failed to release mutex %s: %d", m_name.c_str(), ::GetLastError());
+        }
+        rc = ::CloseHandle(m_handle);
+        if ((rc == CLOSEHANDLE_ERROR) && !std::uncaught_exception())
+        {
+            RuntimeError("Mutex Release: Failed to close handler %s: %d", m_name.c_str(), ::GetLastError());
+        }
        m_handle = NULL;
    }

@ -121,6 +137,8 @@ public:
    // Returns false if !wait and lock cannot be acquired, or in case of a system error that prevents us from acquiring the lock.
    bool Acquire(bool wait)
    {
+        mode_t mask = umask(0);
+
        assert(m_fd == -1);
        for (;;)
        {
@ -146,6 +164,7 @@ public:
            {
                // acquire failed
                close(fd);
+                umask(mask);
                return false;
            }
            // we own the exclusive lock on file descriptor, but we need to double-check
@ -165,6 +184,7 @@ public:
            {
                // lock acquired successfully
                m_fd = fd;
+                umask(mask);
                return true;
            }
        }
@ -181,8 +201,11 @@ public:
        m_lock.l_type = F_UNLCK;
        // Now removing the lock and closing the file descriptor
        // waiting processes will be notified
-        // TODO: Check for error code and throw if !std::uncaught_exception()
-        fcntl(m_fd, F_SETLKW, &m_lock);
+        int rc = fcntl(m_fd, F_SETLKW, &m_lock);
+        if (rc == FCNTL_ERROR)
+        {
+            RuntimeError("Mutex Release: Failed to release mutex %s", m_fileName.c_str());
+        }
        close(m_fd);
        m_fd = -1;
    }
--- a/Source/Common/DataReader.cpp
+++ b/Source/Common/DataReader.cpp
@ -184,6 +184,23 @@ bool DataReader::SupportsDistributedMBRead() const
    return supportsDistributedMBRead;
 }

+//IsLegacyReader - Returns true if one of the readers is a legacy reader, false otherwise.
+bool DataReader::IsLegacyReader() const
+{
+    for (size_t i = 0; i < m_ioNames.size(); i++)
+    {
+        auto currReaderIter = m_dataReaders.find(m_ioNames[i]);
+        assert(currReaderIter != m_dataReaders.end());
+
+        if (currReaderIter->second->IsLegacyReader())
+        {
+            return true;
+        }
+    }
+
+    return false;
+}
+
 //StartDistributedMinibatchLoop - Startup a distributed minibatch loop for parallel training
 // mbSize - [in] size of the minibatch (number of frames, etc.)
 // epoch - [in] epoch number for this loop
@ -207,6 +224,13 @@ void DataReader::StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size
    }
 }

+size_t DataReader::GetCurrentSamplePosition()
+{
+    // BUGBUG: composition of old readers is not supported.
+    // Returning just for the last reader.
+    return m_dataReaders[m_ioNames.back()]->GetCurrentSamplePosition();
+}
+
 // GetMinibatch - Get the next minibatch (features and labels)
 // matrices - [in] a map with named matrix types (i.e. 'features', 'labels') mapped to the corresponding matrix,
 //             [out] each matrix resized if necessary containing data.
--- a/Source/Common/File.cpp
+++ b/Source/Common/File.cpp
@ -26,6 +26,8 @@
 #include <linux/limits.h> // for PATH_MAX
 #endif

+#define PCLOSE_ERROR -1
+
 namespace Microsoft { namespace MSR { namespace CNTK {

 // File creation
@ -255,17 +257,23 @@ bool File::IsTextBased()
 // Note: this does not check for errors when the File corresponds to pipe stream. In this case, use Flush() before closing a file you are writing.
 File::~File(void)
 {
+    int rc = 0;
    if (m_pcloseNeeded)
    {
-        // TODO: Check for error code and throw if !std::uncaught_exception()     
-        _pclose(m_file);
+        rc = _pclose(m_file);
+        if ((rc == PCLOSE_ERROR) && !std::uncaught_exception())
+        {
+            RuntimeError("File: failed to close file at %S", m_filename.c_str());
+        }
    }
    else if (m_file != stdin && m_file != stdout && m_file != stderr)
    {
-        int rc = fclose(m_file);
-        if ((rc != 0) && !std::uncaught_exception())
+        rc = fclose(m_file);
+        if ((rc != FCLOSE_SUCCESS) && !std::uncaught_exception())
+        {
            RuntimeError("File: failed to close file at %S", m_filename.c_str());
        }
+    }
 }

 void File::Flush()
--- a/Source/Common/Include/Basics.h
+++ b/Source/Common/Include/Basics.h
@ -26,6 +26,7 @@

 #define EPSILON 1e-5
 #define ISCLOSE(a, b, threshold) (abs(a - b) < threshold) ? true : false
+#define DLCLOSE_SUCCESS 0

 #define UNUSED(x) (void)(x) // for variables that are, e.g., only used in _DEBUG builds

@ -705,9 +706,14 @@ public:
    }
    ~Plugin()
    {
-        // TODO: Check for error code and throw if !std::uncaught_exception()
        if (handle != NULL)
-            dlclose(handle);
+        {
+            int rc = dlclose(handle);
+            if ((rc != DLCLOSE_SUCCESS) && !std::uncaught_exception())
+            {
+                RuntimeError("Plugin: Failed to decrements the reference count.");
+            }
+        }
    }
 };
 #endif
--- a/Source/Common/Include/DataReader.h
+++ b/Source/Common/Include/DataReader.h
@ -239,6 +239,18 @@ public:
        return false;
    };

+    // old DataReader architecture
+    virtual bool IsLegacyReader() const
+    {
+        return true;
+    };
+    
+    // Gets current sample position on the global timeline.
+    virtual size_t GetCurrentSamplePosition()
+    {
+        NOT_IMPLEMENTED;
+    }
+
    virtual void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize)
    {
        if (SupportsDistributedMBRead() || (numSubsets != 1) || (subsetNum != 0))
@ -410,6 +422,8 @@ public:
    }
    virtual ~DataReader();

+    size_t GetCurrentSamplePosition() override;
+
    // StartMinibatchLoop - Startup a minibatch loop
    // mbSize - [in] size of the minibatch (number of frames, etc.)
    // epoch - [in] epoch number for this loop
@ -417,6 +431,7 @@ public:
    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, size_t requestedEpochSamples = requestDataSize);

    virtual bool SupportsDistributedMBRead() const override;
+    virtual bool IsLegacyReader() const override;
    virtual void StartDistributedMinibatchLoop(size_t mbSize, size_t epoch, size_t subsetNum, size_t numSubsets, size_t requestedEpochSamples = requestDataSize) override;

    virtual void StartMinibatchLoop(size_t mbSize, size_t epoch, const std::unordered_set<InputStreamDescription>&, size_t requestedEpochSamples = requestDataSize) override;
--- a/Source/Common/Include/MPIWrapper.h
+++ b/Source/Common/Include/MPIWrapper.h
@ -14,12 +14,14 @@
 #endif
 #pragma comment(lib, "msmpi.lib")

-
+#include <errno.h> 
 #include <string>
 #include <array>
 #include <vector>
 #include <memory>

+#define FFLUSH_SUCCESS 0
+
 namespace Microsoft { namespace MSR { namespace CNTK {

 struct MpiFail : public std::string
@ -138,6 +140,14 @@ public:
        MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes);
        m_numNodesInUse = m_numMPINodes;

+        // Verify that the environment variable used by GetTotalNumberOfMPINodes()  
+        // matches what the MPI API says. There're actually two possible cases:
+        // 1) when we're running with mpiexec both values have to match;
+        // 2) when we're running without mpiexec, the former will return 0, and
+        // the later will be set to 1.
+        assert((GetTotalNumberOfMPINodes() == 0 && m_numNodesInUse == 1) ||
+                (GetTotalNumberOfMPINodes() == m_numNodesInUse));
+
        // Applying MPI workaround
        s_myRank = m_myRank;
        atexit(&MPIWrapper::MPIWorkaroundAtExit);
@ -160,19 +170,50 @@ public:
        ::Sleep((DWORD)(500 * CurrentNodeRank()));
    }

+    // Note that specifically, this function is such that it does not require
+    // MPI initialization. Moreover, it can be used without actually loading any
+    // MPI libs.
+    // TODO: Once we move to dynamic loading for MPI libs on Linux, move it to utilities.
+    static int GetTotalNumberOfMPINodes()
+    {
+#ifdef WIN32
+        const char* p = std::getenv("PMI_SIZE");
+#else
+        const char* p = std::getenv("OMPI_COMM_WORLD_SIZE");
+#endif
+        if (!p)
+        {
+            return 0;
+        }
+        else
+        {
+            return std::stoi(string(p));
+        }
+    }
+
    // Note: we don't clear the sub-communication here although we should, because in case of a crash, this prevents the EXE from terminating.
    // It's OK since this class is a singleton anyway that gets instantiated exactly once at program startup.
    ~MPIWrapper()
    {
        fprintf(stderr, "~MPIWrapper\n");
-        fflush(stderr);
-        // TODO: Check for error code and throw if !std::uncaught_exception()

        // Do not finalize in event of an exception since calling MPI_Finalize without
        // all pending communications being finished results in a hang
+        int rc = fflush(stderr);
        if (!std::uncaught_exception())
+        {
+            if (rc != FFLUSH_SUCCESS)
+            {
+            #ifdef _WIN32
+                RuntimeError("MPIWrapper: Failed to flush stderr, %d", ::GetLastError());
+            #else
+                RuntimeError("MPIWrapper: Failed to flush stderr, %d", errno);
+            #endif
+            }
+
            MPI_Finalize();
        }
+    }

 private:
    void Ping(const char *msg) const
--- a/Показать больше
+++ b/Показать больше