merged from master

2017-03-02 11:46:22 -08:00 · 2017-03-02 11:46:22 -08:00 · 736d2f006c
--- a/.gitignore
+++ b/.gitignore
@ -186,6 +186,7 @@ core
 # prebuild file 
 Source/CNTK/buildinfo.h
 Source/CNTK/buildinfo.h$$
+Source/CNTKv2LibraryDll/buildinfo.h

 # Unit test output
 Tests/UnitTests/ReaderTests/Control/**/*_Output.txt
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1497,6 +1497,15 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryEndToEndTests", "T
 		{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}
 	EndProjectSection
 EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Extensibility", "Extensibility", "{3BF56127-6F0F-41CF-BFCE-31165A0A5E73}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "CPP", "CPP", "{7A27E076-296E-41A8-BA76-164071251372}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPExtensibilityExamples", "Examples\Extensibility\CPP\CPPExtensibilityExamples.vcxproj", "{40A8CC31-8C08-4156-AE08-E8C0FADC3509}"
+	ProjectSection(ProjectDependencies) = postProject
+		{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}
+	EndProjectSection
+EndProject
 Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "PythonExamples", "Examples\PythonExamples.pyproj", "{292FF4EE-D9DD-4BA7-85F7-6A22148D1E01}"
 EndProject
 Global
@ -1965,6 +1974,16 @@ Global
 		{743FC7AA-3884-4C96-983A-A33FD6C56227}.Release_NoOpt|x64.Build.0 = Release_NoOpt|x64
 		{743FC7AA-3884-4C96-983A-A33FD6C56227}.Release|x64.ActiveCfg = Release|x64
 		{743FC7AA-3884-4C96-983A-A33FD6C56227}.Release|x64.Build.0 = Release|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Debug|x64.ActiveCfg = Debug|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Debug|x64.Build.0 = Debug|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Release_NoOpt|x64.ActiveCfg = Release_NoOpt|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Release_NoOpt|x64.Build.0 = Release_NoOpt|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Release|x64.ActiveCfg = Release|x64
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509}.Release|x64.Build.0 = Release|x64
 		{292FF4EE-D9DD-4BA7-85F7-6A22148D1E01}.Debug_CpuOnly|x64.ActiveCfg = Debug|Any CPU
 		{292FF4EE-D9DD-4BA7-85F7-6A22148D1E01}.Debug|x64.ActiveCfg = Debug|Any CPU
 		{292FF4EE-D9DD-4BA7-85F7-6A22148D1E01}.Release_CpuOnly|x64.ActiveCfg = Release|Any CPU
@ -2176,6 +2195,9 @@ Global
 		{5CC403B9-2405-4FFB-A73B-DAE0DC986C76} = {CE223840-1DEE-4849-B530-F06BEE05BAA8}
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005} = {05E45AF7-C069-4057-BC16-0A532D068CE4}
 		{743FC7AA-3884-4C96-983A-A33FD6C56227} = {43ED3FD0-824C-4201-BD96-B824DF959ADC}
+		{3BF56127-6F0F-41CF-BFCE-31165A0A5E73} = {47755F2E-D674-4175-9E38-8EA053455072}
+		{7A27E076-296E-41A8-BA76-164071251372} = {3BF56127-6F0F-41CF-BFCE-31165A0A5E73}
+		{40A8CC31-8C08-4156-AE08-E8C0FADC3509} = {7A27E076-296E-41A8-BA76-164071251372}
 		{292FF4EE-D9DD-4BA7-85F7-6A22148D1E01} = {47755F2E-D674-4175-9E38-8EA053455072}
 	EndGlobalSection
 EndGlobal
--- a/Examples/Extensibility/CPP/CPPExtensibilityExamples.vcxproj
+++ b/Examples/Extensibility/CPP/CPPExtensibilityExamples.vcxproj
@ -0,0 +1,125 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_NoOpt|x64">
+      <Configuration>Release_NoOpt</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_CpuOnly|x64">
+      <Configuration>Debug_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_CpuOnly|x64">
+      <Configuration>Release_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="Main.cpp" />
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="UserMatrixMultiplicationOp.h" />
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{40A8CC31-8C08-4156-AE08-E8C0FADC3509}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CPPExtensibilityExamples</RootNamespace>
+    <ProjectName>CPPExtensibilityExamples</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
+  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v140</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings" />
+  <ImportGroup Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="$(DebugBuild)">
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+  <PropertyGroup Condition="$(ReleaseBuild)">
+    <LinkIncremental>false</LinkIncremental>
+    <ExecutablePath>$(ExecutablePath)</ExecutablePath>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\CNTKv2LibraryDll\API</AdditionalIncludeDirectories>
+    </ClCompile>
+    <Link>
+      <AdditionalLibraryDirectories>$(OutDir);$(SolutionDir)$(Platform)\$(Configuration)</AdditionalLibraryDirectories>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(DebugBuild)">
+    <ClCompile>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <WarningLevel>Level4</WarningLevel>
+      <Optimization>Disabled</Optimization>
+      <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <AdditionalOptions>/bigobj %(AdditionalOptions)</AdditionalOptions>
+      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">MultiThreadedDebug</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>CNTKLibrary-2.0.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>
+      </PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <FavorSizeOrSpeed>Speed</FavorSizeOrSpeed>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release|x64'">MultiThreaded</RuntimeLibrary>
+      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release_NoOpt|x64'">MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalDependencies>CNTKLibrary-2.0.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
+    <ClCompile>
+      <PreprocessorDefinitions>CPUONLY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Debug_CpuOnly|x64'">MultiThreadedDebug</RuntimeLibrary>
+      <RuntimeLibrary Condition="'$(Configuration)|$(Platform)'=='Release_CpuOnly|x64'">MultiThreaded</RuntimeLibrary>
+    </ClCompile>
+  </ItemDefinitionGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/Examples/Extensibility/CPP/CPPExtensibilityExamples.vcxproj.filters
+++ b/Examples/Extensibility/CPP/CPPExtensibilityExamples.vcxproj.filters
@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="Main.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+  <ItemGroup>
+    <ClInclude Include="UserMatrixMultiplicationOp.h">
+      <Filter>Header Files</Filter>
+    </ClInclude>
+  </ItemGroup>
+</Project>
--- a/Examples/Extensibility/CPP/Main.cpp
+++ b/Examples/Extensibility/CPP/Main.cpp
@ -0,0 +1,6 @@
+#include "UserMatrixMultiplicationOp.h"
+
+void main()
+{
+    UserTimesFunctionExample();
+}
--- a/Examples/Extensibility/CPP/UserMatrixMultiplicationOp.h
+++ b/Examples/Extensibility/CPP/UserMatrixMultiplicationOp.h
@ -0,0 +1,187 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "CNTKLibrary.h"
+
+using namespace CNTK;
+
+class UserTimesFunction final : public Function
+{
+public:
+    static FunctionPtr Create(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name)
+    {
+        return AsComposite(MakeSharedObject<UserTimesFunction>(leftOperand, rightOperand, name));
+    }
+
+    UserTimesFunction(const Variable& leftOperand, const Variable& rightOperand, const std::wstring& name)
+        : Function({ leftOperand, rightOperand }, Dictionary(), name)
+    {}
+
+private:
+    static void MatrixMultiply(const NDArrayViewPtr& leftMatrix, const NDArrayViewPtr& rightMatrix, NDArrayViewPtr& outputMatrix, bool transposeRight = false)
+    {
+        auto GetNumRowsAndCols = [](const NDShape& shape, bool transpose = false) {
+            auto numRows = shape[0];
+            auto numCols = shape[shape.Rank() - 1];
+            if (transpose)
+                std::swap(numRows, numCols);
+
+            return std::make_pair(numRows, numCols);
+        };
+
+        size_t leftNumRows, leftNumCols;
+        std::tie(leftNumRows, leftNumCols) = GetNumRowsAndCols(leftMatrix->Shape());
+
+        size_t rightNumRows, rightNumCols;
+        std::tie(rightNumRows, rightNumCols) = GetNumRowsAndCols(rightMatrix->Shape(), transposeRight);
+
+        auto numOutRows = leftNumRows;
+        auto K = leftNumCols;
+        auto numOutCols = rightNumCols;
+
+        assert(!leftMatrix->IsSparse() && !rightMatrix->IsSparse() && !outputMatrix->IsSparse());
+        assert(K == rightNumRows);
+        assert((outputMatrix->Shape()[0] == numOutRows) && (outputMatrix->Shape()[1] == numOutCols));
+        outputMatrix->SetValue(0.0f);
+
+        // The operands values are in column major layout
+        auto Offset = [](size_t rowIdx, size_t colIdx, const NDShape& matrixShape, bool transpose = false) {
+            if (transpose)
+                std::swap(rowIdx, colIdx);
+
+            return (colIdx * matrixShape[0]) + rowIdx;
+        };
+
+        auto leftBuffer = leftMatrix->DataBuffer<float>();
+        auto rightBuffer = rightMatrix->DataBuffer<float>();
+        auto outBuffer = outputMatrix->WritableDataBuffer<float>();
+        for (size_t j = 0; j < numOutCols; ++j)
+            for (size_t k = 0; k < K; ++k)
+                for (size_t i = 0; i < numOutRows; ++i)
+                    outBuffer[Offset(i, j, outputMatrix->Shape())] += leftBuffer[Offset(i, k, leftMatrix->Shape())] * rightBuffer[Offset(k, j, rightMatrix->Shape(), transposeRight)];
+    }
+
+    BackPropStatePtr Forward(const std::vector<ValuePtr>& inputValues,
+                             std::unordered_map<Variable, ValuePtr>& outputs,
+                             const DeviceDescriptor& computeDevice,
+                             const std::unordered_set<Variable>& /*outputsToRetainBackwardStateFor*/) override
+    {
+        auto leftOperandData = inputValues[0]->Data();
+        auto rightOperandData = inputValues[1]->Data();
+
+        // Allocate outputValue if needed
+        auto& outputValue = outputs[this->Output()];
+        if (outputValue == nullptr)
+        {
+            auto numOutRows = leftOperandData->Shape()[0];
+            auto numOutCols = rightOperandData->Shape()[rightOperandData->Shape().Rank() - 1];
+            outputValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(DataType::Float, NDShape({ numOutRows , numOutCols }), computeDevice));
+        }
+
+        auto outputData = outputValue->Data();
+        MatrixMultiply(leftOperandData, rightOperandData, outputData);
+
+        // Let's save the right input's Value in the BackPropSate to be used in the backward pass for computing gradients
+        return MakeSharedObject<BackPropState>(this->shared_from_this(), computeDevice, std::unordered_map<Variable, ValuePtr>({ {Inputs()[1], inputValues[1] } }));
+    }
+
+    void Backward(const BackPropStatePtr& state,
+                  const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
+                  std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs) override
+    {
+        auto leftInputVariable = Inputs()[0];
+        auto rightInputVariable = Inputs()[1];
+        if (backPropagatedGradientValuesForInputs.find(rightInputVariable) != backPropagatedGradientValuesForInputs.end())
+            std::runtime_error("UserTimesFunction does not support computing gradient wrt right operand");
+
+        auto rightInputData = state->SavedForwardPropValues().at(rightInputVariable)->Data();
+
+        // Allocate input gradient Value if needed
+        auto& inputGradientValue = backPropagatedGradientValuesForInputs[leftInputVariable];
+        if (inputGradientValue == nullptr)
+            inputGradientValue = MakeSharedObject<Value>(MakeSharedObject<NDArrayView>(DataType::Float, leftInputVariable.Shape(), state->Device()));
+
+        auto rootGradientData = rootGradientValues.at(this->Output())->Data();
+        auto inputGradientData = inputGradientValue->Data();
+
+        MatrixMultiply(rootGradientData, rightInputData, inputGradientData, /*transposeRight =*/ true);
+    }
+
+    const std::wstring& OpName() const override
+    {
+        static const std::wstring opName = L"UserTimesOp";
+        return opName;
+    }
+
+    Dictionary Serialize() const override { NOT_IMPLEMENTED; }
+    size_t CurrentVersion() const override { NOT_IMPLEMENTED; }
+
+    void InferOutputs(std::vector<Variable>& outputs) override
+    {
+        auto leftOperand = Inputs()[0];
+        auto rightOperand = Inputs()[1];
+
+        if (leftOperand.Shape().Rank() != 2)
+            std::runtime_error("Left operand must be 2D");
+
+        if (rightOperand.Shape().Rank() != 1)
+            std::runtime_error("Right operand must be 1D");
+
+        if (!leftOperand.DynamicAxes().empty())
+            std::runtime_error("Left operand must not have dynamic axes (i.e. should not be minibatch data, but be a Parameter of fixed size)");
+
+        outputs.push_back(OutputVariable(NDShape({ leftOperand.Shape()[0] }), leftOperand.GetDataType(), rightOperand.DynamicAxes()));
+    }
+};
+
+#pragma warning(push)
+#pragma warning(disable:  4459)
+void UserTimesFunctionExample()
+{
+    auto device = DeviceDescriptor::CPUDevice();
+    size_t outDim = 15;
+    size_t inDim = 10;
+    auto W = Parameter(NDShape({ outDim, inDim }), DataType::Float, GlorotUniformInitializer(), device);
+    auto x = InputVariable(NDShape({ inDim }), DataType::Float, { Axis::DefaultBatchAxis() });
+    auto userDefinedTimes = UserTimesFunction::Create(W, x, L"UserDefinedTimes");
+
+    size_t batchSize = 3;
+    std::vector<float> inputData(inDim * batchSize);
+    for (size_t i = 0; i < inputData.size(); ++i)
+        inputData[i] = (float)rand() / RAND_MAX;
+
+    auto inputDataValue = Value::CreateBatch(x.Shape(), inputData, device);
+
+    std::vector<float> rootGradientData(outDim * batchSize, 1);
+    auto rootGradientValue = Value::CreateBatch(userDefinedTimes->Output().Shape(), rootGradientData, device);
+
+    std::unordered_map<Variable, ValuePtr> outputValues = { { userDefinedTimes->Output(), nullptr } };
+    auto backPropState = userDefinedTimes->Forward({ { x, inputDataValue } }, outputValues, device, { userDefinedTimes->Output() });
+
+    std::unordered_map<Variable, ValuePtr> inputGradientValues = { { W, nullptr } };
+    userDefinedTimes->Backward(backPropState, { { userDefinedTimes->Output(), rootGradientValue } }, inputGradientValues);
+    auto userDefinedTimesOutputValue = outputValues[userDefinedTimes->Output()];
+    auto userDefinedTimesInputGradientValue = inputGradientValues[W];
+
+    // Compare against the CNTK built-in implementation
+    auto builtInTimes = Times(W, x, L"BuiltInTimes");
+    outputValues = { { builtInTimes->Output(), nullptr } };
+    backPropState = builtInTimes->Forward({ { x, inputDataValue } }, outputValues, device, { builtInTimes->Output() });
+    inputGradientValues = { { W, nullptr } };
+    builtInTimes->Backward(backPropState, { { builtInTimes->Output(), rootGradientValue } }, inputGradientValues);
+    auto builtInTimesOutputValue = outputValues[builtInTimes->Output()];
+    auto builtInTimesInputGradientValue = inputGradientValues[W];
+
+    const double relativeTolerance = 0.001f;
+    const double absoluteTolerance = 0.000001f;
+
+    if (!Internal::AreEqual(*userDefinedTimesOutputValue, *builtInTimesOutputValue, relativeTolerance, absoluteTolerance))
+        std::runtime_error("UserTimesOp's Forward result does not match built-in result");
+
+    if (!Internal::AreEqual(*userDefinedTimesInputGradientValue, *builtInTimesInputGradientValue, relativeTolerance, absoluteTolerance))
+        std::runtime_error("UserTimesOp's Forward result does not match built-in result");
+}
+#pragma warning(pop)
+
--- a/Examples/Image/Classification/AlexNet/Python/AlexNet_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/AlexNet/Python/AlexNet_ImageNet_Distributed.py
@ -13,6 +13,7 @@ import cntk
 import _cntk_py

 from cntk.utils import *
+from cntk.training_session import *
 from cntk.ops import *
 from cntk.distributed import data_parallel_distributed_learner, Communicator
 from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
@ -140,7 +141,7 @@ def create_alexnet():
    }

 # Create trainer
-def create_trainer(network, epoch_size, num_quantization_bits):
+def create_trainer(network, epoch_size, num_quantization_bits, printer):
    # Set learning parameters
    lr_per_mb         = [0.01]*25 + [0.001]*25 + [0.0001]*25 + [0.00001]*25 + [0.000001]
    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
@ -156,10 +157,10 @@ def create_trainer(network, epoch_size, num_quantization_bits):
        distributed_after=0)

    # Create trainer
-    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
+    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, printer)

 # Train and test
-def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
+def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore):

    # define mapping from intput streams to network inputs
    input_map = {
@ -167,23 +168,15 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
        network['label']: train_source.streams.labels
    }

-    training_session = cntk.training_session(
-        training_minibatch_source = train_source,
-        trainer = trainer,
-        model_inputs_to_mb_source_mapping = input_map,
-        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
-        progress_printer = progress_printer,
-#        checkpoint_frequency = epoch_size,
-        checkpoint_filename = os.path.join(model_path, model_name),
-#        save_all_checkpoints = True,
-        progress_frequency = epoch_size,
-        cv_source = test_source,
-        cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
-#        cv_frequency = epoch_size,
-        restore = restore)
-
-    # Train all minibatches
-    training_session.train()
+    # Train all minibatches 
+    training_session(
+        trainer=trainer, mb_source = train_source,
+        var_to_stream = input_map, 
+        mb_size = minibatch_size,
+        progress_frequency=epoch_size,
+        checkpoint_config = CheckpointConfig(filename=os.path.join(model_path, model_name), restore=restore),
+        cv_config= CrossValidationConfig(source=test_source, mb_size=minibatch_size)
+    ).train()

 # Train and evaluate the network.
 def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=256, epoch_size = 1281167, max_epochs=112,
@ -199,10 +192,10 @@ def alexnet_train_and_eval(train_data, test_data, num_quantization_bits=32, mini
        num_epochs=max_epochs)

    network = create_alexnet()
-    trainer = create_trainer(network, epoch_size, num_quantization_bits)
+    trainer = create_trainer(network, epoch_size, num_quantization_bits, progress_printer)
    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
-    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
+    train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore)


 if __name__=='__main__':
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
@ -12,6 +12,7 @@ import numpy as np
 import cntk
 import _cntk_py
 import cntk.io.transforms as xforms
+from cntk.training_session import *

 # default Paths relative to current python file.
 abs_path   = os.path.dirname(os.path.abspath(__file__))
@ -90,7 +91,7 @@ def create_conv_network():


 # Create trainer
-def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up):
+def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, progress_writers):
    # Set learning parameters
    lr_per_sample     = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625]
    lr_schedule       = cntk.learning_rate_schedule(lr_per_sample, unit=cntk.learner.UnitType.sample, epoch_size=epoch_size)
@ -112,10 +113,10 @@ def create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_
        parameter_learner = cntk.distributed.data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)

    # Create trainer
-    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
+    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_writers)

 # Train and test
-def train_and_test(network, trainer, train_source, test_source, progress_writers, minibatch_size, epoch_size, restore, profiling=False):
+def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore, profiling=False):

    # define mapping from intput streams to network inputs
    input_map = {
@ -123,26 +124,20 @@ def train_and_test(network, trainer, train_source, test_source, progress_writers
        network['label']: train_source.streams.labels
    }

-    training_session = cntk.training_session(
-        training_minibatch_source = train_source,
-        trainer = trainer,
-        model_inputs_to_mb_source_mapping = input_map,
-        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
-        progress_printer = progress_writers,
-        checkpoint_frequency = epoch_size,
-        checkpoint_filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
-#        save_all_checkpoints = False,
-        progress_frequency=epoch_size,
-        cv_source = test_source,
-        cv_mb_size_schedule=cntk.minibatch_size_schedule(minibatch_size),
-#        cv_frequency = epoch_size,
-        restore=restore)
-
    # Train all minibatches
    if profiling:
        cntk.start_profiler(sync_gpu=True)

-    training_session.train()
+    training_session(
+        trainer=trainer, mb_source = train_source,
+        var_to_stream = input_map, 
+        mb_size = minibatch_size,
+        progress_frequency=epoch_size,
+        checkpoint_config = CheckpointConfig(frequency = epoch_size,
+                                             filename = os.path.join(model_path, "ConvNet_CIFAR10_DataAug"),
+                                             restore = restore),
+        cv_config = CrossValidationConfig(source = test_source, mb_size=minibatch_size)
+    ).train()

    if profiling:
        cntk.stop_profiler()
@ -169,10 +164,10 @@ def convnet_cifar10_dataaug(train_data, test_data, mean_data, minibatch_size=64,
        rank=cntk.distributed.Communicator.rank(),
        model=network['output'])

-    trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up)
+    trainer = create_trainer(network, epoch_size, num_quantization_bits, block_size, warm_up, [progress_printer, tensorboard_writer])
    train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
-    train_and_test(network, trainer, train_source, test_source, [progress_printer, tensorboard_writer], minibatch_size,
+    train_and_test(network, trainer, train_source, test_source, minibatch_size,
                   epoch_size, restore, profiling)


--- a/Examples/Image/Classification/MLP/Python/SimpleMNIST.py
+++ b/Examples/Image/Classification/MLP/Python/SimpleMNIST.py
@ -7,12 +7,13 @@
 import numpy as np
 import sys
 import os
-from cntk import Trainer, training_session, minibatch_size_schedule
+from cntk import Trainer, minibatch_size_schedule 
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
 from cntk.device import cpu, set_default_device
 from cntk.learner import sgd, learning_rate_schedule, UnitType
 from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, relu, element_times, constant
 from cntk.utils import ProgressPrinter
+from cntk.training_session import *

 abs_path = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(abs_path, "..", "..", "..", "..", "common"))
@ -65,8 +66,6 @@ def simple_mnist():
    }

    lr_per_minibatch=learning_rate_schedule(0.2, UnitType.minibatch)
-    # Instantiate the trainer object to drive the model training
-    trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch))

    # Get minibatches of images to train with and perform model training
    minibatch_size = 64
@ -79,16 +78,17 @@ def simple_mnist():
        tag='Training',
        num_epochs=num_sweeps_to_train_with)

-    session = training_session(
-        training_minibatch_source = reader_train,
-        trainer = trainer,
-        mb_size_schedule = minibatch_size_schedule(minibatch_size),
-        progress_printer = progress_printer,
-        model_inputs_to_mb_source_mapping = input_map,
-        progress_frequency = num_samples_per_sweep,
-        max_training_samples = num_samples_per_sweep * num_sweeps_to_train_with)
-	
-    session.train()
+    # Instantiate the trainer object to drive the model training
+    trainer = Trainer(z, (ce, pe), sgd(z.parameters, lr=lr_per_minibatch), progress_printer)
+
+    training_session(
+        trainer=trainer, 
+        mb_source = reader_train,
+        mb_size = minibatch_size,
+        var_to_stream = input_map,
+        max_samples = num_samples_per_sweep * num_sweeps_to_train_with,
+        progress_frequency=num_samples_per_sweep
+    ).train()
    
    # Load test data
    path = os.path.normpath(os.path.join(data_dir, "Test-28x28_cntk_text.txt"))
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
@ -18,6 +18,7 @@ from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_
 from _cntk_py import set_computation_network_trace_level
 from cntk.device import set_default_device, gpu
 from cntk.distributed import data_parallel_distributed_learner, block_momentum_distributed_learner, Communicator
+from cntk.training_session import *

 from resnet_models import *

@ -66,7 +67,7 @@ def create_resnet_network(network_name):


 # Create trainer
-def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up):
+def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer):
    if network['name'] == 'resnet20': 
        lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
    elif network['name'] == 'resnet110': 
@ -94,10 +95,10 @@ def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, b
    else:
        learner = data_parallel_distributed_learner(local_learner, num_quantization_bits=num_quantization_bits, distributed_after=warm_up)
    
-    return Trainer(network['output'], (network['ce'], network['pe']), learner)
+    return Trainer(network['output'], (network['ce'], network['pe']), learner, progress_printer)

 # Train and test
-def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling=False):
+def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, profiling=False):

    # define mapping from intput streams to network inputs
    input_map = {
@ -105,23 +106,17 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
        network['label']: train_source.streams.labels
    }

-    training_session = cntk.training_session(
-        training_minibatch_source = train_source, 
-        trainer = trainer,
-        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
-        progress_printer = progress_printer,
-        model_inputs_to_mb_source_mapping = input_map, 
-        checkpoint_frequency = epoch_size,
-        checkpoint_filename="ResNet_CIFAR10_DataAug", 
-        progress_frequency=epoch_size,
-        cv_source=test_source,
-        cv_mb_size_schedule=cntk.minibatch_size_schedule(16),
-        restore=False)
-	
    if profiling:
        start_profiler(sync_gpu=True)
        
-    training_session.train()
+    training_session(
+        trainer=trainer, mb_source = train_source, 
+        mb_size = minibatch_size,
+        var_to_stream = input_map,
+        checkpoint_config = CheckpointConfig(frequency=epoch_size, filename="ResNet_CIFAR10_DataAug", restore=False),
+        progress_frequency=epoch_size,
+        cv_config = CrossValidationConfig(source=test_source, mb_size=16)
+    ).train()
    
    if profiling:
        stop_profiler()
@ -146,10 +141,10 @@ def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, n
        num_epochs=max_epochs)

    network = create_resnet_network(network_name)
-    trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up)
+    trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer)
    train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
-    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, profiling)
+    train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, profiling)


 if __name__=='__main__':
--- a/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/VGG/Python/VGG16_ImageNet_Distributed.py
@ -18,6 +18,7 @@ from cntk.distributed import data_parallel_distributed_learner, Communicator
 from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
 from cntk.layers import Placeholder, Block, Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options, Sequential, For
 from cntk.initializer import normal
+from cntk.training_session import *

 # default Paths relative to current python file.
 abs_path   = os.path.dirname(os.path.abspath(__file__))
@ -131,7 +132,7 @@ def create_vgg16():
    }

 # Create trainer
-def create_trainer(network, epoch_size, num_quantization_bits):
+def create_trainer(network, epoch_size, num_quantization_bits, progress_printer):
    # Set learning parameters
    lr_per_mb         = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001]
    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
@ -147,10 +148,10 @@ def create_trainer(network, epoch_size, num_quantization_bits):
        distributed_after=0)

    # Create trainer
-    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
+    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)

 # Train and test
-def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
+def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore):

    # define mapping from intput streams to network inputs
    input_map = {
@ -158,23 +159,17 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
        network['label']: train_source.streams.labels
    }

-    training_session = cntk.training_session(
-        training_minibatch_source = train_source, 
-        trainer = trainer,
-        model_inputs_to_mb_source_mapping = input_map, 
-        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), 
-        progress_printer = progress_printer, 
-#        checkpoint_frequency = epoch_size,
-        checkpoint_filename = os.path.join(model_path, model_name), 
-#        save_all_checkpoints = True,
-        progress_frequency = epoch_size, 
-        cv_source = test_source, 
-        cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
-#        cv_frequency = epoch_size,
-        restore = restore)
+    mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size)

    # Train all minibatches 
-    training_session.train()
+    training_session(
+        trainer=trainer, mb_source = train_source,
+        var_to_stream = input_map,
+        mb_size_schedule = mb_size_schedule,
+        progress_frequency=epoch_size,
+        checkpoint_config = CheckpointConfig(filename = os.path.join(model_path, model_name), restore=restore),
+        cv_config = CrossValidationConfig(source=test_source, schedule=mb_size_schedule)
+    ).train()

 # Train and evaluate the network.
 def vgg16_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80, 
@ -190,10 +185,10 @@ def vgg16_train_and_eval(train_data, test_data, num_quantization_bits=32, miniba
        num_epochs=max_epochs)

    network = create_vgg16()
-    trainer = create_trainer(network, epoch_size, num_quantization_bits)
+    trainer = create_trainer(network, epoch_size, num_quantization_bits, progress_printer)
    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
-    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
+    train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore)
 

 if __name__=='__main__':
--- a/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/VGG/Python/VGG19_ImageNet_Distributed.py
@ -18,6 +18,7 @@ from cntk.distributed import data_parallel_distributed_learner, Communicator
 from cntk.io import ImageDeserializer, MinibatchSource, StreamDef, StreamDefs, FULL_DATA_SWEEP
 from cntk.layers import Placeholder, Block, Convolution2D, Activation, MaxPooling, Dense, Dropout, default_options, Sequential, For
 from cntk.initializer import normal
+from cntk.training_session import *

 # default Paths relative to current python file.
 abs_path   = os.path.dirname(os.path.abspath(__file__))
@ -131,7 +132,7 @@ def create_vgg19():
    }

 # Create trainer
-def create_trainer(network, epoch_size, num_quantization_bits):
+def create_trainer(network, epoch_size, num_quantization_bits, progress_printer):
    # Set learning parameters
    lr_per_mb         = [0.01]*20 + [0.001]*20 + [0.0001]*20 + [0.00001]*10 + [0.000001]
    lr_schedule       = cntk.learning_rate_schedule(lr_per_mb, unit=cntk.learner.UnitType.minibatch, epoch_size=epoch_size)
@ -147,10 +148,10 @@ def create_trainer(network, epoch_size, num_quantization_bits):
        distributed_after=0)

    # Create trainer
-    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner)
+    return cntk.Trainer(network['output'], (network['ce'], network['pe']), parameter_learner, progress_printer)

 # Train and test
-def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore):
+def train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore):

    # define mapping from intput streams to network inputs
    input_map = {
@ -158,23 +159,15 @@ def train_and_test(network, trainer, train_source, test_source, progress_printer
        network['label']: train_source.streams.labels
    }

-    training_session = cntk.training_session(
-        training_minibatch_source = train_source, 
-        trainer = trainer,
-        model_inputs_to_mb_source_mapping = input_map, 
-        mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size), 
-        progress_printer = progress_printer, 
-#        checkpoint_frequency = epoch_size,
-        checkpoint_filename = os.path.join(model_path, model_name), 
-#        save_all_checkpoints = True,
-        progress_frequency = epoch_size, 
-        cv_source = test_source, 
-        cv_mb_size_schedule = cntk.minibatch_size_schedule(minibatch_size),
-#        cv_frequency = epoch_size,
-        restore = restore)
-
    # Train all minibatches 
-    training_session.train()
+    training_session(
+        trainer=trainer, mb_source = train_source, 
+        var_to_stream = input_map, 
+        mb_size = minibatch_size,
+        progress_frequency=epoch_size,
+        checkpoint_config = CheckpointConfig(filename = os.path.join(model_path, model_name), restore=restore),
+        cv_config = CrossValidationConfig(source=test_source, mb_size=minibatch_size)
+    ).train()

 # Train and evaluate the network.
 def vgg19_train_and_eval(train_data, test_data, num_quantization_bits=32, minibatch_size=128, epoch_size = 1281167, max_epochs=80, 
@ -190,10 +183,10 @@ def vgg19_train_and_eval(train_data, test_data, num_quantization_bits=32, miniba
        num_epochs=max_epochs)

    network = create_vgg19()
-    trainer = create_trainer(network, epoch_size, num_quantization_bits)
+    trainer = create_trainer(network, epoch_size, num_quantization_bits, progress_printer)
    train_source = create_image_mb_source(train_data, True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, False, total_number_of_samples=FULL_DATA_SWEEP)
-    train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size, restore)
+    train_and_test(network, trainer, train_source, test_source, minibatch_size, epoch_size, restore)
 

 if __name__=='__main__':
--- a/Examples/Image/GettingStarted/07_Deconvolution_BS.cntk
+++ b/Examples/Image/GettingStarted/07_Deconvolution_BS.cntk
@ -22,10 +22,10 @@ trainNetwork = {
    BrainScriptNetworkBuilder = {
        cMap = 1
        model = inputFeatures => {
-            conv1   = ConvolutionalLayer {cMap, (5:5), pad = true, activation=ReLU}(inputFeatures)
+            conv1   = ConvolutionalLayer {cMap, (5:5), pad=true, activation=ReLU}(inputFeatures)
            pool1   = MaxPoolingLayer {(4:4), stride=(4:4)}(conv1)
            unpool1 = MaxUnpoolingLayer {(4:4), stride=(4:4)}(pool1, conv1)
-            deconv1 = DeconvLayer {1, (5:5), cMap, lowerPad=(2:2:0), upperPad=(2:2:0), bias=false}(unpool1)
+            deconv1 = ConvolutionTransposeLayer {1, (5:5), cMap, pad=true, bias=false}(unpool1)
        }.deconv1

        # inputs
--- a/Examples/Image/GettingStarted/07_Deconvolution_PY.py
+++ b/Examples/Image/GettingStarted/07_Deconvolution_PY.py
@ -40,7 +40,7 @@ def deconv_mnist(max_epochs=3):
    conv1   = cntk.layers.Convolution  ((5,5), cMap, pad=True, activation=cntk.ops.relu)(scaled_input)
    pool1   = cntk.layers.MaxPooling   ((4,4), (4,4))(conv1)
    unpool1 = cntk.layers.MaxUnpooling ((4,4), (4,4))(pool1, conv1)
-    z       = cntk.layers.Deconvolution((5,5), num_channels, cMap, lower_pad=(0,2,2), upper_pad=(0,2,2), bias=False, init=cntk.glorot_uniform(0.001))(unpool1)
+    z       = cntk.layers.ConvolutionTranspose((5,5), num_channels, cMap, pad=True, bias=False, init=cntk.glorot_uniform(0.001))(unpool1)

    # define rmse loss function (should be 'err = cntk.ops.minus(deconv1, scaled_input)')
    f2        = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var)
--- a/5
+++ b/5
@ -90,7 +90,7 @@ PROTOC = $(PROTOBUF_PATH)/bin/protoc
 #SSE_FLAGS =

 SOURCEDIR:= Source
-INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API CNTKv2LibraryDll/proto Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib PerformanceProfilerDll)
+INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API CNTKv2LibraryDll/proto ../Examples/Extensibility/CPP Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib PerformanceProfilerDll)
 INCLUDEPATH+=$(PROTOBUF_PATH)/include
 # COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
 COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
@ -326,7 +326,8 @@ READER_SRC =\
 	$(SOURCEDIR)/Readers/ReaderLib/FramePacker.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/ReaderBase.cpp \
 	$(SOURCEDIR)/Readers/ReaderLib/Indexer.cpp \
-    $(SOURCEDIR)/Readers/ReaderLib/ChunkCache.cpp \
+	$(SOURCEDIR)/Readers/ReaderLib/ChunkCache.cpp \
+	$(SOURCEDIR)/Readers/ReaderLib/ReaderUtil.cpp \

 COMMON_SRC =\
 	$(SOURCEDIR)/Common/Config.cpp \
--- a/README.md
+++ b/README.md
@ -5,10 +5,16 @@ Effective January 25, 2017 CNTK [1-bit Stochastic Gradient Descent (1bit-SGD)](h
 Give us feedback through these [channels](https://github.com/Microsoft/CNTK/wiki/Feedback-Channels).

 # Latest news
+***2017-02-28.* V 2.0 Beta 12 Release available at Docker Hub**  
+CNTK V 2.0 Beta 12 Runtime packages are now available as [Public Images at Docker Hub](https://hub.docker.com/r/microsoft/cntk/).  
+See more on CNTK as Docker Images in this [Wiki article](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers).
+
 ***2017-02-23.* V 2.0 Beta 12 Release**  
 Highlights of this Release:
 * New and updated features: new activation functions, support of `Argmax` and `Argmin`, improved performance of `numpy` interop, new functionality of existing operators, and more.
-* [CNTK for CPU on Windows can now be installed via `pip install` on Anaconda 3](https://github.com/Microsoft/CNTK/wiki/Setup-Windows-Python). Other configurations will be enabled soon.
+* [CNTK for CPU on Windows can now be installed via `pip install` on Anaconda 3](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine). Other configurations will be enabled soon.
+* HTK deserializers are now exposed in Python. All deserializers are exposed in C++.
+* The memory pool implementation of CNTK has been updated with a new global optimization algorithm. Hyper memory compression has been removed.
 * New features in C++ API.
 * [New Eval examples for RNN models](https://github.com/Microsoft/CNTK/blob/master/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CNTKLibraryCSEvalExamples.cs).
 * New [CNTK NuGet Packages](https://github.com/Microsoft/CNTK/wiki/NuGet-Package) with CNTK V2 C++ Library.
@ -35,19 +41,6 @@ Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/
 CNTK V 2.0 Beta 10 Runtime packages are now available as [Public Images at Docker Hub](https://hub.docker.com/r/microsoft/cntk/).  
 See more on CNTK as Docker Images in this [Wiki article](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers).

-***2017-02-01.* V 2.0 Beta 10 Release**  
-* New and updated core and Python API features ([Operators with UserFunctions](https://www.cntk.ai/pythondocs/extend.html), [Tensorboard support](https://github.com/Microsoft/CNTK/wiki/Using-TensorBoard-for-Visualization), [Python API Fast R CNN](https://github.com/Microsoft/CNTK/wiki/Object-Detection-using-Fast-R-CNN)).
-* Improved speed of CrossEntropyWithSoftmax and ClassificationError for sparse labels.
-* New Tutorials and Examples:
-  * A Python version of the deconvolution layer and image auto encoder example was added ([Example **07_Deconvolution** in *Image - Getting Started*](https://github.com/Microsoft/CNTK/tree/v2.0.beta10.0/Examples/Image/GettingStarted)).
-  * A Python distributed training example for image classification using AlexNet was added, cf. [here](https://github.com/Microsoft/CNTK/tree/v2.0.beta10.0/Examples/Image/Classification/AlexNet/Python)
-  * [Basic implementation of Generative Adversarial Networks (GAN) networks](https://github.com/Microsoft/CNTK/blob/v2.0.beta10.0/Tutorials/CNTK_206_Basic_GAN.ipynb)
-  * [Training with Sampled Softmax](https://github.com/Microsoft/CNTK/blob/v2.0.beta10.0/Tutorials/CNTK_207_Training_with_Sampled_Softmax.ipynb)
-* New [CNTK NuGet Packages](https://github.com/Microsoft/CNTK/wiki/NuGet-Package).
-
-See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_2_0_beta_10_Release_Notes).
-Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases).
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 # What is The Microsoft Cognitive Toolkit
--- a/Scripts/ctf2bin.py
+++ b/Scripts/ctf2bin.py
@ -43,8 +43,9 @@ class Converter(object):

    def appendSample(self, sample):
        if( len(sample) != self.sampleDim ):
-            print( "Invalid sample dimension for input {0}" ).format( self.name )
-            sys.exit()
+            raise ValueError(
+                "Invalid sample dimension for input {0}".format(self.name))
+
        if( len(self.vals) == 0 ):
            self.vals.append( list() )

@ -65,7 +66,7 @@ class DenseConverter(Converter):
        Converter.__init__(self, name, sampleDim)
    
    def headerBytes(self):
-        output = ""
+        output = bytearray()
        # First is the matrix type. Dense is type 0
        output += struct.pack( "i", 0 )
        # Next is the elem type, currently float only
@ -77,11 +78,11 @@ class DenseConverter(Converter):

    
    def toBytes(self):
-        output = ""
+        output = bytearray()
        for sequence in self.vals:
            if( len(sequence) != 1 ):
-                print( "Converter does not support dense sequences." )
-                sys.exit()
+                raise ValueError("Dense sequences currently not supported.")
+
            for sample in sequence[0]:
                output += struct.pack( "f", float(sample) )

@ -94,17 +95,18 @@ class SparseConverter(Converter):
        Converter.__init__(self, name, sampleDim)

    def appendSample(self, sample):
-        for samp in sample:
-            if( int(samp.split(":")[0]) >= self.sampleDim ):
-                print( "Invalid sample dimension for input {0}. Max {1}, given {2}" ).format( self.name, self.sampleDim, sample.split( ":" )[0] )
-                sys.exit()
+        for pair in sample:
+            index = int(pair.split(":")[0])
+            if (index >= self.sampleDim):
+                raise ValueError("Invalid sample dimension for input {0}. Max {1}, given {2}"
+                        .format(self.name, self.sampleDim, index))
        if( len(self.vals) == 0 ):
            self.vals.append( list() )

        self.vals[-1].append( sample )

    def headerBytes(self):
-        output = ""
+        output = bytearray()
        # First is the matrix type. Sparse is type 1
        output += struct.pack( "i", 1 )
        # Next is the storage type, currently sparse csc only
@ -120,7 +122,7 @@ class SparseConverter(Converter):
        return output

    def toBytes(self):
-        output = ""
+        output = bytearray()
        values = list()
        rowInd = list()
        colInd = [0]
@ -139,9 +141,9 @@ class SparseConverter(Converter):
            colInd.append( nnz )

        output += struct.pack( "i", nnz )
-        output += "".join( [ struct.pack( "f", float(val) ) for val in values ] )
-        output += "".join( [ struct.pack( "i", int(ind) ) for ind in rowInd ] )
-        output += "".join( [ struct.pack( "i", int(ind) ) for ind in colInd ] )
+        output += b''.join( [ struct.pack( "f", float(val) ) for val in values ] )
+        output += b''.join( [ struct.pack( "i", int(ind) ) for ind in rowInd ] )
+        output += b''.join( [ struct.pack( "i", int(ind) ) for ind in colInd ] )

        return output

@ -174,7 +176,7 @@ def GetConverter( inputtype, name, sampleDim ):
    elif( inputtype.lower() == 'sparse' ):
        converter = SparseConverter( name, sampleDim )
    else:
-        print( 'Invalid input format {0}' ).format( inputtype )
+        print('Invalid input format {0}'.format( inputtype ))
        sys.exit()

    return converter 
@ -240,7 +242,6 @@ if __name__ == '__main__':
            id += 1

    OutputHeader( binaryHeaderFile, converters )
-
    numChunks = 0
    with open( args.input, "r" ) as inputFile:
        curSequence = list()
@ -280,7 +281,7 @@ if __name__ == '__main__':
        binaryHeaderFile.close()
        binaryDataFile.close()

-        destination = open( args.output, 'awb+' )
+        destination = open( args.output, 'ab+' )
        shutil.copyfileobj( open( dataPath, "rb" ), destination )
        
        destination.flush()
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -420,8 +420,9 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                else
                {
                    bool transpose = node->GetOptionalParameter("transpose", "false");
+                    auto outputShape = paramResolver("outputShape", 0); 
                    nodePtr = builder.Convolution(NULL, NULL, kernelShape, mapCount, stride, sharing, 
-                                                  autoPad, lowerPad, upperPad, transpose, imageLayout, maxTempMemSizeInSamples, name);
+                                                  autoPad, lowerPad, upperPad, transpose, outputShape, imageLayout, maxTempMemSizeInSamples, name);
                }

            }
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -94,34 +94,33 @@ ConvolutionalLayer {numOutputChannels,   # e.g. (1) or BS.Constants.None
    b = ParameterTensor(_ConcatArrays (Repeat (Length (filterShape), 1), outputChannelsShape), initValue = initBias)                                                       # [ 1 x 1 x     K ]
    sharing = true    # TODO: support this
    apply (x) = {
-        c = Convolution (W, x, filterShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = pad, lowerPad = lowerPad, upperPad = upperPad, deconv = false, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
+        c = Convolution (W, x, filterShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = pad, lowerPad = lowerPad, upperPad = upperPad, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
        res = activation (if bias then c + b else c)
    }.res
 }.apply

-# DeconvLayer -- create a deconvolution layer with optional non-linearity
-DeconvLayer {numOutputChannels,
-             filterShape,         # e.g. (3:3)
-             numInputChannels,
-             bias = true,
-             activation = (x=>x),
-             init = 'glorotUniform',
-             initValueScale = 0.001,
-             initBias = 0,
-             stride = 1, 
-             sharing = true,
-             autoPadding = false,
-             lowerPad = 0, 
-             upperPad = 0,
-             maxTempMemSizeInSamples = 0} =
+# ConvolutionTransposeLayer -- create a convolution transpose layer with optional non-linearity
+ConvolutionTransposeLayer {numOutputChannels,
+                           filterShape,         # e.g. (3:3)
+                           numInputChannels,
+                           bias = true, 
+                           activation = (x=>x), 
+                           init = 'glorotUniform', 
+                           initValueScale = 0.001, 
+                           initBias = 0, 
+                           stride = 1, pad = false, 
+                           lowerPad = 0, upperPad = 0, 
+                           outputShape = 0, 
+                           maxTempMemSizeInSamples = 0} =
 {
    outputChannelsShape = _AsArray (numOutputChannels)
    kernelShape = _ConcatArrays (filterShape, outputChannelsShape) 
    paramShape = _ConcatArrays (kernelShape, _AsArray (numInputChannels)) 
    W = ParameterTensor{paramShape, init=init, initValueScale=initValueScale, initOnCPUOnly=true}
    b = ParameterTensor(_ConcatArrays (Repeat (Length (filterShape), 1), outputChannelsShape), initValue = initBias)
+    sharing = true    # TODO: support this
    apply (x) = {
-        c = Convolution(W, x, kernelShape, mapDims=numInputChannels, stride=stride, sharing=sharing, autoPadding=autoPadding, lowerPad=lowerPad, upperPad=upperPad, deconv=true, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
+        c = ConvolutionTranspose (W, x, kernelShape, mapDims=numInputChannels, stride=stride, sharing=sharing, autoPadding=pad, lowerPad=lowerPad, upperPad=upperPad, outputShape = outputShape, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
        res = activation (if bias then c + b else c)
    }.res
 }.apply
@ -607,7 +606,8 @@ ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ ope
 ReconcileMBLayout = ReconcileDynamicAxis # back compat
 CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
 # ND convo & pooling/unpooling   --why is autoPadding true? Normally one would want to reduce dimensions, no?
-Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, deconv=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = _AsNodes (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] ; transpose = deconv /*plus the function args*/ ]
+Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = _AsNodes (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] ; transpose = false ; dimOutputShape = new TensorShape [ dims = 0 ] /*plus the function args*/ ]
+ConvolutionTranspose(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, outputShape = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = _AsNodes (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] ; transpose = true ; dimOutputShape = new TensorShape [ dims = outputShape ]  /*plus the function args*/ ]
 Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = _AsNodes (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxUnpooling' ; inputs = _AsNodes (unpoolInput : poolInput); kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 # 2D pooling
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -359,18 +359,27 @@ namespace CNTK
        ///
        std::wstring AsString() const
        {
-            std::wstringstream wStrStream;
-            wStrStream << L"[";
-            for (size_t i = 0; i < Rank(); i++)
+            if (IsUnknown())
            {
-                if (i != 0)
-                    wStrStream << L" x ";
-
-                wStrStream << m_shapeDims[i];
+                return L"[???]";
            }
+            else
+            {
+                std::wstringstream wStrStream;
+                wStrStream << L"[";
+                for (size_t i = 0; i < Rank(); i++)
+                {
+                    if (i != 0)
+                        wStrStream << L" x ";

-            wStrStream << L"]";
-            return wStrStream.str();
+                    if (m_shapeDims[i] != InferredDimension)
+                        wStrStream << m_shapeDims[i];
+                    else
+                        wStrStream << "?";
+                }
+                wStrStream << L"]";
+                return wStrStream.str();
+            }
        }

    private:
@ -991,6 +1000,7 @@ namespace std {
    };
 }

+
 namespace CNTK
 {
    ///
@ -1612,6 +1622,11 @@ namespace CNTK
        ///
        CNTK_API bool NeedsGradient() const;

+        ///
+        /// Returns a string representation for this variable.
+        ///
+        CNTK_API std::wstring AsString() const;
+
    protected:
 #ifdef SWIG
    public:
@ -2065,6 +2080,12 @@ namespace CNTK
            return Create(sampleShape, sequences, {}, device, readOnly);
        }

+
+        ///
+        /// Create a new Value object containing a collection of variable length sequences.
+        ///
+        CNTK_API static ValuePtr Create(const NDShape& sampleShape, const std::vector<NDArrayViewPtr>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly, bool createNewCopy);
+
        ///
        /// Create a new Value object containing a collection of variable length sequences.
        /// The created Value object contains a copy of the specified 'sequences' data.
@ -2442,8 +2463,6 @@ namespace CNTK
        template <typename ElementType>
        static void AppendSparseSequenceData(const NDArrayViewPtr& sequenceData, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<char>& nonZeroValues, size_t maxSequenceLength);

-        CNTK_API static ValuePtr Create(const NDShape& sampleShape, const std::vector<NDArrayViewPtr>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly, bool createNewCopy);
-
        ///
        /// Copy the data stored in 'this' Value object to the buffer 'sequences' as a collection of variable length sequences.
        /// The output data is in the dense format.
@ -2764,7 +2783,7 @@ namespace CNTK

        ///
        /// Returns the root of the Function graph underlying this block Function.
-        /// Throws an exception of this is not a block Function
+        /// Throws an exception if this is not a block Function
        ///
        CNTK_API FunctionPtr BlockRoot() const;

@ -2808,11 +2827,11 @@ namespace CNTK
        ///
        /// Returns a set comprising of all input variables of 'this' Function's variables that are not of kind 'Parameter' or 'Constant'.
        ///
-        std::vector<Variable> Arguments() const
+        std::vector<Variable> Arguments(bool rowMajor = false) const
        {
            return FilteredInputs<Variable>([](const Variable& var) {
                return (var.IsInput() || var.IsPlaceholder() || var.IsOutput());
-            });
+            }, rowMajor);
        }

        ///
@ -2916,6 +2935,11 @@ namespace CNTK
        ///
        CNTK_API void PrintGraph() const;

+        ///
+        /// Returns a string representation of this Function
+        ///
+        CNTK_API std::wstring AsString() const;
+
        ///
        /// Maximum number of outputs that is currently supported.
        ///
@ -2982,11 +3006,11 @@ namespace CNTK
        CNTK_API std::vector<Variable>& InitOutputs();

        template <typename VariableType, typename FilterFunction>
-        std::vector<VariableType> FilteredInputs(FilterFunction&& filterFunc) const
+        std::vector<VariableType> FilteredInputs(FilterFunction&& filterFunc, bool rowMajor = false) const
        {
            std::vector<VariableType> filteredInputs;
            std::unordered_set<Variable> uniqueFilteredInputs;
-            auto inputs = Inputs();
+            auto inputs = Inputs(rowMajor);
            for (auto inputVar : inputs)
            {
                if (filterFunc(inputVar) && (uniqueFilteredInputs.find(inputVar) == uniqueFilteredInputs.end()))
@ -3370,7 +3394,18 @@ namespace CNTK
    ///
    /// Create an instance of the CNTK built-in operation for computing the edit distance error for specified operands.
    ///
-    CNTK_API FunctionPtr EditDistanceError(const Variable& prediction, const Variable& labels, float substitutionPenalty, float deletionPenalty, float insertionPenalty, bool squashInputs, const std::vector<size_t>& samplesToIgnore, const std::wstring& name = L"");
+    CNTK_API FunctionPtr EditDistanceError(const Variable& prediction, const Variable& labels, float substitutionPenalty, float deletionPenalty, float insertionPenalty, bool squashInputs, const std::vector<size_t>& tokensToIgnore, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in operation for computing the forwardbackward for specified operands.
+    ///
+    CNTK_API FunctionPtr ForwardBackward(const Variable& graph, const Variable& features, size_t blankTokenId, int delayConstraint, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in operation for computing the labels to graph for input operands.
+    ///
+    CNTK_API FunctionPtr LabelsToGraph(const Variable& labels, const std::wstring& name = L"");
+

    ///
    /// Create an instance of the CNTK built-in operation for computing the classification prediction error for specified operands.
@ -3495,10 +3530,23 @@ namespace CNTK
                                     const std::vector<bool>& autoPadding = {true},
                                     const NDShape& lowerPad = {0},
                                     const NDShape& upperPad = {0},
-                                     bool transpose = false,
                                     size_t maxTempMemSizeInSamples = 0,
                                     const std::wstring& name = L"");

+    ///
+    /// TODO:
+    ///
+    CNTK_API FunctionPtr ConvolutionTranspose(const Variable& convolutionMap,
+                                              const Variable& operand,
+                                              const NDShape& strides = { 1 },
+                                              const std::vector<bool>& sharing = { true },
+                                              const std::vector<bool>& autoPadding = { true },
+                                              const NDShape& lowerPad = { 0 },
+                                              const NDShape& upperPad = { 0 },
+                                              const NDShape& outputShape = { 0 },
+                                              size_t maxTempMemSizeInSamples = 0,
+                                              const std::wstring& name = L"");
+
    ///
    /// Create an instance of the CNTK built-in ROI pooling operation on specified tensor input operands with the specified output shape
    ///
@ -3621,6 +3669,11 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr PReLU(const Variable& alpha, const Variable& operand, const std::wstring& name = L"");

+    ///
+    /// Create an instance of the CNTK built-in elementwise softplus operation 
+    ///
+    CNTK_API FunctionPtr Softplus(const Variable& operand, const std::wstring& name = L"");
+
    ///
    /// Create an instance of the CNTK built-in argmax operation on specified tensor input operand along the specified axis
    ///
@ -4275,6 +4328,9 @@ namespace CNTK
        void UpdateTestProgress(size_t numSamples, const ValuePtr& evalCriterion, const DeviceDescriptor& computeDevice);
        void AddProgressWriters(const std::vector<ProgressWriterPtr>& progressWriters);

+        // TODO: Workaround for back compat. Should not be used and will be removed in the next version.
+        friend CNTK_API void ::CNTK::Internal::AddProgressWriters(const TrainerPtr&, const std::vector<ProgressWriterPtr>&);
+
        FunctionPtr m_combinedTrainingFunction;
        FunctionPtr m_model;
        FunctionPtr m_lossFunction;
@ -4698,6 +4754,55 @@ namespace CNTK
    ///
    CNTK_API QuantizedDistributedCommunicatorPtr QuantizedMPICommunicator(bool zeroThresholdFor1Bit, bool useQuantizationForSelfStripe, size_t numQuantizationBits);

+    ///
+    /// Cross validation configuration
+    ///
+    struct CrossValidationConfig
+    {
+    public:
+        /// Cross validation configuration.
+        /// crossValidationSource: a minibatch source that will be used for cross validation.
+        /// crossValidationSchedule : a minibatch size schedule for cross validation.
+        /// crossValidationFrequencyInSamples: frequency in samples when to perform cross validation.
+        ///
+        CNTK_API CrossValidationConfig(const MinibatchSourcePtr& crossValidationSource,
+            const MinibatchSizeSchedule& crossValidationSchedule = MinibatchSizeSchedule(1),
+            size_t crossValidationFrequencyInSamples = std::numeric_limits<size_t>::max());
+
+    private:
+        friend class TrainingSession;
+        const MinibatchSourcePtr m_source;
+        const MinibatchSizeSchedule m_mbSize;
+        const size_t m_frequency;
+    };
+
+    ///
+    /// Checkpoint configuration
+    ///
+    struct CheckpointConfig
+    {
+    public:
+        ///
+        /// Checkpoint configuration.
+        /// checkPointFileName: a file name where the checkpoint will be stored.
+        /// checkpointFrequencyInSamples: frequency in samples when to perform checkpointing.
+        /// restoreFromCheckpointIfExists: if flag is set, the training session will try to restore before training.
+        /// preserveAllCheckpoints: if flag is set, all checkpoints will be preserved.
+        ///
+        CNTK_API CheckpointConfig(
+            const std::wstring& checkPointFileName,
+            size_t checkpointFrequencyInSamples = std::numeric_limits<size_t>::max(),
+            bool restoreFromCheckpointIfExists = true,
+            bool preserveAllCheckpoints = false);
+
+    private:
+        friend class TrainingSession;
+        const std::wstring m_fileName;
+        const bool m_restore;
+        const bool m_preserveAll;
+        const size_t m_frequency;
+    };
+
    ///
    /// Base abstract class that represents a training session.
    /// Derived classes can redefine different aspects of training, overriding base virtual methods (GetMinibatchSize, OnMinibatchStart, etc.)
@ -4709,12 +4814,31 @@ namespace CNTK
            size_t frequency;
            size_t currentIndex;
            size_t sampleCountWhenLastCalled;
-            std::function<void(size_t currentIndex, const DeviceDescriptor&)> action;
+            std::function<bool(size_t currentIndex, const DeviceDescriptor&)> action;
        };

    public:
-        /// 
+        ///
        /// Constructor of the training session:
+        /// trainer : an instance of a trainer
+        /// trainingSource: minibatch source
+        /// minibatchSizeSchedule: mb size schedule
+        /// inputVarToStream: var to stream mapping
+        /// maxNumTrainingSamples: max number of training samples
+        /// progress : a training configuration
+        ///
+        CNTK_API TrainingSession(
+            const TrainerPtr& trainer,
+            const MinibatchSourcePtr& trainingSource,
+            const MinibatchSizeSchedule& minibatchSizeSchedule,
+            const std::unordered_map<Variable, StreamInformation>& inputVarToStream,
+            size_t maxNumTrainingSamples,
+            size_t progressFrequency,
+            const CheckpointConfig& checkpointing,
+            const CrossValidationConfig& crossValidation);
+
+        /// !!! DEPRECATED !!!
+        /// Constructor of the training session: 
        /// trainingSource : a minibatch source that will be used for training
        /// trainer : an instance of a trainer
        /// modelInputsToMinibatchSourceMapping : mapping between the input node of the model and the corresponding stream
@ -4765,7 +4889,7 @@ namespace CNTK
        ///
        virtual size_t GetMinibatchSize()
        {
-            return m_minibatchSizeSchedule[Trainer()->TotalNumberOfSamplesSeen()];
+            return m_mbSize[Trainer()->TotalNumberOfSamplesSeen()];
        }

        ///
@ -4775,8 +4899,9 @@ namespace CNTK

        ///
        /// Optionally overridable callback that is invoked after each minibatch.
+        /// If return value is false, the training will be stopped.
        ///
-        CNTK_API virtual void OnMinibatchEnd() {};
+        CNTK_API virtual bool OnMinibatchEnd() { return true; };

        ///
        /// Optionally overridable callback that is invoked before each checkpoint.
@ -4795,8 +4920,12 @@ namespace CNTK

        ///
        /// Optionally overridable callback that is invoked after each cross validation.
+        /// If return value is false, the training will be stopped.
        ///
-        CNTK_API virtual void OnCrossValidationEnd(size_t /*validationIndex*/, double /*averageError*/, size_t /*numberOfSamples*/, size_t /*numberOfMinibatches*/) {};
+        CNTK_API virtual bool OnCrossValidationEnd(size_t /*validationIndex*/, double /*averageError*/, size_t /*numberOfSamples*/, size_t /*numberOfMinibatches*/)
+        {
+            return true;
+        }

    protected:
        ///
@ -4804,8 +4933,6 @@ namespace CNTK
        ///
        TrainerPtr Trainer() const { return m_trainer; }

-        MinibatchSourcePtr TrainingMinibatchSource() const { return m_trainingSource; }
-
    private:
        /// Disallow copy and move construction and assignment
        TrainingSession(const TrainingSession&) = delete; TrainingSession& operator=(const TrainingSession&) = delete; TrainingSession& operator=(TrainingSession&&) = delete; TrainingSession(TrainingSession&&) = delete;
@ -4819,32 +4946,30 @@ namespace CNTK
        void SaveCheckpoint(size_t currentIndex);
        void SaveFinalCheckpoint();

-        void CrossValidate(size_t currentIndex, const DeviceDescriptor& computeDevice);
+        bool CrossValidate(size_t currentIndex, const DeviceDescriptor& computeDevice);
        void ReportProgress(size_t currentIndex);

-        // Checkpointing
-        const std::wstring m_checkPointFileName;
-        const bool m_restoreFromCheckpointIfExists;
-        const bool m_saveAllCheckpoints;
-
-        // Training
-        MinibatchSourcePtr m_trainingSource;
-        TrainerPtr m_trainer;
-        std::unordered_map<Variable, StreamInformation> m_modelInputToMinibatchSourceStream;
        size_t m_parallelAfterSamples;
        size_t m_workerRank;
        size_t m_numberOfWorkers;
-        const MinibatchSizeSchedule m_minibatchSizeSchedule;
-        const size_t m_maxNumberOfSamples;
-
-        // Cross validation.
-        MinibatchSourcePtr m_crossValidationSource;
-        const MinibatchSizeSchedule m_crossValidationSchedule;

        std::vector<PeriodicAction> m_actions;
+
+        // Training.
+        TrainerPtr m_trainer;
+        const MinibatchSourcePtr m_source;
+        const MinibatchSizeSchedule m_mbSize;
+        const std::unordered_map<Variable, StreamInformation> m_varToStream;
+        const size_t m_maxNumSamples;
+        const size_t m_progressFrequency;
+
+        // Additional configuration.
+        CheckpointConfig m_checkpoint;
+        CrossValidationConfig m_cv;
    };

    ///
+    /// !!! DEPRECATED !!!
    /// Creates an instance of the training session class. Parameters match the paramters of the TrainingSession constructor.
    ///
    CNTK_API TrainingSessionPtr CreateBasicTrainingSession(
@ -4961,6 +5086,18 @@ namespace CNTK
        std::unique_ptr<Impl> m_training;
        std::unique_ptr<Impl> m_test;
    };
+
+    /// Creates an instance of the training session class. Parameters match the parameters of the TrainingSession constructor.
+    ///
+    CNTK_API TrainingSessionPtr CreateTrainingSession(
+        const TrainerPtr& trainer,
+        const MinibatchSourcePtr& trainingSource,
+        const MinibatchSizeSchedule& minibatchSizeSchedule,
+        const std::unordered_map<Variable, StreamInformation>& inputVarToStream,
+        size_t maxNumTrainingSamples,
+        size_t progressFrequency,
+        const CheckpointConfig& checkpointing,
+        const CrossValidationConfig& crossValidation);
 }


--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -134,6 +134,7 @@ namespace CNTK
 {
    // Forward declarations
    class Utils;
+    class NDShape; 
    class PrimitiveFunction;
    class CompositeFunction;
    class BlockFunction;
@ -224,9 +225,15 @@ namespace CNTK
        CNTK_API FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name = L"");
        CNTK_API FunctionPtr ReconcileDynamicAxes(const Variable& operand, const Variable& axesAsOperand, const std::wstring& name = L"");

+        CNTK_API FunctionPtr Convolution(const Variable& convolutionMap, const Variable& operand, const NDShape& strides, const std::vector<bool>& sharing, const std::vector<bool>& autoPadding,
+                                         const NDShape& lowerPad, const NDShape& upperPad, bool transpose, const NDShape& outputShape, size_t maxTempMemSizeInSamples, const std::wstring& name = L"");
+
        // This is meant for debugging purposes only and is very likely to be deprecated in the future.
        CNTK_API void SaveAsLegacyModel(const FunctionPtr& rootFunction, const std::wstring& modelFile);

+        // TODO: Workaround for back compat. Should not be used and will be removed in the next version.
+        CNTK_API void AddProgressWriters(const TrainerPtr&, const std::vector<ProgressWriterPtr>&);
+
        CNTK_API size_t NewUniqueId();

        // Internal hooks for testing and higher-level bindings
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -22,6 +22,7 @@
 #include "RNNNodes.h"
 #include "PreComputeNodes.h"
 #include "DeprecatedNodes.h"
+#include "SpecialPurposeNodes.h"

 using namespace Microsoft::MSR::CNTK;

@ -349,6 +350,7 @@ namespace CNTK
                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameLowerPad] = AsNDShape(convolutionNode->LowerPad());
                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameUpperPad] = AsNDShape(convolutionNode->UpperPad());
                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameTranspose] = convolutionNode->Transpose();
+                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameOutputShape] = AsNDShape(convolutionNode->OutputShape());
                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameMaxTempMemSizeInSamples] = convolutionNode->MaxTempMemSizeInSamples();

                    opType = PrimitiveOpType::Convolution;
@ -456,6 +458,14 @@ namespace CNTK

                    opType = PrimitiveOpType::EditDistanceError;
                }
+                else if (node->OperationName() == OperationNameOf(ForwardBackwardNode))
+                {
+                    auto edNode = node->As<ForwardBackwardNode<ElementType>>();
+                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameDelayConstraint] = edNode->DelayConstraint();
+                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameBlankTokenId] = edNode->BlankTokenId();
+
+                    opType = PrimitiveOpType::ForwardBackward;
+                }
                else if ((node->OperationName() == OperationNameOf(MeanNode)) || (node->OperationName() == OperationNameOf(InvStdDevNode)))
                {
                    auto precomputeNode = node->As<MeanInvStdDevNodeBase<ElementType>>();
@ -475,7 +485,9 @@ namespace CNTK
                    return PerDimMeanVarianceNormalize(inputVars[0], meanValue, invStdDevValue, name);
                }
                else
-                    LogicError("Unsupported ComputationNode with OperationName='%S' found when loading legacy CNTK model", node->OperationName().c_str());
+                    InvalidArgument("Unsupported ComputationNode with OperationName='%S' found when loading legacy CNTK model.\n"
+                                    "This is likely a deprecated operation; loading Brainscript/NDL models that contain deprecated operations, is not supported in Python/C++ API.\n"
+                                    "Please refer to CNTK documentation and edit/modify your Brainscript model/script to replace the deprecated operation with a supported operation.\n" , node->OperationName().c_str());

                if (node->Is<RngUser>())
                {
--- a/Source/CNTKv2LibraryDll/CompositeFunction.cpp
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.cpp
@ -539,6 +539,9 @@ namespace CNTK
            case PrimitiveOpType::Sqrt:
                computationNodePtr = New<SqrtNode<ElementType>>(network->GetDeviceId(), internalNodeName);
                break;
+            case PrimitiveOpType::ELU:
+                computationNodePtr = New<ExponentialLinearUnitNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                break;
            case PrimitiveOpType::Floor:
                computationNodePtr = New<FloorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
                break;
@ -690,8 +693,15 @@ namespace CNTK
                auto sharing = AsVector<bool>(functionConfig[PrimitiveFunction::AttributeNameSharing].Value<std::vector<DictionaryValue>>());
                auto autoPadding = AsVector<bool>(functionConfig[PrimitiveFunction::AttributeNameAutoPadding].Value<std::vector<DictionaryValue>>());
                auto transpose = functionConfig[PrimitiveFunction::AttributeNameTranspose].Value<bool>();
+                NDShape outputShape = NDShape::Unknown; 
+                if (functionConfig.Contains(PrimitiveFunction::AttributeNameOutputShape))
+                    outputShape = functionConfig[PrimitiveFunction::AttributeNameOutputShape].Value<NDShape>();
                auto maxTempMemSizeInSamples = functionConfig[PrimitiveFunction::AttributeNameMaxTempMemSizeInSamples].Value<size_t>();
-                computationNodePtr = New<ConvolutionNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(kernelShape), AsTensorShape(outputMapCount), AsTensorShape(strides), sharing, autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), transpose, ImageLayoutKind::CHW, maxTempMemSizeInSamples);
+                computationNodePtr = New<ConvolutionNode<ElementType>>(network->GetDeviceId(), internalNodeName, 
+                                                                       AsTensorShape(kernelShape), AsTensorShape(outputMapCount), AsTensorShape(strides), 
+                                                                       sharing, autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), transpose, 
+                                                                       outputShape.IsUnknown()? TensorShape(0) : AsTensorShape(outputShape), 
+                                                                       ImageLayoutKind::CHW, maxTempMemSizeInSamples);
                break;
            }
            case PrimitiveOpType::CosDistance:
@ -719,6 +729,13 @@ namespace CNTK
                computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
                break;
            }
+            case PrimitiveOpType::ForwardBackward:
+            {
+                auto delayContraint = functionConfig[PrimitiveFunction::AttributeNameDelayConstraint].Value<int>();
+                auto blankTokenId = functionConfig[PrimitiveFunction::AttributeNameBlankTokenId].Value<size_t>();
+                computationNodePtr = New<ForwardBackwardNode<ElementType>>(network->GetDeviceId(), internalNodeName, blankTokenId, delayContraint);
+                break;
+            }
            case PrimitiveOpType::LambdaRank:
                computationNodePtr = New<LambdaRankNode<ElementType>>(network->GetDeviceId(), internalNodeName);
                break;
@ -847,22 +864,34 @@ namespace CNTK
        }
        else
        {
-            computationNodePtr = New<UserDefinedV2FunctionNode<ElementType>>(network->GetDeviceId(), internalNodeName, function->shared_from_this());
-
-            // For user defined functions, we only attach unique inputs in the internal computation network since, the UDF 
-            // backward implementations directly compute aggregate gradient values for unique inputs
-            std::vector<ComputationNodeBasePtr> uniqueInputNodesBasePtrs;
-            for (auto inputNodeBasePtr : inputNodesBasePtrs)
+            auto outputs = function->RawOutputs();
+            if (variable == outputs[0])
            {
-                if (std::find(uniqueInputNodesBasePtrs.begin(), uniqueInputNodesBasePtrs.end(), inputNodeBasePtr) == uniqueInputNodesBasePtrs.end())
-                    uniqueInputNodesBasePtrs.push_back(inputNodeBasePtr);
-            }
+                computationNodePtr = New<UserDefinedV2FunctionNode<ElementType>>(network->GetDeviceId(), internalNodeName, function->shared_from_this());

-            inputNodesBasePtrs = uniqueInputNodesBasePtrs;
+                // For user defined functions, we only attach unique inputs in the internal computation network since, the UDF 
+                // backward implementations directly compute aggregate gradient values for unique inputs
+                std::vector<ComputationNodeBasePtr> uniqueInputNodesBasePtrs;
+                for (auto inputNodeBasePtr : inputNodesBasePtrs)
+                {
+                    if (std::find(uniqueInputNodesBasePtrs.begin(), uniqueInputNodesBasePtrs.end(), inputNodeBasePtr) == uniqueInputNodesBasePtrs.end())
+                        uniqueInputNodesBasePtrs.push_back(inputNodeBasePtr);
+                }
+
+                inputNodesBasePtrs = uniqueInputNodesBasePtrs;
+            }
+            else
+            {
+                size_t i = 1;
+                while (outputs[i] != variable) i++;
+                assert(i < outputs.size());
+
+                computationNodePtr = New<SelectUserDefinedV2FunctionOutputNode<ElementType>>(network->GetDeviceId(), CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name()), i);
+                inputNodesBasePtrs = { variableToNodeMap[outputs[0]] };
+            }
        }

        network->AddNodeToNetAndAttachInputs(computationNodePtr, inputNodesBasePtrs);
-
        return computationNodePtr;
    }

@ -1012,10 +1041,9 @@ namespace CNTK
            };
            PreorderTraverseFunctions(rootFunction, PatchBlockArgumentsMapping);

-            std::function<bool(const Variable&)> IsVariableRoot;
-            IsVariableRoot = [this, &IsVariableRoot](const Variable& outputVar) {
+            std::function<bool(const Variable&)> IsVariableRoot = [this, &IsVariableRoot](const Variable& outputVar) {
                auto mappingVariable = GetMappingVariable(outputVar);
-                return (m_isVariableRootMap[outputVar] && ((mappingVariable == outputVar) || IsVariableRoot(mappingVariable)));
+                return (m_isVariableRootMap[outputVar] && !IsFirstOutputOfMultiOutputUDF(mappingVariable) && ((mappingVariable == outputVar) || IsVariableRoot(mappingVariable)));
            };

            // If any of the function or requested outputs is not a root node, we need to explicitly
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -748,6 +748,24 @@ namespace CNTK
        });
    }

+    std::wstring Function::AsString() const
+    {
+        wstringstream wss;
+        bool first = true;
+        if (IsComposite())
+            wss << "Composite(" << RootFunction()->OpName() << "): ";
+        else
+            wss << OpName() <<": ";
+        bool reverse = Internal::IsReversingTensorShapesInErrorMessagesEnabled();
+        for (auto arg : Arguments(reverse))
+            wss << (first ? (first = false, "") : ", ") << arg.AsString();
+        wss << " -> ";
+        first = true;
+        for (auto out : Outputs())
+            wss << (first ? (first = false, "") : ", ") << out.AsString();
+        return wss.str();
+    }
+
    FunctionPtr UnaryOp(PrimitiveOpType op, const Variable& operand, Dictionary&& opConfig, const std::wstring& name)
    {
        std::vector<Variable> operands = { operand };
@ -785,14 +803,14 @@ namespace CNTK
    }

    FunctionPtr Exp(const Variable& operand, const std::wstring& name)
-        {
+    {
        return UnaryOp(PrimitiveOpType::Exp, operand, Dictionary(), name);
    }

    FunctionPtr Log(const Variable& operand, const std::wstring& name)
    {
        return UnaryOp(PrimitiveOpType::Log, operand, Dictionary(), name);
-        }
+    }

    FunctionPtr Square(const Variable& operand, const std::wstring& name)
    {
@ -1090,6 +1108,20 @@ namespace CNTK
        return BinaryOp(PrimitiveOpType::EditDistanceError, prediction, labels, std::move(additionalProperties), name);
    }

+    FunctionPtr ForwardBackward(const Variable& graph, const Variable& features, size_t blankTokenId, int delayConstraint, const std::wstring& name)
+    {
+        auto additionalProperties = Dictionary();
+        additionalProperties[PrimitiveFunction::AttributeNameBlankTokenId] = blankTokenId;
+        additionalProperties[PrimitiveFunction::AttributeNameDelayConstraint] = delayConstraint;
+
+        return BinaryOp(PrimitiveOpType::ForwardBackward, graph, features, std::move(additionalProperties), name);
+    }
+
+    FunctionPtr LabelsToGraph(const Variable& labels, const std::wstring& name)
+    {
+        return UnaryOp(PrimitiveOpType::LabelsToGraph, labels, Dictionary(), name);
+    }
+
    FunctionPtr PastValue(const Variable& operand, const Variable& initialState, size_t offset, const std::wstring& name)
    {
        auto additionalProperties = Dictionary();
@ -1155,26 +1187,44 @@ namespace CNTK
                            const std::vector<bool>& autoPadding,
                            const NDShape& lowerPad,
                            const NDShape& upperPad,
-                            bool transpose,
                            size_t maxTempMemSizeInSamples,
                            const std::wstring& name)
    {
-        // Currently we require that the Convolution function's operand have a dynamic axis since otherwise
-        // the internal implementation incorrectly infers the batch axis dimension by picking up the first axis as 
-        // the sample shape and considering the rest to be part of the batch axis
-        if (operand.DynamicAxes().empty())
-            LogicError("Convolution currently requires the main operand to have dynamic axes");
+        return Internal::Convolution(convolutionMap,
+                                     operand,
+                                     strides,
+                                     sharing,
+                                     autoPadding,
+                                     lowerPad,
+                                     upperPad,
+                                     false,
+                                     {0},
+                                     maxTempMemSizeInSamples,
+                                     name); 
+    }

-        auto additionalProperties = Dictionary();
-        additionalProperties[PrimitiveFunction::AttributeNameStrides] = strides;
-        additionalProperties[PrimitiveFunction::AttributeNameSharing] = AsDictionaryValueVector(sharing);
-        additionalProperties[PrimitiveFunction::AttributeNameAutoPadding] = AsDictionaryValueVector(autoPadding);
-        additionalProperties[PrimitiveFunction::AttributeNameLowerPad] = lowerPad;
-        additionalProperties[PrimitiveFunction::AttributeNameUpperPad] = upperPad;
-        additionalProperties[PrimitiveFunction::AttributeNameTranspose] = transpose;
-        additionalProperties[PrimitiveFunction::AttributeNameMaxTempMemSizeInSamples] = maxTempMemSizeInSamples;
-
-        return BinaryOp(PrimitiveOpType::Convolution, convolutionMap, operand, std::move(additionalProperties), name);
+    FunctionPtr ConvolutionTranspose(const Variable& convolutionMap,
+        const Variable& operand,
+        const NDShape& strides,
+        const std::vector<bool>& sharing,
+        const std::vector<bool>& autoPadding,
+        const NDShape& lowerPad,
+        const NDShape& upperPad,
+        const NDShape& outputShape, 
+        size_t maxTempMemSizeInSamples,
+        const std::wstring& name)
+    {
+        return Internal::Convolution(convolutionMap,
+                                     operand,
+                                     strides,
+                                     sharing,
+                                     autoPadding,
+                                     lowerPad,
+                                     upperPad,
+                                     true,
+                                     outputShape,
+                                     maxTempMemSizeInSamples,
+                                     name);
    }

    FunctionPtr ROIPooling(const Variable& convolutionMap, const Variable& rois, const NDShape& roiOutputShape, const std::wstring& name/* = L""*/)
@ -1312,13 +1362,7 @@ namespace CNTK

    FunctionPtr ELU(const Variable& operand, const std::wstring& name)
    {
-        auto operandPlaceholder = PlaceholderVariable();
-        auto lessThanZero = Less(operandPlaceholder, Constant::Scalar(operand.GetDataType(), 0.0));
-        auto result = ElementSelect(lessThanZero, 
-            Minus(Exp(operandPlaceholder), Constant::Scalar(operand.GetDataType(), 1.0)),
-            operandPlaceholder);
-
-        return AsBlock(std::move(result), { { operandPlaceholder, operand } }, L"ELU", name);
+        return UnaryOp(PrimitiveOpType::ELU, operand, Dictionary(), name);
    }

    FunctionPtr LeakyReLU(const Variable& operand, const std::wstring& name)
@ -1343,6 +1387,14 @@ namespace CNTK
        return AsBlock(std::move(result), { { operandPlaceholder, operand } }, L"PReLU", name);
    }

+    FunctionPtr Softplus(const Variable& operand, const std::wstring& name)
+    {
+        auto operandPlaceholder = PlaceholderVariable();
+        auto result = LogAddExp(operandPlaceholder, Constant::Scalar(operand.GetDataType(), 0.0));
+
+        return AsBlock(std::move(result), { { operandPlaceholder, operand } }, L"Softplus", name);
+    }
+
    FunctionPtr Argmax(const Variable& operand, const Axis& axis, const std::wstring& name)
    {
        return Internal::ReduceElements(operand, PrimitiveFunction::InternalArgmaxReductionOpName, axis, name);
@ -1607,5 +1659,36 @@ namespace CNTK
            //       E.g. used for seq2seq.
            return BinaryOp(PrimitiveOpType::ReconcileDynamicAxis, operand, axesAsOperand, Dictionary(), name);
        }
+
+        FunctionPtr Convolution(const Variable& convolutionMap,
+            const Variable& operand,
+            const NDShape& strides,
+            const std::vector<bool>& sharing,
+            const std::vector<bool>& autoPadding,
+            const NDShape& lowerPad,
+            const NDShape& upperPad,
+            bool transpose,
+            const NDShape& outputShape,
+            size_t maxTempMemSizeInSamples,
+            const std::wstring& name)
+        {
+            // Currently we require that the Convolution function's operand have a dynamic axis since otherwise
+            // the internal implementation incorrectly infers the batch axis dimension by picking up the first axis as 
+            // the sample shape and considering the rest to be part of the batch axis
+            if (operand.DynamicAxes().empty())
+                LogicError("Convolution currently requires the main operand to have dynamic axes");
+
+            auto additionalProperties = Dictionary();
+            additionalProperties[PrimitiveFunction::AttributeNameStrides] = strides;
+            additionalProperties[PrimitiveFunction::AttributeNameSharing] = AsDictionaryValueVector(sharing);
+            additionalProperties[PrimitiveFunction::AttributeNameAutoPadding] = AsDictionaryValueVector(autoPadding);
+            additionalProperties[PrimitiveFunction::AttributeNameLowerPad] = lowerPad;
+            additionalProperties[PrimitiveFunction::AttributeNameUpperPad] = upperPad;
+            additionalProperties[PrimitiveFunction::AttributeNameTranspose] = transpose;
+            additionalProperties[PrimitiveFunction::AttributeNameOutputShape] = outputShape;
+            additionalProperties[PrimitiveFunction::AttributeNameMaxTempMemSizeInSamples] = maxTempMemSizeInSamples;
+
+            return BinaryOp(PrimitiveOpType::Convolution, convolutionMap, operand, std::move(additionalProperties), name);
+        }
   }
 }
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
@ -54,6 +54,7 @@ namespace CNTK
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameLowerPad = L"lowerPad";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameUpperPad = L"upperPad";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameTranspose = L"transpose";
+    /*static*/ const std::wstring PrimitiveFunction::AttributeNameOutputShape = L"outputShape";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameMaxTempMemSizeInSamples = L"maxTempMemSizeInSamples";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameROIOutputShape = L"roiOutputShape";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNamePoolingType = L"poolingType";
@ -81,6 +82,8 @@ namespace CNTK
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameInsertionPenalty = L"InsertionPenalty";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameSquashInputs = L"SquashInputs";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameTokensToIgnore = L"TokensToIgnore";
+    /*static*/ const std::wstring PrimitiveFunction::AttributeNameDelayConstraint = L"DelayConstraint";
+    /*static*/ const std::wstring PrimitiveFunction::AttributeNameBlankTokenId = L"BlankTokenId";

    /*static*/ DataType PrimitiveFunction::GetOutputDataType(PrimitiveOpType op, std::vector<Variable>& inputs, bool inferDimensions)
    {
@ -306,7 +309,9 @@ namespace CNTK
                        case PrimitiveOpType::Sin:
                        case PrimitiveOpType::Cos:
                        case PrimitiveOpType::Pass:
+                        case PrimitiveOpType::LabelsToGraph:
                        case PrimitiveOpType::StopGradient:
+                        case PrimitiveOpType::ELU:
                            assert(m_inputs.size() == 1);
                            outputShape = UnaryElementwiseOpOutputShape(m_inputs[0].Shape());
                            break;
@ -518,6 +523,9 @@ namespace CNTK
                            auto& strides = m_attributes[PrimitiveFunction::AttributeNameStrides].Value<NDShape>();
                            auto& lowerPad = m_attributes[PrimitiveFunction::AttributeNameLowerPad].Value<NDShape>();
                            auto& upperPad = m_attributes[PrimitiveFunction::AttributeNameUpperPad].Value<NDShape>();
+                            NDShape tmpShape = NDShape::Unknown; 
+                            if (m_attributes.Contains(PrimitiveFunction::AttributeNameOutputShape))
+                                tmpShape = m_attributes[PrimitiveFunction::AttributeNameOutputShape].Value<NDShape>();
                            auto sharing = AsVector<bool>(m_attributes[PrimitiveFunction::AttributeNameSharing].Value<std::vector<DictionaryValue>>());
                            auto autoPadding = AsVector<bool>(m_attributes[PrimitiveFunction::AttributeNameAutoPadding].Value<std::vector<DictionaryValue>>());
                            bool transpose = m_attributes[PrimitiveFunction::AttributeNameTranspose].Value<bool>();
@ -527,7 +535,20 @@ namespace CNTK
                            NDShape outputMapCount, kernelShape;
                            std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(m_inputs[0].Shape(), m_inputs[1].Shape());
                            auto originalKernelShape = kernelShape;
-                            outputShape = ConvolutionOpOutputShape(m_op, m_inputs[1].Shape(), kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, transpose, true);
+
+                            auto inputShape = m_inputs[1].Shape();
+                            if (!transpose || tmpShape.IsUnknown() || tmpShape[0] == 0)
+                            {
+                                outputShape = ConvolutionOpOutputShape(m_op, inputShape, kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, transpose, true);
+                            }
+                            else
+                            {
+                                NDShape inferredInputShape = ConvolutionOpOutputShape(m_op, tmpShape, kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, false, true);
+                                if (inferredInputShape != inputShape)
+                                    RuntimeError("The shape of the convolution transpose operand %ls is different from the result of convoluting the specified output argument using the provided options %ls", inputShape.AsString().c_str(), inferredInputShape.AsString().c_str());
+                                outputShape = tmpShape; 
+                            }
+
                            if (originalKernelShape != kernelShape)
                            {
                                for (size_t i2 = 0; i2 < kernelShape.Rank(); ++i2)
@ -540,6 +561,7 @@ namespace CNTK
                        }
                        case PrimitiveOpType::CosDistance:
                        case PrimitiveOpType::EditDistanceError:
+                        case PrimitiveOpType::ForwardBackward:
                        case PrimitiveOpType::Logistic:
                        case PrimitiveOpType::SquaredError:
                        case PrimitiveOpType::CrossEntropyWithSoftmax:
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.h
@ -68,6 +68,8 @@ namespace CNTK
        {PrimitiveOpType::CrossEntropyWithSoftmax, L"CrossEntropyWithSoftmax"},
        {PrimitiveOpType::ClassificationError, L"ClassificationError"},
        {PrimitiveOpType::EditDistanceError, L"EditDistanceError" },
+        {PrimitiveOpType::ForwardBackward, L"ForwardBackward" },
+        {PrimitiveOpType::LabelsToGraph, L"LabelsToGraph" },
        {PrimitiveOpType::PastValue, L"PastValue"},
        {PrimitiveOpType::FutureValue, L"FutureValue"},
        {PrimitiveOpType::ReduceElements, L"ReduceElements"},
@ -87,12 +89,13 @@ namespace CNTK
        {PrimitiveOpType::Sin, L"Sin"},
        {PrimitiveOpType::Cos, L"Cos"},
        {PrimitiveOpType::Pass, L"Pass"},
-        { PrimitiveOpType::Block, L"Block" },
-        { PrimitiveOpType::Unpooling, L"Unpooling" },
-        { PrimitiveOpType::LambdaRank, L"LambdaRank" },
-        { PrimitiveOpType::NDCG, L"NDCG" },
-        { PrimitiveOpType::NoOp, L"NoOp" },
-        { PrimitiveOpType::StopGradient, L"StopGradient" }
+        {PrimitiveOpType::Block, L"Block" },
+        {PrimitiveOpType::Unpooling, L"Unpooling" },
+        {PrimitiveOpType::LambdaRank, L"LambdaRank" },
+        {PrimitiveOpType::NDCG, L"NDCG" },
+        {PrimitiveOpType::NoOp, L"NoOp" },
+        {PrimitiveOpType::StopGradient, L"StopGradient" },
+        {PrimitiveOpType::ELU, L"ELU" },
    };

    inline const std::wstring& PrimitiveOpTypeName(PrimitiveOpType opType)
@ -210,6 +213,7 @@ namespace CNTK
        static const std::wstring AttributeNameLowerPad;
        static const std::wstring AttributeNameUpperPad;
        static const std::wstring AttributeNameTranspose;
+        static const std::wstring AttributeNameOutputShape; 
        static const std::wstring AttributeNameMaxTempMemSizeInSamples;
        static const std::wstring AttributeNameROIOutputShape;
        static const std::wstring AttributeNamePoolingType;
@ -237,6 +241,8 @@ namespace CNTK
        static const std::wstring AttributeNameInsertionPenalty;
        static const std::wstring AttributeNameSquashInputs;
        static const std::wstring AttributeNameTokensToIgnore;
+        static const std::wstring AttributeNameDelayConstraint;
+        static const std::wstring AttributeNameBlankTokenId;

    protected:
        PrimitiveFunction(PrimitiveOpType op, const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& functionName, const std::wstring& uid)
@ -731,6 +737,8 @@ namespace CNTK
        // version 2: changed in 7af3a7c0e46cb12f873f1289400a9c5d86746662. TODO(n17s): add description.
        // version 3: changed in df0ab4e58186738931968e806b61bc80d7b6e20e. TODO(pkrannen): add description.
        // version 4: added extra parameter (#6) for the running mean sample count in BatchNormalization.
-        static const size_t s_serializationVersion = 7;
+        // Version 6: Add argmax and argmin to ReduceElement.
+        // Version 8: Add ELU node.
+        static const size_t s_serializationVersion = 8;
    };
 }
--- a/Source/CNTKv2LibraryDll/PrimitiveOpType.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveOpType.h
@ -74,6 +74,8 @@ namespace CNTK
        NoOp = 62,
        LabelsToGraph = 63,
        StopGradient = 64,
+        ELU = 65,
+        ForwardBackward = 66,
        // New op types should only be appended to the end of this list 
        UnknownOP
        // and UnknownOP should always be last.
--- a/Source/CNTKv2LibraryDll/TrainingSession.cpp
+++ b/Source/CNTKv2LibraryDll/TrainingSession.cpp
@ -12,6 +12,15 @@

 namespace CNTK
 {
+    namespace Internal
+    {
+        // TODO: Workaround for back compat. Should not be used and will be removed in the next version.
+        CNTK_API void AddProgressWriters(const TrainerPtr& t, const std::vector<ProgressWriterPtr>& w)
+        {
+            t->AddProgressWriters(w);
+        }
+    }
+
    using namespace std;

    const static std::wstring s_trainingMinibatchSource = L"TrainingMinibatchSource";
@ -22,6 +31,36 @@ namespace CNTK
            find_if(s.begin(), s.end(), [](wchar_t c) { return !isdigit(c); }) == s.end();
    }

+    CheckpointConfig::CheckpointConfig(
+        const std::wstring& checkPointFileName,
+        size_t checkpointFrequencyInSamples,
+        bool restoreFromCheckpointIfExists,
+        bool preserveAllCheckpoints) :
+        m_preserveAll(preserveAllCheckpoints),
+        m_restore(restoreFromCheckpointIfExists),
+        m_fileName(checkPointFileName),
+        m_frequency(checkpointFrequencyInSamples)
+    {
+        if (m_fileName.empty())
+        {
+            if (checkpointFrequencyInSamples != 0 && checkpointFrequencyInSamples != std::numeric_limits<size_t>::max())
+                InvalidArgument("Checkpoint file name is not allowed to be empty if checkpoint frequency is non zero.");
+            if (preserveAllCheckpoints)
+                InvalidArgument("Checkpoint file name is not allowed to be empty if 'preserve all checkpoints' is specified.");
+            checkpointFrequencyInSamples = 0;
+        }
+    }
+
+    CrossValidationConfig::CrossValidationConfig(
+        const MinibatchSourcePtr& crossValidationSource,
+        const MinibatchSizeSchedule& crossValidationSchedule,
+        size_t crossValidationFrequencyInSamples):
+        m_source(crossValidationSource),
+        m_mbSize(crossValidationSchedule),
+        m_frequency(crossValidationFrequencyInSamples)
+    {
+    }
+
    TrainingSessionPtr CreateBasicTrainingSession(
        const MinibatchSourcePtr& trainingSource,
        const TrainerPtr& trainer,
@ -38,6 +77,10 @@ namespace CNTK
        size_t progressFrequency,
        const std::vector<ProgressWriterPtr>& progressWriters)
    {
+        fprintf(stderr, "WARNING:CreateBasicTrainingSession is deprecated and will be removed in the next beta (13)."
+            "Instructions for updating:"
+            "Please switch to CreateTrainingSession function and then call SetCheckpointing/SetCrossValidation/SetPrintingProgress as needed.");
+
        return MakeSharedObject<TrainingSession>(trainingSource,
            trainer,
            modelInputToMinibatchSourceStream,
@ -54,6 +97,24 @@ namespace CNTK
            progressWriters);
    }

+    TrainingSessionPtr CreateTrainingSession(
+        const TrainerPtr& trainer,
+        const MinibatchSourcePtr& trainingSource,
+        const MinibatchSizeSchedule& minibatchSizeSchedule,
+        const std::unordered_map<Variable, StreamInformation>& inputVarToStream,
+        size_t maxNumTrainingSamples,
+        size_t progressFrequency,
+        const CheckpointConfig& checkpointing,
+        const CrossValidationConfig& crossValidation)
+    {
+        return MakeSharedObject<TrainingSession>(trainer,
+            trainingSource,
+            minibatchSizeSchedule,
+            inputVarToStream,
+            maxNumTrainingSamples,
+            progressFrequency, checkpointing, crossValidation);
+    }
+
    TrainingSession::TrainingSession(
        const MinibatchSourcePtr& trainingSource,
        const TrainerPtr& trainer,
@ -68,49 +129,56 @@ namespace CNTK
        bool saveAllCheckpoints,
        size_t maxNumberOfSamples,
        size_t progressFrequencyInSamples,
-        const std::vector<ProgressWriterPtr>& progressWriters) :
-        m_trainingSource(trainingSource),
+        const std::vector<ProgressWriterPtr>& progressWriters)
+        : TrainingSession(
+            trainer, trainingSource, schedule, modelInputToMinibatchSourceStream, maxNumberOfSamples, progressFrequencyInSamples,
+            CheckpointConfig(checkPointFileName, checkpointFrequencyInSamples, restoreFromCheckpointIfExists, saveAllCheckpoints),
+            CrossValidationConfig(crossValidationSource, crossValidationSchedule, crossValidationFrequencyInSamples))
+    {
+        if (progressFrequencyInSamples)
+        {
+            trainer->AddProgressWriters(progressWriters);
+        }
+    }
+
+    TrainingSession::TrainingSession(
+        const TrainerPtr& trainer,
+        const MinibatchSourcePtr& trainingSource,
+        const MinibatchSizeSchedule& minibatchSizeSchedule,
+        const std::unordered_map<Variable, StreamInformation>& inputVarToStream,
+        size_t maxNumTrainingSamples,
+        size_t progressFrequency,
+        const CheckpointConfig& checkpointing,
+        const CrossValidationConfig& crossValidation) :
        m_trainer(trainer),
-        m_modelInputToMinibatchSourceStream(modelInputToMinibatchSourceStream),
-        m_checkPointFileName(checkPointFileName),
+        m_source(trainingSource),
+        m_mbSize(minibatchSizeSchedule),
+        m_varToStream(inputVarToStream),
+        m_maxNumSamples(maxNumTrainingSamples),
+        m_progressFrequency(progressFrequency),
+        m_checkpoint(checkpointing),
+        m_cv(crossValidation),
        m_parallelAfterSamples(0),
        m_workerRank(0),
-        m_numberOfWorkers(1),
-        m_minibatchSizeSchedule(schedule),
-        m_maxNumberOfSamples(maxNumberOfSamples),
-        m_restoreFromCheckpointIfExists(restoreFromCheckpointIfExists),
-        m_saveAllCheckpoints(saveAllCheckpoints),
-        m_crossValidationSource(crossValidationSource),
-        m_crossValidationSchedule(crossValidationSchedule)
+        m_numberOfWorkers(1)
    {
-        if (!trainingSource)
-            InvalidArgument("Training minibatch source is not allowed to be null.");
-        if (!trainer)
+        if (!m_trainer)
            InvalidArgument("Trainer is not allowed to be null.");
-        if(modelInputToMinibatchSourceStream.empty())
-            InvalidArgument("Input mapping is not allowed to be empty.");

-        if (m_checkPointFileName.empty())
-        {
-            if(checkpointFrequencyInSamples != 0 && checkpointFrequencyInSamples != std::numeric_limits<size_t>::max())
-                InvalidArgument("Checkpoint file name is not allowed to be empty if checkpoint frequency is non zero.");
-            if(saveAllCheckpoints)
-                InvalidArgument("Checkpoint file name is not allowed to be empty if 'save all checkpoints' is specified.");
-            checkpointFrequencyInSamples = 0;
-        }
+        if (!m_source)
+            InvalidArgument("Training source must not be null.");

-        if (!m_crossValidationSource)
-        {
-            if(crossValidationFrequencyInSamples != 0 && crossValidationFrequencyInSamples != std::numeric_limits<size_t>::max())
-                InvalidArgument("Cross validation minibatch source is not allowed to be empty.");
-            crossValidationFrequencyInSamples = 0;
-        }
+        if (m_maxNumSamples == 0)
+            InvalidArgument("maxNumTrainingSamples must not be zero.");
+
+        if (m_varToStream.empty())
+            InvalidArgument("inputVarToStream mapping must not be empty.");

        // Let's calculate the warm up period the distributed learners may need.
        // We will take the maximum warm up period required.
-        auto learners = trainer->ParameterLearners();
+        auto learners = m_trainer->ParameterLearners();
        m_parallelAfterSamples = 0;
-        for (const auto& l: learners)
+        for (const auto& l : learners)
        {
            auto distributed = std::dynamic_pointer_cast<DistributedLearner>(l);
            if (distributed)
@ -122,47 +190,49 @@ namespace CNTK
        }

        // Fill-in required actions.
-        if (checkpointFrequencyInSamples != 0)
-            m_actions.push_back({ checkpointFrequencyInSamples, 0, 0,
+        if (m_checkpoint.m_frequency != 0)
+            m_actions.push_back({ m_checkpoint.m_frequency, 0, 0,
                [this](size_t currentIndex, const DeviceDescriptor&)
                {
                    SaveCheckpoint(currentIndex);
                    // enable profiler after the first checkpoint
                    // This has effect only if the profiler is globally enabled by StartProfiler()
                    Microsoft::MSR::CNTK::ProfilerEnable(true);
+                    return true;
                } });

-        if(crossValidationFrequencyInSamples != 0)
-            m_actions.push_back({ crossValidationFrequencyInSamples, 0, 0,
-                [this](size_t currentIndex, const DeviceDescriptor& d) { CrossValidate(currentIndex, d); } });
+        if (m_cv.m_frequency != 0)
+            m_actions.push_back({ m_cv.m_frequency , 0, 0,
+                [this](size_t currentIndex, const DeviceDescriptor& d) { return CrossValidate(currentIndex, d); } });

-        if (progressFrequencyInSamples != 0)
-            m_actions.push_back({ progressFrequencyInSamples, 0, 0,
-                [this](size_t currentIndex, const DeviceDescriptor&) { ReportProgress(currentIndex); } });
-
-        m_trainer->AddProgressWriters(progressWriters);
+        if (m_progressFrequency != 0)
+        {
+            m_actions.push_back({ m_progressFrequency, 0, 0,
+                [this](size_t currentIndex, const DeviceDescriptor&) { ReportProgress(currentIndex); return true; } });
+        }
    }

    void TrainingSession::Train(const DeviceDescriptor& computeDevice)
    {
        std::unordered_map<Variable, ValuePtr> minibatch;
-        bool shouldTrain = m_maxNumberOfSamples > 0;
+        bool shouldTrain = m_maxNumSamples > 0;

        // Let's try to restore if required.
        size_t restoredNumberOfSamples = 0;
-        if (m_restoreFromCheckpointIfExists && !m_checkPointFileName.empty())
+        if (m_checkpoint.m_restore && !m_checkpoint.m_fileName.empty())
        {
            RestoreFromCheckpoint();
            restoredNumberOfSamples = m_trainer->TotalNumberOfSamplesSeen();
        }

        // Main train loop.
+        bool earlyExit = false;
        while (shouldTrain)
        {
            // Get next minibatch.
-            size_t samplesLeft = m_maxNumberOfSamples > m_trainer->TotalNumberOfSamplesSeen()
-                ? m_maxNumberOfSamples - m_trainer->TotalNumberOfSamplesSeen()
-                : 0;
+            size_t samplesLeft = earlyExit || m_maxNumSamples <= Trainer()->TotalNumberOfSamplesSeen()
+                ? 0
+                : m_maxNumSamples - Trainer()->TotalNumberOfSamplesSeen();

            // Note that in case of distributed training we don't want to stop if the local minibatch
            // is empty - it is possible that the other workers are still processing their minibatches.
@ -170,32 +240,34 @@ namespace CNTK

            // Train on the minibatch.
            OnMinibatchStart();
-            shouldTrain = m_trainer->TrainMinibatch(minibatch, computeDevice);
-            OnMinibatchEnd();
+            shouldTrain = Trainer()->TrainMinibatch(minibatch, computeDevice);
+            earlyExit |= !OnMinibatchEnd(); // If the callback wants to have early exit - we stop training.

            auto profMisc = Microsoft::MSR::CNTK::ScopeProfile(Microsoft::MSR::CNTK::profilerEvtMainPost);

            // Peform actions if required.
-            size_t totalNumberOfSamples = m_trainer->TotalNumberOfSamplesSeen();
+            size_t totalNumberOfSamples = Trainer()->TotalNumberOfSamplesSeen();
            for (auto& action : m_actions)
            {
                size_t index = totalNumberOfSamples / action.frequency;
                if (index != action.currentIndex)
                {
-                    action.action(action.currentIndex, computeDevice);
+                    // If any action wants to have early exit - we stop training.
+                    earlyExit |= !action.action(action.currentIndex, computeDevice);
+
                    action.currentIndex = index;
                    action.sampleCountWhenLastCalled = totalNumberOfSamples;
                }
            }
        }

-        if (restoredNumberOfSamples != m_trainer->TotalNumberOfSamplesSeen())
+        if (restoredNumberOfSamples != Trainer()->TotalNumberOfSamplesSeen())
        {
            // Let's do all actions on the last probably a partial data at the end.
            for (auto& action: m_actions)
            {
-                if (m_trainer->TotalNumberOfSamplesSeen() % action.frequency != 0 &&
-                    m_trainer->TotalNumberOfSamplesSeen() != action.sampleCountWhenLastCalled)
+                if (Trainer()->TotalNumberOfSamplesSeen() % action.frequency != 0 &&
+                    Trainer()->TotalNumberOfSamplesSeen() != action.sampleCountWhenLastCalled)
                    action.action(action.currentIndex, computeDevice);
            }
        }
@ -203,38 +275,48 @@ namespace CNTK
        // In case of incremental - save final checkpoint.
        // This is required only when we keep all existing checkpoints, otherwise 
        // The checkpoint was already saved with the proper name.
-        if (m_saveAllCheckpoints && !fexists(m_checkPointFileName))
+        if (m_checkpoint.m_frequency &&
+            m_checkpoint.m_preserveAll &&
+            !fexists(m_checkpoint.m_fileName))
            SaveFinalCheckpoint();
    }

    // TODO: Possibly expose a limiting counter on the number of samples for validation.
-    void TrainingSession::CrossValidate(size_t currentIndex, const DeviceDescriptor& computeDevice)
+    bool TrainingSession::CrossValidate(size_t currentIndex, const DeviceDescriptor& computeDevice)
    {
-        std::unordered_map<Variable, ValuePtr> minibatch;
-        double accumulatedError = 0;
-        double error;
-        size_t totalNumberOfSamples = 0;
-        size_t numberOfMinibatches = 0;
-
-        auto checkpoint = m_crossValidationSource->GetCheckpointState();
-        size_t sampleCount = 0;
-        while(GetCrossValidationMinibatch(minibatch, m_crossValidationSchedule[sampleCount], computeDevice), !minibatch.empty())
+        if (m_cv.m_source) // Running cross validation
        {
-            // TODO: it may be slow to rely on TestMinibatch to return error each time, since it may require transfer
-            // of error from the GPU each time.
-            error = m_trainer->TestMinibatch(minibatch, computeDevice, sampleCount);
-            accumulatedError += error * sampleCount;
-            totalNumberOfSamples += sampleCount;
-            numberOfMinibatches++;
+            std::unordered_map<Variable, ValuePtr> minibatch;
+            double accumulatedError = 0;
+            double error = 0;
+            size_t totalNumberOfSamples = 0;
+            size_t numberOfMinibatches = 0;
+
+            auto checkpoint = m_cv.m_source->GetCheckpointState();
+            size_t sampleCount = 0;
+            while (GetCrossValidationMinibatch(minibatch, m_cv.m_mbSize[sampleCount], computeDevice), !minibatch.empty())
+            {
+                // TODO: it may be slow to rely on TestMinibatch to return error each time, since it may require transfer
+                // of error from the GPU each time.
+                error = m_trainer->TestMinibatch(minibatch, computeDevice, sampleCount);
+                accumulatedError += error * sampleCount;
+                totalNumberOfSamples += sampleCount;
+                numberOfMinibatches++;
+            }
+
+            m_cv.m_source->RestoreFromCheckpoint(checkpoint);
+            Trainer()->SummarizeTestProgress();
+            return OnCrossValidationEnd(currentIndex, accumulatedError / totalNumberOfSamples, totalNumberOfSamples, numberOfMinibatches);
+        }
+        else // Only invoking the callback.
+        {
+            return OnCrossValidationEnd(currentIndex, 0, 0, 0);
        }
-        m_crossValidationSource->RestoreFromCheckpoint(checkpoint);
-        m_trainer->SummarizeTestProgress();
-        OnCrossValidationEnd(currentIndex, accumulatedError / totalNumberOfSamples, totalNumberOfSamples, numberOfMinibatches);
    }

    inline void TrainingSession::ReportProgress(size_t /*currentIndex*/)
    {
-        m_trainer->SummarizeTrainingProgress();
+        Trainer()->SummarizeTrainingProgress();
    }

    void TrainingSession::GetTrainingMinibatch(std::unordered_map<Variable, ValuePtr>& minibatch, size_t maxMbSize, const DeviceDescriptor& computeDevice)
@ -242,7 +324,7 @@ namespace CNTK
        size_t workerRank = m_workerRank, numberOfWorkers = m_numberOfWorkers;

        // Check if we are operating in distributed mode.
-        if (m_parallelAfterSamples > m_trainer->TotalNumberOfSamplesSeen())
+        if (m_parallelAfterSamples > Trainer()->TotalNumberOfSamplesSeen())
        {
            numberOfWorkers = 1;
            workerRank = 0;
@ -250,13 +332,13 @@ namespace CNTK

        size_t mbSize = GetMinibatchSize();
        mbSize = std::min(mbSize, maxMbSize);
-        GetNextMinibatch(m_trainingSource, minibatch, mbSize, workerRank, numberOfWorkers, computeDevice);
+        GetNextMinibatch(m_source, minibatch, mbSize, workerRank, numberOfWorkers, computeDevice);
    }

    void TrainingSession::GetCrossValidationMinibatch(std::unordered_map<Variable, ValuePtr>& minibatch, size_t maxMbSize, const DeviceDescriptor& computeDevice)
    {
        // TODO: Support distributed cross-validation, when TestMinibatch supports it.
-        GetNextMinibatch(m_crossValidationSource, minibatch, maxMbSize, 0, 1, computeDevice);
+        GetNextMinibatch(m_cv.m_source, minibatch, maxMbSize, 0, 1, computeDevice);
    }

    void TrainingSession::GetNextMinibatch(const MinibatchSourcePtr& source, std::unordered_map<Variable, ValuePtr>& minibatch, size_t mbSize, size_t workerRank, size_t numberOfWorkers, const DeviceDescriptor& computeDevice)
@ -271,34 +353,34 @@ namespace CNTK
        if (minibatchData.empty())
            return;

-        for (auto v : m_modelInputToMinibatchSourceStream)
+        for (auto v : m_varToStream)
            minibatch.insert({ v.first, minibatchData[v.second].data });
    }

    void TrainingSession::RestoreFromCheckpoint(const std::wstring& checkpointFileName)
    {
-        Dictionary externalState = m_trainer->RestoreFromCheckpoint(checkpointFileName);
-        m_trainingSource->RestoreFromCheckpoint(externalState[s_trainingMinibatchSource].Value<Dictionary>());
+        Dictionary externalState = Trainer()->RestoreFromCheckpoint(checkpointFileName);
+        m_source->RestoreFromCheckpoint(externalState[s_trainingMinibatchSource].Value<Dictionary>());
    }

    void TrainingSession::SaveCheckpoint(size_t currentIndex)
    {
        OnCheckpointStart(currentIndex);
        Dictionary externalState;
-        externalState[s_trainingMinibatchSource] = m_trainingSource->GetCheckpointState();
+        externalState[s_trainingMinibatchSource] = m_source->GetCheckpointState();

-        wstring checkpointFile = m_checkPointFileName;
-        if (m_saveAllCheckpoints)
+        wstring checkpointFile = m_checkpoint.m_fileName;
+        if (m_checkpoint.m_preserveAll)
            checkpointFile += std::to_wstring(currentIndex);
-        m_trainer->SaveCheckpoint(checkpointFile, externalState);
+        Trainer()->SaveCheckpoint(checkpointFile, externalState);
        OnCheckpointEnd(currentIndex);
    }

    void TrainingSession::SaveFinalCheckpoint()
    {
        Dictionary externalState;
-        externalState[s_trainingMinibatchSource] = m_trainingSource->GetCheckpointState();
-        m_trainer->SaveCheckpoint(m_checkPointFileName, externalState);
+        externalState[s_trainingMinibatchSource] = m_source->GetCheckpointState();
+        Trainer()->SaveCheckpoint(m_checkpoint.m_fileName, externalState);
    }

    // Restores from a m_checkPointFileName file.
@ -308,29 +390,30 @@ namespace CNTK
    // Where N is some positive integer.
    void TrainingSession::RestoreFromCheckpoint()
    {
-        assert(!m_checkPointFileName.empty());
+        assert(!m_checkpoint.m_fileName.empty());
+        auto checkpoint = m_checkpoint.m_fileName;

        // Make sure the intermediate directories exist, so no need for further checks.
-        msra::files::make_intermediate_dirs(m_checkPointFileName);
+        msra::files::make_intermediate_dirs(checkpoint);

-        size_t pos = m_checkPointFileName.find_last_of(L"\\/");
+        size_t pos = checkpoint.find_last_of(L"\\/");
        wstring parent;
        wstring fileName;
        if (pos == wstring::npos)
        {
            parent = L"..";
-            fileName = m_checkPointFileName;
+            fileName = checkpoint;
        }
        else
        {
-            parent = m_checkPointFileName.substr(0, pos);
-            fileName = m_checkPointFileName.substr(pos);
+            parent = checkpoint.substr(0, pos);
+            fileName = checkpoint.substr(pos);
        }

        std::wstring restoreFile;
-        if (fexists(m_checkPointFileName))
+        if (fexists(checkpoint))
        {
-            restoreFile = m_checkPointFileName;
+            restoreFile = checkpoint;
        }
        else
        {
@ -375,7 +458,7 @@ namespace CNTK
        this->RestoreFromCheckpoint(restoreFile);

        // Recalculate actions indicies.
-        size_t totalNumberOfSamples = m_trainer->TotalNumberOfSamplesSeen();
+        size_t totalNumberOfSamples = Trainer()->TotalNumberOfSamplesSeen();
        for (auto& action : m_actions)
        {
            action.currentIndex = totalNumberOfSamples / action.frequency;
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -454,6 +454,18 @@ namespace CNTK
 #endif
    }

+    bool IsFirstOutputOfMultiOutputUDF(const Variable& var)
+    {
+        if (!var.IsOutput())
+            return false;
+
+        auto owner = var.Owner();
+        if (dynamic_cast<PrimitiveFunction*>(owner.get()))
+            return false;
+
+        return (var == owner->Outputs()[0]) && (owner->Outputs().size() > 1);
+    }
+
    std::vector<Axis> DynamicAxesFromInternalDynamicAxisName(const std::wstring& internalDynamicAxisName)
    {
        std::vector<Axis> inputVarDynamicAxes;
@ -513,7 +525,8 @@ namespace CNTK
        if (var.GetDataType() != value->GetDataType())
            LogicError("The Variable's DataType %s does not match the corresponding Value's DataType %s", DataTypeName(var.GetDataType()), DataTypeName(value->GetDataType()));

-        bool isPackedValue = (dynamic_cast<PackedValue*>(value.get()) != nullptr);
+        auto packedValue = dynamic_cast<PackedValue*>(value.get());
+        bool isPackedValue = (packedValue != nullptr) && packedValue->IsPacked();

        // TODO: Is supplying dense data for an Input variable tagged as sparse, a fatal error even for packed value objects?
        if (!isPackedValue)
@ -571,7 +584,7 @@ namespace CNTK
            LogicError("The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(value->GetDataType()));

        auto packedValue = dynamic_cast<PackedValue*>(value.get());
-        if (packedValue)
+        if (packedValue && packedValue->IsPacked())
            return packedValue->PackedData<ElementType>();

        auto varShape = var.Shape();
@ -953,4 +966,34 @@ namespace CNTK
            Data()->SetValue(0.0);
        }
    }
+
+    std::wstring DynamicAxesAsString(std::vector<Axis> da, bool rowMajor)
+    {
+        if (da.size() == 0)
+            return L"[]";
+        std::wstringstream wss;
+        wss << "[";
+        if (da == Axis::UnknownDynamicAxes())
+            wss << "???";
+        else
+        {
+            if (rowMajor)
+                std::reverse(da.begin(), da.end());
+            bool first = true;
+            for (auto d : da)
+            {
+                wss << (first ? "" : ", ");
+                if (d == Axis::DefaultBatchAxis())
+                    wss << "#";
+                else if (d == Axis::DefaultDynamicAxis())
+                    wss << "*";
+                else
+                    wss << d.Name();
+                first = false;
+            }
+        }
+        wss << "]";
+        return wss.str();
+    }
+
 }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -141,9 +141,9 @@ namespace CNTK

    inline std::wstring AsStringForErrorReporting(const NDShape& shape)
    {
-        bool invertShape = Internal::IsReversingTensorShapesInErrorMessagesEnabled();
+        bool reverseShape = Internal::IsReversingTensorShapesInErrorMessagesEnabled();
        auto displayShape = shape;
-        if (invertShape)
+        if (reverseShape)
        {
            for (size_t i = 0, j = shape.Rank() - 1; i < shape.Rank(); ++i, --j)
                displayShape[i] = shape[j];
@ -526,6 +526,8 @@ namespace CNTK
            InvalidArgument("The specified axis index (%d) exceeds the static #axes (%d) of the corresponding operand", (int)axis.StaticAxisIndex(), (int)operandShape.Rank());
    }

+    bool IsFirstOutputOfMultiOutputUDF(const Variable& var);
+
    std::vector<Axis> DynamicAxesFromInternalDynamicAxisName(const std::wstring& internalDynamicAxisName);

    // Construct the dynamic axis name to be used internally for the CNTK InputNodes
@ -625,4 +627,7 @@ namespace CNTK

        size_t   m_numUpdates;
    };
+
+    std::wstring DynamicAxesAsString(std::vector<Axis> da, bool rowMajor = false);
+
 }
--- a/Source/CNTKv2LibraryDll/Value.h
+++ b/Source/CNTKv2LibraryDll/Value.h
@ -33,6 +33,8 @@ namespace CNTK
                m_unpackedShape = m_unpackedShape.AppendShape({ packedDataLayout->GetNumTimeSteps(), packedDataLayout->GetNumSequences() });
        }

+        bool IsPacked() const { return m_isPacked; }
+
        void Unpack() const;

        const NDShape& Shape() const override { return m_unpackedShape; }
--- a/Source/CNTKv2LibraryDll/Variable.cpp
+++ b/Source/CNTKv2LibraryDll/Variable.cpp
@ -183,6 +183,22 @@ namespace CNTK
        }
    }

+    std::wstring Variable::AsString() const
+    {
+        std::wstringstream wss;
+        wss << VariableKindName(Kind()) << "('";
+        if (Name() != L"")
+            wss << Name();
+        else
+            wss << Uid();
+        bool reverse = Internal::IsReversingTensorShapesInErrorMessagesEnabled();
+        if (reverse)
+            wss << "', " << DynamicAxesAsString(DynamicAxes(), reverse) << ", " << AsStringForErrorReporting(Shape()) << ")";
+        else
+            wss << "', " << AsStringForErrorReporting(Shape()) << ", " << DynamicAxesAsString(DynamicAxes(), reverse) << ")";
+        return wss.str();
+    }
+
    static const std::wstring InitializerTypeAttributeName = L"initializerType";
    static const std::wstring OutputRankAttributeName = L"outputRank";
    static const std::wstring FilterRankAttributeName = L"filterRank";
--- a/Source/Common/Include/Constants.h
+++ b/Source/Common/Include/Constants.h
@ -0,0 +1,13 @@
+// Constants.h -- the constants used by CNTK
+//
+
+#pragma once
+
+#ifndef _CONSTANTS_H_
+#define _CONSTANTS_H_
+
+// Constants used in aggregation
+const size_t DEFAULT_PACK_THRESHOLD_SIZE_IN_KB = 32;
+const size_t DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES = DEFAULT_PACK_THRESHOLD_SIZE_IN_KB * 1024;
+
+#endif
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -267,12 +267,12 @@ template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount,
                                                                                                 const TensorShape& strideShape, const std::vector<bool>& sharing,
                                                                                                 const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                                                                                                 bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+                                                                                                 bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
 {
    return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
                                                                       kernelShape, mapCount, strideShape,
                                                                       sharing, autoPadding, lowerPad, upperPad,
-                                                                       transpose, imageLayout, maxTempMemSizeInSamples));
+                                                                       transpose, outputShape, imageLayout, maxTempMemSizeInSamples));
 }

 template <class ElemType>
@ -344,13 +344,13 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convo
                                                                                       const TensorShape& kernelShape, const TensorShape& mapCount, 
                                                                                       const TensorShape& strideShape, const std::vector<bool>& sharing,
                                                                                       const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                                                                                       bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
+                                                                                       bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
                                                                                       const std::wstring nodeName)
 {
    return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
                                                                          kernelShape, mapCount, strideShape,
                                                                          sharing, autoPadding, lowerPad, upperPad,
-                                                                          transpose, imageLayout, maxTempMemSizeInSamples),
+                                                                          transpose, outputShape, imageLayout, maxTempMemSizeInSamples),
                                                                          { weight, inputValues });
 }

@ -502,9 +502,9 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Seque
 }

 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ForwardBackward(const ComputationNodePtr label, const ComputationNodePtr prediction, int blankTokenId, int delayConstraint, const std::wstring nodeName)
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::ForwardBackward(const ComputationNodePtr graph, const ComputationNodePtr features, int blankTokenId, int delayConstraint, const std::wstring nodeName)
 {
-    return net.AddNodeToNetAndAttachInputs(New<ForwardBackwardNode<ElemType>>(net.GetDeviceId(), nodeName, blankTokenId, delayConstraint), { label, prediction });
+    return net.AddNodeToNetAndAttachInputs(New<ForwardBackwardNode<ElemType>>(net.GetDeviceId(), nodeName, blankTokenId, delayConstraint), { graph, features });
 }

 template <class ElemType>
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -54,7 +54,7 @@ public:
    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                                             const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                                             bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
+                                             bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, 
                                             const size_t horizontalSubsample, const size_t verticalSubsample, 
                                             ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
@ -84,7 +84,7 @@ public:
                                   const ComputationNodePtr inputValues,
                                   const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                                   const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                                   bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
+                                   bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
                                   const std::wstring nodeName = L"");
    ComputationNodePtr Pooling(const ComputationNodePtr inputValues, 
                               PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
@ -126,7 +126,7 @@ public:
    ComputationNodePtr CosDistance(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr CrossEntropy(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
    ComputationNodePtr CrossEntropyWithSoftmax(const ComputationNodePtr label, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
-    ComputationNodePtr ForwardBackward(const ComputationNodePtr label, const ComputationNodePtr prediction, int blankTokenId, int delayConstraint, const std::wstring nodeName = L"");
+    ComputationNodePtr ForwardBackward(const ComputationNodePtr graph, const ComputationNodePtr features, int blankTokenId, int delayConstraint, const std::wstring nodeName = L"");
    ComputationNodePtr DiagTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr Diagonal(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -105,7 +105,7 @@ template<class ElemType>
    }

    std::shared_ptr<Matrix<ElemType>> unpackedData;
-    if ((maxNumTimeSteps == 1) || (numSequences == 1))
+    if ((maxNumTimeSteps == 1) || (numSequences == 1) || (batchMajor && (layout->GetNumParallelSequences() == layout->GetNumSequences())))
        unpackedData = std::make_shared<Matrix<ElemType>>(packedData.AsReference());
    else
    {
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -49,7 +49,8 @@
 #define CNTK_MODEL_VERSION_17 17 // use 8 bytes for rng seeds on both platforms
 #define CNTK_MODEL_VERSION_18 18 // reserving 18 for dilated convolution, write out one more TensorShape 
 #define CNTK_MODEL_VERSION_19 19 // batch norm: flag whether running mean count is 0
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_19
+#define CNTK_MODEL_VERSION_20 20 // adding output shape to convolution node 
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_20


 // helper mode for debugging
@ -1417,13 +1418,13 @@ public:
    // for debugging, set the gaps to NaN instead (to track whether it bubbles up somewhere)
    void InvalidateMissingValueColumns(const FrameRange& fr) override final
    {
-        // fprintf(stderr, "invalidating %ls %ls m_value column range %d\n", NodeName().c_str(), OperationName().c_str(), (int)fr.timeIdxInSeq);
-        MaskMissingColumnsTo(*m_value, m_pMBLayout, fr, Matrix<ElemType>::MakeNan(__LINE__));
+        if (m_value->GetMatrixType() != SPARSE) // Sparse matrices can only be masked with 0s
+            MaskMissingColumnsTo(*m_value, m_pMBLayout, fr, Matrix<ElemType>::MakeNan(__LINE__));
    }
    void InvalidateMissingGradientColumns(const FrameRange& fr) override final
    {
-        // fprintf(stderr, "invalidating %ls %ls m_gradient column range %d\n", NodeName().c_str(), OperationName().c_str(), (int)fr.timeIdxInSeq);
-        MaskMissingColumnsTo(*m_gradient, m_pMBLayout, fr, Matrix<ElemType>::MakeNan(__LINE__));
+        if (m_gradient->GetMatrixType() != SPARSE) // Sparse matrices can only be masked with 0s
+            MaskMissingColumnsTo(*m_gradient, m_pMBLayout, fr, Matrix<ElemType>::MakeNan(__LINE__));
    }

    static TensorView<ElemType> Unpack(const TensorShape& sampleShape,
--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -53,14 +53,14 @@ class ConvolutionNodeBase : public ComputationNode<ElemType>

 public:
    ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name), m_poolKind(PoolKind::None), m_transpose(false), m_maxTempMemSizeInSamples(0)
+        : Base(deviceId, name), m_poolKind(PoolKind::None), m_transpose(false), m_outputShape(TensorShape(0)), m_maxTempMemSizeInSamples(0)
    {
    }
    ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                        const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                        PoolKind poolKind, bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+                        PoolKind poolKind, bool transpose, const TensorShape& outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
                        : Base(deviceId, name), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(strideShape), m_sharing(sharing),
-                        m_autoPad(autoPadding), m_lowerPad(lowerPad), m_upperPad(upperPad), m_poolKind(poolKind), m_transpose(transpose),
+                        m_autoPad(autoPadding), m_lowerPad(lowerPad), m_upperPad(upperPad), m_poolKind(poolKind), m_transpose(transpose), m_outputShape(outputShape), 
                        m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
    {
    }
@ -81,6 +81,7 @@ public:
        fstream << (int32_t)m_imageLayout;
        fstream << m_maxTempMemSizeInSamples;
        fstream << m_transpose;
+        m_outputShape.Save(fstream); 
    }

    void Load(File& fstream, size_t modelVersion) override
@ -109,6 +110,10 @@ public:
        {
            fstream >> m_transpose;
        }
+        if (modelVersion >= CNTK_MODEL_VERSION_20)
+        {
+            m_outputShape.Load(fstream); 
+        }
    }

    void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -126,6 +131,7 @@ public:
            node->m_upperPad = m_upperPad;
            node->m_poolKind = m_poolKind;
            node->m_transpose = m_transpose;
+            node->m_outputShape = m_outputShape; 
            node->m_imageLayout = m_imageLayout;
            node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
        }
@ -148,6 +154,7 @@ public:
    TensorShape LowerPad() const { return m_lowerPad; }
    TensorShape UpperPad() const { return m_upperPad; }
    bool Transpose() const { return m_transpose; }
+    TensorShape OutputShape() const { return m_outputShape; }
    size_t MaxTempMemSizeInSamples() const { return m_maxTempMemSizeInSamples; }
    PoolKind PoolingKind() const { return m_poolKind; }

@ -216,7 +223,8 @@ protected:
    TensorShape m_lowerPad;
    TensorShape m_upperPad;
    PoolKind m_poolKind;
-    bool m_transpose; // means de-convolution ...I think
+    bool m_transpose; 
+    TensorShape m_outputShape; 
    ImageLayoutKind m_imageLayout;
    
    size_t m_maxTempMemSizeInSamples;
@ -238,6 +246,7 @@ protected:                                  \
    using Base::m_upperPad;                 \
    using Base::m_poolKind;                 \
    using Base::m_transpose;                \
+    using Base::m_outputShape;              \
    using Base::m_imageLayout;              \
    using Base::m_maxTempMemSizeInSamples;  \
    using Base::m_tempMatrixForward;        \
@ -262,8 +271,8 @@ public:
    }
    ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                    const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                    bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
-                    : Base(deviceId, name, kernelShape, mapCount, strideShape, sharing, autoPadding, lowerPad, upperPad, PoolKind::None, transpose, imageLayout, maxTempMemSizeInSamples),
+                    bool transpose, const TensorShape &outputShape, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+                    : Base(deviceId, name, kernelShape, mapCount, strideShape, sharing, autoPadding, lowerPad, upperPad, PoolKind::None, transpose, outputShape, imageLayout, maxTempMemSizeInSamples),
                    m_convolution2D(false)
    {
    }
@ -273,14 +282,14 @@ public:
                    : ConvolutionNode(deviceId, name, TensorShape(kernelWidth, kernelHeight, 1), TensorShape(1, 1, outputChannels),
                                      TensorShape(horizontalSubsample, verticalSubsample, 1), vector<bool>{true},
                                      vector<bool>{zeroPadding}, TensorShape(0), TensorShape(0),
-                                      false, imageLayout, maxTempMemSizeInSamples)
+                                      false, TensorShape(0), imageLayout, maxTempMemSizeInSamples)
    {
        m_convolution2D = true;
    }
    ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
        : ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelShape"), configp->Get(L"mapCount"), configp->Get(L"strideShape"),
                          configp->Get(L"dimSharing"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
-                          configp->Get(L"transpose"), ImageLayoutKindFrom(configp->Get(L"imageLayout")), configp->Get(L"maxTempMemSizeInSamples"))
+                          configp->Get(L"transpose"), configp->Get(L"dimOutputShape"), ImageLayoutKindFrom(configp->Get(L"imageLayout")), configp->Get(L"maxTempMemSizeInSamples"))
    {
        AttachInputsFromConfig(configp, GetExpectedNumInputs());
    }
@ -443,13 +452,41 @@ public:
            {
                outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
                                                                    m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+
+                if (m_outputShape.GetRank() > 0 && m_outputShape != TensorShape(0))    // user have explicitly set m_outputShape, we check if it's the same as outputShape
+                {
+                    if (m_outputShape != outputShape)
+                    {
+                        InvalidArgument("%ls %ls the shape of the specified convolution output %ls is different from "
+                            "the result of convoluting the input argument using the provided options %ls. It is recommonded "
+                            "that the output shape is not specified for convolution.", NodeName().c_str(), OperationName().c_str(),
+                            static_cast<std::wstring>(m_outputShape).c_str(),
+                            static_cast<std::wstring>(outputShape).c_str());
+                    }
+                }
            }
            else
            {
-                // In case of transpose (deconvolution), node input (inputShape) is really the output of the convolution
-                // and node output (outDims) is convolution input. ConvolveGeometry does not care about deconvolutions (it does not have to).
-                outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
-                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+                if (m_outputShape.GetRank() <= 0 || m_outputShape == TensorShape(0))
+                {
+                    // In case of convolution transpose (deconvolution), node input (inputShape) is really the output of the convolution
+                    // and node output (outDims) is convolution input. ConvolveGeometry does not care about deconvolutions (it does not have to).
+                    outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                      m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+                }
+                else
+                {
+                    // in case the user specifies the output shape, we make sure the input shape can be the result of
+                    // convolution from the specified output shape
+                    auto inferredShape = ConvolveGeometry::ComputeOutputShape(m_outputShape, m_kernelShape, m_mapCount, m_stride, m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+                    if (inputShape != inferredShape)
+                        InvalidArgument("%ls %ls the shape of the convolution transpose operand %ls is different from "
+                            "the result of convoluting the specified output argument using "
+                            "the provided options %ls", NodeName().c_str(), OperationName().c_str(),
+                            static_cast<std::wstring>(inputShape).c_str(),
+                            static_cast<std::wstring>(inferredShape).c_str());
+                    outputShape = m_outputShape; 
+                }
            }

            if (m_imageLayout == ImageLayoutKind::CHW) 
@ -760,7 +797,7 @@ public:
    PoolingNode(DEVICEID_TYPE deviceId, const wstring& name, PoolKind pool, const TensorShape& kernelShape, const TensorShape& strideShape,
                const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
                ImageLayoutKind imageLayout)
-                : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, pool, false, imageLayout, 0)
+                : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, pool, false, TensorShape(0), imageLayout, 0)
    {
    }
    PoolingNode(const ScriptableObjects::IConfigRecordPtr configp)
@ -882,7 +919,7 @@ public:
    MaxUnpoolingNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& strideShape,
                       const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
                       ImageLayoutKind imageLayout)
-                       : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, PoolKind::Max, true, imageLayout, 0)
+                       : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, PoolKind::Max, true, TensorShape(0), imageLayout, 0)
    {
    }
    MaxUnpoolingNode(const ScriptableObjects::IConfigRecordPtr configp)
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -6,6 +6,7 @@

 #include "Basics.h"
 #include "ComputationNode.h"
+#include "Constants.h"
 #include "Matrix.h"
 #include "TensorView.h"
 #include <unordered_set>
@ -1451,7 +1452,8 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
    shared_ptr<ComputationNetwork> net,
    set<shared_ptr<ComputationNodeBase>> evalNodesWhichAccumulateResult,
    shared_ptr<DistGradHeader> gradHeader,
-    shared_ptr<MPIWrapper> mpi);
+    shared_ptr<MPIWrapper> mpi,
+    size_t packThresholdSizeInBytes = (size_t)DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES);

 // -----------------------------------------------------------------------
 // EpochAccumulatorNode calculates mean values of all samples used in forward pass.
@ -1502,7 +1504,8 @@ protected:
        shared_ptr<ComputationNetwork> net,
        set<shared_ptr<ComputationNodeBase>> evalNodesWhichAccumulateResult,
        shared_ptr<DistGradHeader> gradHeader,
-        shared_ptr<MPIWrapper> mpi);
+        shared_ptr<MPIWrapper> mpi,
+        size_t packThresholdSize);

    void Reset();

--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -118,6 +118,7 @@ public:
 // Negate (input)
 // Sqrt (input)
 // Reciprocal (input)
+// ExponentialLinearUnitDerivative (input)
 // These are all implemented by single-opcode functions and can thus be declared by a macro.
 // -----------------------------------------------------------------------

@ -141,21 +142,22 @@ public:
        }                                                                                                                                    \
    }

-//                                    Name             Forward and      Backward opcodes                                           Gradient optype
-DeclareUnaryElementWiseWithOpCodeNode(Abs,             Abs,             ElementwiseProductWithAbsDerivative,                       binaryWithInputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Cosine,          Cosine,          ElementwiseProductWithCosDerivative,                       binaryWithInputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Exp,             Exp,             ElementwiseProduct,                                        binaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Floor,           Floor,           None,                                                      noGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Log,             Log,             ElementwiseProductWithLogDerivativeFromOutput,             binaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Negate,          Negate,          Negate,                                                    unaryGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Pass,            Copy,            Copy,                                                      unaryGradient);
-DeclareUnaryElementWiseWithOpCodeNode(LabelsToGraph,   Copy,            Copy,                                                      unaryGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Reciprocal,      Reciprocal,      ElementwiseProductWithReciprocalDerivative,                binaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear, LinearRectifier, ElementwiseProductWithLinearRectifierDerivativeFromOutput, binaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Sigmoid,         Sigmoid,         ElementwiseProductWithSigmoidDerivativeFromOutput,         binaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Sin,             Sin,             ElementwiseProductWithSinDerivative,                       binaryWithInputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Sqrt,            Sqrt,            ElementwiseProductWithSqrtDerivative,                      binaryWithOutputGradient);
-DeclareUnaryElementWiseWithOpCodeNode(Tanh,            Tanh,            ElementwiseProductWithTanhDerivativeFromOutput,            binaryWithOutputGradient);
+//                                    Name                   Forward and            Backward opcodes                                                 Gradient optype
+DeclareUnaryElementWiseWithOpCodeNode(Abs,                   Abs,                   ElementwiseProductWithAbsDerivative,                             binaryWithInputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Cosine,                Cosine,                ElementwiseProductWithCosDerivative,                             binaryWithInputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Exp,                   Exp,                   ElementwiseProduct,                                              binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Floor,                 Floor,                 None,                                                            noGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Log,                   Log,                   ElementwiseProductWithLogDerivativeFromOutput,                   binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Negate,                Negate,                Negate,                                                          unaryGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Pass,                  Copy,                  Copy,                                                            unaryGradient);
+DeclareUnaryElementWiseWithOpCodeNode(LabelsToGraph,         Copy,                  Copy,                                                            unaryGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Reciprocal,            Reciprocal,            ElementwiseProductWithReciprocalDerivative,                      binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(RectifiedLinear,       LinearRectifier,       ElementwiseProductWithLinearRectifierDerivativeFromOutput,       binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Sigmoid,               Sigmoid,               ElementwiseProductWithSigmoidDerivativeFromOutput,               binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Sin,                   Sin,                   ElementwiseProductWithSinDerivative,                             binaryWithInputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Sqrt,                  Sqrt,                  ElementwiseProductWithSqrtDerivative,                            binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(Tanh,                  Tanh,                  ElementwiseProductWithTanhDerivativeFromOutput,                  binaryWithOutputGradient);
+DeclareUnaryElementWiseWithOpCodeNode(ExponentialLinearUnit, ExponentialLinearUnit, ElementwiseProductWithExponentialLinearUnitDerivativeFromOutput, binaryWithOutputGradient);

 #pragma pop_macro("DeclareUnaryElementWiseWithOpCodeNode")

--- a/Source/ComputationNetworkLib/SpecialPurposeNodes.h
+++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.h
@ -767,10 +767,10 @@ template class DummyCriterionNode<double>;
 // ForwardBackwardNode (graph, prediction, delayConstraint)
 // CTC training criterion, primarily based on the paper "Connectionist Temporal Classification: Labelling Unsegmented
 // Sequence Data with Recurrent Neural Networks", ftp://ftp.idsia.ch/pub/juergen/icml2006.pdf
-//
+// blankTokenId (input): id of the blank token. If specified as SIZE_MAX, will be replaced with (numberOfLabels - 1)
 // delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. 
 //      This using the original time information to enforce that CTC tokens only get aligned within a time margin.
-//      Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
+//      Setting this parameter smaller will result in shorter delay between label output during decoding, yet may hurt accuracy.
 //      delayConstraint=-1 means no constraint
 // -----------------------------------------------------------------------

@ -785,7 +785,7 @@ class ForwardBackwardNode : public  ComputationNodeNonLooping<ElemType>, public
    }
 public:
    DeclareConstructorFromConfigWithNumInputs(ForwardBackwardNode);
-    ForwardBackwardNode(DEVICEID_TYPE deviceId, const wstring & name, int blankTokenId=INT_MIN, int delayConstraint=-1) :
+    ForwardBackwardNode(DEVICEID_TYPE deviceId, const wstring & name, size_t blankTokenId=SIZE_MAX, int delayConstraint=-1) :
        Base(deviceId, name), m_blankTokenId(blankTokenId), m_delayConstraint(delayConstraint)
    {
    }
@ -936,6 +936,9 @@ public:
        m_maxValues->Resize(1, cols);
    }

+    int DelayConstraint() { return m_delayConstraint; }
+    size_t BlankTokenId() { return m_blankTokenId; }
+
 protected:
    virtual bool NodeDoesItsOwnCustomizedMissingColumnsMasking() { return true; }
    shared_ptr<Matrix<ElemType>> m_logSoftmaxOfRight;
@ -945,7 +948,7 @@ protected:
    shared_ptr<Matrix<ElemType>> m_maxValues;

    msra::lattices::GammaCalculation<ElemType> m_GammaCal;
-    int m_blankTokenId;
+    size_t m_blankTokenId;
    int m_delayConstraint;
 };

--- a/Source/ComputationNetworkLib/UserDefinedV2FunctionNode.h
+++ b/Source/ComputationNetworkLib/UserDefinedV2FunctionNode.h
@ -12,6 +12,9 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+template <typename ElemType>
+class SelectUserDefinedV2FunctionOutputNode;
+
 // -----------------------------------------------------------------------
 // UserDefinedV2Function
 // Proxy ComputationNode type for a V2 user-defined custom Function, instances
@ -25,6 +28,8 @@ class UserDefinedV2FunctionNode final : public ComputationNodeNonLooping<ElemTyp
 {
    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
    static const std::wstring TypeName() { return L"UserDefinedV2Function"; }
+    
+    friend class SelectUserDefinedV2FunctionOutputNode<ElemType>;

 public:
    UserDefinedV2FunctionNode(DEVICEID_TYPE deviceId, const wstring& name, const ::CNTK::FunctionPtr& externalFunction = nullptr)
@ -32,10 +37,18 @@ public:
    {
        if (!m_externalFunction)
            LogicError("UserDefinedV2FunctionNode ctor should never be called with externalFunction == nullptr");
+
+        m_numOutputs = m_externalFunction->Outputs().size();
+        m_values.resize(m_numOutputs);
+        m_gradients.resize(m_numOutputs);
+        m_MBLayouts.resize(m_numOutputs);
+        m_outputHasNewMBLayout.resize(m_numOutputs);
    }

    virtual void ForwardPropNonLooping() override
    {
+        m_values[0] = m_value;
+
        // Get the arguments of the external function
        auto arguments = m_externalFunction->Arguments();
        std::unordered_map<::CNTK::Variable, ::CNTK::ValuePtr> argumentValues;
@ -53,40 +66,50 @@ public:
        }
        assert(j == arguments.size());

+        auto outputs = m_externalFunction->Outputs();
+
        // TODO: Instead of passing null for output values, we should have the forward call directly produce the outputs in the output Value() of this node
-        std::unordered_map<::CNTK::Variable, ::CNTK::ValuePtr> outputValue = { { m_externalFunction->Output(), nullptr } };
+        std::unordered_map<::CNTK::Variable, ::CNTK::ValuePtr> outputValues;
+        for (auto output : outputs)
+            outputValues.insert({output, nullptr});
+
        std::unordered_set<::CNTK::Variable> outputsToRetainBackwardStateFor;
        if (Environment().IsTraining())
-            outputsToRetainBackwardStateFor.insert(m_externalFunction->Output());
+            outputsToRetainBackwardStateFor.insert(outputs.begin(), outputs.end());

        auto computeDevice = ::CNTK::AsDeviceDescriptor(InputRef(0).Value().GetDeviceId());
-        m_currentBackpropStatePtr = m_externalFunction->Forward(argumentValues, outputValue, computeDevice, outputsToRetainBackwardStateFor);
+        m_currentBackpropStatePtr = m_externalFunction->Forward(argumentValues, outputValues, computeDevice, outputsToRetainBackwardStateFor);

-        // Copy the computed output to Value() of this node
-        // TODO: We currently assume that the external Function does not generate a new MBLayout
-        auto outputMatrixAndLayout = ::CNTK::Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<ElemType>(outputValue.begin()->first, outputValue.begin()->second);
-        Value().AssignValuesOf(*outputMatrixAndLayout.first);
-        
-        if ((GetMBLayout() != nullptr) && (outputMatrixAndLayout.second == nullptr))
-            LogicError("The UserDefinedFunction node has a non-null output MBLayout but none found from the (%S) user Function::Forward output Value", m_externalFunction->Name().c_str());
-        else if ((GetMBLayout() == nullptr) && (outputMatrixAndLayout.second != nullptr))
-            LogicError("The UserDefinedFunction node does not have an output MBLayout but the (%S) user Function::Forward output Value have a non-null layout", m_externalFunction->Name().c_str());
-        else if ((GetMBLayout() == nullptr) && (outputMatrixAndLayout.second == nullptr))
-            ;
-        else
+        // Copy the computed output
+        for (size_t i = 0; i < outputs.size(); ++i)
        {
-            if (m_hasNewOutputMBLayout)
-                GetMBLayout()->CopyFrom(outputMatrixAndLayout.second);
+            auto output = outputs[i];
+            auto outputMatrixAndLayout = ::CNTK::Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<ElemType>(output, outputValues[output]);
+            m_values[i]->SetValue(*outputMatrixAndLayout.first);
+
+            if ((m_MBLayouts[i] != nullptr) && (outputMatrixAndLayout.second == nullptr))
+                LogicError("The UserDefinedFunction node has a non-null output MBLayout but none found from the (%S) user Function::Forward output Value", m_externalFunction->Name().c_str());
+            else if ((m_MBLayouts[i] == nullptr) && (outputMatrixAndLayout.second != nullptr))
+                LogicError("The UserDefinedFunction node does not have an output MBLayout but the (%S) user Function::Forward output Value have a non-null layout", m_externalFunction->Name().c_str());
+            else if ((m_MBLayouts[i] == nullptr) && (outputMatrixAndLayout.second == nullptr))
+                ;
            else
            {
-                if (*GetMBLayout() != *outputMatrixAndLayout.second)
-                    LogicError("The MBLayout of the output computed by the external function (%S) does not match the expected MBLayout", m_externalFunction->Name().c_str());
+                if (m_outputHasNewMBLayout[i])
+                    m_MBLayouts[i]->CopyFrom(outputMatrixAndLayout.second);
+                else
+                {
+                    if (*m_MBLayouts[i] != *outputMatrixAndLayout.second)
+                        LogicError("The MBLayout of the output computed by the external function (%S) does not match the expected MBLayout", m_externalFunction->Name().c_str());
+                }
            }
        }
    }

    virtual void BackpropToNonLooping(size_t inputIndex) override
    {
+        m_gradients[0] = m_gradient;
+
        std::vector<::CNTK::Variable> externalFunctionUniqueInputs;
        auto externalFunctionInputs = m_externalFunction->Inputs();
        for (auto input : externalFunctionInputs)
@ -97,10 +120,21 @@ public:

        auto input = externalFunctionUniqueInputs[inputIndex];

-        auto gradientValue = ::CNTK::Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_externalFunction->Output(), Gradient(), GetMBLayout());
-        std::unordered_map<::CNTK::Variable, ::CNTK::ValuePtr> outputGradientValue = { { m_externalFunction->Output(), gradientValue } };
+        std::unordered_map<::CNTK::Variable, ::CNTK::ValuePtr> outputGradientValues;
+        auto outputs = m_externalFunction->Outputs();
+        for (size_t i = 0; i < outputs.size(); ++i)
+        {
+            auto output = outputs[i];
+
+            // TODO: We unpack the same output gradients each time this method is called for a different input.
+            // We should be able to cache the unpacked values during backpropagation of gradients to the first
+            // input, and reuse them for subsequence inputs.
+            auto gradientValue = ::CNTK::Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout(output, *m_gradients[i], m_MBLayouts[i]);
+            outputGradientValues.insert({ output, gradientValue });
+        }
+
        std::unordered_map<::CNTK::Variable, ::CNTK::ValuePtr> inputGradientValue = { { input, nullptr } };
-        m_externalFunction->Backward(m_currentBackpropStatePtr, outputGradientValue, inputGradientValue);
+        m_externalFunction->Backward(m_currentBackpropStatePtr, outputGradientValues, inputGradientValue);

        // Accumulate the computed input gradient value into the existing input gradient value
        // TODO: We should directly pass the actual input gradient tensor to the Backward method 
@ -116,71 +150,160 @@ public:
    {
        Base::Validate(isFinalValidationPass);

-        // The external Function can only have a single output
-        auto numOutputs = m_externalFunction->Outputs().size();
-        if (numOutputs != 1)
-            InvalidArgument("Found user defined function (%S) with %lu outputs. User defined functions must have exactly one output", this->GetName().c_str(), (unsigned long)numOutputs);
-
-        auto output = m_externalFunction->Output();
-        if (output.GetDataType() != ::CNTK::AsDataType<ElemType>())
+        auto outputs = m_externalFunction->Outputs();
+        for (size_t i = 0; i < outputs.size(); ++i)
        {
-            LogicError("The DataType (%s) of the external user defined Function's output does not match the internal ComputationNode's ElemType (%s)",
-                       DataTypeName(output.GetDataType()),
-                       DataTypeName(::CNTK::AsDataType<ElemType>()));
-        }
+            auto output = outputs[i];

-        auto outputNDShape = output.Shape();
-        if (outputNDShape.IsUnknown() || outputNDShape.HasInferredDimension())
-            LogicError("The output shape of an external user defined Function should be fully determined by the time CNTK engine validation executes");
-
-        auto outputDynamicAxes = output.DynamicAxes();
-        if (outputDynamicAxes.empty())
-        {
-            m_hasNewOutputMBLayout = true;
-            m_pMBLayout = nullptr;
-        }
-        else
-        {
-            auto argumentVariables = m_externalFunction->Arguments();
-            size_t j = 0;
-            auto numInputs = GetNumInputs();
-            for (size_t i = 0; i < numInputs; ++i)
+            if (output.GetDataType() != ::CNTK::AsDataType<ElemType>())
            {
-                auto& input = InputRef(i);
-                if (input.template Is<LearnableParameter<ElemType>>())
-                    continue;
-
-                auto argumentVar = argumentVariables[j];
-                if (argumentVar.DynamicAxes() == outputDynamicAxes)
-                {
-                    m_pMBLayout = input.GetMBLayout();
-                    break;
-                }
-
-                j++;
+                LogicError("The DataType (%s) of the external user defined Function's output does not match the internal ComputationNode's ElemType (%s)",
+                    DataTypeName(output.GetDataType()),
+                    DataTypeName(::CNTK::AsDataType<ElemType>()));
            }
-            
-            if (!m_pMBLayout)
+
+            auto outputNDShape = output.Shape();
+            if (outputNDShape.IsUnknown() || outputNDShape.HasInferredDimension())
+                LogicError("The output shape of an external user defined Function should be fully determined by the time CNTK engine validation executes");
+
+            auto outputDynamicAxes = output.DynamicAxes();
+            if (outputDynamicAxes.empty())
            {
-                m_pMBLayout = make_shared<MBLayout>(); // this generates a new layout
-                m_pMBLayout->SetUniqueAxisName(InternalDynamicAxisNameFromDynamicAxes(output.DynamicAxes()));
-                m_hasNewOutputMBLayout = true;
+                m_outputHasNewMBLayout[i] = true;
+                m_MBLayouts[i] = nullptr;
            }
            else
-                m_hasNewOutputMBLayout = false;
+            {
+                auto argumentVariables = m_externalFunction->Arguments();
+                size_t j = 0;
+                auto numInputs = GetNumInputs();
+                for (size_t k = 0; k < numInputs; ++k)
+                {
+                    auto& input = InputRef(k);
+                    if (input.template Is<LearnableParameter<ElemType>>())
+                        continue;
+
+                    auto argumentVar = argumentVariables[j];
+                    if (argumentVar.DynamicAxes() == outputDynamicAxes)
+                    {
+                        m_MBLayouts[i] = input.GetMBLayout();
+                        break;
+                    }
+
+                    j++;
+                }
+
+                if (!m_MBLayouts[i])
+                {
+                    m_MBLayouts[i] = make_shared<MBLayout>(); // this generates a new layout
+                    m_MBLayouts[i]->SetUniqueAxisName(InternalDynamicAxisNameFromDynamicAxes(output.DynamicAxes()));
+                    m_outputHasNewMBLayout[i] = true;
+                }
+                else
+                    m_outputHasNewMBLayout[i] = false;
+            }
+
+            if (i == 0)
+            {
+                m_pMBLayout = m_MBLayouts[i];
+                SetDims(::CNTK::AsTensorShape(outputNDShape), HasMBLayout());
+            }
        }
-        
-        auto outputTensorShape = ::CNTK::AsTensorShape(outputNDShape);
-        SetDims(outputTensorShape, HasMBLayout());
+    }
+
+    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
+    {
+        Base::RequestMatricesBeforeForwardProp(matrixPool);
+        for (size_t i = 1 ; i < m_numOutputs; ++i)
+            RequestMatrixFromPool(m_values[i], matrixPool);
+    }
+
+    void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
+    {
+        Base::RequestMatricesBeforeBackprop(matrixPool);
+        for (size_t i = 1; i < m_numOutputs; ++i)
+            RequestMatrixFromPool(m_gradients[i], matrixPool);
+    }
+
+    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
+    {
+        Base::ReleaseMatricesAfterBackprop(matrixPool);
+        for (size_t i = 1; i < m_numOutputs; ++i)
+            ReleaseMatrixToPool(m_values[i], matrixPool);
+        for (size_t i = 1; i < m_numOutputs; ++i)
+            ReleaseMatrixToPool(m_gradients[i], matrixPool);
    }

 private:
    ::CNTK::FunctionPtr m_externalFunction;
-    bool m_hasNewOutputMBLayout;
    ::CNTK::BackPropStatePtr m_currentBackpropStatePtr;
+
+    size_t m_numOutputs;
+    std::vector<std::shared_ptr<Matrix<ElemType>>> m_values;
+    std::vector<std::shared_ptr<Matrix<ElemType>>> m_gradients;
+    std::vector<std::shared_ptr<MBLayout>> m_MBLayouts;
+    std::vector<bool> m_outputHasNewMBLayout;
 };

 template class UserDefinedV2FunctionNode<float>;
 template class UserDefinedV2FunctionNode<double>;

+// -----------------------------------------------------------------------
+// SelectUserDefinedV2FunctionOutputNode(userDefinedV2FunctionNode, outputIndex)
+// ComputationNode for selecting one of the multiple outputs of UserDefinedV2FunctionNode
+// This is needed since the CNTK computation engin natively does not support
+// nodes with multiple outputs and hence, we need a separate node to multiplex 
+// the additional outputs.
+// -----------------------------------------------------------------------
+
+// TODO: We currently only support external nodes that cannot be part of CNTK recurrent loops
+template <class ElemType>
+class SelectUserDefinedV2FunctionOutputNode final : public ComputationNodeNonLooping<ElemType>, public NumInputs<1>
+{
+    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
+    static const std::wstring TypeName() { return L"SelectUserDefinedV2FunctionOutput"; }
+
+public:
+    SelectUserDefinedV2FunctionOutputNode(DEVICEID_TYPE deviceId, const wstring& name, size_t outputIndex = 0)
+        : Base(deviceId, name), m_outputIndex(outputIndex)
+    {}
+
+    virtual void ForwardPropNonLooping() override
+    {
+        // TODO: We should avoid this copy but that requires carefully managing the 
+        // lifetimes of the Value objects since to be able to directly use the 
+        // input Value as its output, we have to make sure that the input's Value
+        // is not reused until all dependents of this node are finished.
+        auto inputNode = Input(0)->template As<UserDefinedV2FunctionNode<ElemType>>();
+        Value().AssignValuesOf(*inputNode->m_values[m_outputIndex]);
+    }
+
+    virtual void BackpropToNonLooping(size_t inputIndex) override
+    {
+        // TODO: We should avoid this copy but that requires carefully managing the 
+        // lifetimes of the Gradient objects since to be able to directly use the 
+        // Gradient as input's gradient, we have to make sure that the Gradient
+        // is not reused until all the inputs are finished backpropagating to their inputs.
+        auto inputNode = Input(0)->template As<UserDefinedV2FunctionNode<ElemType>>();
+        inputNode->m_gradients[m_outputIndex]->SetValue(Gradient());
+    }
+
+    virtual void Validate(bool isFinalValidationPass) override
+    {
+        Base::Validate(isFinalValidationPass);
+
+        auto inputNode = Input(0)->template As<UserDefinedV2FunctionNode<ElemType>>();
+        m_pMBLayout = inputNode->m_MBLayouts[m_outputIndex];
+
+        auto outputNDShape = inputNode->m_externalFunction->Outputs()[m_outputIndex].Shape();
+        SetDims(::CNTK::AsTensorShape(outputNDShape), HasMBLayout());
+    }
+
+private:
+    size_t m_outputIndex;
+};
+
+template class SelectUserDefinedV2FunctionOutputNode<float>;
+template class SelectUserDefinedV2FunctionOutputNode<double>;
+
 }}}
--- a/Source/Extensibility/EvalWrapper/EvalExtendedWrapper.cpp
+++ b/Source/Extensibility/EvalWrapper/EvalExtendedWrapper.cpp
@ -31,6 +31,7 @@ using namespace std;
 using namespace System;
 using namespace System::Collections::Generic;
 using namespace System::Collections;
+using namespace System::Runtime::InteropServices;

 namespace Microsoft { namespace MSR { namespace CNTK { namespace Extensibility { namespace Managed {

@ -291,19 +292,18 @@ public:
            throw gcnew ObjectDisposedException("Object has been disposed.");
        }

+        // Hold all buffers that should be pinned during native operations
+        List<GCHandle>^ pinnedGCHandleList = gcnew List<GCHandle>;
+
        try
        {
            Native::ValueRefs<ElemType> stdInputs;
            Native::ValueRefs<ElemType> stdOutputs;

-            // Hold gc objects in the stack, while performing native actions
-            vector<gcroot<cli::array<ElemType>^>> pinBuffers;
-            vector<gcroot<cli::array<int>^>> pinIndices;
-
            // Map the managed space into the native space, results will be written directly into the managed memory space
            // https://msdn.microsoft.com/en-us/library/1dz8byfh.aspx
-            TransferVectorsToValueBuffers(inputs, stdInputs, pinBuffers, pinIndices, StorageType::Sparse);
-            TransferVectorsToValueBuffers(outputs, stdOutputs, pinBuffers, pinIndices, StorageType::Dense);
+            TransferVectorsToValueBuffers(inputs, stdInputs, pinnedGCHandleList, StorageType::Sparse);
+            TransferVectorsToValueBuffers(outputs, stdOutputs, pinnedGCHandleList, StorageType::Dense);

            try
            {
@ -324,6 +324,13 @@ public:
        {
            throw;
        }
+        finally
+        {
+            for each (auto h in pinnedGCHandleList)
+            {
+                h.Free();
+            }
+        }
    }

    ~ModelEvaluationExtended()
@ -431,37 +438,31 @@ private:
        }
    }

-    void PinBuffer(cli::array<ElemType>^ itemBuffer, vector<gcroot<cli::array<ElemType>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
+    void PinBuffer(cli::array<ElemType>^ itemBuffer, List<GCHandle>^ pinnedGCHandleList, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
    {
-        // gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
-        gcroot<cli::array<ElemType>^> pBuf(itemBuffer);
-        pin_ptr<ElemType> pp = &(pBuf[0]);
-        pinBuffers.push_back(pBuf);
+        GCHandle h = GCHandle::Alloc(itemBuffer, GCHandleType::Pinned);
+        pinnedGCHandleList->Add(h);
+        ElemType* pp = reinterpret_cast<ElemType *>(h.AddrOfPinnedObject().ToPointer());
        vb->m_buffer.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
-        pp = nullptr;
    }

-    void PinIndices(cli::array<int>^ itemBuffer, vector<gcroot<cli::array<int>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
+    void PinIndices(cli::array<int>^ itemBuffer, List<GCHandle>^ pinnedGCHandleList, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
    {
-        // gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
-        gcroot<cli::array<int>^> pBuf(itemBuffer);
-        pin_ptr<int> pp = &(pBuf[0]);
-        pinBuffers.push_back(pBuf);
+        GCHandle h = GCHandle::Alloc(itemBuffer, GCHandleType::Pinned);
+        pinnedGCHandleList->Add(h);
+        int* pp = reinterpret_cast<int *>(h.AddrOfPinnedObject().ToPointer());
        vb->m_indices.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
-        pp = nullptr;
    }

-    void PinColIndices(cli::array<int>^ itemBuffer, vector<gcroot<cli::array<int>^>>& pinBuffers, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
+    void PinColIndices(cli::array<int>^ itemBuffer, List<GCHandle>^ pinnedGCHandleList, Native::ValueBuffer<ElemType, Native::VectorRef>* vb, StorageType storageType, int bufferSize)
    {
-        // gcroot object manages the pointer so that it always corresponds to the correct managed location (even after gc relocation)
-        gcroot<cli::array<int>^> pBuf(itemBuffer);
-        pin_ptr<int> pp = &(pBuf[0]);
-        pinBuffers.push_back(pBuf);
+        GCHandle h = GCHandle::Alloc(itemBuffer, GCHandleType::Pinned);
+        pinnedGCHandleList->Add(h);
+        int* pp = reinterpret_cast<int *>(h.AddrOfPinnedObject().ToPointer());
        vb->m_colIndices.InitFrom(pp, bufferSize, storageType == StorageType::Sparse ? bufferSize : 0);
-        pp = nullptr;
    }

-    void TransferVectorsToValueBuffers(cli::array<ValueBuffer<ElemType>^>^ list, Native::ValueRefs<ElemType>& valueRefs, vector<gcroot<cli::array<ElemType>^>>& pinBuffers, vector<gcroot<cli::array<int>^>>& pinIndices, StorageType storageType)
+    void TransferVectorsToValueBuffers(cli::array<ValueBuffer<ElemType>^>^ list, Native::ValueRefs<ElemType>& valueRefs, List<GCHandle>^ pinnedGCHandleList, StorageType storageType)
    {
        for each (auto item in list)
        {
@ -476,16 +477,16 @@ private:
                throw gcnew CNTKRuntimeException("Invalid buffer (empty) for argument into ForwardPass", String::Empty);
            }

-            PinBuffer(item->Buffer, pinBuffers, &vb, storageType, bufferSize);
+            PinBuffer(item->Buffer, pinnedGCHandleList, &vb, storageType, bufferSize);

            if (item->Indices != nullptr)
            {
-                PinIndices(item->Indices, pinIndices, &vb, storageType, bufferSize);
+                PinIndices(item->Indices, pinnedGCHandleList, &vb, storageType, bufferSize);
            }

            if (item->ColIndices != nullptr)
            {
-                PinColIndices(item->ColIndices, pinIndices, &vb, storageType, numElements);
+                PinColIndices(item->ColIndices, pinnedGCHandleList, &vb, storageType, numElements);
            }

            valueRefs.push_back(vb);
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -5888,6 +5888,7 @@ void CPUMatrix<ElemType>::RCRFBackwardCompute(const CPUMatrix<ElemType>& alpha,
 // t (input): time stamp to process
 // maxPhoneNum (input): the max number of phones between utterances
 // totalPhoneNum (input): the total number of phones of all utterances
+// blankTokenId (input): id of the CTC blank token
 // delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
 //      Alpha and Beta scores outside of the delay boundary are set to zero.
 //      Setting this parameter smaller will result in shorted delay between label output during decoding.
@ -5907,6 +5908,7 @@ void _assignAlphaScore(
    const size_t  t,
    const size_t maxPhoneNum, // Maximum length of utterance in this MB
    const size_t totalPhoneNum, // Total number of phones
+    const size_t blankTokenId,
    const int delayConstraint)
 {
    for (size_t uttId = 0;uttId < uttNum;uttId++) {
@ -5958,7 +5960,7 @@ void _assignAlphaScore(
                    {
                        size_t labelid_2 = labelid - 2;
                        // if current label is not blank and not equal prev non-blank label
-                        if ((size_t)(phoneSeq[labelid]) != totalPhoneNum - 1 && phoneId != (size_t)(phoneSeq[labelid_2]))
+                        if ((size_t)(phoneSeq[labelid]) != blankTokenId && phoneId != (size_t)(phoneSeq[labelid_2]))
                        {
                            x = LogAdd(x, alphaScore[alphaId_2]);
                        }
@ -5980,13 +5982,13 @@ void _assignAlphaScore(
                    {
                        size_t labelid_r = labelid + 2;
                        size_t phoneBoundId_r = (size_t)(phoneBound[labelid_r]);
-                        if (phoneId == totalPhoneNum - 1)
+                        if (phoneId == blankTokenId)
                        {
                            // only constraint right side
                            if (t > phoneBoundId_r + delayConstraint - 1)
                                alphaScore[alphaId] = LZERO;
                        }
-                        else if (phoneId != totalPhoneNum - 1)
+                        else if (phoneId != blankTokenId)
                        {
                            if (t > phoneBoundId_r + delayConstraint)
                                alphaScore[alphaId] = LZERO;
@ -6016,6 +6018,7 @@ void _assignBetaScore(
    const long  t,
    const size_t maxPhoneNum,
    const size_t totalPhoneNum,
+    const size_t blankTokenId,
    const int delayConstraint)
 {
    for (size_t uttId = 0;uttId < uttNum;uttId++) {
@ -6055,7 +6058,7 @@ void _assignBetaScore(
                    ElemType ascore;
                    if (phoneSeqId < phoneNum - 3)
                    {
-                        if (phoneSeq[labelid] != totalPhoneNum - 1 && phoneId != phoneSeq[labelid_2])
+                        if (phoneSeq[labelid] != blankTokenId && phoneId != phoneSeq[labelid_2])
                        {
                            x = LogAdd(x, betaScore[betaid_2]);
                        }
@ -6076,12 +6079,12 @@ void _assignBetaScore(
                    if (delayConstraint != -1)
                    {
                        size_t phoneBoundId_r = (size_t)(phoneBound[labelid_2]);
-                        if (phoneId == totalPhoneNum - 1)
+                        if (phoneId == blankTokenId)
                        {
                            if (t > phoneBoundId_r + delayConstraint - 1)
                                betaScore[betaid] = LZERO;
                        }
-                        else if (phoneId != totalPhoneNum - 1)
+                        else if (phoneId != blankTokenId)
                        {
                            if (t > phoneBoundId_r + delayConstraint)
                                betaScore[betaid] = LZERO;
@ -6171,7 +6174,7 @@ template<class ElemType>
 CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignCTCScore(
    const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta,
    const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const std::vector<size_t>& uttToChanInd, const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum,
-    const std::vector<size_t> & uttPhoneNum, const size_t numParallelSequences, const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
+    const std::vector<size_t> & uttPhoneNum, const size_t numParallelSequences, const size_t maxFrameNum, const size_t blankTokenId, const int delayConstraint, const bool isColWise)
 {
    // Column wise representation of sequences in input matrices (each column is one sequence/utterance)
    if (isColWise)
@ -6186,13 +6189,13 @@ CPUMatrix<ElemType>& CPUMatrix<ElemType>::AssignCTCScore(
        for (size_t t = 0; t < maxFrameNum; t++)
        {
            _assignAlphaScore(prob.Data(), alpha.Data(), phoneSeq.Data(), phoneBoundary.Data(), uttToChanInd,
-                uttFrameNum, uttBeginFrame, uttPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
+                uttFrameNum, uttBeginFrame, uttPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, blankTokenId, delayConstraint);
        }

        for (LONG64 t = maxFrameNum - 1; t >= 0; t--)
        {
            _assignBetaScore(prob.Data(), beta.Data(), phoneSeq.Data(), phoneBoundary.Data(), uttToChanInd,
-                uttFrameNum, uttBeginFrame, uttPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
+                uttFrameNum, uttBeginFrame, uttPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, blankTokenId, delayConstraint);
        }

        std::vector<ElemType> scores(uttNum);
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -231,7 +231,7 @@ public:
    // sequence training
    CPUMatrix<ElemType>& DropFrame(const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& gamma, const ElemType& threshhold);
    CPUMatrix<ElemType>& AssignSequenceError(const ElemType hsmoothingWeight, const CPUMatrix<ElemType>& label, const CPUMatrix<ElemType>& dnnoutput, const CPUMatrix<ElemType>& gamma, ElemType alpha);
-    CPUMatrix<ElemType>& AssignCTCScore(const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta, const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise);
+    CPUMatrix<ElemType>& AssignCTCScore(const CPUMatrix<ElemType>& prob, CPUMatrix<ElemType>& alpha, CPUMatrix<ElemType>& beta, const CPUMatrix<ElemType>& phoneSeq, const CPUMatrix<ElemType>& phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const size_t blankTokenId, const int delayConstraint, const bool isColWise);
    CPUMatrix<ElemType>& InplaceSqrt();
    CPUMatrix<ElemType>& AssignSqrtOf(const CPUMatrix<ElemType>& a);

--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -85,9 +85,9 @@ enum ElementWiseOperator
    // unary (or binary with constant parameter)
    opCopy,
    opNegate, opNot, opAbs, opFloor, opReciprocal,
-    opSigmoid, opTanh, opSqr, opSqrt, opExp, opLog, opLinearRectifier, opCosine, opSin,
+    opSigmoid, opTanh, opSqr, opSqrt, opExp, opLog, opLinearRectifier, opCosine, opSin, opExponentialLinearUnit,
    // unary ops for use by Matrix class only (there is no TensorView implementation)
-    opSigmoidDerivative, opLinearRectifierDerivative, opNegativeSine,
+    opSigmoidDerivative, opLinearRectifierDerivative, opNegativeSine, opExponentialLinearUnitDerivative,
    // binary
    opCopyIf, opCopyIfNot, opSum, opDifference, opElementwiseProduct, opElementwiseQuotient, opLogSum,
    opMax, opMin, opArgmax, opArgmin,
@ -98,6 +98,7 @@ enum ElementWiseOperator
    opElementwiseProductWithCosDerivative, opElementwiseProductWithSinDerivative,
    opElementwiseProductWithAbsDerivative, opElementwiseProductWithSqrtDerivative,
    opElementwiseProductWithReciprocalDerivative, opSqrOfDifference,
+    opElementwiseProductWithExponentialLinearUnitDerivativeFromOutput,
    // binary ops for indexing
    // opIndex,
    // ternary
@ -114,53 +115,55 @@ enum ElementWiseOperator
 #define ForAllNullaryOps(Macro) \
    Macro(ConstOne);

-#define ForAllUnaryOps(Macro) \
-    Macro(Copy);              \
-    Macro(Negate);            \
-    Macro(Not);               \
-    Macro(Abs);               \
-    Macro(Floor);             \
-    Macro(Reciprocal);        \
-    Macro(Sigmoid);           \
-    Macro(Tanh);              \
-    Macro(Sqr);               \
-    Macro(Sqrt);              \
-    Macro(Exp);               \
-    Macro(Log);               \
-    Macro(LinearRectifier);   \
-    Macro(Cosine);            \
-    Macro(Sin);
+#define ForAllUnaryOps(Macro)     \
+    Macro(Copy);                  \
+    Macro(Negate);                \
+    Macro(Not);                   \
+    Macro(Abs);                   \
+    Macro(Floor);                 \
+    Macro(Reciprocal);            \
+    Macro(Sigmoid);               \
+    Macro(Tanh);                  \
+    Macro(Sqr);                   \
+    Macro(Sqrt);                  \
+    Macro(Exp);                   \
+    Macro(Log);                   \
+    Macro(LinearRectifier);       \
+    Macro(Cosine);                \
+    Macro(Sin);                   \
+    Macro(ExponentialLinearUnit);

-#define ForAllBinaryOps(Macro)                                        \
-    Macro(CopyIf);                                                    \
-    Macro(CopyIfNot);                                                 \
-    Macro(Sum);                                                       \
-    Macro(Difference);                                                \
-    Macro(ElementwiseProduct);                                        \
-    Macro(ElementwiseQuotient);                                       \
-    Macro(LogSum);                                                    \
-    Macro(Max);                                                       \
-    Macro(Min);                                                       \
-    Macro(Equal);                                                     \
-    Macro(NotEqual);                                                  \
-    Macro(Greater);                                                   \
-    Macro(Less);                                                      \
-    Macro(GreaterEqual);                                              \
-    Macro(LessEqual);                                                 \
-    Macro(And);                                                       \
-    Macro(Or);                                                        \
-    Macro(Xor);                                                       \
-    Macro(MaskNegative);                                              \
-    Macro(ElementwiseProductWithSigmoidDerivativeFromOutput);         \
-    Macro(ElementwiseProductWithTanhDerivativeFromOutput);            \
-    Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput); \
-    Macro(ElementwiseProductWithLogDerivativeFromOutput);             \
-    Macro(ElementwiseProductWithCosDerivative);                       \
-    Macro(ElementwiseProductWithSinDerivative);                       \
-    Macro(ElementwiseProductWithAbsDerivative);                       \
-    Macro(ElementwiseProductWithReciprocalDerivative);                \
-    Macro(ElementwiseProductWithSqrtDerivative);                      \
-    Macro(SqrOfDifference);                                           \
+#define ForAllBinaryOps(Macro)                                               \
+    Macro(CopyIf);                                                           \
+    Macro(CopyIfNot);                                                        \
+    Macro(Sum);                                                              \
+    Macro(Difference);                                                       \
+    Macro(ElementwiseProduct);                                               \
+    Macro(ElementwiseQuotient);                                              \
+    Macro(LogSum);                                                           \
+    Macro(Max);                                                              \
+    Macro(Min);                                                              \
+    Macro(Equal);                                                            \
+    Macro(NotEqual);                                                         \
+    Macro(Greater);                                                          \
+    Macro(Less);                                                             \
+    Macro(GreaterEqual);                                                     \
+    Macro(LessEqual);                                                        \
+    Macro(And);                                                              \
+    Macro(Or);                                                               \
+    Macro(Xor);                                                              \
+    Macro(MaskNegative);                                                     \
+    Macro(ElementwiseProductWithSigmoidDerivativeFromOutput);                \
+    Macro(ElementwiseProductWithTanhDerivativeFromOutput);                   \
+    Macro(ElementwiseProductWithLinearRectifierDerivativeFromOutput);        \
+    Macro(ElementwiseProductWithLogDerivativeFromOutput);                    \
+    Macro(ElementwiseProductWithCosDerivative);                              \
+    Macro(ElementwiseProductWithSinDerivative);                              \
+    Macro(ElementwiseProductWithAbsDerivative);                              \
+    Macro(ElementwiseProductWithReciprocalDerivative);                       \
+    Macro(ElementwiseProductWithSqrtDerivative);                             \
+    Macro(SqrOfDifference);                                                  \
+    Macro(ElementwiseProductWithExponentialLinearUnitDerivativeFromOutput);
    //Macro(Index);

 #define ForAllTernaryOps(Macro)                         \
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -4299,7 +4299,10 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCTCScore(const GPUMatrix<ElemTyp
    const std::vector<size_t> & uttFrameNum,
    const std::vector<size_t> & uttPhoneNum,
    const size_t numParallelSequences,
-    const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
+    const size_t maxFrameNum, 
+    const size_t blankTokenId,
+    const int delayConstraint, 
+    const bool isColWise)
 {
    if (isColWise)
    {
@ -4340,13 +4343,13 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCTCScore(const GPUMatrix<ElemTyp
        for (long t = 0; t < maxFrameNum; t++)
        {
            _assignAlphaScore << <block_tail, thread_tail, 0, t_stream >> >(prob.Data(), alpha.Data(), phoneSeq.Data(), phoneBoundary.Data(), gpuUttToChanInd,
-                gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
+                gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, blankTokenId, delayConstraint);
        }

        for (long t = maxFrameNum - 1; t >= 0; t--)
        {
            _assignBetaScore << <block_tail, thread_tail, 0, t_stream >> >(prob.Data(), beta.Data(), phoneSeq.Data(), phoneBoundary.Data(), gpuUttToChanInd,
-                gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, delayConstraint);
+                gpuFrameNum, gpuBeginFrame, gpuPhoneNum, numParallelSequences, uttNum, t, maxPhoneNum, totalPhoneNum, blankTokenId, delayConstraint);
        }

        _assignTotalScore << <uttNum, 1, 0, t_stream >> > (beta.Data(), gpuScores, uttNum, gpuUttToChanInd, gpuBeginFrame, numParallelSequences, maxPhoneNum);
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -351,7 +351,7 @@ public:

    GPUMatrix<ElemType>& AssignCTCScore(const GPUMatrix<ElemType>& prob, GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& beta,
        const GPUMatrix<ElemType> phoneSeq, const GPUMatrix<ElemType> phoneBoundary, ElemType &totalScore, const vector<size_t>& uttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum,
-        const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise);
+        const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const size_t blankTokenId, const int delayConstraint, const bool isColWise);

    GPUMatrix<ElemType>& InplaceSqrt();
    GPUMatrix<ElemType>& AssignSqrtOf(const GPUMatrix<ElemType>& a);
--- a/Source/Math/GPUMatrixCUDAKernels.cuh
+++ b/Source/Math/GPUMatrixCUDAKernels.cuh
@ -5208,6 +5208,7 @@ __global__ void _adam4BlockSparseCol(CUDA_LONG size,
 // t (input): time stamp to process
 // maxPhoneNum (input): the max number of phones between utterances
 // totalPhoneNum (input): the total number of phones of all utterances
+// blankTokenId (input): id of the CTC blank token
 // delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference.
 //      Alpha and Beta scores outside of the delay boundary are set to zero.
 //      Setting this parameter smaller will result in shorted delay between label output during decoding.
@ -5227,6 +5228,7 @@ __global__ void _assignAlphaScore(
    const size_t  t,
    const size_t maxPhoneNum, // Maximum length of utterance in this MB
    const size_t totalPhoneNum, // Total number of phones
+    const size_t blankTokenId,
    const int delayConstraint)
 {
    LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
@ -5277,7 +5279,7 @@ __global__ void _assignAlphaScore(
            if (phoneSeqId > 2)
            {
                // if current label is not blank and not equal prev non-blank label
-                if ((LONG64)(phoneSeq[labelid]) != totalPhoneNum - 1 && phoneId != (LONG64)(phoneSeq[labelid_2]))
+                if ((LONG64)(phoneSeq[labelid]) != blankTokenId && phoneId != (LONG64)(phoneSeq[labelid_2]))
                {
                    x = logaddk(x, alphaScore[alphaId_2]);
                }
@ -5299,13 +5301,13 @@ __global__ void _assignAlphaScore(
            {
                LONG64 labelid_r = labelid + 2;
                LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_r]);
-                if (phoneId == totalPhoneNum - 1)
+                if (phoneId == blankTokenId)
                {
                    // only constraint right side
                    if (t > phoneBoundId_r + delayConstraint - 1)
                        alphaScore[alphaId] = LZERO;
                }
-                else if (phoneId != totalPhoneNum - 1)
+                else if (phoneId != blankTokenId)
                {
                    if (t > phoneBoundId_r + delayConstraint)
                        alphaScore[alphaId] = LZERO;
@ -5332,6 +5334,7 @@ __global__ void _assignBetaScore(
    const size_t  t,
    const size_t maxPhoneNum,
    const size_t totalPhoneNum,
+    const size_t blankTokenId,
    const int delayConstraint)
 {
    LONG64 uttId = blockDim.x * blockIdx.x + threadIdx.x;
@ -5368,7 +5371,7 @@ __global__ void _assignBetaScore(
            ElemType ascore;
            if (phoneSeqId < phoneNum - 3)
            {
-                if (phoneSeq[labelid] != totalPhoneNum - 1 && phoneId != phoneSeq[labelid_2])
+                if (phoneSeq[labelid] != blankTokenId && phoneId != phoneSeq[labelid_2])
                {
                    x = logaddk(x, betaScore[betaid_2]);
                }
@ -5389,12 +5392,12 @@ __global__ void _assignBetaScore(
            if (delayConstraint != -1)
            {
                LONG64 phoneBoundId_r = (LONG64)(phoneBound[labelid_2]);
-                if (phoneId == totalPhoneNum - 1)
+                if (phoneId == blankTokenId)
                {
                    if (t > phoneBoundId_r + delayConstraint - 1)
                        betaScore[betaid] = LZERO;
                }
-                else if (phoneId != totalPhoneNum - 1)
+                else if (phoneId != blankTokenId)
                {
                    if (t > phoneBoundId_r + delayConstraint)
                        betaScore[betaid] = LZERO;
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -5732,6 +5732,7 @@ Matrix<ElemType>& Matrix<ElemType>::AssignSequenceError(const ElemType hsmoothin
 // uttPhoneNum (input): the phone number of each utterance. The size of this vector =  the number of all utterances in this minibatch
 // numParallelSequences (input): num of parallel sequences
 // mbsize (input): the maximum channel frame number
+// blankTokenId (input): id of the CTC blank token
 // delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin.
 //      Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
 //      delayConstraint=-1 means no constraint
@ -5739,7 +5740,7 @@ template<class ElemType>
 Matrix<ElemType>& Matrix<ElemType>::AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta,
    const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, ElemType &totalScore, const std::vector<size_t> & uttToChanInd,
    const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum, const std::vector<size_t> & uttPhoneNum,
-    const size_t numParallelSequences, const size_t mbsize, const int delayConstraint, const bool isColWise)
+    const size_t numParallelSequences, const size_t mbsize, const size_t blankTokenId, const int delayConstraint, const bool isColWise)
 {
    DecideAndMoveToRightDevice(prob, *this);
    alpha.Resize(phoneSeq.GetNumRows(), prob.GetNumCols());
@ -5754,9 +5755,9 @@ Matrix<ElemType>& Matrix<ElemType>::AssignCTCScore(const Matrix<ElemType>& prob,
    DISPATCH_MATRIX_ON_FLAG(&prob,
        this,
        this->m_CPUMatrix->AssignCTCScore(*prob.m_CPUMatrix, *alpha.m_CPUMatrix, *beta.m_CPUMatrix, *phoneSeq.m_CPUMatrix, *phoneBound.m_CPUMatrix, totalScore,
-            uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, isColWise),
+            uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, blankTokenId, delayConstraint, isColWise),
        this->m_GPUMatrix->AssignCTCScore(*prob.m_GPUMatrix, *alpha.m_GPUMatrix, *beta.m_GPUMatrix, *phoneSeq.m_GPUMatrix, *phoneBound.m_GPUMatrix, totalScore,
-            uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, isColWise),
+            uttToChanInd, uttBeginFrame, uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, blankTokenId, delayConstraint, isColWise),
        NOT_IMPLEMENTED,
        NOT_IMPLEMENTED
    );
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -380,7 +380,7 @@ public:

    Matrix<ElemType>& AssignCTCScore(const Matrix<ElemType>& prob, Matrix<ElemType>& alpha, Matrix<ElemType>& beta, const Matrix<ElemType>& phoneSeq, const Matrix<ElemType>& phoneBound, ElemType &totalScore,
        const vector<size_t> & extraUttMap, const vector<size_t> & uttBeginFrame, const vector<size_t> & uttFrameNum, const vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep,
-        const size_t mbSize, const int delayConstraint, const bool isColWise);
+        const size_t mbSize, const size_t blankTokenId, const int delayConstraint, const bool isColWise);

    Matrix<ElemType>& InplaceSqrt();
    Matrix<ElemType>& AssignSqrtOf(const Matrix<ElemType>& a);
--- a/Source/Math/NcclComm.cpp
+++ b/Source/Math/NcclComm.cpp
@ -93,6 +93,23 @@ void NcclComm::AllReduceImpl(void* buffer, size_t count, DataType dtype)
        RuntimeError("NcclComm ncclAllReduce failed: %s", ncclGetErrorString(res));
 }

+void NcclComm::BroadcastImpl(void* buffer, size_t count, MPI_Datatype dtype, int root)
+{
+    ncclResult_t res;
+    if (dtype == MPI_CHAR)
+    {
+        res = ncclBcast(buffer, count, ncclChar, root, m_ncclComm, m_stream);
+    }
+    else
+    {
+        RuntimeError("NcclComm Broadcast supports Char type only");
+    }
+    if (res != ncclSuccess)
+    {
+        RuntimeError("NcclComm ncclBcast failed: %s", ncclGetErrorString(res));
+    }
+}
+
 void NcclComm::Sync()
 {
    cudaStreamSynchronize(m_stream) || "NcclComm: cudaStreamSynchronize failed";
--- a/Source/Math/NcclComm.h
+++ b/Source/Math/NcclComm.h
@ -23,6 +23,7 @@ class NcclComm
 private:
    enum class DataType : int {FLOAT, DOUBLE};
    void AllReduceImpl(void* buffer, size_t count, DataType dtype);
+    void BroadcastImpl(void* buffer, size_t count, MPI_Datatype dtype, int root);
    cudaStream_t m_stream;
    ncclComm_t m_ncclComm;
 #endif
@ -53,6 +54,20 @@ public:
        RuntimeError("NcclComm: CNTK was built without NCCL support.");
 #endif
    }
+
+#pragma warning( push )
+#pragma warning ( disable : 4100 ) // Disable warning 4100 in Broadcast function
+
+    void Broadcast(void* buffer, size_t count, MPI_Datatype dtype, int root)
+    {
+#ifdef USE_NCCL
+        BroadcastImpl(buffer, count, dtype, root);
+#else
+        RuntimeError("NcclComm: CNTK was built without NCCL support.");
+#endif
+    }
 };

+#pragma warning( pop )
+
 }}}
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -1395,7 +1395,7 @@ GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignSequenceError(const ElemType hsm
 template <class ElemType>
 GPUMatrix<ElemType>& GPUMatrix<ElemType>::AssignCTCScore(const GPUMatrix<ElemType>& prob, GPUMatrix<ElemType>& alpha, GPUMatrix<ElemType>& beta,
    const GPUMatrix<ElemType> phoneSeq, const GPUMatrix<ElemType> phoneBound, ElemType &totalScore, const std::vector<size_t>& uttMap, const std::vector<size_t> & uttBeginFrame, const std::vector<size_t> & uttFrameNum,
-    const std::vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const int delayConstraint, const bool isColWise)
+    const std::vector<size_t> & uttPhoneNum, const size_t samplesInRecurrentStep, const size_t maxFrameNum, const size_t blankTokenId, const int delayConstraint, const bool isColWise)
 {
    return *this;
 }
--- a/Source/Math/TensorOps.h
+++ b/Source/Math/TensorOps.h
@ -48,6 +48,7 @@ OverloadUnaryMathFns(fabs);
 OverloadUnaryMathFns(cos);
 OverloadUnaryMathFns(sin);
 OverloadUnaryMathFns(floor);
+OverloadUnaryMathFns(log1p);

 #pragma pop_macro("OverloadUnaryMathFns")

@ -97,6 +98,12 @@ DECL ElemType LinearRectifierDerivative(ElemType z)
    return z > 0 ? (ElemType) 1 : 0;
 }

+template <class ElemType>
+DECL ElemType ExponentialLinearUnitDerivative(ElemType z)
+{
+    return z >= 0 ? (ElemType)1 : exp_(z);
+}
+
 template <class ElemType>
 DECL ElemType Sgn(ElemType z)
 {
@ -141,21 +148,9 @@ template <typename ElemType>
 DECL ElemType LogAdd(ElemType x, ElemType y)
 {
    if (x < y)
-    {
-        ElemType temp = x;
-        x = y;
-        y = temp;
-    }
-    ElemType diff = y - x;
-    if (diff < (ElemType) MINLOGEXP)
-    {
-        return (x < (ElemType) LSMALL) ? (ElemType) LZERO : x;
-    }
-    else
-    {
-        ElemType z = exp_(diff);
-        return x + log_((ElemType) 1.0 + z);
-    }
+        std::swap(x, y);
+
+    return x + log1p_(exp_(y - x));
 }

 // IndexElement reindexes a tensor along one dimension.
@ -206,6 +201,7 @@ DefUnaryOp(LinearRectifier, a > 0 ? a : 0);
 DefUnaryOp(Cosine, cos_(a));
 DefUnaryOp(Sin, sin_(a));
 DefUnaryOp(Reciprocal, a == 0 ? 0 : 1 / a);
+DefUnaryOp(ExponentialLinearUnit, a >= 0 ? a : (exp_(a)-1));
 #pragma pop_macro("DefUnaryOp")

 #pragma push_macro("DefBinaryOp")
@ -245,6 +241,7 @@ DefBinaryOp(ElementwiseProductWithAbsDerivative, a * Sgn(b)); // note: b = input
 DefBinaryOp(ElementwiseProductWithReciprocalDerivative, a * -Sqr(b)); // b = output
 DefBinaryOp(ElementwiseProductWithSqrtDerivative, a / (2 * b)); // b = output; d/dx sqrt(x) = 1/(2 * sqrt(x)) --> note this is the same as ElementwiseQuotient w a constant; if more show up like this we should add more template params
 DefBinaryOp(SqrOfDifference, Sqr(a - b));
+DefBinaryOp(ElementwiseProductWithExponentialLinearUnitDerivativeFromOutput, b >= 0 ? a : a*(1+b)); // b = output;
 //DefBinaryOp(Index, IndexElement(a, b, i));  // note: this one uses the third argument

 #pragma pop_macro("DefBinaryOp")
--- a/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.cpp
+++ b/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.cpp
@ -10,6 +10,8 @@
 #include "BinaryConfigHelper.h"
 #include "DataReader.h"
 #include "StringUtil.h"
+#include "ReaderConstants.h"
+#include "ReaderUtil.h"

 using std::string;
 using std::wstring;
@ -49,23 +51,16 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        m_filepath = msra::strfun::utf16(config(L"file"));
        m_keepDataInMemory = config(L"keepDataInMemory", false);

-        // EvalActions inserts randomize = "none" into the reader config in DoWriteOutoput. We would like this to be true/false,
-        // but we can't for this reason. So we will assume false unless we specifically get "true"
-
-        m_randomize = false;
-        wstring randomizeString = config(L"randomize", L"false");
-        if (!_wcsicmp(randomizeString.c_str(), L"true")) // TODO: don't support case-insensitive option strings in the new reader
-            m_randomize = true;
-
-        if (m_randomize)
+        m_randomizationWindow = GetRandomizationWindowFromConfig(config);
+        m_sampleBasedRandomizationWindow = config(L"sampleBasedRandomizationWindow", false);
+        if (!m_sampleBasedRandomizationWindow && m_randomizationWindow == randomizeAuto)
        {
-            if (config.Exists(L"randomizationWindow"))
-                m_randomizationWindow = config(L"randomizationWindow");
-            else
-                m_randomizationWindow = randomizeAuto;
+            // The size of the chunk for the binary reader is specified in terms of the number of sequences
+            // per chunk and is fixed at the time when the data is serialized into the binary format.
+            // As a result, the on-disk size of a chunk can be arbitrary, and 32MB number used here is 
+            // merely a heuristic. 
+            m_randomizationWindow = g_4GB / g_32MB; // 128 chunks. 
        }
-        else
-            m_randomizationWindow = randomizeNone;

        m_traceLevel = config(L"traceLevel", 1);
    }
--- a/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.h
+++ b/Source/Readers/CNTKBinaryReader/BinaryConfigHelper.h
@ -26,10 +26,10 @@ public:
    // Get full path to the input file.
    const wstring& GetFilePath() const { return m_filepath; }

-    size_t GetRandomize() const { return m_randomize; }
-
    size_t GetRandomizationWindow() const { return m_randomizationWindow; }

+    bool UseSampleBasedRandomizationWindow() const { return m_sampleBasedRandomizationWindow; }
+
    unsigned int GetTraceLevel() const { return m_traceLevel; }

    bool ShouldKeepDataInMemory() const { return m_keepDataInMemory; }
@ -40,7 +40,9 @@ private:
    std::wstring m_filepath;
    std::map<std::wstring, std::wstring> m_streams;
    size_t m_randomizationWindow;
-    bool m_randomize;
+    // Specifies how to interpret randomization window, if true randomization window == number of samples, else 
+    // randomization window = number of chunks (default).
+    bool m_sampleBasedRandomizationWindow;
    unsigned int m_traceLevel;
    bool m_keepDataInMemory; // if true the whole dataset is kept in memory
 };
--- a/Source/Readers/CNTKBinaryReader/CNTKBinaryReader.cpp
+++ b/Source/Readers/CNTKBinaryReader/CNTKBinaryReader.cpp
@ -35,9 +35,9 @@ CNTKBinaryReader::CNTKBinaryReader(const ConfigParameters& config)
            log += " | keeping data in memory";
        }

-        if (configHelper.GetRandomize())
+        size_t window = configHelper.GetRandomizationWindow();
+        if (window > 0)
        {
-            size_t window = configHelper.GetRandomizationWindow();
            // Verbosity is a general config parameter, not specific to the binary format reader.
            log += " | randomizing with window: " + (int)window;
            int verbosity = config(L"verbosity", 0);
@ -46,8 +46,9 @@ CNTKBinaryReader::CNTKBinaryReader(const ConfigParameters& config)
                window,  /* randomizationRangeInSamples */
                m_deserializer, /* deserializer */
                true, /* shouldPrefetch */
-                false /* multithreadedGetNextSequences */
-                );
+                false, /* multithreadedGetNextSequences */
+                 0, /*maxNumberOfInvalidSequences */
+                configHelper.UseSampleBasedRandomizationWindow() /*sampleBasedRandomizationWindow */);
        }
        else
        {
--- a/Source/Readers/CNTKTextFormatReader/TextConfigHelper.cpp
+++ b/Source/Readers/CNTKTextFormatReader/TextConfigHelper.cpp
@ -11,6 +11,7 @@
 #include "DataReader.h"
 #include "StringUtil.h"
 #include "ReaderConstants.h"
+#include "ReaderUtil.h"

 using std::string;
 using std::wstring;
@ -117,35 +118,6 @@ TextConfigHelper::TextConfigHelper(const ConfigParameters& config)
    }

    m_filepath = msra::strfun::utf16(config(L"file"));
-
-    wstring randomizeString = config(L"randomize", wstring());
-    if (!_wcsicmp(randomizeString.c_str(), L"none")) // TODO: don't support case-insensitive option strings in the new reader
-    {
-        // "none" is only accepted to be backwards-compatible (DoWriteOutput() in EvalActions.cpp
-        // inserts this magic constant into the reader config to prevent it from shuffling the input).
-        // In user-defined configurations, 'randomize' should be a boolean.
-        m_randomizationWindow = randomizeNone;
-    }
-    else
-    {
-        bool randomize = config(L"randomize", true);
-
-        if (!randomize)
-        {
-            m_randomizationWindow = randomizeNone;
-        }
-        else if (config.Exists(L"randomizationWindow"))
-        {
-            m_randomizationWindow = config(L"randomizationWindow");
-        }
-        else
-        {
-            m_randomizationWindow = randomizeAuto;
-        }
-    }
-
-    m_sampleBasedRandomizationWindow = config(L"sampleBasedRandomizationWindow", false);
-
    m_skipSequenceIds = config(L"skipSequenceIds", false);
    m_maxErrors = config(L"maxErrors", 0);
    m_traceLevel = config(L"traceLevel", 1);
@ -153,6 +125,8 @@ TextConfigHelper::TextConfigHelper(const ConfigParameters& config)
    m_keepDataInMemory = config(L"keepDataInMemory", false);
    m_frameMode = config(L"frameMode", false);

+    m_randomizationWindow = GetRandomizationWindowFromConfig(config);
+    m_sampleBasedRandomizationWindow = config(L"sampleBasedRandomizationWindow", false);
    if (!m_sampleBasedRandomizationWindow && m_randomizationWindow == randomizeAuto) 
    {
        m_randomizationWindow = g_4GB / m_chunkSizeBytes; // ~ 4 GB (on disk) worth of chunks
--- a/Source/Readers/ReaderLib/ReaderLib.vcxproj
+++ b/Source/Readers/ReaderLib/ReaderLib.vcxproj
@ -87,6 +87,7 @@
    <ClCompile Include="FramePacker.cpp" />
    <ClCompile Include="ReaderBase.cpp" />
    <ClCompile Include="ReaderShim.cpp" />
+    <ClCompile Include="ReaderUtil.cpp" />
    <ClCompile Include="SequencePacker.cpp" />
    <ClCompile Include="SequenceRandomizer.cpp" />
    <ClCompile Include="TruncatedBpttPacker.cpp" />
--- a/Source/Readers/ReaderLib/ReaderLib.vcxproj.filters
+++ b/Source/Readers/ReaderLib/ReaderLib.vcxproj.filters
@ -135,6 +135,9 @@
    <ClCompile Include="Indexer.cpp">
      <Filter>Utils</Filter>
    </ClCompile>
+    <ClCompile Include="ReaderUtil.cpp">
+      <Filter>Utils</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <Filter Include="Interfaces">
--- a/Source/Readers/ReaderLib/ReaderUtil.cpp
+++ b/Source/Readers/ReaderLib/ReaderUtil.cpp
@ -0,0 +1,37 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "Config.h"
+#include "DataReader.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+    
+    size_t GetRandomizationWindowFromConfig(const ConfigParameters& config)
+    {
+        wstring randomizeString = config(L"randomize", wstring());
+        if (!_wcsicmp(randomizeString.c_str(), L"none")) // TODO: don't support case-insensitive option strings in the new reader
+        {
+            // "none" is only accepted to be backwards-compatible (DoWriteOutput() in EvalActions.cpp
+            // inserts this magic constant into the reader config to prevent it from shuffling the input).
+            // In user-defined configurations, 'randomize' should be a boolean.
+            return randomizeNone;
+        }
+
+        bool randomize = config(L"randomize", true);
+
+        if (!randomize)
+        {
+            return randomizeNone;
+        }
+
+        if (config.Exists(L"randomizationWindow"))
+        {
+            return config(L"randomizationWindow");
+        }
+
+        return randomizeAuto;
+    }
+
+}}}
--- a/Source/Readers/ReaderLib/ReaderUtil.h
+++ b/Source/Readers/ReaderLib/ReaderUtil.h
@ -10,6 +10,10 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+class ConfigParameters;
+
+size_t GetRandomizationWindowFromConfig(const ConfigParameters& config);
+
 // Returns the size of the type.
 inline size_t GetSizeByType(ElementType type)
 {
--- a/Source/SGDLib/AccumulatorAggregation.h
+++ b/Source/SGDLib/AccumulatorAggregation.h
@ -26,7 +26,8 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
    std::shared_ptr<ComputationNetwork> net,
    std::set<std::shared_ptr<ComputationNodeBase>> evalNodesWhichAccumulateResult,
    std::shared_ptr<DistGradHeader> gradHeader,
-    std::shared_ptr<MPIWrapper> mpi)
+    std::shared_ptr<MPIWrapper> mpi,
+    size_t packThresholdSizeInBytes)
 {
    // Accumulator stores mean value and number of samples. Aggregation performs simple summation of values,
    // so we transfer sum instead of mean, and calculate mean after aggregation is finished.
@ -58,7 +59,8 @@ void AggregateAccumulatorValuesAndUpdateEvaluation(
            mpi,
            false /*useAsyncAggregation*/,
            net->GetDeviceId(),
-            0 /*syncStatsTrace*/);
+            0 /*syncStatsTrace*/,
+            packThresholdSizeInBytes);

    // Prepare header.
    const size_t c_evalNodes = 1;
@ -127,10 +129,11 @@ void AggregateAccumulatorValuesAndUpdateEpochEvaluation(
    std::vector<EpochCriterion>& epochEvalErrors,
    const std::vector<ComputationNodeBasePtr>& evaluationNodes,
    CriterionAccumulator<ElemType> localEpochEvalErrors,
-    std::function<bool(ComputationNodeBasePtr)> containsAccumulatedResult)
+    std::function<bool(ComputationNodeBasePtr)> containsAccumulatedResult,
+    size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES)
 {
    // Each node contains accumulated values for part of the data set, we have to aggregate accumulated values.
-    AggregateAccumulatorValuesAndUpdateEvaluation<ElemType>(net, evalNodesWhichAccumulateResult, gradHeader, mpi);
+    AggregateAccumulatorValuesAndUpdateEvaluation<ElemType>(net, evalNodesWhichAccumulateResult, gradHeader, mpi, packThresholdSizeInBytes);

    // After values of accumulators have been aggregated accross nodes, we have to update evaluation results for
    // evaluation nodes that accumulate results.
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -1511,7 +1511,7 @@ size_t SGD<ElemType>::TrainOneEpoch(ComputationNetworkPtr net,
        // and recalculate evaluation errors based on accumulators.
        AggregateAccumulatorValuesAndUpdateEpochEvaluation<ElemType>(
            net, evaluationNodesWhichAccumulateResult, m_gradHeader, m_mpi, epochEvalErrors, evaluationNodes,
-            localEpochEvalErrors, ContainsAccumulatedResult);
+            localEpochEvalErrors, ContainsAccumulatedResult, m_packThresholdSizeInBytes);
    }

    return totalEpochSamples;
@ -2111,7 +2111,7 @@ void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int d
        if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines.
            m_distGradAgg = std::make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, ::CNTK::MPICommunicator());
        else
-            m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace);
+            m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace, m_packThresholdSizeInBytes);
    }

    m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
@ -2701,6 +2701,8 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
    m_maxSamplesInRAM = configSGD(L"maxSamplesInRAM", (size_t) SIZE_MAX);
    m_numSubminiBatches = configSGD(L"numSubminibatches", (size_t) 1);

+    m_packThresholdSizeInBytes = configSGD(L"packThresholdSizeInKB", DEFAULT_PACK_THRESHOLD_SIZE_IN_KB) * 1024;
+
    if (configAALR.Exists(L"numMiniBatch4LRSearch"))
    {
        LOGPRINTF(stderr, "WARNING: 'numMiniBatch4LRSearch' is deprecated, please remove it and use 'numSamples4Search' instead.\n");
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -200,6 +200,9 @@ protected:
    intargvector m_numSamples4Search;
    size_t m_numBestSearchEpoch;

+    // Threshold size in bytes for single gradient to do packing
+    size_t m_packThresholdSizeInBytes;
+
    LearningRateSearchAlgorithm m_autoLearnRateSearchType;

    AdaptationRegType m_adaptationRegType;
--- a/Source/SGDLib/SimpleDistGradAggregator.h
+++ b/Source/SGDLib/SimpleDistGradAggregator.h
@ -6,6 +6,7 @@

 #pragma once

+#include "Constants.h"
 #include "IDistGradAggregator.h"
 #include "CUDAPageLockedMemAllocator.h"
 #include "NcclComm.h"
@ -22,8 +23,9 @@ class SimpleDistGradAggregator : public IDistGradAggregator<ElemType>
    UsingIDistGradAggregatorMembers;

 public:
-    SimpleDistGradAggregator(const MPIWrapperPtr& mpi, bool useAsyncAggregation, int deviceId, int syncStatsTrace)
-        : IDistGradAggregator<ElemType>(mpi), m_useAsyncAggregation(useAsyncAggregation), m_initialized(false), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace), m_iterationCount(0), m_nccl(deviceId, mpi)
+    SimpleDistGradAggregator(const MPIWrapperPtr& mpi, bool useAsyncAggregation, int deviceId, int syncStatsTrace, size_t packThresholdSizeInBytes = DEFAULT_PACK_THRESHOLD_SIZE_IN_BYTES)
+        : IDistGradAggregator<ElemType>(mpi), m_useAsyncAggregation(useAsyncAggregation), m_initialized(false), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace),
+        m_iterationCount(0), m_nccl(deviceId, mpi), m_packThresholdSizeInBytes(packThresholdSizeInBytes)
    {}

    ~SimpleDistGradAggregator()
@ -144,25 +146,65 @@ private:
            m_initialized = true;
            int deviceId = gradients[0]->GetDeviceId();

-            if (!m_nccl.IsSupported() && deviceId != CPUDEVICE)
+            if (!m_nccl.IsSupported() && (deviceId != CPUDEVICE))
                m_allocator.reset(new CUDAPageLockedMemAllocator(deviceId));

+            size_t packedGradientsSizeInElements = 0;
            for (size_t i = 0; i < gradients.size(); i++)
            {
+                if (!m_useAsyncAggregation && sizeof(ElemType) * gradients[i]->GetNumElements() <= m_packThresholdSizeInBytes)
+                {
+                    packedGradientsSizeInElements += gradients[i]->GetNumElements();
+                    m_packedGradientsIndex.push_back(i);
+                }
+                else
+                {
+                    m_gradientIndexToAggregate.push_back(i);
+                }
+
                // Make sure none of the gradient matrixes are sparse - we currently do not support aggregation of sparse gradient matrices
                if (gradients[i]->GetMatrixType() != DENSE)
                    RuntimeError("Gradient aggregation for sparse gradient matrices is currently unsupported!");

-                if (!m_nccl.IsSupported() && deviceId != CPUDEVICE)
-                {
-                    m_gpuDataTransferers.push_back(std::make_unique<GPUDataTransferer>(deviceId, m_useAsyncAggregation));
-                    m_intermediateCPUBuffers.push_back(AllocateIntermediateBuffer(deviceId, gradients[i]->GetNumElements()));
-                }
-
                if (m_useAsyncAggregation)
                    m_bufferedGradients[gradients[i]].reset(new Matrix<ElemType>(gradients[i]->GetNumRows(), gradients[i]->GetNumCols(), deviceId));
            }

+            // Packing matrices into continous buffer if not doing async aggregation
+            m_aggregationBuffer.reset();
+            if (packedGradientsSizeInElements > 0)
+            {
+                m_aggregationBuffer.reset(new (std::nothrow) Matrix<ElemType>(1, packedGradientsSizeInElements, deviceId));
+            }
+            // If no extra continous buffer allocated or using async aggregation
+            if (m_aggregationBuffer == nullptr)
+            {
+                m_gradientIndexToAggregate.clear();
+                m_packedGradientsIndex.clear();
+                packedGradientsSizeInElements = 0;
+                // Reuse "@param m_gradientIndexToAggregate" for following code, if no continous buffer allocated
+                for (size_t i = 0; i < gradients.size(); i++)
+                {
+                    m_gradientIndexToAggregate.push_back(i);
+                }
+            }
+            else
+            {
+                // First element is reserved for continous buffer
+                m_gradientIndexToAggregate.insert(m_gradientIndexToAggregate.begin(), 1, (size_t)-1);
+            }
+
+            // If running on GPU and NCCL not supported, initialize GPU and CPU data transfer
+            if (!m_nccl.IsSupported() && (deviceId != CPUDEVICE))
+            {
+                for (size_t i : m_gradientIndexToAggregate)
+                {
+                    m_gpuDataTransferers.push_back(std::make_unique<GPUDataTransferer>(deviceId, m_useAsyncAggregation));
+                    m_intermediateCPUBuffers.push_back(AllocateIntermediateBuffer(deviceId,
+                        (i == -1) ? packedGradientsSizeInElements : gradients[i]->GetNumElements()));
+                }
+            }
+
            if (m_useAsyncAggregation)
            {
                m_bufferedGradHeader = DistGradHeader::Create(numEvalNodes);
@ -223,11 +265,33 @@ private:
            }
        }

-        // Initiate transfer of the gradient matrices to the CPU if needed
-        if (!m_nccl.IsSupported() && deviceId >= 0)
+        // Copy all gradient data into a single contiguous buffer, if additional continous buffer allocated
+        size_t offset = 0;
+        for (size_t i : m_packedGradientsIndex)
        {
-            for (size_t i = 0; i < numGradMatrices; ++i)
-                m_gpuDataTransferers[i]->CopyGPUToCPUAsync(gradients[i]->Data(), gradients[i]->GetNumElements(), m_intermediateCPUBuffers[i].get());
+            m_aggregationBuffer->ColumnSlice(offset, gradients[i]->GetNumElements()).AssignValuesOf(gradients[i]->Reshaped(1, gradients[i]->GetNumElements()));
+            offset += gradients[i]->GetNumElements();
+        }
+
+        // Initiate transfer of the bufferred data to the CPU if needed
+        if (!m_nccl.IsSupported() && deviceId != CPUDEVICE)
+        {
+            size_t gpuDataTransfersIdx = 0;
+            Matrix<ElemType>* gpuCopyBuffer = m_aggregationBuffer.get();
+            for (size_t i : m_gradientIndexToAggregate)
+            {
+                if (i != -1)
+                {
+                    gpuCopyBuffer = gradients[i];
+                }
+                else
+                {
+                    // i == -1, first element is for packed gradients, which should not be with AsyncAggregation
+                    assert(m_useAsyncAggregation == false);
+                }
+                m_gpuDataTransferers[gpuDataTransfersIdx]->CopyGPUToCPUAsync(gpuCopyBuffer->Data(), gpuCopyBuffer->GetNumElements(), m_intermediateCPUBuffers[gpuDataTransfersIdx].get());
+                gpuDataTransfersIdx++;
+            }
        }

        // Initiate receive of the header on the main node
@ -248,26 +312,35 @@ private:
            m_mpi->Isend(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank(), numGradMatrices, &sendHeaderRequest) || MpiFail("MPI_Isend");

        // Perform async allreduce on the gradient data
-        std::vector<MPI_Request> allReduceRequests(numGradMatrices);
+        std::vector<MPI_Request> allReduceRequests;
        if (!m_nccl.IsSupported())
        {
-            for (size_t i = 0; i < numGradMatrices; ++i)
+            size_t allReduceIndex = 0;
+            ElemType* reductionBuffer;
+            for (size_t i : m_gradientIndexToAggregate)
            {
-                ElemType* reductionBuffer = gradients[i]->Data();
-                if (deviceId >= 0)
+                allReduceRequests.push_back(MPI_Request());
+                reductionBuffer = (i == -1)? m_aggregationBuffer->Data() : gradients[i]->Data();
+                if (deviceId != CPUDEVICE)
                {
-                    m_gpuDataTransferers[i]->WaitForCopyGPUToCPUAsync();
-                    reductionBuffer = m_intermediateCPUBuffers[i].get();
+                    m_gpuDataTransferers[allReduceIndex]->WaitForCopyGPUToCPUAsync();
+                    reductionBuffer = m_intermediateCPUBuffers[allReduceIndex].get();
                }

-                // On Windows this async MPI_Iallreduce call requires MS MPI v7 or higher to be installed
-                m_mpi->Iallreduce(MPI_IN_PLACE, reductionBuffer, gradients[i]->GetNumElements(),
-                               MPIWrapper::GetDataType(reductionBuffer), MPI_SUM,
-                               &allReduceRequests[i]) || MpiFail("MPI_Iallreduce");
+                m_mpi->Iallreduce(MPI_IN_PLACE, reductionBuffer, (i == -1) ? m_aggregationBuffer->GetNumElements() : gradients[i]->GetNumElements(),
+                    MPIWrapper::GetDataType(reductionBuffer), MPI_SUM, &allReduceRequests.back()) || MpiFail("MPI_Iallreduce");
+                allReduceIndex++;
            }
-        }
+        } 
        else
-            m_nccl.AllReduce(gradients);
+        {
+            std::vector<Matrix<ElemType>*> ncclReduceGradients;
+            for (size_t i : m_gradientIndexToAggregate)
+            {
+                ncclReduceGradients.push_back((i == -1) ? m_aggregationBuffer.get() : gradients[i]);
+            }
+            m_nccl.AllReduce(ncclReduceGradients);
+        }

        // On the main node wait for the headers to arrive and aggregate
        if (m_mpi->IsMainNode())
@ -290,52 +363,48 @@ private:
            assert(numNodesHeadersReceivedFrom == (NumProc() - 1));
        }

-        // Initiate receive of the aggregate header
-        MPI_Request recvAggHeaderRequest;
-        if (!m_mpi->IsMainNode())
-            m_mpi->Irecv(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank(), numGradMatrices + 1 + numGradMatrices, &recvAggHeaderRequest) || MpiFail("MPI_Irecv");
+        // Broadcast the aggregated header to all nodes
+        m_mpi->Bcast(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank());

-        // Intiate send of the aggregate header from main node
-        std::vector<MPI_Request> sendAggHeaderRequests(NumProc() - 1);
-        if (m_mpi->IsMainNode())
-        {
-            for (size_t j = 0; j < NumProc() - 1; ++j)
-            {
-                int dest = (j >= MyRank()) ? (j + 1) : j;
-                // TODO: Should we use MPI_Bcast instead for better performance
-                m_mpi->Isend(headerCPU, headerCPU->Size(), MPI_CHAR, dest, numGradMatrices + 1 + numGradMatrices, &(sendAggHeaderRequests[j])) || MpiFail("MPI_Isend");
-            }
-        }
-
-        // Wait for the allreduce operations to finish and initiate transfer back to the GPU if needed
-        if (!m_nccl.IsSupported())
-        {
-            for (size_t i = 0; i < numGradMatrices; ++i)
-            {
-                m_mpi->Wait(&allReduceRequests[i], MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
-                if (deviceId >= 0)
-                    m_gpuDataTransferers[i]->CopyCPUToGPUAsync(m_intermediateCPUBuffers[i].get(), gradients[i]->GetNumElements(), gradients[i]->Data());
-            }
-        }
-
-        // Wait to receive aggregate header
-        if (!m_mpi->IsMainNode())
-            m_mpi->Wait(&recvAggHeaderRequest, MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
-
-        // Wait for all the transfers to finish
        if (m_nccl.IsSupported())
-            m_nccl.Sync();
-        else if (deviceId >= 0)
        {
-            for (size_t i = 0; i < numGradMatrices; ++i)
-                m_gpuDataTransferers[i]->WaitForCopyCPUToGPUAsync();
+            m_nccl.Sync();
+        }
+        else
+        {
+            // Wait for the allreduce operations to finish and initiate transfer back to the GPU if needed
+            size_t gpuDataTransfersIdx = 0; // Index of allReduceRequest for each un-packed gradient
+            for (size_t i : m_gradientIndexToAggregate)
+            {
+                m_mpi->Wait(&allReduceRequests[gpuDataTransfersIdx], MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
+                if (deviceId != CPUDEVICE)
+                {
+                    m_gpuDataTransferers[gpuDataTransfersIdx]->CopyCPUToGPUAsync(m_intermediateCPUBuffers[gpuDataTransfersIdx].get(),
+                        (i == -1) ? m_aggregationBuffer->GetNumElements() : gradients[i]->GetNumElements(),
+                        (i == -1) ? m_aggregationBuffer->Data() : gradients[i]->Data());
+                }
+                gpuDataTransfersIdx++;
+            }
+
+            // Wait for copy data from CPU to GPU, if not running on CPU and not NCCL enabled
+            if (deviceId != CPUDEVICE)
+            {
+                for (size_t i = 0; i < m_gradientIndexToAggregate.size(); i++)
+                    m_gpuDataTransferers[i]->WaitForCopyCPUToGPUAsync();
+            }
+        }
+
+        // Copy data back to the packed gradients from the continous buffer
+        offset = 0;
+        for (size_t i : m_packedGradientsIndex)
+        {
+            gradients[i]->AssignValuesOf(m_aggregationBuffer->ColumnSlice(offset, gradients[i]->GetNumElements()).Reshaped(gradients[i]->GetNumRows(), gradients[i]->GetNumCols()));
+            offset += gradients[i]->GetNumElements();
        }

        // Wait for completion of the async send requests
        if (!m_mpi->IsMainNode())
            m_mpi->Wait(&sendHeaderRequest, MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
-        else
-            m_mpi->Waitall(sendAggHeaderRequests.size(), sendAggHeaderRequests.data(), MPI_STATUSES_IGNORE) || MpiFail("MPI_Waitall");

        if (showSyncPerfStats)
        {
@ -347,8 +416,8 @@ private:

 private:
    std::unique_ptr<CUDAPageLockedMemAllocator> m_allocator;
-    std::vector<std::shared_ptr<ElemType>> m_intermediateCPUBuffers;

+    std::vector<std::shared_ptr<ElemType>> m_intermediateCPUBuffers;
    std::vector<std::unique_ptr<GPUDataTransferer>> m_gpuDataTransferers;

    std::vector<DistGradHeader*> m_recvHeaders;
@ -363,6 +432,13 @@ private:
    std::unordered_map<Matrix<ElemType>*, std::unique_ptr<Matrix<ElemType>>> m_bufferedGradients;
    DistGradHeader* m_bufferedGradHeader;

+    // Packing small gradients (size not larger than threshold size) into a continous buffer to reduce MPI calls.
+    // Threshold size to pack a gradient into the continous buffer, default 32KB (tunable by define "packThresholdSizeInKB=[value]")
+    const size_t m_packThresholdSizeInBytes;
+    std::unique_ptr<Matrix<ElemType>> m_aggregationBuffer;
+    std::vector<size_t> m_packedGradientsIndex;
+    std::vector<size_t> m_gradientIndexToAggregate;
+
    int m_syncStatsTrace;

    // Only used for controlling frequency of measuring/showing gradient aggregation perf stats
--- a/Source/SequenceTrainingLib/gammacalculation.h
+++ b/Source/SequenceTrainingLib/gammacalculation.h
@ -258,7 +258,7 @@ public:
    // maxValues (input): values of max elements in label input vectors
    // labels (input): 1-hot vector with frame-level phone labels
    // CTCPosterior (output): CTC posterior
-    // blankTokenId (input): id of the blank token
+    // blankTokenId (input): id of the blank token. If specified as SIZE_MAX, will be replaced with (numberOfLabels - 1)
    // delayConstraint -- label output delay constraint introduced during training that allows to have shorter delay during inference. This using the original time information to enforce that CTC tokens only get aligned within a time margin.
    //      Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy.
    //      delayConstraint=-1 means no constraint
@ -285,7 +285,7 @@ public:
        std::vector<size_t> phoneBound;

        ElemType finalScore = 0;
-        if (blankTokenId == INT_MIN)
+        if (blankTokenId == SIZE_MAX)
            blankTokenId = numRows - 1;

        size_t mbsize = numCols / numParallelSequences;
@ -374,7 +374,7 @@ public:
        Microsoft::MSR::CNTK::Matrix<ElemType> alpha(m_deviceid);
        Microsoft::MSR::CNTK::Matrix<ElemType> beta(m_deviceid);
        CTCPosterior.AssignCTCScore(prob, alpha, beta, matrixPhoneSeqs, matrixPhoneBounds, finalScore, uttToChanInd, uttBeginFrame,
-            uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, delayConstraint, /*isColWise=*/true );
+            uttFrameNum, uttPhoneNum, numParallelSequences, mbsize, blankTokenId, delayConstraint, /*isColWise=*/true );
        
        Microsoft::MSR::CNTK::Matrix<ElemType> rowSum(m_deviceid);
        rowSum.Resize(1, numCols);
--- a/Tests/EndToEndTests/CNTKv2Library/EndToEndTests/Seq2Seq.cpp
+++ b/Tests/EndToEndTests/CNTKv2Library/EndToEndTests/Seq2Seq.cpp
@ -209,6 +209,15 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
            break;

        trainer->TrainMinibatch({ { rawInput, minibatchData[rawInputStreamInfo] }, { rawLabels, minibatchData[rawLabelsStreamInfo] } }, device);
+
+        // Some basic sanity tests on the training loss and evaluation error values
+        auto IsNegativeOrNan = [](double value) {
+            return (value < 0) || std::isnan(value);
+        };
+
+        if (IsNegativeOrNan(trainer->PreviousMinibatchLossAverage()) || IsNegativeOrNan(trainer->PreviousMinibatchEvaluationAverage()))
+            ReportFailure("SequenceToSequence: Invalid (-ve or nan) loss or evaluation metric encountered in training of the SequenceToSequence model.");
+
        PrintTrainingProgress(trainer, i, outputFrequencyInMinibatches);

        if ((i + 1) == numMinibatchesToCheckpointAfter)
@ -232,9 +241,12 @@ void TrainSequenceToSequenceTranslator()
 {
    fprintf(stderr, "\nTrainSequenceToSequenceTranslator..\n");

-    // TODO: Also test with sparse input variables in the graph
    TrainSequenceToSequenceTranslator(DeviceDescriptor::CPUDevice(), false, true, false, false, true, true);
+    TrainSequenceToSequenceTranslator(DeviceDescriptor::CPUDevice(), true, false, false, false, true, true);

    if (IsGPUAvailable())
+    {
        TrainSequenceToSequenceTranslator(DeviceDescriptor::GPUDevice(0), false, false, true, true, false, false);
+        TrainSequenceToSequenceTranslator(DeviceDescriptor::GPUDevice(0), true, true, true, true, false, false);
+    }
 }
--- a/Tests/EndToEndTests/CNTKv2Python/Doc/run-test
+++ b/Tests/EndToEndTests/CNTKv2Python/Doc/run-test
@ -12,7 +12,7 @@ DOC_DIR=$TEST_ROOT_DIR/../../bindings/python/doc
 pushd $DOC_DIR || exit $?
 echo Current dir: $PWD

-py.test --deviceid $TEST_DEVICE
+py.test --verbose --deviceid $TEST_DEVICE

 if [ "$?" -eq "0" ]; then
  echo "__COMPLETED__"
--- a/Tests/EndToEndTests/CNTKv2Python/Examples/run-test
+++ b/Tests/EndToEndTests/CNTKv2Python/Examples/run-test
@ -7,7 +7,7 @@ python -c "import numpy; print('NumPy: %s'%numpy.version.full_version)"
 python -c "import scipy; print('SciPy: %s'%scipy.version.full_version)"
 python -c "import pytest; print('PyTest: %s'%pytest.__version__)"

-py.test --deviceid $TEST_DEVICE --is1bitsgd $TEST_1BIT_SGD
+py.test --verbose --deviceid $TEST_DEVICE --is1bitsgd $TEST_1BIT_SGD

 if [ "$?" -eq "0" ]; then
  echo "__COMPLETED__"
--- a/Tests/EndToEndTests/CNTKv2Python/ModuleTests/run-test
+++ b/Tests/EndToEndTests/CNTKv2Python/ModuleTests/run-test
@ -10,7 +10,7 @@ python -c "import pytest; print('PyTest: %s'%pytest.__version__)"
 # TODO why doesn't "py.test --pyargs cntk" work?
 MODULE_DIR="$(python -c "import cntk, os, sys; sys.stdout.write(os.path.dirname(os.path.abspath(cntk.__file__)))")"
 [ $? -eq 0 ] || exit $?
-py.test "$MODULE_DIR" --deviceid $TEST_DEVICE --is1bitsgd $TEST_1BIT_SGD --doctest-modules
+py.test "$MODULE_DIR" --verbose --deviceid $TEST_DEVICE --is1bitsgd $TEST_1BIT_SGD --doctest-modules

 if [ "$?" -eq "0" ]; then
  echo "__COMPLETED__"
--- a/Tests/EndToEndTests/CNTKv2Python/Tutorials/run-test
+++ b/Tests/EndToEndTests/CNTKv2Python/Tutorials/run-test
@ -7,7 +7,7 @@ python -c "import numpy; print('NumPy: %s'%numpy.version.full_version)"
 python -c "import scipy; print('SciPy: %s'%scipy.version.full_version)"
 python -c "import pytest; print('PyTest: %s'%pytest.__version__)"

-py.test --deviceid $TEST_DEVICE --is1bitsgd $TEST_1BIT_SGD
+py.test --verbose --deviceid $TEST_DEVICE --is1bitsgd $TEST_1BIT_SGD

 if [ "$?" -eq "0" ]; then
  echo "__COMPLETED__"
--- a/Tests/EndToEndTests/Speech/LSTM_CTC/baseline.linux.cpu.txt
+++ b/Tests/EndToEndTests/Speech/LSTM_CTC/baseline.linux.cpu.txt
@ -471,67 +471,67 @@ Here are the ones that don't share memory:
 02/23/2017 05:17:41: Precomputing --> Completed.


-02/23/2017 05:17:41: Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:38: Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:41: Starting minibatch loop.
-02/23/2017 05:17:42: Finished Epoch[ 1 of 10]: [Training] cr = 4.15554279 * 368; Err = 2.48529418 * 368; totalSamplesSeen = 368; learningRatePerSample = 0.0049999999; epochTime=0.465629s
-02/23/2017 05:17:42: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.1'
+02/18/2017 07:06:38: Starting minibatch loop.
+02/18/2017 07:06:39: Finished Epoch[ 1 of 10]: [Training] cr = 4.16293501 * 368; Err = 2.52941181 * 368; totalSamplesSeen = 368; learningRatePerSample = 0.0049999999; epochTime=1.00518s
+02/18/2017 07:06:39: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.1'

-02/23/2017 05:17:42: Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:39: Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:42: Starting minibatch loop.
-02/23/2017 05:17:42: Finished Epoch[ 2 of 10]: [Training] cr = 3.68123707 * 438; Err = 1.00000000 * 438; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=0.604311s
-02/23/2017 05:17:43: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.2'
+02/18/2017 07:06:39: Starting minibatch loop.
+02/18/2017 07:06:41: Finished Epoch[ 2 of 10]: [Training] cr = 3.68804012 * 438; Err = 1.00000000 * 438; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=1.21725s
+02/18/2017 07:06:41: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.2'

-02/23/2017 05:17:43: Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:41: Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:43: Starting minibatch loop.
-02/23/2017 05:17:43: Finished Epoch[ 3 of 10]: [Training] cr = 0.00000000 * 0; Err = 0.00000000 * 0; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=0.000737422s
-02/23/2017 05:17:43: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.3'
+02/18/2017 07:06:41: Starting minibatch loop.
+02/18/2017 07:06:41: Finished Epoch[ 3 of 10]: [Training] cr = 0.00000000 * 0; Err = 0.00000000 * 0; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=0.00115141s
+02/18/2017 07:06:41: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.3'

-02/23/2017 05:17:43: Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:41: Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:43: Starting minibatch loop.
-02/23/2017 05:17:43: Finished Epoch[ 4 of 10]: [Training] cr = 2.09130743 * 368; Err = 1.00000000 * 368; totalSamplesSeen = 1174; learningRatePerSample = 0.0049999999; epochTime=0.442559s
-02/23/2017 05:17:43: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.4'
+02/18/2017 07:06:41: Starting minibatch loop.
+02/18/2017 07:06:42: Finished Epoch[ 4 of 10]: [Training] cr = 2.14839206 * 368; Err = 1.00000000 * 368; totalSamplesSeen = 1174; learningRatePerSample = 0.0049999999; epochTime=0.99298s
+02/18/2017 07:06:42: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.4'

-02/23/2017 05:17:43: Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:42: Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:43: Starting minibatch loop.
-02/23/2017 05:17:44: Finished Epoch[ 5 of 10]: [Training] cr = 464.95003780 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1422; learningRatePerSample = 0.0049999999; epochTime=0.28969s
-02/23/2017 05:17:44: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.5'
+02/18/2017 07:06:42: Starting minibatch loop.
+02/18/2017 07:06:43: Finished Epoch[ 5 of 10]: [Training] cr = 383.37273185 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1422; learningRatePerSample = 0.0049999999; epochTime=0.675824s
+02/18/2017 07:06:43: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.5'

-02/23/2017 05:17:44: Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:43: Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:44: Starting minibatch loop.
-02/23/2017 05:17:44: Finished Epoch[ 6 of 10]: [Training] cr = 1.84468669 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1670; learningRatePerSample = 0.0049999999; epochTime=0.283613s
-02/23/2017 05:17:44: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.6'
+02/18/2017 07:06:43: Starting minibatch loop.
+02/18/2017 07:06:43: Finished Epoch[ 6 of 10]: [Training] cr = 1.82054593 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1670; learningRatePerSample = 0.0049999999; epochTime=0.662799s
+02/18/2017 07:06:44: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.6'

-02/23/2017 05:17:44: Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:44: Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:44: Starting minibatch loop.
-02/23/2017 05:17:44: Finished Epoch[ 7 of 10]: [Training] cr = 1.71730664 * 358; Err = 1.00000000 * 358; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=0.423181s
-02/23/2017 05:17:45: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.7'
+02/18/2017 07:06:44: Starting minibatch loop.
+02/18/2017 07:06:45: Finished Epoch[ 7 of 10]: [Training] cr = 1.70413907 * 358; Err = 1.00000000 * 358; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=0.973855s
+02/18/2017 07:06:45: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.7'

-02/23/2017 05:17:45: Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:45: Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:45: Starting minibatch loop.
-02/23/2017 05:17:45: Finished Epoch[ 8 of 10]: [Training] cr = 0.00000000 * 0; Err = 0.00000000 * 0; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=0.000582587s
-02/23/2017 05:17:45: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.8'
+02/18/2017 07:06:45: Starting minibatch loop.
+02/18/2017 07:06:45: Finished Epoch[ 8 of 10]: [Training] cr = 0.00000000 * 0; Err = 0.00000000 * 0; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=0.00162166s
+02/18/2017 07:06:45: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.8'

-02/23/2017 05:17:45: Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:45: Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:45: Starting minibatch loop.
-02/23/2017 05:17:45: Finished Epoch[ 9 of 10]: [Training] cr = 1.20227814 * 308; Err = 1.00000000 * 308; totalSamplesSeen = 2336; learningRatePerSample = 0.0049999999; epochTime=0.358506s
-02/23/2017 05:17:45: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn.9'
+02/18/2017 07:06:45: Starting minibatch loop.
+02/18/2017 07:06:46: Finished Epoch[ 9 of 10]: [Training] cr = 1.19612240 * 308; Err = 1.00000000 * 308; totalSamplesSeen = 2336; learningRatePerSample = 0.0049999999; epochTime=0.835254s
+02/18/2017 07:06:46: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.9'

-02/23/2017 05:17:45: Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
+02/18/2017 07:06:46: Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

-02/23/2017 05:17:45: Starting minibatch loop.
-02/23/2017 05:17:46: Finished Epoch[10 of 10]: [Training] cr = 1.33477291 * 608; Err = 1.00000000 * 608; totalSamplesSeen = 2944; learningRatePerSample = 0.0049999999; epochTime=0.912103s
-02/23/2017 05:17:46: SGD: Saving checkpoint model '/tmp/cntk-test-20170223051714.228082/Speech_LSTM_CTC@release_cpu/models/simple.dnn'
+02/18/2017 07:06:46: Starting minibatch loop.
+02/18/2017 07:06:48: Finished Epoch[10 of 10]: [Training] cr = 1.33511935 * 608; Err = 1.00000000 * 608; totalSamplesSeen = 2944; learningRatePerSample = 0.0049999999; epochTime=1.71111s
+02/18/2017 07:06:48: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn'

-02/23/2017 05:17:46: Action "train" complete.
+02/18/2017 07:06:48: Action "train" complete.

-02/23/2017 05:17:46: __COMPLETED__
+02/18/2017 07:06:48: __COMPLETED__
 === Deleting last epoch data
--- a/Tests/EndToEndTests/Speech/LSTM_CTC/baseline.linux.gpu.txt
+++ b/Tests/EndToEndTests/Speech/LSTM_CTC/baseline.linux.gpu.txt
@ -342,13 +342,13 @@ Memory Sharing: Out of 200 matrices, 71 are shared as 27, and 129 are not shared
 02/18/2017 07:06:38: Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 07:06:38: Starting minibatch loop.
-02/18/2017 07:06:39: Finished Epoch[ 1 of 10]: [Training] cr = 4.15554279 * 368; Err = 2.48529418 * 368; totalSamplesSeen = 368; learningRatePerSample = 0.0049999999; epochTime=1.00518s
+02/18/2017 07:06:39: Finished Epoch[ 1 of 10]: [Training] cr = 4.16293501 * 368; Err = 2.52941181 * 368; totalSamplesSeen = 368; learningRatePerSample = 0.0049999999; epochTime=1.00518s
 02/18/2017 07:06:39: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.1'

 02/18/2017 07:06:39: Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 07:06:39: Starting minibatch loop.
-02/18/2017 07:06:41: Finished Epoch[ 2 of 10]: [Training] cr = 3.68123763 * 438; Err = 1.00000000 * 438; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=1.21725s
+02/18/2017 07:06:41: Finished Epoch[ 2 of 10]: [Training] cr = 3.68804068 * 438; Err = 1.00000000 * 438; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=1.21725s
 02/18/2017 07:06:41: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.2'

 02/18/2017 07:06:41: Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
@ -360,25 +360,25 @@ Memory Sharing: Out of 200 matrices, 71 are shared as 27, and 129 are not shared
 02/18/2017 07:06:41: Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 07:06:41: Starting minibatch loop.
-02/18/2017 07:06:42: Finished Epoch[ 4 of 10]: [Training] cr = 2.09130760 * 368; Err = 1.00000000 * 368; totalSamplesSeen = 1174; learningRatePerSample = 0.0049999999; epochTime=0.99298s
+02/18/2017 07:06:42: Finished Epoch[ 4 of 10]: [Training] cr = 2.14839604 * 368; Err = 1.00000000 * 368; totalSamplesSeen = 1174; learningRatePerSample = 0.0049999999; epochTime=0.99298s
 02/18/2017 07:06:42: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.4'

 02/18/2017 07:06:42: Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 07:06:42: Starting minibatch loop.
-02/18/2017 07:06:43: Finished Epoch[ 5 of 10]: [Training] cr = 464.94988029 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1422; learningRatePerSample = 0.0049999999; epochTime=0.675824s
+02/18/2017 07:06:43: Finished Epoch[ 5 of 10]: [Training] cr = 383.36677797 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1422; learningRatePerSample = 0.0049999999; epochTime=0.675824s
 02/18/2017 07:06:43: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.5'

 02/18/2017 07:06:43: Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 07:06:43: Starting minibatch loop.
-02/18/2017 07:06:43: Finished Epoch[ 6 of 10]: [Training] cr = 1.84473739 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1670; learningRatePerSample = 0.0049999999; epochTime=0.662799s
+02/18/2017 07:06:43: Finished Epoch[ 6 of 10]: [Training] cr = 1.82060106 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1670; learningRatePerSample = 0.0049999999; epochTime=0.662799s
 02/18/2017 07:06:44: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.6'

 02/18/2017 07:06:44: Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 07:06:44: Starting minibatch loop.
-02/18/2017 07:06:45: Finished Epoch[ 7 of 10]: [Training] cr = 1.71734363 * 358; Err = 1.00000000 * 358; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=0.973855s
+02/18/2017 07:06:45: Finished Epoch[ 7 of 10]: [Training] cr = 1.70418050 * 358; Err = 1.00000000 * 358; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=0.973855s
 02/18/2017 07:06:45: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.7'

 02/18/2017 07:06:45: Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
@ -390,13 +390,13 @@ Memory Sharing: Out of 200 matrices, 71 are shared as 27, and 129 are not shared
 02/18/2017 07:06:45: Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 07:06:45: Starting minibatch loop.
-02/18/2017 07:06:46: Finished Epoch[ 9 of 10]: [Training] cr = 1.20229756 * 308; Err = 1.00000000 * 308; totalSamplesSeen = 2336; learningRatePerSample = 0.0049999999; epochTime=0.835254s
+02/18/2017 07:06:46: Finished Epoch[ 9 of 10]: [Training] cr = 1.19614708 * 308; Err = 1.00000000 * 308; totalSamplesSeen = 2336; learningRatePerSample = 0.0049999999; epochTime=0.835254s
 02/18/2017 07:06:46: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn.9'

 02/18/2017 07:06:46: Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 07:06:46: Starting minibatch loop.
-02/18/2017 07:06:48: Finished Epoch[10 of 10]: [Training] cr = 1.33476970 * 608; Err = 1.00000000 * 608; totalSamplesSeen = 2944; learningRatePerSample = 0.0049999999; epochTime=1.71111s
+02/18/2017 07:06:48: Finished Epoch[10 of 10]: [Training] cr = 1.33512417 * 608; Err = 1.00000000 * 608; totalSamplesSeen = 2944; learningRatePerSample = 0.0049999999; epochTime=1.71111s
 02/18/2017 07:06:48: SGD: Saving checkpoint model '/tmp/cntk-test-20170218070416.834755/Speech_LSTM_CTC@debug_gpu/models/simple.dnn'

 02/18/2017 07:06:48: Action "train" complete.
--- a/Tests/EndToEndTests/Speech/LSTM_CTC/baseline.windows.cpu.txt
+++ b/Tests/EndToEndTests/Speech/LSTM_CTC/baseline.windows.cpu.txt
@ -467,13 +467,13 @@ Here are the ones that don't share memory:
 02/22/2017 21:20:37: Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/22/2017 21:20:37: Starting minibatch loop.
-02/22/2017 21:20:38: Finished Epoch[ 1 of 10]: [Training] cr = 4.15554279 * 368; Err = 2.48529418 * 368; totalSamplesSeen = 368; learningRatePerSample = 0.0049999999; epochTime=0.926568s
+02/22/2017 21:20:38: Finished Epoch[ 1 of 10]: [Training] cr = 4.16293534 * 368; Err = 2.52941181 * 368; totalSamplesSeen = 368; learningRatePerSample = 0.0049999999; epochTime=0.926568s
 02/22/2017 21:20:38: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170222211959.232036\Speech_LSTM_CTC@release_cpu/models/simple.dnn.1'

 02/22/2017 21:20:38: Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/22/2017 21:20:38: Starting minibatch loop.
-02/22/2017 21:20:40: Finished Epoch[ 2 of 10]: [Training] cr = 3.68123707 * 438; Err = 1.00000000 * 438; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=1.07618s
+02/22/2017 21:20:40: Finished Epoch[ 2 of 10]: [Training] cr = 3.68804096 * 438; Err = 1.00000000 * 438; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=1.07618s
 02/22/2017 21:20:40: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170222211959.232036\Speech_LSTM_CTC@release_cpu/models/simple.dnn.2'

 02/22/2017 21:20:40: Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
@ -485,25 +485,25 @@ Here are the ones that don't share memory:
 02/22/2017 21:20:40: Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/22/2017 21:20:40: Starting minibatch loop.
-02/22/2017 21:20:41: Finished Epoch[ 4 of 10]: [Training] cr = 2.09131158 * 368; Err = 1.00000000 * 368; totalSamplesSeen = 1174; learningRatePerSample = 0.0049999999; epochTime=0.749923s
+02/22/2017 21:20:41: Finished Epoch[ 4 of 10]: [Training] cr = 2.14839720 * 368; Err = 1.00000000 * 368; totalSamplesSeen = 1174; learningRatePerSample = 0.0049999999; epochTime=0.749923s
 02/22/2017 21:20:41: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170222211959.232036\Speech_LSTM_CTC@release_cpu/models/simple.dnn.4'

 02/22/2017 21:20:41: Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/22/2017 21:20:41: Starting minibatch loop.
-02/22/2017 21:20:41: Finished Epoch[ 5 of 10]: [Training] cr = 464.94304435 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1422; learningRatePerSample = 0.0049999999; epochTime=0.369816s
+02/22/2017 21:20:41: Finished Epoch[ 5 of 10]: [Training] cr = 383.36570691 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1422; learningRatePerSample = 0.0049999999; epochTime=0.369816s
 02/22/2017 21:20:43: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170222211959.232036\Speech_LSTM_CTC@release_cpu/models/simple.dnn.5'

 02/22/2017 21:20:43: Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/22/2017 21:20:43: Starting minibatch loop.
-02/22/2017 21:20:43: Finished Epoch[ 6 of 10]: [Training] cr = 1.84468571 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1670; learningRatePerSample = 0.0049999999; epochTime=0.626039s
+02/22/2017 21:20:43: Finished Epoch[ 6 of 10]: [Training] cr = 1.82054569 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1670; learningRatePerSample = 0.0049999999; epochTime=0.626039s
 02/22/2017 21:20:43: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170222211959.232036\Speech_LSTM_CTC@release_cpu/models/simple.dnn.6'

 02/22/2017 21:20:44: Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/22/2017 21:20:44: Starting minibatch loop.
-02/22/2017 21:20:44: Finished Epoch[ 7 of 10]: [Training] cr = 1.71730425 * 358; Err = 1.00000000 * 358; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=0.607594s
+02/22/2017 21:20:44: Finished Epoch[ 7 of 10]: [Training] cr = 1.70413839 * 358; Err = 1.00000000 * 358; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=0.607594s
 02/22/2017 21:20:44: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170222211959.232036\Speech_LSTM_CTC@release_cpu/models/simple.dnn.7'

 02/22/2017 21:20:44: Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
@ -515,13 +515,13 @@ Here are the ones that don't share memory:
 02/22/2017 21:20:45: Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/22/2017 21:20:45: Starting minibatch loop.
-02/22/2017 21:20:46: Finished Epoch[ 9 of 10]: [Training] cr = 1.20227734 * 308; Err = 1.00000000 * 308; totalSamplesSeen = 2336; learningRatePerSample = 0.0049999999; epochTime=0.754015s
+02/22/2017 21:20:46: Finished Epoch[ 9 of 10]: [Training] cr = 1.19612221 * 308; Err = 1.00000000 * 308; totalSamplesSeen = 2336; learningRatePerSample = 0.0049999999; epochTime=0.754015s
 02/22/2017 21:20:46: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170222211959.232036\Speech_LSTM_CTC@release_cpu/models/simple.dnn.9'

 02/22/2017 21:20:46: Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/22/2017 21:20:46: Starting minibatch loop.
-02/22/2017 21:20:47: Finished Epoch[10 of 10]: [Training] cr = 1.33477251 * 608; Err = 1.00000000 * 608; totalSamplesSeen = 2944; learningRatePerSample = 0.0049999999; epochTime=1.0302s
+02/22/2017 21:20:47: Finished Epoch[10 of 10]: [Training] cr = 1.33511985 * 608; Err = 1.00000000 * 608; totalSamplesSeen = 2944; learningRatePerSample = 0.0049999999; epochTime=1.0302s
 02/22/2017 21:20:47: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170222211959.232036\Speech_LSTM_CTC@release_cpu/models/simple.dnn'

 02/22/2017 21:20:48: Action "train" complete.
--- a/Tests/EndToEndTests/Speech/LSTM_CTC/baseline.windows.gpu.txt
+++ b/Tests/EndToEndTests/Speech/LSTM_CTC/baseline.windows.gpu.txt
@ -340,13 +340,13 @@ Memory Sharing: Out of 200 matrices, 71 are shared as 27, and 129 are not shared
 02/18/2017 09:28:11: Starting Epoch 1: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 09:28:11: Starting minibatch loop.
-02/18/2017 09:28:13: Finished Epoch[ 1 of 10]: [Training] cr = 4.15554279 * 368; Err = 2.48529418 * 368; totalSamplesSeen = 368; learningRatePerSample = 0.0049999999; epochTime=1.03913s
+02/18/2017 09:28:13: Finished Epoch[ 1 of 10]: [Training] cr = 4.16293501 * 368; Err = 2.52941181 * 368; totalSamplesSeen = 368; learningRatePerSample = 0.0049999999; epochTime=1.03913s
 02/18/2017 09:28:13: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170218092651.300722\Speech_LSTM_CTC@release_gpu/models/simple.dnn.1'

 02/18/2017 09:28:13: Starting Epoch 2: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 09:28:13: Starting minibatch loop.
-02/18/2017 09:28:14: Finished Epoch[ 2 of 10]: [Training] cr = 3.68123735 * 438; Err = 1.00000000 * 438; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=1.33313s
+02/18/2017 09:28:14: Finished Epoch[ 2 of 10]: [Training] cr = 3.68804068 * 438; Err = 1.00000000 * 438; totalSamplesSeen = 806; learningRatePerSample = 0.0049999999; epochTime=1.33313s
 02/18/2017 09:28:14: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170218092651.300722\Speech_LSTM_CTC@release_gpu/models/simple.dnn.2'

 02/18/2017 09:28:14: Starting Epoch 3: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
@ -358,25 +358,25 @@ Memory Sharing: Out of 200 matrices, 71 are shared as 27, and 129 are not shared
 02/18/2017 09:28:14: Starting Epoch 4: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 09:28:14: Starting minibatch loop.
-02/18/2017 09:28:15: Finished Epoch[ 4 of 10]: [Training] cr = 2.09130859 * 368; Err = 1.00000000 * 368; totalSamplesSeen = 1174; learningRatePerSample = 0.0049999999; epochTime=1.03006s
+02/18/2017 09:28:15: Finished Epoch[ 4 of 10]: [Training] cr = 2.14839438 * 368; Err = 1.00000000 * 368; totalSamplesSeen = 1174; learningRatePerSample = 0.0049999999; epochTime=1.03006s
 02/18/2017 09:28:16: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170218092651.300722\Speech_LSTM_CTC@release_gpu/models/simple.dnn.4'

 02/18/2017 09:28:16: Starting Epoch 5: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 09:28:16: Starting minibatch loop.
-02/18/2017 09:28:16: Finished Epoch[ 5 of 10]: [Training] cr = 464.94795867 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1422; learningRatePerSample = 0.0049999999; epochTime=0.691575s
+02/18/2017 09:28:16: Finished Epoch[ 5 of 10]: [Training] cr = 383.36923513 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1422; learningRatePerSample = 0.0049999999; epochTime=0.691575s
 02/18/2017 09:28:16: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170218092651.300722\Speech_LSTM_CTC@release_gpu/models/simple.dnn.5'

 02/18/2017 09:28:17: Starting Epoch 6: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 09:28:17: Starting minibatch loop.
-02/18/2017 09:28:17: Finished Epoch[ 6 of 10]: [Training] cr = 1.84468029 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1670; learningRatePerSample = 0.0049999999; epochTime=0.722621s
+02/18/2017 09:28:17: Finished Epoch[ 6 of 10]: [Training] cr = 1.82059245 * 248; Err = 1.00000000 * 248; totalSamplesSeen = 1670; learningRatePerSample = 0.0049999999; epochTime=0.722621s
 02/18/2017 09:28:17: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170218092651.300722\Speech_LSTM_CTC@release_gpu/models/simple.dnn.6'

 02/18/2017 09:28:18: Starting Epoch 7: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 09:28:18: Starting minibatch loop.
-02/18/2017 09:28:19: Finished Epoch[ 7 of 10]: [Training] cr = 1.71730442 * 358; Err = 1.00000000 * 358; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=1.0906s
+02/18/2017 09:28:19: Finished Epoch[ 7 of 10]: [Training] cr = 1.70416839 * 358; Err = 1.00000000 * 358; totalSamplesSeen = 2028; learningRatePerSample = 0.0049999999; epochTime=1.0906s
 02/18/2017 09:28:19: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170218092651.300722\Speech_LSTM_CTC@release_gpu/models/simple.dnn.7'

 02/18/2017 09:28:19: Starting Epoch 8: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples
@ -388,13 +388,13 @@ Memory Sharing: Out of 200 matrices, 71 are shared as 27, and 129 are not shared
 02/18/2017 09:28:19: Starting Epoch 9: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 09:28:19: Starting minibatch loop.
-02/18/2017 09:28:20: Finished Epoch[ 9 of 10]: [Training] cr = 1.20227705 * 308; Err = 1.00000000 * 308; totalSamplesSeen = 2336; learningRatePerSample = 0.0049999999; epochTime=0.945437s
+02/18/2017 09:28:20: Finished Epoch[ 9 of 10]: [Training] cr = 1.19614064 * 308; Err = 1.00000000 * 308; totalSamplesSeen = 2336; learningRatePerSample = 0.0049999999; epochTime=0.945437s
 02/18/2017 09:28:20: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170218092651.300722\Speech_LSTM_CTC@release_gpu/models/simple.dnn.9'

 02/18/2017 09:28:20: Starting Epoch 10: learning rate per sample = 0.005000  effective momentum = 0.900000  momentum as time constant = 189.8 samples

 02/18/2017 09:28:20: Starting minibatch loop.
-02/18/2017 09:28:22: Finished Epoch[10 of 10]: [Training] cr = 1.33477301 * 608; Err = 1.00000000 * 608; totalSamplesSeen = 2944; learningRatePerSample = 0.0049999999; epochTime=1.86033s
+02/18/2017 09:28:22: Finished Epoch[10 of 10]: [Training] cr = 1.33511915 * 608; Err = 1.00000000 * 608; totalSamplesSeen = 2944; learningRatePerSample = 0.0049999999; epochTime=1.86033s
 02/18/2017 09:28:22: SGD: Saving checkpoint model 'C:\Users\svcphil\AppData\Local\Temp\cntk-test-20170218092651.300722\Speech_LSTM_CTC@release_gpu/models/simple.dnn'

 02/18/2017 09:28:22: Action "train" complete.
--- a/Tests/EndToEndTests/Speech/LSTM_CTC/lstm.bs
+++ b/Tests/EndToEndTests/Speech/LSTM_CTC/lstm.bs
@ -48,7 +48,7 @@ speechTrain = [
                input = [
                    labels = [
                        alias = "l"
-                        dim = 132
+                        dim = 133
                        format = "sparse"
                    ]
                ]
@ -98,7 +98,7 @@ speechTrain = [
        // define basic I/O
        baseFeatDim = 33
        featDim = 11 * baseFeatDim
-        labelDim = 132
+        labelDim = 133

        // hidden dimensions
        cellDim = 1024
--- a/Tests/EndToEndTests/UnitTests/CNTKv2Library/baseline.txt
+++ b/Tests/EndToEndTests/UnitTests/CNTKv2Library/baseline.txt
@ -498,7 +498,7 @@ Test module "V2LibraryTests" has passed with:
    Test case "TensorSuite/TensorPlusRightOperandWithoutAxes" has passed

  Test suite "UserDefinedFunctionSuite" has passed with:
-    4 test cases out of 4 passed
+    5 test cases out of 5 passed

    Test case "UserDefinedFunctionSuite/DuplicateVariablesInCPU" has passed

@ -508,6 +508,8 @@ Test module "V2LibraryTests" has passed with:

    Test case "UserDefinedFunctionSuite/TimesAndPlusInGPU" has passed

+	Test case "UserDefinedFunctionSuite/UserTimesFunctionExample" has passed
+
  Test suite "ValueSuite" has passed with:
    30 test cases out of 30 passed
    332 assertions out of 332 passed
--- a/Tests/EndToEndTests/UnitTests/CNTKv2Library/run-test
+++ b/Tests/EndToEndTests/UnitTests/CNTKv2Library/run-test
@ -46,22 +46,20 @@ fi

 pushd $TestDataDir

-# Note: Run the device selection tests first since later tests may interfere with
-# device selection by freezing default device
 if [ "$OS" == "Windows_NT" ]; then
  TEST_BINARY=V2LibraryTests.exe
 else
  TEST_BINARY=v2librarytests
 fi

+# Note: Run the device selection suite first since later tests may interfere with
+# device selection by freezing default device
 $TEST_BIN_DIR/$TEST_BINARY --report_level=detailed --run_test=DeviceSelectionSuite
-ExitCode1=$?
+ExitCode=$?
 $TEST_BIN_DIR/$TEST_BINARY --report_level=detailed --run_test=!DeviceSelectionSuite
 ExitCode2=$?

-(( ExitCode1 != 0)) && ExitCode=$ExitCode1
-(( ExitCode2 != 0)) && ExitCode=$ExitCode2
-(( ExitCode1 == ExitCode2)) && ExitCode=$ExitCode1
+(( ExitCode2 > ExitCode )) && ExitCode=$ExitCode2

 # Delete the test data
 popd
--- a/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
+++ b/Tests/UnitTests/EvalTests/EvalExtendedTests.cpp
@ -60,7 +60,7 @@ BOOST_AUTO_TEST_CASE(CheckModelVersion)
    // This is a watch guard to make sure that any change in the model version will be detected. 
    // If you change the CNTK model version, please do not silently adapt this test. 
    // Instead, please do notify the CNTK release team (AlexeyO, Wolfgang, Zhou, Mark) to prepare required steps for the next release.
-    BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 19, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
+    BOOST_REQUIRE_MESSAGE(CURRENT_CNTK_MODEL_VERSION == 20, "The model version has been changed. Before making changes in this test, please first notify the CNTK release team to prepare required steps in the next release. Thanks!\n");
 }

 BOOST_AUTO_TEST_CASE(EvalConstantPlusTest)
--- a/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/SerializationTests.cpp
@ -326,7 +326,9 @@ void CheckEnumValuesNotModified() {
                  static_cast<size_t>(PrimitiveOpType::EditDistanceError) == 61 &&
                  static_cast<size_t>(PrimitiveOpType::NoOp) == 62 &&
                  static_cast<size_t>(PrimitiveOpType::LabelsToGraph) == 63 &&
-                  static_cast<size_t>(PrimitiveOpType::StopGradient) == 64,
+                  static_cast<size_t>(PrimitiveOpType::StopGradient) == 64 &&
+                  static_cast<size_t>(PrimitiveOpType::ELU) == 65 &&
+                  static_cast<size_t>(PrimitiveOpType::ForwardBackward) == 66,
                  "PrimitiveOpType enum value was modified.");
 }

--- a/Tests/UnitTests/V2LibraryTests/UserDefinedFunctionTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/UserDefinedFunctionTests.cpp
@ -6,6 +6,7 @@
 #include "CNTKLibrary.h"
 #include <functional>
 #include "Common.h"
+#include "UserMatrixMultiplicationOp.h"

 using namespace CNTK;
 // TODO: Need to further cleanup/simplify definition of user defined functions
@ -391,6 +392,11 @@ BOOST_AUTO_TEST_CASE(TimesAndPlusInGPU)
    }
 }

+BOOST_AUTO_TEST_CASE(UserTimesFunctionExample)
+{
+    UserTimesFunctionExample();
+}
+
 BOOST_AUTO_TEST_SUITE_END()

 }}
--- a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
+++ b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj
@ -61,7 +61,7 @@
      <SDLCheck>true</SDLCheck>
      <UseFullPaths>true</UseFullPaths>
      <OpenMPSupport>true</OpenMPSupport>
-      <AdditionalIncludeDirectories>$(SolutionDir)Source\CNTKv2LibraryDll\API;$(SolutionDir)Source\CNTKv2LibraryDll\;$(BOOST_INCLUDE_PATH)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(SolutionDir)Examples\Extensibility\CPP;$(SolutionDir)Source\CNTKv2LibraryDll\API;$(SolutionDir)Source\CNTKv2LibraryDll\;$(BOOST_INCLUDE_PATH)</AdditionalIncludeDirectories>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
--- a/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj.filters
+++ b/Tests/UnitTests/V2LibraryTests/V2LibraryTests.vcxproj.filters
@ -42,9 +42,6 @@
    <ClCompile Include="DeviceSelectionTests.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
-    <ClCompile Include="Common.cpp">
-      <Filter>Source Files</Filter>
-    </ClCompile>
    <ClCompile Include="MinibatchSourceTest.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
@ -63,6 +60,9 @@
    <ClCompile Include="LoadLegacyModelTests.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
+    <ClCompile Include="..\..\EndToEndTests\CNTKv2Library\Common\Common.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="stdafx.h">
@ -74,7 +74,7 @@
    <ClInclude Include="Common.h">
      <Filter>Header Files</Filter>
    </ClInclude>
-    <ClInclude Include="Image.h">
+    <ClInclude Include="..\..\EndToEndTests\CNTKv2Library\Common\Image.h">
      <Filter>Header Files</Filter>
    </ClInclude>
  </ItemGroup>
--- a/Tools/make_binary_drop_windows.ps1
+++ b/Tools/make_binary_drop_windows.ps1
@ -33,12 +33,6 @@ If (-not $buildConfig) {Throw "buildConfig" + $usage}
 If (-not $targetConfig) {Throw "targetConfig" + $usage}
 If (-not $sharePath) {Throw "sharePath" + $usage}

-# Set Verbose mode
-If ($verbose)
-{
-     $VerbosePreference = "continue"
-}
-
 Write-Verbose "Making binary drops..."

 # If not a Release build quit
@ -84,10 +78,7 @@ Remove-Item $baseDropPath\cntk\*.lib  -Exclude EvalDll.lib, CNTKLibrary-2.0.lib
 Remove-Item $baseDropPath\cntk\*.exp
 Remove-Item $baseDropPath\cntk\*.metagen
 # Remove specific items
-If (Test-Path $baseDropPath\cntk\CommandEval.exe)
-{
-    Remove-Item $baseDropPath\cntk\CommandEval.exe
-}
+Remove-Item $baseDropPath\cntk\CommandEval.exe -Force -ErrorAction SilentlyContinue
 Remove-Item $baseDropPath\cntk\Microsoft.VisualStudio.QualityTools.UnitTestFramework.*

 # Make Include folder
@ -112,14 +103,8 @@ Copy-Item Tutorials -Recurse -Destination $baseDropPath\Tutorials
 Write-Verbose "Copying Scripts ..."
 Copy-Item Scripts -Recurse -Destination $baseDropPath\Scripts
 # Remove some files if they exist
-If (Test-Path $baseDropPath\Scripts\pytest.ini)
-{
-    Remove-Item $baseDropPath\Scripts\pytest.ini
-}
-If (Test-Path $baseDropPath\Scripts\install\linux)
-{
-    Remove-Item -Recurse $baseDropPath\Scripts\install\linux
-}
+Remove-Item $baseDropPath\Scripts\pytest.ini -Force -ErrorAction SilentlyContinue
+Remove-Item -Recurse $baseDropPath\Scripts\install\linux -Force -ErrorAction SilentlyContinue

 # Copy all items from the share
 # For whatever reason Copy-Item in the line below does not work
@ -138,19 +123,24 @@ If ($LastExitCode -gt 7)
 Write-Verbose "Making ZIP and cleaning up..."

 # Make ZIP file
+# Switched to use 7zip because of the backslash separator issue in .NET compressor
+# (fixed in 4.6.1, which is not a standard component of build machines
+# see https://msdn.microsoft.com/en-us/library/mt712573(v=vs.110).aspx?f=255&MSPPError=-2147217396 )
+$workSpace = $PWD.Path
 $source = Join-Path $PWD.Path -ChildPath $basePath
 $destination = Join-Path $PWD.Path -ChildPath $outputPath
-Add-Type -assembly "system.io.compression.filesystem"
-[io.compression.zipfile]::CreateFromDirectory($source, $destination)
+Set-Location -Path $source
+7za a -bd $destination .
+If ($LastExitCode -ne 0)
+{
+    throw "7za returned exit code $LastExitCode"
+}
+Set-Location -Path $workSpace

 # Log the file hash
 Get-FileHash -Algorithm SHA256 -Path $destination, *.whl

 # Remove ZIP sources
-If (Test-Path $basePath)
-{
-    Remove-Item $basePath -Recurse
-}
+Remove-Item -Recurse $basePath -Force -ErrorAction SilentlyContinue 

-# Return zero exit code code from here (N.B.: can be non-zero from robocopy above)
 exit 0
--- a/Tutorials/CNTK_106B_LSTM_Timeseries_with_IOT_Data.ipynb
+++ b/Tutorials/CNTK_106B_LSTM_Timeseries_with_IOT_Data.ipynb
@ -170,7 +170,7 @@
    "**Note** if we have less than 8 datapoints for a day we skip over the day assuming something is missing in the raw data. If we get more than 14 data points in a day we truncate the readings.\n",
    "\n",
    "## Training / Testing / Validation data preparation\n",
-    "The raw data is sorted by time and we should randomize it before splitting into training, validation and test datasets but this would make it impractical to visualize results in the tutorial. Hence, we split the dataset in the following manner: pick in sequence, 8 values for training, 1 for validation and 1 for test until there is no more data. This will spread training, validation and test datasets across the full timeline while preserving time order.\n"
+    "We start by reading the csv file for use with CNTK.  The raw data is sorted by time and we should randomize it before splitting into training, validation and test datasets but this would make it impractical to visualize results in the tutorial. Hence, we split the dataset in the following manner: pick in sequence, 8 values for training, 1 for validation and 1 for test until there is no more data. This will spread training, validation and test datasets across the full timeline while preserving time order.\n"
   ]
  },
  {
--- a/bindings/csharp/Swig/cntk_cs.i
+++ b/bindings/csharp/Swig/cntk_cs.i
@ -63,6 +63,7 @@
 %ignore_function CNTK::PlaceholderVariable;
 %ignore_function CNTK::InputVariable;
 %ignore_function CNTK::OutputVariable;
+%ignore_function CNTK::Internal::AddProgressWriters;

 %ignore_class CNTK::Variable::CompositeFunction;
 %ignore_class CNTK::Variable::Trainer;
@ -237,6 +238,7 @@

 %ignore_class CNTK::TrainingSession;
 %ignore_function CNTK::CreateBasicTrainingSession;
+%ignore_function CNTK::CreateTrainingSession;
 %ignore_function CNTK::CreateDataParallelDistributedTrainer;
 %ignore_function CNTK::CreateQuantizedDataParallelDistributedTrainer;

--- a/bindings/python/cntk/axis.py
+++ b/bindings/python/cntk/axis.py
@ -151,9 +151,25 @@ class Axis(cntk_py.Axis):
    @typemap
    def end_static_axis():
        '''
-        Creates an Axis object representing the end (one past last) static axis.
+        DEPRECATED.
+
+        Creates an Axis object representing a new leading static axis.

        Returns:
-            :class:`Axis`: axis object representing the end (one past last) static axis.
+            :class:`Axis`: axis object representing a new leading static axis.
+        '''
+        import warnings
+        warnings.warn('This will be removed in future versions. Please use '
+                'Axis.new_leading_axis() instead.', DeprecationWarning)
+        return cntk_py.Axis.end_static_axis()
+
+    @staticmethod
+    @typemap
+    def new_leading_axis():
+        '''
+        Creates an Axis object representing a new leading static axis.
+
+        Returns:
+            :class:`Axis`: axis object representing a new leading static axis.
        '''
        return cntk_py.Axis.end_static_axis()
--- a/bindings/python/cntk/cntk_py.i
+++ b/bindings/python/cntk/cntk_py.i
@ -16,6 +16,7 @@
 %implicitconv CNTK::Variable;

 %rename(_forward) CNTK::Function::Forward;
+%rename(_add_progress_writers) CNTK::Internal::AddProgressWriters;
 %rename(_backward) CNTK::Function::Backward;
 %rename(_infer_outputs) CNTK::Function::InferOutputs;
 %rename(_update) CNTK::Learner::Update;
@ -150,6 +151,7 @@
 %ignore CNTK::Internal::IsRenamingFunctionsAllowed;
 %ignore CNTK::Internal::IsAutomaticUnpackingOfPackedValuesDisabled;
 %ignore CNTK::Internal::GetComputationNetworkTraceLevel;
+%ignore CNTK::Internal::Convolution; 

 %ignore CNTK::Function::Function(const std::vector<Variable>& inputs, Dictionary&& functionConfig, const std::wstring& name = L"", const std::wstring& uid = Internal::GenerateUid(L"UserDefinedFunction"));

@ -1422,20 +1424,23 @@ std::unordered_map<CNTK::StreamInformation, std::pair<CNTK::NDArrayViewPtr, CNTK
 //
 %extend CNTK::NDArrayView {

-    NDArrayView(PyObject* pyobj, const CNTK::DeviceDescriptor& device, bool readOnly)
+    NDArrayView(PyObject* numpyArrayObject, const CNTK::DeviceDescriptor& device, bool readOnly, bool borrow)
    {
-        if (!PyArray_Check((PyArrayObject*)pyobj))
+        if (!PyArray_Check((PyArrayObject*)numpyArrayObject))
        {
            // Note that in contrast to numpy.i's implementation we demand NumPy arrays
            // and do not accept arbitrary sequences, which would needed to be copied around.
            throw std::logic_error("NumPy array expected");
        }

-        PyArrayObject* array = (PyArrayObject*)pyobj;
+        // Borrowing the memory is only allowed on CPU for now
+        borrow &= device == DeviceDescriptor::CPUDevice();

-        int rank = PyArray_NDIM(array); 
-        
-        npy_intp* np_shape = PyArray_SHAPE(array); 
+        PyArrayObject* array = (PyArrayObject*)numpyArrayObject;
+
+        int rank = PyArray_NDIM(array);
+
+        npy_intp* np_shape = PyArray_SHAPE(array);
        std::vector<size_t> shape(rank);

        npy_intp num_elements = 1;
@ -1451,15 +1456,29 @@ std::unordered_map<CNTK::StreamInformation, std::pair<CNTK::NDArrayViewPtr, CNTK
        NDArrayView* view;
        if (typecode == NPY_FLOAT)
        {
-            NDArrayView  tmp(NDShape(shape), (float*)PyArray_DATA(array), num_elements, DeviceDescriptor::CPUDevice(), readOnly);
-            view = new NDArrayView(DataType::Float, tmp.Shape(), device);
-            view->CopyFrom(tmp);
+            if (borrow)
+            {
+                 view = new NDArrayView(NDShape(shape), (float*)PyArray_DATA(array), num_elements, DeviceDescriptor::CPUDevice(), readOnly);
+            }
+            else
+            {
+                 NDArrayView  tmp(NDShape(shape), (float*)PyArray_DATA(array), num_elements, DeviceDescriptor::CPUDevice(), readOnly);
+                 view = new NDArrayView(DataType::Float, tmp.Shape(), device);
+                 view->CopyFrom(tmp);
+            }
        }
        else if (typecode == NPY_DOUBLE)
        {
-            NDArrayView  tmp(NDShape(shape), (double*)PyArray_DATA(array), num_elements, DeviceDescriptor::CPUDevice(), readOnly);
-            view = new NDArrayView(DataType::Double, tmp.Shape(), device);
-            view->CopyFrom(tmp);
+            if (borrow)
+            {
+                 view = new NDArrayView(NDShape(shape), (double*)PyArray_DATA(array), num_elements, DeviceDescriptor::CPUDevice(), readOnly);
+            }
+            else
+            {
+                 NDArrayView  tmp(NDShape(shape), (double*)PyArray_DATA(array), num_elements, DeviceDescriptor::CPUDevice(), readOnly);
+                 view = new NDArrayView(DataType::Double, tmp.Shape(), device);
+                 view->CopyFrom(tmp);
+            }
        }
        else
        {
@ -1469,7 +1488,8 @@ std::unordered_map<CNTK::StreamInformation, std::pair<CNTK::NDArrayViewPtr, CNTK
        return view;
    }

-    NDArrayView(const CNTK::NDShape& shape, PyObject* pyData, PyObject* pyColStarts, PyObject* pyRowIndices, const CNTK::DeviceDescriptor& device, bool readOnly) 
+
+    NDArrayView(const CNTK::NDShape& shape, PyObject* pyData, PyObject* pyColStarts, PyObject* pyRowIndices, const CNTK::DeviceDescriptor& device, bool readOnly, bool borrow)
    {
        //
        // pyData, pyColStarts, and pyRowIndices are fed by
@ -1491,33 +1511,58 @@ std::unordered_map<CNTK::StreamInformation, std::pair<CNTK::NDArrayViewPtr, CNTK
            throw std::logic_error("index pointers must be a NumPy array");
        }

+        // Borrowing the memory is only allowed on CPU for now
+        borrow &= device == DeviceDescriptor::CPUDevice();
+
        PyArrayObject* data = (PyArrayObject*)pyData;
        PyArrayObject* indices = (PyArrayObject*)pyColStarts;
        PyArrayObject* indptr = (PyArrayObject*)pyRowIndices;

        int typecode = PyArray_TYPE(data);
        size_t numNonZeroValues = PyArray_SIZE(data);
-        
+
        NDArrayView* view;
        if (typecode == NPY_FLOAT)
        {
-            NDArrayView  tmp(shape, 
-             (CNTK::SparseIndexType*)PyArray_DATA(indices), 
-             (CNTK::SparseIndexType*)PyArray_DATA(indptr), 
-             (float*)PyArray_DATA(data), numNonZeroValues, 
-             DeviceDescriptor::CPUDevice(), readOnly);
-            view = new NDArrayView(DataType::Float, StorageFormat::SparseCSC, tmp.Shape(), device);
-            view->CopyFrom(tmp);
+            if (borrow)
+            {
+                view = new NDArrayView(shape,
+                 (CNTK::SparseIndexType*)PyArray_DATA(indices),
+                 (CNTK::SparseIndexType*)PyArray_DATA(indptr),
+                 (float*)PyArray_DATA(data), numNonZeroValues,
+                 DeviceDescriptor::CPUDevice(), readOnly);
+            }
+            else
+            {
+                NDArrayView tmp(shape,
+                 (CNTK::SparseIndexType*)PyArray_DATA(indices),
+                 (CNTK::SparseIndexType*)PyArray_DATA(indptr),
+                 (float*)PyArray_DATA(data), numNonZeroValues,
+                 DeviceDescriptor::CPUDevice(), readOnly);
+                view = new NDArrayView(DataType::Float, StorageFormat::SparseCSC, tmp.Shape(), device);
+                view->CopyFrom(tmp);
+            }
        }
        else if (typecode == NPY_DOUBLE)
        {
-            NDArrayView  tmp(shape, 
-             (CNTK::SparseIndexType*)PyArray_DATA(indices), 
-             (CNTK::SparseIndexType*)PyArray_DATA(indptr), 
-             (double*)PyArray_DATA(data), numNonZeroValues, 
-             DeviceDescriptor::CPUDevice(), readOnly);
-            view = new NDArrayView(DataType::Double, StorageFormat::SparseCSC, tmp.Shape(), device);
-            view->CopyFrom(tmp);
+            if (borrow)
+            {
+                view = new NDArrayView(shape,
+                 (CNTK::SparseIndexType*)PyArray_DATA(indices),
+                 (CNTK::SparseIndexType*)PyArray_DATA(indptr),
+                 (double*)PyArray_DATA(data), numNonZeroValues,
+                 DeviceDescriptor::CPUDevice(), readOnly);
+            }
+            else
+            {
+                NDArrayView tmp(shape,
+                 (CNTK::SparseIndexType*)PyArray_DATA(indices),
+                 (CNTK::SparseIndexType*)PyArray_DATA(indptr),
+                 (double*)PyArray_DATA(data), numNonZeroValues,
+                 DeviceDescriptor::CPUDevice(), readOnly);
+                view = new NDArrayView(DataType::Double, StorageFormat::SparseCSC, tmp.Shape(), device);
+                view->CopyFrom(tmp);
+            }
        }
        else
        {
@ -1607,6 +1652,7 @@ namespace CNTK {
  Py_END_ALLOW_THREADS;
 }

+
 //
 // Setting up hash calculation so that __hash__ on Swig objects
 // are redirected to the std::hash computation of the C++ API
@ -1619,12 +1665,32 @@ namespace CNTK {
 }
 %enddef

+//
+// Setting __str__ and __repr__ methods for frequently used Swig objects
+//
+%define %py_repr_for(TYPE)
+%extend CNTK::TYPE {
+    const std::wstring __str__() {
+        return self->AsString();
+    }
+
+    const std::wstring __repr__() {
+        return self->AsString();
+    }
+}
+%enddef
+
 %define %py_eq_for(DATA_TYPE, EQ)
 %pythoncode %{
 DATA_TYPE.__eq__ = lambda a,b: (a is not None and b is not None and EQ(a,b)) or (a is None and b is None)
 %}
 %enddef

+%py_repr_for(Variable)
+%py_repr_for(Parameter)
+%py_repr_for(Constant)
+%py_repr_for(Function)
+
 %py_eq_for(Variable, Variable_eq)
 %py_hash_for(Variable)

--- a/bindings/python/cntk/core.py
+++ b/bindings/python/cntk/core.py
@ -8,26 +8,29 @@ import numpy as np
 from scipy import sparse

 from . import cntk_py
-from .device import use_default_device, cpu
+from .device import use_default_device, cpu, DeviceKind
 from .utils.swig_helper import typemap

+
 def _is_c_contiguous(data):
    while isinstance(data, list):
        data = data[0]

    return data.flags.c_contiguous

+
 class NDArrayView(cntk_py.NDArrayView):
    '''
-    Creates an empty dense internal data representation of a :class:`~cntk.core.Value` object.
+    Creates an empty dense internal data representation of a
+    :class:`~cntk.core.Value` object.
    To create an NDArrayView from a NumPy array, use :meth:`from_dense`.
    To create an NDArrayView from a sparse array, use :meth:`from_csr`.

    Args:
        shape (tuple): shape of the data
        data_type (np.float32, np.float64): data type of the data
-        device (:class:`~cntk.device.DeviceDescriptor`): device this value should be put
-         on
+        device (:class:`~cntk.device.DeviceDescriptor`): device this value
+         should be put on
    '''

    def __init__(self, shape, data_type, device=None):
@ -37,39 +40,44 @@ class NDArrayView(cntk_py.NDArrayView):
        if device is None:
            device = use_default_device()
        super(NDArrayView, self).__init__(data_type, cntk_py.StorageFormat_Dense, shape,
-                device)
+                                          device)

    @staticmethod
    @typemap
-    def from_dense(np_array, device=None, read_only=False):
+    def from_dense(np_array, device=None, read_only=False, borrow=False):
        '''
        Create a :class:`NDArrayView` instance from a NumPy array.

        Args:
            np_array (numpy.ndarray): NumPy array
-            device (:class:`~cntk.device.DeviceDescriptor`): device this value should be put
-             on
-            read_only (bool): whether the data can be modified or not
+            device (:class:`~cntk.device.DeviceDescriptor`): device this value
+             should be put on
+            borrow (bool, default False): whether nd_arrary memory can be
+             borrowed internally to speed up the data creation
+            read_only (bool, optional): whether the data can be modified or
+             not (default False)

        Returns:
            :class:`NDArrayView` instance
        '''
        if not isinstance(np_array, np.ndarray):
            raise TypeError('data must be of type numpy.ndarray'
-                    ' and not %s'%type(np_array))
+                            ' and not %s' % type(np_array))

        if not _is_c_contiguous(np_array):
-            warnings.warn('data is not C contiguous; rearrange your data/computation to avoid costly data conversions', RuntimeWarning)
+            warnings.warn('data is not C contiguous; rearrange your '
+                          'data/computation to avoid costly data conversions',
+                          RuntimeWarning)
            np_array = np.ascontiguousarray(np_array)

        if device is None:
            device = use_default_device()

-        return cntk_py.NDArrayView(np_array, device, read_only)
+        return cntk_py.NDArrayView(np_array, device, read_only, borrow)

    @staticmethod
    @typemap
-    def from_csr(csr_array, device=None, read_only=False):
+    def from_csr(csr_array, device=None, read_only=False, borrow=False):
        '''
        Create a :class:`NDArrayView` instance from a SciPy sparse array in CSR
        format.
@ -77,35 +85,42 @@ class NDArrayView(cntk_py.NDArrayView):
        Args:
            csr_array (scipy.sparse.csr.csr_matrix): SciPy sparse matrix in CSR
             format
-            device (:class:`~cntk.device.DeviceDescriptor`): device this value should be put
-             on
-            read_only (bool): whether the data can be modified or not
+            device (:class:`~cntk.device.DeviceDescriptor`): device this value
+             should be put on
+            read_only (bool, optional): whether the data can be modified or
+             not (default False)
+            borrow (bool, default False): whether nd_arrary memory can be
+             borrowed internally to speed up the data creation

        Returns:
            :class:`NDArrayView` instance
        '''
        if not sparse.isspmatrix_csr(csr_array):
            raise TypeError("only CSR is supported as of now. Please "
-                    "convert your data using 'tocsr()'")
+                            "convert your data using 'tocsr()'")

        if device is None:
            device = use_default_device()

        return cntk_py.NDArrayView(csr_array.shape, csr_array.data,
-                csr_array.indptr, csr_array.indices, device, read_only)
+                                   csr_array.indptr, csr_array.indices, device,
+                                   read_only, borrow)

    @staticmethod
    @typemap
-    def from_data(data, device=None, read_only=False):
+    def from_data(data, device=None, read_only=False, borrow=False):
        '''
-        Create a :class:`NDArrayView` instance from a NumPy or SciPy sparse array in CSR
-        format.
+        Create a :class:`NDArrayView` instance from a NumPy or SciPy sparse
+        array in CSR format.

        Args:
            data (numpy.ndarray or scipy.sparse.csr.csr_matrix): data
-            device (:class:`~cntk.device.DeviceDescriptor`): device this value should be put
-             on
-            read_only (bool): whether the data can be modified or not
+            device (:class:`~cntk.device.DeviceDescriptor`): device this value
+             should be put on
+            read_only (bool, optional): whether the data can be modified or
+             not (default False)
+            borrow (bool, default False): whether nd_arrary memory can be
+             borrowed internally to speed up the data creation

        Returns:
            :class:`NDArrayView` instance
@ -117,17 +132,17 @@ class NDArrayView(cntk_py.NDArrayView):
            data = np.asarray(data)

        if isinstance(data, np.ndarray):
-            ndav = NDArrayView.from_dense(data, device)
+            ndav = NDArrayView.from_dense(data, device, borrow=borrow)
        elif sparse.issparse(data):
-            ndav = NDArrayView.from_csr(data, device)
+            ndav = NDArrayView.from_csr(data, device, borrow=borrow)
        else:
            raise TypeError('data type "%s" is not supported. Please '
-                    'provide the data as a Python list of NumPy arrays '
-                    'or Scipy CSR matrices.'%type(data))
-
+                            'provide the data as a Python list of NumPy '
+                            'arrays or Scipy CSR matrices.' % type(data))

        return ndav

+
 class Value(cntk_py.Value):
    '''
    Internal representation of minibatch data.
@ -147,9 +162,10 @@ class Value(cntk_py.Value):
         Booleans that tell whether a sequence is a new sequence (`True`) or a
         continuation of the sequence in the same slot of the previous
         minibatch (`False`)
-        device (:class:`~cntk.device.DeviceDescriptor`): device this value should be put
-         on
+        device (:class:`~cntk.device.DeviceDescriptor`): device this value
+         should be put on
    '''
+
    def __init__(self, shape=None, dtype=None, batch=None, seq_starts=None, device=None):
        if device is None:
            device = use_default_device()
@ -178,12 +194,13 @@ class Value(cntk_py.Value):
                sample = np.asarray(sample, dtype=var.dtype)
            except ValueError:
                s = sample
-                while isinstance(s, list) and len(s)>0:
+                while isinstance(s, list) and len(s) > 0:
                    s = s[0]
                if sparse.issparse(s):
                    raise ValueError('if you provide sparse data, every '
-                            'sequence has to be encoded as one '
-                            'csr_matrix instance. Your sequence was: \'%s\''%str(sample))
+                                     'sequence has to be encoded as one '
+                                     'csr_matrix instance. Your sequence '
+                                     'was: \'%s\'' % str(sample))
                else:
                    raise

@ -203,10 +220,10 @@ class Value(cntk_py.Value):
                             'supported, you gave %s' % sample.dtype)

        if convert_to_var_dtype:
-            warnings.warn('your data is of type "%s", but your input'
-                          'expects "%s". Please convert your data '
-                          'beforehand to speed up training.' %
-                          (sample.dtype, str(var.dtype)))
+            warnings.warn('your data is of type "%s", but your input '
+                          'variable (uid "%s") expects "%s". Please convert '
+                          'your data beforehand to speed up training.' %
+                          (sample.dtype, var.uid, str(var.dtype)))
            sample = sample.astype(var.dtype)

        return sample
@ -238,24 +255,32 @@ class Value(cntk_py.Value):
            :class:`~cntk.core.Value` object.
        '''
        if not isinstance(var, cntk_py.Variable):
-            raise TypeError('Variable expected, but got "%s"'%type(var))
-
-        cpu_dev = cpu()
+            raise TypeError('Variable expected, but got "%s"' % type(var))

        if not var.dynamic_axes:
-            # No dynamic axes -> no batch
+            # No dynamic axes -> we can pass everything in one go
            data = Value._as_best_data_type(var, data)
-            ndav = NDArrayView.from_data(data, device)
+            # Since the core API's Value does not copy single NDArrayViews,
+            # we cannot borrow the memory here.
+            ndav = NDArrayView.from_data(data, device=cpu(), borrow=False)

            return cntk_py.Value(ndav)

+        elif len(var.dynamic_axes) <= 1 and isinstance(data, list):
+            warnings.warn('you provided the minibatch data as a list, but '
+                          'your corresponding input variable (uid "%s") has '
+                          'only one dynamic axis (batch axis). To speed up '
+                          'graph executen, please convert the data '
+                          'beforehand into one NumPy array to speed up '
+                          ' training.' % var.uid)
+
        if isinstance(data, np.ndarray):
            # The outermost axis has to be Python list. If the user passes a
            # full minibatch as one NumPy array, we have to convert it.
            if data.dtype == object:
-                raise ValueError('dtype object is not supported. If this is a batch '
-                        'of sequences, you need to pass them as a pure-Python list '
-                        'of NumPy arrays')
+                raise ValueError('dtype object is not supported. If this is a '
+                                 'batch of sequences, you need to pass them as a '
+                                 'pure-Python list of NumPy arrays')

            if seq_starts:
                data = list(np.atleast_1d(data))
@ -267,25 +292,30 @@ class Value(cntk_py.Value):

        if not isinstance(data, list):
            raise ValueError('batch has to be a list of NumPy arrays or '
-                    'SciPy CSR matrices')
-
-        list_of_ndavs = []
+                             'SciPy CSR matrices')

        # NDArrayViews are all created on CPU. The Value object later then will
        # move it to the requested device.
-        for sample in data:
-            sample = Value._as_best_data_type(var, sample)
-            ndav = NDArrayView.from_data(sample, cpu_dev)
-
-            list_of_ndavs.append(ndav)
+        # As Value will later create copies anyways, we do not create copies in
+        # NDArrayView itself. Because of that, we need to keep around the
+        # instances _as_best_data_type() until we have passed them to
+        # Value_create() where it will be copied further.
+        data = [Value._as_best_data_type(var, sample) for sample in data]
+        borrow = device.type() == DeviceKind.CPU
+        list_of_ndavs = [NDArrayView.from_data(sample, device=cpu(),
+                                               borrow=borrow)
+                         for sample in data]

        from .utils import sanitize_shape
-        return cntk_py.Value_create(
-                sanitize_shape(var.shape), list_of_ndavs,
-                seq_starts or [],
-                device or use_default_device(),
-                read_only)
+        value = cntk_py.Value_create(
+            sanitize_shape(var.shape),
+            list_of_ndavs,
+            seq_starts or [],
+            device or use_default_device(),
+            read_only,
+            True)  # always create a copy in Value

+        return value

    @property
    def shape(self):
@ -315,13 +345,13 @@ class Value(cntk_py.Value):
        '''
        return np.asarray(super(Value, self).mask())

-
    def __len__(self):
        '''
        Number of samples in this value object.
        '''
        return self.shape[0]

+
 def user_function(user_func):
    '''
    Wraps the passed Function to create a composite representing the
--- a/bindings/python/cntk/device.py
+++ b/bindings/python/cntk/device.py
@ -4,8 +4,20 @@
 # for full license information.
 # ==============================================================================

+from enum import Enum, unique
 from . import cntk_py

+
+@unique
+class DeviceKind(Enum):
+    '''
+    Describes different device kinds like CPU or GPU.
+    '''
+
+    CPU = cntk_py.DeviceKind_CPU
+    GPU = cntk_py.DeviceKind_GPU
+
+
 class DeviceDescriptor(cntk_py.DeviceDescriptor):
    '''
    Describes a device by an unique id and its type. If the device corresponds to a GPU its type is 1,
@ -30,6 +42,7 @@ class DeviceDescriptor(cntk_py.DeviceDescriptor):
        '''
        return super(DeviceDescriptor, self).type()

+
 def all_devices():
    '''
    Returns a device descriptor list with all the available devices
@ -39,6 +52,7 @@ def all_devices():
    '''
    return cntk_py.DeviceDescriptor.all_devices()

+
 def best():
    '''
    Returns a device descriptor with the best configuration.
@ -48,6 +62,7 @@ def best():
    '''
    return cntk_py.DeviceDescriptor.best_device()

+
 def cpu():
    '''
    Returns CPU device descriptor
@ -57,6 +72,7 @@ def cpu():
    '''
    return cntk_py.DeviceDescriptor.cpu_device()

+
 def default():
    '''
    Returns default device
@ -66,6 +82,7 @@ def default():
    '''
    return cntk_py.DeviceDescriptor.default_device()

+
 def gpu(device_id):
    '''
    Returns GPU device
@ -75,6 +92,7 @@ def gpu(device_id):
    '''
    return cntk_py.DeviceDescriptor.gpu_device(device_id)

+
 def use_default_device():
    '''
    Use default device
@ -84,6 +102,7 @@ def use_default_device():
    '''
    return cntk_py.DeviceDescriptor.use_default_device()

+
 def set_default_device(new_default_device):
    '''
    Set new device descriptor as default
--- a/bindings/python/cntk/io/init.py
+++ b/bindings/python/cntk/io/init.py
@ -234,6 +234,26 @@ class MinibatchSource(cntk_py.MinibatchSource):
        '''
        return super(MinibatchSource, self).is_distributed()

+    @property
+    def current_position(self):
+        '''
+        Gets current position in the minibatch source.
+
+        Returns:
+            Minibatch position :class:`~cntk.cntk_py.Dictionary` on the global timeline.
+        '''
+        return self.get_checkpoint_state()
+
+    @current_position.setter
+    def current_position(self, position):
+        '''
+        Sets current position in the minibatch source.
+
+        Args:
+            position (:class:`~cntk.cntk_py.Dictionary`): position returned from :func:`~get_current_position`.
+        '''
+        self.restore_from_checkpoint(position)
+
 def _py_dict_to_cntk_dict(py_dict):
    '''
    Converts a Python dictionary into a CNTK Dictionary whose values are CNTK DictionaryValue instances.
@ -455,25 +475,25 @@ def StreamDef(field=None, shape=None, is_sparse=False, transforms=None, context=

    Args:
        field (str): this is the name of the stream:
-        
+
         * for CTFDeserializer the name is inside the CTF file
         * for ImageDeserializer the acceptable names are `image` or `label`
-         * for HTKFeatureDeserializer and HTKMLFDeserializer only the default 
+         * for HTKFeatureDeserializer and HTKMLFDeserializer only the default
           value of None is acceptable
-        
-        shape (int, tuple): dimensions of this stream. HTKFeatureDeserializer, 
+
+        shape (int, tuple): dimensions of this stream. HTKFeatureDeserializer,
         HTKMLFDeserializer, and CTFDeserializer read data
         as flat arrays. If you need different shapes you can
         :func:`~cntk.ops.reshape` it later.
        is_sparse (bool): whether the provided data is sparse.
         `False` by default, unless mlf is provided.
-        transforms (list): list of transforms to be applied by the Deserializer. 
+        transforms (list): list of transforms to be applied by the Deserializer.
         Currently only ImageDeserializer supports transforms.
-        context (tuple): left and right context to consider when reading in HTK 
+        context (tuple): left and right context to consider when reading in HTK
         data. Only supported by HTKFeatureDeserializer.
        scp (str, list): scp files for HTK data
        mlf (str, list): mlf files for HTK data
-        broadcast (bool): whether the features in this stream should be 
+        broadcast (bool): whether the features in this stream should be
         broadcast to the whole sequence (useful in e.g. ivectors with HTK)
    '''
    config = dict(stream_alias=field, is_sparse=is_sparse)
@ -592,5 +612,3 @@ def sequence_to_cntk_text_format(seq_idx, alias_tensor_map):
        lines.append('%i\t|' % seq_idx + ' |'.join(line))

    return '\n'.join(lines)
-
-
--- a/bindings/python/cntk/layers/layers.py
+++ b/bindings/python/cntk/layers/layers.py
@ -12,7 +12,7 @@ import numpy as np
 from ..ops.functions import Function
 from ..ops.variables import Variable
 from ..ops import parameter, input_variable, placeholder_variable, combine
-from ..ops import times, element_times, convolution, pooling, unpooling, batch_normalization, dropout, splice, reshape, sequence, softmax, tanh, reduce_sum, reduce_mean, sqrt
+from ..ops import times, element_times, convolution, convolution_transpose, pooling, unpooling, batch_normalization, dropout, splice, reshape, sequence, softmax, tanh, reduce_sum, reduce_mean, sqrt
 from ..utils import Record, _as_tuple
 from .blocks import *
 from .blocks import _initializer_for, _get_initial_state_or_default, _INFERRED # helpers
@ -443,7 +443,6 @@ def Convolution1D(rf_shape,         # shape of receptive field, e.g. (3)
                  init=default_override_or(glorot_uniform()),
                  pad=default_override_or(False),
                  strides=1,
-                  sharing=True,     # (must be True currently)
                  bias=default_override_or(True),
                  init_bias=default_override_or(0),
                  reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image)
@ -460,7 +459,7 @@ def Convolution1D(rf_shape,         # shape of receptive field, e.g. (3)
    init_bias  = get_default_override(Convolution1D, init_bias=init_bias)
    if len(_as_tuple(rf_shape)) != 1: 
         raise ValueError('Convolution1D: rf_shape must be a scalar')
-    return Convolution(rf_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, strides=strides, sharing=sharing, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank, op_name='Convolution1D', name=name)
+    return Convolution(rf_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, strides=strides, sharing=True, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank, op_name='Convolution1D', name=name)


 def Convolution2D(rf_shape,         # shape of receptive field, e.g. (3,3). Must be a 2-element tuple.
@ -469,7 +468,6 @@ def Convolution2D(rf_shape,         # shape of receptive field, e.g. (3,3). Must
                  init=default_override_or(glorot_uniform()),
                  pad=default_override_or(False),
                  strides=1,
-                  sharing=True,     # (must be True currently)
                  bias=default_override_or(True),
                  init_bias=default_override_or(0),
                  reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image)
@ -486,7 +484,7 @@ def Convolution2D(rf_shape,         # shape of receptive field, e.g. (3,3). Must
    init_bias  = get_default_override(Convolution2D, init_bias=init_bias)
    if len(rf_shape) != 2: 
         raise ValueError('Convolution2D: rf_shape must be a 2D tuple, e.g. (3,3)')
-    return Convolution(rf_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, strides=strides, sharing=sharing, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank, op_name='Convolution2D', name=name)
+    return Convolution(rf_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, strides=strides, sharing=True, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank, op_name='Convolution2D', name=name)


 def Convolution3D(rf_shape,         # shape of receptive field, e.g. (3,3,3). Must be a 3-element tuple.
@ -495,7 +493,6 @@ def Convolution3D(rf_shape,         # shape of receptive field, e.g. (3,3,3). Mu
                  init=default_override_or(glorot_uniform()),
                  pad=default_override_or(False),
                  strides=1,
-                  sharing=True,     # (must be True currently)
                  bias=default_override_or(True),
                  init_bias=default_override_or(0),
                  reduction_rank=1, # (0 means input has no depth dimension, e.g. audio signal or B&W image)
@ -512,42 +509,41 @@ def Convolution3D(rf_shape,         # shape of receptive field, e.g. (3,3,3). Mu
    init_bias  = get_default_override(Convolution3D, init_bias=init_bias)
    if len(rf_shape) != 3: 
         raise ValueError('Convolution3D: rf_shape must be a 3D tuple, e.g. (3,3,3)')
-    return Convolution(rf_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, strides=strides, sharing=sharing, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank, op_name='Convolution3D', name=name)
+    return Convolution(rf_shape, num_filters=num_filters, activation=activation, init=init, pad=pad, strides=strides, sharing=True, bias=bias, init_bias=init_bias, reduction_rank=reduction_rank, op_name='Convolution3D', name=name)


-# Deconvolution -- create a deconvolution layer with optional non-linearity
+# ConvolutionTranspose -- create a deconvolution layer with optional non-linearity
 # TODO: need to merge with above. Can it simply be transpose=True?
-def Deconvolution(rf_shape,        # shape of receptive field, e.g. (3,3)
-                  num_filters,
-                  num_input_filters,
-                  activation=default_override_or(identity),
-                  init=default_override_or(glorot_uniform()),
-                  pad=default_override_or(False),
-                  strides=1,
-                  sharing=True,     # (must be True currently)
-                  lower_pad=(0,),
-                  upper_pad=(0,),
-                  bias=default_override_or(True),
-                  init_bias=default_override_or(0),
-                  reduction_rank=1, # (must be 1 currently)
-                  max_temp_mem_size_in_samples=0, 
-                  name=''):
+def ConvolutionTranspose(rf_shape,        # shape of receptive field, e.g. (3,3)
+                         num_filters,
+                         num_input_filters,
+                         activation=default_override_or(identity),
+                         init=default_override_or(glorot_uniform()),
+                         pad=default_override_or(False),
+                         strides=1,
+                         sharing=True,     # (must be True currently)
+                         bias=default_override_or(True),
+                         init_bias=default_override_or(0),
+                         output_shape=(0,), 
+                         reduction_rank=1, # (must be 1 currently)
+                         max_temp_mem_size_in_samples=0, 
+                         name=''):

    '''
    Layer factory function to create a deconvolution layer.
    '''
-    #UntestedBranchError("Deconvolution not tested after merge to new Layers lib") # it's actually tested by a end-to-end test
+    #UntestedBranchError("ConvolutionTranspose not tested after merge to new Layers lib") # it's actually tested by a end-to-end test

-    activation = get_default_override(Deconvolution, activation=activation)
-    init       = get_default_override(Deconvolution, init=init)
-    pad        = get_default_override(Deconvolution, pad=pad)
-    bias       = get_default_override(Deconvolution, bias=bias)
-    init_bias  = get_default_override(Deconvolution, init_bias=init_bias)
+    activation = get_default_override(ConvolutionTranspose, activation=activation)
+    init       = get_default_override(ConvolutionTranspose, init=init)
+    pad        = get_default_override(ConvolutionTranspose, pad=pad)
+    bias       = get_default_override(ConvolutionTranspose, bias=bias)
+    init_bias  = get_default_override(ConvolutionTranspose, init_bias=init_bias)

    if reduction_rank != 1:
-        NotImplementedError("Deconvolution: reduction_rank other than 1 currently not supported")
+        NotImplementedError("ConvolutionTranspose: reduction_rank other than 1 currently not supported")
    if not sharing:
-        NotImplementedError("Deconvolution: sharing option currently must be True")
+        NotImplementedError("ConvolutionTranspose: sharing option currently must be True")
    output_channels_shape = _as_tuple(num_filters)
    input_channels_shape = _as_tuple(num_input_filters)
    kernel_shape = output_channels_shape + rf_shape
@ -559,22 +555,65 @@ def Deconvolution(rf_shape,        # shape of receptive field, e.g. (3,3)
    b = Parameter(output_channels_shape + (1,) * len(rf_shape), init=init_bias, name='b') if bias else None

    # expression
-    @BlockFunction('Deconvolution', name)
-    def deconvolve(x):
-        r = convolution (W, x,
-                         strides=_as_tuple(strides),
-                         sharing=_as_tuple(sharing),
-                         auto_padding=_as_tuple(pad),
-                         lower_pad=lower_pad,
-                         upper_pad=upper_pad,
-                         transpose=True,
-                         max_temp_mem_size_in_samples=max_temp_mem_size_in_samples)
+    @BlockFunction('ConvolutionTranspose', name)
+    def convolve_transposed(x):
+        r = convolution_transpose (W, x,
+                                   strides=_as_tuple(strides),
+                                   sharing=_as_tuple(sharing),
+                                   auto_padding=_as_tuple(pad),
+                                   output_shape=output_shape, 
+                                   max_temp_mem_size_in_samples=max_temp_mem_size_in_samples)
        if bias:
            r = r + b
        if activation is not None:
            r = activation(r)
        return r
-    return deconvolve
+    return convolve_transposed
+
+# ConvolutionTranspose1D -- create a 1D convolution transpose layer with optional non-linearity
+def ConvolutionTranspose1D(filter_shape,        # a scalar, e.g., 3 
+                           num_filters=None,
+                           activation=activation_default_or_None,
+                           init=init_default_or_glorot_uniform,
+                           pad=pad_default_or_False,
+                           strides=1,
+                           bias=bias_default_or_True,
+                           init_bias=init_bias_default_or_0,
+                           output_shape=None, 
+                           name=''):
+    if len(filter_shape) != 1: 
+         raise ValueError('ConvolutionTranspose1D: filter_shape must be a scalar')
+    return ConvolutionTranspose(filter_shape, num_filters, activation, init, pad, strides, True, bias, init_bias, output_shape, name=name)
+
+# ConvolutionTranspose2D -- create a 2D convolution transpose layer with optional non-linearity
+def ConvolutionTranspose2D(filter_shape,        # a 2D tuple, e.g., (3,3) 
+                           num_filters=None,
+                           activation=activation_default_or_None,
+                           init=init_default_or_glorot_uniform,
+                           pad=pad_default_or_False,
+                           strides=1,
+                           output_shape=None, 
+                           bias=bias_default_or_True,
+                           init_bias=init_bias_default_or_0,
+                           name=''):
+    if len(filter_shape) != 2: 
+         raise ValueError('ConvolutionTranspose2D: filter_shape must be a 2D tuple, e.g. (3,3)')
+    return ConvolutionTranspose(filter_shape, num_filters, activation, init, pad, strides, True, bias, init_bias, output_shape, name=name)
+
+# ConvolutionTranspose3D -- create a 3D convolution transpose layer with optional non-linearity
+def ConvolutionTranspose3D(filter_shape,        # a 3D tuple, e.g., (3,3,3) 
+                           num_filters=None,
+                           activation=activation_default_or_None,
+                           init=init_default_or_glorot_uniform,
+                           pad=pad_default_or_False,
+                           strides=1,
+                           output_shape=None, 
+                           bias=bias_default_or_True,
+                           init_bias=init_bias_default_or_0,
+                           name=''):
+    if len(filter_shape) != 3: 
+         raise ValueError('ConvolutionTranspose3D: filter_shape must be a 3D tuple, e.g. (3,3,3)')
+    return ConvolutionTranspose(filter_shape, num_filters, activation, init, pad, strides, True, bias, init_bias, output_shape, name=name)

 # TODO: add sequential mode like Convolution()
 from cntk.cntk_py import PoolingType_Max, PoolingType_Average, NDShape
--- a/bindings/python/cntk/ops/init.py
+++ b/bindings/python/cntk/ops/init.py
@ -219,8 +219,8 @@ def cross_entropy_with_softmax(output_vector, target_vector, axis=-1, name=''):
        target_vector: usually it is one-hot vector where the hot bit
         corresponds to the label index. But it can be any probability
         distribution over the labels.
-        axis (int or :class:`~cntk.axis.Axis`): axis along which the cross
-         entropy will be computed.
+        axis (int or :class:`~cntk.axis.Axis`, optional): if given, cross entropy will be computed
+         along this axis
        name (str, optional): the name of the Function instance in the network
    Returns:
        :class:`~cntk.ops.functions.Function`
@ -404,7 +404,7 @@ def classification_error(output_vector, target_vector, axis=-1, topN=1, name='')
    return classification_error(output_vector, target_vector, topN, axis, name)

@typemap
-def edit_distance_error(input_a, input_b, subPen=0, delPen=0, insPen=0, squashInputs=False, samplesToIgnore=[], name=''):
+def edit_distance_error(input_a, input_b, subPen=0, delPen=0, insPen=0, squashInputs=False, tokensToIgnore=[], name=''):
    '''
    Edit distance error evaluation node with the option of specifying penalty of substitution, deletion and insertion, as well as squashing the input sequences and ignoring certain samples.
    Using the classic DP algorithm as described in https://en.wikipedia.org/wiki/Edit_distance, adjusted to take into account the penalties.
@ -415,7 +415,7 @@ def edit_distance_error(input_a, input_b, subPen=0, delPen=0, insPen=0, squashIn
    3 0 3 2
    will be represented as the vector of labels (indices) as [1, 0, 0, 1], on which edit distance will be actually evaluated.

-    The node allows to squash sequences of repeating labels and ignore certain labels. For example, if squashInputs is true and samplesToIgnore contains label '-' then
+    The node allows to squash sequences of repeating labels and ignore certain labels. For example, if squashInputs is true and tokensToIgnore contains label '-' then
    given first input sequence as s1="1-12-" and second as s2="-11--122" the edit distance will be computed against s1' = "112" and s2' = "112".

    The returned error is computed as: EditDistance(s1,s2) * length(s1') / length(s1)
@ -435,9 +435,9 @@ def edit_distance_error(input_a, input_b, subPen=0, delPen=0, insPen=0, squashIn
        input_a: first input sequence
        input_b: second input sequence
        subPen, delPen, insPen: substitution, deletion and insertion penalties
-        squashInputs: whether to merge sequences of identical samples (in both input sequences). If true and samplesToIgnore contains label '-' then
+        squashInputs: whether to merge sequences of identical samples (in both input sequences). If true and tokensToIgnore contains label '-' then
                given first input sequence as s1="a-ab-" and second as s2="-aa--abb" the edit distance will be computed against s1' = "aab" and s2' = "aab".
-        samplesToIgnore: list of samples to ignore during edit distance evaluation (in both sequences)
+        tokensToIgnore: list of samples to ignore during edit distance evaluation (in both sequences)
        name (str, optional): the name of the Function instance in the network
    Returns:
        :class:`~cntk.ops.functions.Function`
@ -446,8 +446,51 @@ def edit_distance_error(input_a, input_b, subPen=0, delPen=0, insPen=0, squashIn
    dtype = get_data_type(input_a, input_b)
    input_a = sanitize_input(input_a, dtype)
    input_b = sanitize_input(input_b, dtype)
-    return edit_distance_error(input_a, input_b, subPen, delPen, insPen, squashInputs, samplesToIgnore, name)
+    return edit_distance_error(input_a, input_b, subPen, delPen, insPen, squashInputs, tokensToIgnore, name)

+@typemap
+def labels_to_graph(labels, name=''):
+    '''
+    Conversion node from labels to graph. Typically used as an input to ForwardBackward node. 
+    This node's objective is to transform input labels into a graph representing exact forward-backward criterion.
+    Example:
+        num_classes = 2
+        labels = cntk.input_variable((num_classes))
+        graph = cntk.labels_to_graph(labels)
+
+    Args:
+        labels: input training labels
+    Returns:
+        :class:`~cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import labels_to_graph
+    dtype = get_data_type(labels)
+    labels = sanitize_input(labels, dtype)
+    return labels_to_graph(labels, name)
+
+@typemap
+def forward_backward(graph, features, blankTokenId, delayConstraint=-1, name=''):
+    '''
+    Criterion node for training methods that rely on forward-backward Viterbi-like passes, e.g. Connectionist Temporal Classification (CTC) training
+    The node takes as the input the graph of labels, produced by the labels_to_graph operation that determines the exact forward/backward procedure. 
+    Example:
+        graph = cntk.labels_to_graph(labels)
+        networkOut = model(features)
+        fb = C.forward_backward(graph, networkOut, 132)
+
+    Args:
+        graph: labels graph
+        features: network output
+        blankTokenId: id of the CTC blank label
+        delayConstraint: label output delay constraint introduced during training that allows to have shorter delay during inference. This is using the original time information to enforce that CTC tokens only get aligned within a time margin. Setting this parameter smaller will result in shorted delay between label output during decoding, yet may hurt accuracy. delayConstraint=-1 means no constraint
+    Returns:
+        :class:`~cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import forward_backward
+    dtype = get_data_type(features, graph)
+    features = sanitize_input(features, dtype)
+    graph = sanitize_input(graph, dtype)
+    return forward_backward(graph, features, blankTokenId, delayConstraint, name)

 ##########################################################################
 # convolution ops
@ -455,7 +498,7 @@ def edit_distance_error(input_a, input_b, subPen=0, delPen=0, insPen=0, squashIn

@typemap
 def convolution(convolution_map, operand, strides=(1,), sharing=[True],
-                auto_padding=[True], lower_pad=(0,), upper_pad=(0,), transpose=False,
+                auto_padding=[True], lower_pad=(0,), upper_pad=(0,), 
                max_temp_mem_size_in_samples=0, name=''):
    '''
    Computes the convolution of ``convolution_map`` (typically a tensor of learnable parameters) with
@ -501,7 +544,7 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
         the input dimension. The last value that lines up with the number of input channels must be false.
        lower_pad: precise lower padding for each input dimension.
        upper_pad : precise upper padding for each input dimension.
-        transpose (bool): set to true for deconvolution.
+        output_shape: user expected output shape after convolution transpose. 
        max_temp_mem_size_in_samples (int): maximum amount of auxiliary memory (in samples) that should be reserved to perform convolution
         operations. Some convolution engines (e.g. cuDNN and GEMM-based engines) can benefit from using workspace as it may improve
         performance. However, sometimes this may lead to higher memory utilization. Default is 0 which means the same as the input
@ -516,8 +559,74 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
    lower_pad = sanitize_shape(lower_pad)
    upper_pad = sanitize_shape(upper_pad)
    return convolution(convolution_map, operand, strides, sharing, auto_padding,
-                       lower_pad, upper_pad, transpose,
-                       max_temp_mem_size_in_samples, name)
+                       lower_pad, upper_pad, max_temp_mem_size_in_samples, name)
+
+@typemap
+def convolution_transpose(convolution_map, operand, strides=(1,), sharing=[True],
+                          auto_padding=[True], lower_pad=(0,), upper_pad=(0,), output_shape=(0,), 
+                          max_temp_mem_size_in_samples=0, name=''):
+    '''
+    Computes the transposed convolution of ``convolution_map`` (typically a tensor of learnable parameters) with
+    ``operand`` (commonly an image or output of a previous convolution/pooling operation).
+    This is also known as ``fractionally strided convolutional layers``, or, ``deconvolution``. 
+    This operation is used in image and language processing applications. It supports arbitrary
+    dimensions, strides, sharing, and padding.
+
+    This function operates on input tensors with dimensions :math:`[C \\times M_1 \\times M_2 \\times \\ldots \\times M_n]`. This can be understood as a rank-n
+    object, where each entry consists of a :math:`C`-dimensional vector. For example, an RGB image would have dimensions
+    :math:`[3 \\times W \\times H]`, i.e. a :math:`[W \\times H]`-sized structure, where each entry (pixel) consists of a 3-tuple.
+
+    `convolution_transpose` convolves the input ``operand`` with a :math:`n+2` rank tensor of (typically learnable) filters called
+    ``convolution_map`` of shape :math:`[O \\times I \\times m_1 \\times m_2 \\times \\ldots \\times m_n ]` (typically :math:`m_i \\ll M_i`).
+    The first dimension, :math:`O`, is the nunber of convolution filters (i.e. the number of
+    channels in the output). The second dimension, :math:`I`, must match the number of channels in the input.
+    The last n dimensions are the spatial extent of the filter. I.e. for each output position, a vector of
+    dimension :math:`O` is computed. Hence, the total number of filter parameters is :math:`O \\times I \\times m_1 \\times m_2 \\times \\ldots \\times m_n`
+
+
+    Example:
+        >>> img = np.reshape(np.arange(9.0, dtype = np.float32), (1, 3, 3))
+        >>> x = C.input_variable(img.shape)
+        >>> filter = np.reshape(np.array([2, -1, -1, 2], dtype = np.float32), (1, 2, 2))
+        >>> kernel = C.constant(value = filter)
+        >>> np.round(C.convolution_transpose(kernel, x, auto_padding = [False]).eval({x: [img]}),5)
+        array([[[[[  0.,   2.,   3.,  -2.],
+                  [  6.,   4.,   6.,  -1.],
+                  [  9.,  10.,  12.,   2.],
+                  [ -6.,   5.,   6.,  16.]]]]], dtype=float32)
+
+    Args:
+        convolution_map: convolution filter weights, stored as a tensor of dimensions :math:`[O \\times I \\times m_1 \\times m_2 \\times \\ldots \\times m_n]`,
+         where :math:`[m_1 \\times m_2 \\times \\ldots \\times m_n]` must be the kernel dimensions (spatial extent of the filter).
+        operand: convolution input. A tensor with dimensions :math:`[I \\times M_1 \\times M_2 \\times \\ldots \\times M_n]`.
+        strides (tuple, optional): stride dimensions. If strides[i] > 1 then only pixel positions that are multiples of strides[i] are computed.
+         For example, a stride of 2 will lead to a halving of that dimension. The first stride dimension that lines up with the number
+         of input channels can be set to any non-zero value.
+        sharing (bool): sharing flags for each input dimension
+        auto_padding (bool): flags for each input dimension whether it should be padded automatically (that is,
+         symmetrically) or not padded at all. Padding means that the convolution kernel is applied to all pixel positions, where all
+         pixels outside the area are assumed zero ("padded with zeroes"). Without padding, the kernels are only shifted over
+         positions where all inputs to the kernel still fall inside the area. In this case, the output dimension will be less than
+         the input dimension. The last value that lines up with the number of input channels must be false.
+        lower_pad: precise lower padding for each input dimension.
+        upper_pad : precise upper padding for each input dimension.
+        max_temp_mem_size_in_samples (int): maximum amount of auxiliary memory (in samples) that should be reserved to perform convolution
+         operations. Some convolution engines (e.g. cuDNN and GEMM-based engines) can benefit from using workspace as it may improve
+         performance. However, sometimes this may lead to higher memory utilization. Default is 0 which means the same as the input
+         samples.
+        name (str, optional): the name of the Function instance in the network
+    Returns:
+        :class:`~cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import convolution_transpose
+    operand = sanitize_input(operand)
+    strides = sanitize_shape(strides)
+    lower_pad = sanitize_shape(lower_pad)
+    upper_pad = sanitize_shape(upper_pad)
+    output_shape = sanitize_shape(output_shape)
+    return convolution_transpose(convolution_map, operand, strides, sharing, auto_padding,
+                                 lower_pad, upper_pad, output_shape, 
+                                 max_temp_mem_size_in_samples, name)


@typemap
@ -1461,6 +1570,44 @@ def param_relu(alpha, x, name=''):
    x = sanitize_input(x)
    return pre_lu(alpha, x, name)

+
+@typemap
+def softplus(x, steepness=1, name=''):
+    '''
+    Softplus operation. Computes the element-wise softplus of ``x``:
+
+    :math:`\textrm{softplus}(x) = {\log(1+\exp(x))}`
+
+    The optional ``steepness`` allows to make the knee sharper (``steepness>1``) or softer, by computing
+    ``softplus(x * steepness) / steepness``.
+    (For very large steepness, this approaches a linear rectifier).
+
+    The output tensor has the same shape as ``x``.
+
+    Example:
+        >>> C.softplus([[-1, -0.5, 0, 1, 2]]).eval()
+        array([[ 0.313262,  0.474077,  0.693147,  1.313262,  2.126928]], dtype=float32)
+
+        >>> C.softplus([[-1, -0.5, 0, 1, 2]], steepness=4).eval()
+        array([[ 0.004537,  0.031732,  0.173287,  1.004537,  2.000084]], dtype=float32)
+
+    Args:
+        x (`numpy.array` or :class:`~cntk.ops.functions.Function`): any :class:`~cntk.ops.functions.Function` that outputs a tensor.
+        steepness (float, optional): optional steepness factor
+        name (`str`, default to ''): the name of the Function instance in the network
+    Returns:
+        cntk.ops.functions.Function:
+        An instance of :class:`~cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import softplus
+    x = sanitize_input(x)
+    if steepness == 1:
+        return softplus(x, name)
+    xp = placeholder_variable()
+    f = softplus(steepness * xp) / steepness
+    return as_block(f, [(xp, x)], 'softplus', name)
+
+
@typemap
 def sigmoid(x, name=''):
    '''
@ -1485,44 +1632,6 @@ def sigmoid(x, name=''):
    return sigmoid(x, name)


-@typemap
-def softplus(x, steepness=1, name=''):
-    '''
-    Softplus operation. Computes the element-wise softplus
-    of ``x``:
-     ``softplus(x) = log(1 + exp(x))``
-
-    The optional ``steepness`` allows to make the knee sharper (``steepness>1``) or softer, by computing
-    ``softplus(x * steepness) / steepness``.
-    (For very large steepness, this approaches a linear rectifier).
-
-    The output tensor has the same shape as ``x``.
-
-    Example:
-        >>> C.softplus([[-1, -0.5, 0, 1, 2]]).eval()
-        array([[ 0.313262,  0.474077,  0.693147,  1.313262,  2.126928]], dtype=float32)
-
-        >>> C.softplus([[-1, -0.5, 0, 1, 2]], steepness=4).eval()
-        array([[ 0.004537,  0.031732,  0.173287,  1.004537,  2.000084]], dtype=float32)
-
-    Args:
-        x: numpy array or any :class:`~cntk.ops.functions.Function` that outputs a tensor
-        steepness (float, optional): optional steepness factor
-        name (str, optional): the name of the Function instance in the network
-    Returns:
-        :class:`~cntk.ops.functions.Function`
-    '''
-    def softplus1(x):
-        return log_add_exp(0, x) # numerically stable of writing log(1 + exp(x))
-    xp = placeholder_variable()
-    if steepness == 1:
-        f = softplus1(xp)
-    else:
-        f = softplus1(steepness * xp) / steepness
-    x = sanitize_input(x)
-    return as_block(f, [(xp, x)], 'softplus', name)
-
-
@typemap
 def tanh(x, name=''):
    '''
@ -2046,7 +2155,7 @@ def reshape(x, shape, begin_axis=None, end_axis=None, name=''):
        begin_axis = Axis(0)

    if end_axis is None:
-        end_axis = Axis.end_static_axis()
+        end_axis = Axis.new_leading_axis()

    # Pass begin_axis as the end_axis and vice versa to account for
    # the automatic shape reversal across the python SWIG boundary
@ -2057,10 +2166,10 @@ def reshape(x, shape, begin_axis=None, end_axis=None, name=''):
        if not axis.is_static_axis:
            return axis

-        if (axis ==  Axis.end_static_axis()):
+        if (axis ==  Axis.new_leading_axis()):
            return Axis(0)
        elif (axis == Axis(0)):
-            return Axis.end_static_axis()
+            return Axis.new_leading_axis()
        else:
            return Axis(-axis.static_axis_index())

--- a/bindings/python/cntk/ops/functions.py
+++ b/bindings/python/cntk/ops/functions.py
@ -1235,12 +1235,16 @@ class UserFunction(Function):

        map_if_possible(variables)

-        if len(variables)>1:
-            self.backward(state, root_gradients, variables)
-        else:
+        if len(root_gradients) == 1:
            for rg in root_gradients.values():
                break
-            result = self.backward(state, rg)
+            root_gradients = rg
+        
+        possible_wrt = [input for input in self.inputs if input.needs_gradient]
+        if len(possible_wrt) > 1:
+            self.backward(state, root_gradients, variables)
+        else:
+            result = self.backward(state, root_gradients)
            for k in variables:
                variables[k] = result

--- a/bindings/python/cntk/ops/tests/evaluation_test.py
+++ b/bindings/python/cntk/ops/tests/evaluation_test.py
@ -282,12 +282,12 @@ EDIT_DISTANCE_ERROR_TEST_CASES = [
    ([[1, 3], [2, 0]], [[2, 0], [2, 0]], 0, 1, 1, True, [1], 2.0),
 ]

-@pytest.mark.parametrize("left_input, right_input, subPen, delPen, insPen, squashInputs, samplesToIgnore, result", EDIT_DISTANCE_ERROR_TEST_CASES)
-def test_edit_distance_error(left_input, right_input, subPen, delPen, insPen, squashInputs, samplesToIgnore, result, device_id, precision):
+@pytest.mark.parametrize("left_input, right_input, subPen, delPen, insPen, squashInputs, tokensToIgnore, result", EDIT_DISTANCE_ERROR_TEST_CASES)
+def test_edit_distance_error(left_input, right_input, subPen, delPen, insPen, squashInputs, tokensToIgnore, result, device_id, precision):
    i1 = input_variable(shape=(2,))
    i2 = input_variable(shape=(2,))
    arguments = {i1 : left_input, i2 : right_input}
-    a = edit_distance_error(i1, i2, subPen, delPen, insPen, squashInputs, samplesToIgnore)
+    a = edit_distance_error(i1, i2, subPen, delPen, insPen, squashInputs, tokensToIgnore)
    assert np.allclose(result, a.eval(arguments))

 def test_sequence_grad_as_numpy_false(device_id, precision):
--- a/bindings/python/cntk/ops/tests/function_tests.py
+++ b/bindings/python/cntk/ops/tests/function_tests.py
@ -17,6 +17,8 @@ from .. import constant, parameter, input_variable, placeholder_variable, times,
 from ... import InferredDimension
 from .ops_test_utils import compare_lists_of_np_arrays, AA

+from cntk.io import MinibatchSource, CTFDeserializer, StreamDefs, StreamDef
+
 def test_variable_forwarding():
    op = constant(value=2, shape=(3,4)) + 1
    assert op.shape == (3,4)
@ -31,7 +33,7 @@ def test_eval_by_node_name():
    assert res.eval({'i': [[3]]}) == [6]
    assert res.eval({u'i': [[3]]}) == [6]

-    
+
 def test_replace_placeholders():
    p = placeholder_variable(shape=(1,))
    i = input_variable(shape=(1,),
@ -219,19 +221,19 @@ def test_clone_with_function_in_substitution_map():
    t = times(x, w)
    b = parameter((proj_dim))
    t_plus_b = t + b
-    
+
    p = placeholder_variable()
    just_b = t_plus_b.clone('clone', {t : p})
    t_plus_b_clone = just_b.clone('share', {p : t})

-def test_clone_with_slice(): 
+def test_clone_with_slice():
    i1 = input_variable((2,2), name='i1')
    i2 = input_variable((2,2), name='i2')
-    x = splice(i1, i2, axis=0) 
-    W = constant(1, (4,1), name='W') 
+    x = splice(i1, i2, axis=0)
+    W = constant(1, (4,1), name='W')
    y = convolution(W, x)
-    assert(y.shape == (4,2)) 
-    
+    assert(y.shape == (4,2))
+
    from ..functions import CloneMethod
    x1 = input_variable((2,1), name='x1')
    x2 = input_variable((2,1), name='x2')
@ -267,7 +269,7 @@ def test_input_order():
    t = times(x, w)
    t_plus_b = plus(t, b, name=func_name)

-    def compare_var_names(vars, names): 
+    def compare_var_names(vars, names):
        num_vars = len(vars)
        for i in range(num_vars):
            if (vars[i].name != names[i]):
@ -290,8 +292,8 @@ def test_combine_duplicated_inputs():
    t_plus_b = plus(t, b, name=func_name)

    duplicated_t_plus_b = combine([t_plus_b, t_plus_b])
-    
-    def compare_var_names(vars, names): 
+
+    def compare_var_names(vars, names):
        num_vars = len(vars)
        for i in range(num_vars):
            if (vars[i].name != names[i]):
@ -300,7 +302,7 @@ def test_combine_duplicated_inputs():
        return True

    assert compare_var_names(duplicated_t_plus_b.outputs, [func_name, func_name])
-    
+

 def test_extra_arguments_in_eval():
    x1 = input_variable((1,), name='x1')
@ -310,4 +312,35 @@ def test_extra_arguments_in_eval():

    result = x1_plus_1.eval({x1 : np.asarray([[1]]), x2 : np.asarray([[1]])})
    assert np.allclose(result, [[[2]]])
-    
+
+
+def test_MinibatchData_and_Value_as_input(tmpdir):
+
+    mbdata = r'''0  |S0 100'''
+
+    tmpfile = str(tmpdir/'mbtest.txt')
+    with open(tmpfile, 'w') as f:
+        f.write(mbdata)
+
+    defs = StreamDefs(f1 = StreamDef(field='S0', shape=1))
+    mb_source = MinibatchSource(CTFDeserializer(tmpfile, defs),
+                                randomize=False)
+
+    f1_si = mb_source.stream_info('f1')
+
+    mb = mb_source.next_minibatch(1)
+
+    f1 = input_variable(shape=(1,),
+                       needs_gradient=True,
+                       name='f')
+    res = f1 * 2
+
+    assert res.eval({f1: mb[f1_si]}) == [[200]]
+    # Test MinibatchData
+    assert res.eval(mb[f1_si]) == [[200]]
+    # Test Value
+    assert res.eval(mb[f1_si].data) == [[200]]
+    # Test NumPy (converted back from MinibatchData)
+    assert res.eval(mb[f1_si].value) == [[200]]
+    # Test Value
+    assert res.eval(mb[f1_si].data) == [[200]]
--- a/bindings/python/cntk/ops/tests/kernel_test.py
+++ b/bindings/python/cntk/ops/tests/kernel_test.py
@ -360,3 +360,82 @@ def test_op_roipooling(input_map, input_rois, expected_fwd, expected_bkwd, devic
    unittest_helper(input_op,
                    forward_input, exp_fwd_value, expected_backward,
                    device_id=device_id, precision=precision)
+
+CONVOLUTION_TRANSPOSE_DATA = [
+    ([1, 1, 1, 3, 3], # input_size
+     [1, 2, 2], # convolution size
+     [[[[ 0, 0, 1, 2],
+        [ 0, 5, 11, 11],
+        [ 6, 23, 29, 23],
+        [ 12, 32, 37, 24]]]]) # result
+]
+# this test handles convolution transpose, without specifying output shape
+@pytest.mark.parametrize("input_size, conv_size, result", CONVOLUTION_TRANSPOSE_DATA)
+def test_convolution_transpose(input_size, conv_size, result, device_id, precision):
+    dt = PRECISION_TO_TYPE[precision]
+    dev = cntk_device(device_id)
+
+    # fill input operand with a sequence 1,2,3,... til total size and then
+    # resize to input_size
+    total_size = np.prod(input_size)
+    x = np.arange(total_size, dtype=dt)
+    input_operand = x.reshape(input_size)
+
+    a = I(shape=input_operand.shape[2:],
+        dtype=sanitize_dtype_cntk(precision),
+        needs_gradient=False,
+        name='a')
+
+    # do the same for convolution kernel
+    total_size = np.prod(conv_size)
+    y = np.arange(total_size, dtype=dt)
+    conv_map = constant(value=y.reshape(conv_size), device=dev)
+
+    from cntk import convolution_transpose
+    input_op = convolution_transpose(conv_map, a, auto_padding=[False])
+
+    forward_input = {a: input_operand}
+    expected_forward = AA([result])
+
+    unittest_helper(input_op, forward_input, expected_forward,
+                    None, device_id=device_id, precision=precision)
+
+CONVOLUTION_TRANSPOSE_OUTPUT_DATA = [
+    ([1, 1, 1, 3, 3], # input_size
+     [1, 3, 3], # convolution size
+     [[[[ 0, 3, 4, 11, 8, 10],
+        [ 3, 12, 11, 28, 19, 26],
+        [ 12, 27, 16, 35, 20, 25],
+        [ 27, 60, 35, 76, 43, 56], 
+        [ 24, 51, 28, 59, 32, 40]]]]) # result
+]
+# this test handles convolution transpose, without specifying output shape
+@pytest.mark.parametrize("input_size, conv_size, result", CONVOLUTION_TRANSPOSE_OUTPUT_DATA)
+def test_convolution_transpose_with_output(input_size, conv_size, result, device_id, precision):
+    dt = PRECISION_TO_TYPE[precision]
+    dev = cntk_device(device_id)
+
+    # fill input operand with a sequence 1,2,3,... til total size and then
+    # resize to input_size
+    total_size = np.prod(input_size)
+    x = np.arange(total_size, dtype=dt)
+    input_operand = x.reshape(input_size)
+
+    a = I(shape=input_operand.shape[2:],
+        dtype=sanitize_dtype_cntk(precision),
+        needs_gradient=False,
+        name='a')
+
+    # do the same for convolution kernel
+    total_size = np.prod(conv_size)
+    y = np.arange(total_size, dtype=dt)
+    conv_map = constant(value=y.reshape(conv_size), device=dev)
+
+    from cntk import convolution_transpose
+    input_op = convolution_transpose(conv_map, a, auto_padding=[True], strides=2, output_shape=(1,5,6))
+
+    forward_input = {a: input_operand}
+    expected_forward = AA([result])
+
+    unittest_helper(input_op, forward_input, expected_forward,
+                    None, device_id=device_id, precision=precision)
--- a/bindings/python/cntk/ops/tests/non_linear_test.py
+++ b/bindings/python/cntk/ops/tests/non_linear_test.py
@ -301,11 +301,8 @@ def test_op_elu(operand, device_id, precision):

    from cntk import elu

-    #BUGBUG: There is a bug in ElementSelect that cause nan in the output
-    #        for float32.
-    if PRECISION_TO_TYPE[precision] == np.float64:
-        _test_unary_op(precision, device_id, elu, operand,
-                       expected_forward, expected_backward)
+    _test_unary_op(precision, device_id, elu, operand,
+                   expected_forward, expected_backward)

@pytest.mark.parametrize("operand", TENSORS)
 def test_op_leaky_relu(operand, device_id, precision):
@ -347,6 +344,21 @@ def test_op_param_relu(operand, device_id, precision):
    _test_unary_op(precision, device_id, prelu, operand,
                    expected_forward, expected_backward)

+@pytest.mark.parametrize("operand", TENSORS)
+def test_op_softplus(operand, device_id, precision):
+    softplus_f = np.vectorize(lambda x: np.logaddexp(x, 0))
+    softplus_b = np.vectorize(lambda x: 1.0/(1.0+np.exp(-x)))
+
+    t = AA(operand, dtype=PRECISION_TO_TYPE[precision])
+
+    expected_forward = [[softplus_f(t)]]
+    expected_backward = {
+        'arg': [[softplus_b(t)]]
+    }
+
+    from .. import softplus
+    _test_unary_op(precision, device_id, softplus, operand,
+                   expected_forward, expected_backward)

 SAMPLES = [  # 2 samples having 4 classes
    [1, 1, 2, 3],
--- a/Показать больше
+++ b/Показать больше