merged from master (~beta9)

2017-01-22 18:50:15 -08:00 · 2017-01-22 18:50:15 -08:00 · 29bcb50ba8
--- a/.gitignore
+++ b/.gitignore
@ -249,6 +249,7 @@ Examples/Image/DataSets/grocery/testImages/
 Examples/Image/DataSets/grocery/*.txt
 Examples/Image/PretrainedModels/*.model
 Examples/Image/FeatureExtraction/*.txt
+Examples/Image/GettingStarted/Output/
 Tests/EndToEndTests/CNTKv2Python/Examples/layerOutput.txt
 Tutorials/HelloWorld-LogisticRegression/LR.txt.p
 Tutorials/HelloWorld-LogisticRegression/Models/
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1451,7 +1451,37 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "PerformanceProfilerDll", "S
 		{86883653-8A61-4038-81A0-2379FAE4200A} = {86883653-8A61-4038-81A0-2379FAE4200A}
 	EndProjectSection
 EndProject
-Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryManagedExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryManagedExamplesTest\CNTKLibraryManagedExamplesTest.csproj", "{3500A847-E024-4E7D-92DD-CC587C17460B}"
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalExamplesTest", "Tests\EndToEndTests\EvalClientTests\CNTKLibraryCSEvalExamplesTest\CNTKLibraryCSEvalExamplesTest.csproj", "{3500A847-E024-4E7D-92DD-CC587C17460B}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "GoogLeNet", "GoogLeNet", "{789B4AB8-40F1-4A37-823A-BC20D80C8BF1}"
+	ProjectSection(SolutionItems) = preProject
+		Examples\Image\Classification\GoogLeNet\README.md = Examples\Image\Classification\GoogLeNet\README.md
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BN-Inception", "BN-Inception", "{CE223840-1DEE-4849-B530-F06BEE05BAA8}"
+	ProjectSection(SolutionItems) = preProject
+		Examples\Image\Classification\GoogLeNet\BN-Inception\README.md = Examples\Image\Classification\GoogLeNet\BN-Inception\README.md
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "InceptionV3", "InceptionV3", "{824766FA-759A-4466-9C39-13200D2D3159}"
+	ProjectSection(SolutionItems) = preProject
+		Examples\Image\Classification\GoogLeNet\InceptionV3\README.md = Examples\Image\Classification\GoogLeNet\InceptionV3\README.md
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript", "{BD07C9F3-B10C-4C21-82BC-4F249B65DDFE}"
+	ProjectSection(SolutionItems) = preProject
+		Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionBlocks.bs = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionBlocks.bs
+		Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.bs = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.bs
+		Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.cntk = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\InceptionV3.cntk
+		Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\README.md = Examples\Image\Classification\GoogLeNet\InceptionV3\BrainScript\README.md
+	EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "BrainScript", "BrainScript", "{5CC403B9-2405-4FFB-A73B-DAE0DC986C76}"
+	ProjectSection(SolutionItems) = preProject
+		Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.bs
+		Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.cntk = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\BN-Inception.cntk
+		Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs = Examples\Image\Classification\GoogLeNet\BN-Inception\BrainScript\InceptionLayers.bs
+	EndProjectSection
 EndProject
 Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "PythonExamples", "Examples\PythonExamples.pyproj", "{292FF4EE-D9DD-4BA7-85F7-6A22148D1E01}"
 EndProject
@ -2114,6 +2144,11 @@ Global
 		{CB4566F1-6C8F-4270-83EE-F6AED84EBB2B} = {39C3C8CA-9A8A-4733-ADBB-3E19D0F52528}
 		{4B442D34-641A-4B37-9A4B-D18DBE28A979} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{3500A847-E024-4E7D-92DD-CC587C17460B} = {05E45AF7-C069-4057-BC16-0A532D068CE4}
+		{789B4AB8-40F1-4A37-823A-BC20D80C8BF1} = {151202CF-C2E4-47A6-A31C-CE039D698519}
+		{CE223840-1DEE-4849-B530-F06BEE05BAA8} = {789B4AB8-40F1-4A37-823A-BC20D80C8BF1}
+		{824766FA-759A-4466-9C39-13200D2D3159} = {789B4AB8-40F1-4A37-823A-BC20D80C8BF1}
+		{BD07C9F3-B10C-4C21-82BC-4F249B65DDFE} = {824766FA-759A-4466-9C39-13200D2D3159}
+		{5CC403B9-2405-4FFB-A73B-DAE0DC986C76} = {CE223840-1DEE-4849-B530-F06BEE05BAA8}
 		{292FF4EE-D9DD-4BA7-85F7-6A22148D1E01} = {47755F2E-D674-4175-9E38-8EA053455072}
 	EndGlobalSection
 EndGlobal
--- a/Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.cpp
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.cpp
@ -2,7 +2,7 @@
 // Copyright (c) Microsoft. All rights reserved.
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //
-// CPPEvalV2Client.cpp : Sample application shows how to evaluate a model using CNTK V2 API. 
+// CNTKLibraryCPPevalExamples.cpp : Sample application shows how to evaluate a model using CNTK V2 API. 
 //

 #include <stdio.h>
--- a/Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.vcxproj
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.vcxproj
@ -7,14 +7,14 @@
    </ProjectConfiguration>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="CPPEvalV2Client.cpp" />
+    <ClCompile Include="CNTKLibraryCPPEvalExamples.cpp" />
    <ClCompile Include="EvalMultithreads.cpp" />
  </ItemGroup>
  <PropertyGroup Label="Globals">
    <ProjectGuid>{D771A06D-CC25-4582-B5CD-D2A4782BB005}</ProjectGuid>
    <Keyword>Win32Proj</Keyword>
-    <RootNamespace>CPPEvalV2Client</RootNamespace>
-    <ProjectName>CPPEvalV2Client</ProjectName>
+    <RootNamespace>CNTKLibraryCPPEvalExamples</RootNamespace>
+    <ProjectName>CNTKLibraryCPPEvalExamples</ProjectName>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
--- a/Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.vcxproj.filters
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.vcxproj.filters
@ -15,7 +15,7 @@
    </Filter>
  </ItemGroup>
  <ItemGroup>
-    <ClCompile Include="CPPEvalV2Client.cpp">
+    <ClCompile Include="CNTKLibraryCPPEvalExamples.cpp">
      <Filter>Source Files</Filter>
    </ClCompile>
    <ClCompile Include="EvalMultithreads.cpp">
--- a/Examples/Evaluation/CNTKLibraryCPPEvalExamples/EvalMultithreads.cpp
+++ b/Examples/Evaluation/CNTKLibraryCPPEvalExamples/EvalMultithreads.cpp
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/App.config
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/App.config
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<configuration>
+    <startup> 
+        <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5"/>
+    </startup>
+</configuration>
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CNTKLibraryCSEvalCPUOnlyExamples.csproj
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CNTKLibraryCSEvalCPUOnlyExamples.csproj
@ -0,0 +1,75 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">x64</Platform>
+    <ProjectGuid>{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>CNTKLibraryCSEvalExamples</RootNamespace>
+    <AssemblyName>CNTKLibraryCSEvalCPUOnlyExamples</AssemblyName>
+    <TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+    <AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
+    <NuGetPackageImportStamp>
+    </NuGetPackageImportStamp>
+    <TargetFrameworkProfile />
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
+    <DebugSymbols>true</DebugSymbols>
+    <OutputPath>$(SolutionDir)..\..\$(Platform)\CNTKLibraryCSEvalCPUOnlyExamples.$(Configuration)\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <WarningLevel>4</WarningLevel>
+    <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
+    <DebugType>full</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
+    <OutputPath>$(SolutionDir)..\..\$(Platform)\CNTKLibraryCSEvalCPUOnlyExamples.$(Configuration)\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <WarningLevel>4</WarningLevel>
+    <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
+    <Optimize>true</Optimize>
+    <DebugType>pdbonly</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
+      <HintPath>..\packages\CNTK.CPUOnly.2.0-beta9\lib\net45\x64\CNTKLibraryManaged-2.0.dll</HintPath>
+      <Private>True</Private>
+    </Reference>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Drawing" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="CntkBitmapExtensions.cs" />
+    <Compile Include="CNTKLibraryCSEvalExamples.cs" />
+    <Compile Include="Program.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="App.config" />
+    <None Include="packages.config" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <Import Project="..\packages\CNTK.CPUOnly.2.0-beta9\build\net45\CNTK.CPUOnly.targets" Condition="Exists('..\packages\CNTK.CPUOnly.2.0-beta9\build\net45\CNTK.CPUOnly.targets')" />
+  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
+    <PropertyGroup>
+      <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
+    </PropertyGroup>
+    <Error Condition="!Exists('..\packages\CNTK.CPUOnly.2.0-beta9\build\net45\CNTK.CPUOnly.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\CNTK.CPUOnly.2.0-beta9\build\net45\CNTK.CPUOnly.targets'))" />
+  </Target>
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CNTKLibraryCSEvalExamples.cs
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CNTKLibraryCSEvalExamples.cs
@ -2,7 +2,7 @@
 // Copyright (c) Microsoft. All rights reserved.
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //
-// program.cs -- Example for using C# Eval V2 API.
+// CNTKLibraryCSEvalExamples.cs -- Examples for using CNTK Library C# Eval API.
 //

 using System;
@ -10,13 +10,11 @@ using System.Collections.Generic;
 using System.Drawing;
 using System.IO;
 using System.Linq;
-using System.Text;
-using System.Threading.Tasks;
 using CNTK;

-namespace CNTKLibraryManagedExampleTest
+namespace CNTKLibraryCSEvalExamples
 {
-    public class Program
+    public class CNTKLibraryManagedExamples
    {
        // 
        // The example shows
@ -26,13 +24,13 @@ namespace CNTKLibraryManagedExampleTest
        // - how to evaluate a model.
        // - how to retrieve evaluation result and retrieve output data in dense format.
        //
-        static void EvaluationSingleImage(DeviceDescriptor device)
+        public static void EvaluationSingleImage(DeviceDescriptor device)
        {
            const string outputName = "Plus2060";
            var inputDataMap = new Dictionary<Variable, Value>();

            // Load the model.
-            Function modelFunc = Function.LoadModel("z.model");
+            Function modelFunc = Function.LoadModel("z.model", device);

            // Get output variable based on name
            Variable outputVar = modelFunc.Outputs.Where(variable => string.Equals(variable.Name, outputName)).Single();
@ -85,13 +83,13 @@ namespace CNTKLibraryManagedExampleTest
        // - how to evaluate a model.
        // - how to retrieve evaluation result and retrieve output data in dense format.
        //
-        static void EvaluationBatchOfImages(DeviceDescriptor device)
+        public static void EvaluationBatchOfImages(DeviceDescriptor device)
        {
            const string outputName = "Plus2060";
            var inputDataMap = new Dictionary<Variable, Value>();

            // Load the model.
-            Function modelFunc = Function.LoadModel("z.model");
+            Function modelFunc = Function.LoadModel("z.model", device);

            // Get output variable based on name
            Variable outputVar = modelFunc.Outputs.Where(variable => string.Equals(variable.Name, outputName)).Single();
@ -157,12 +155,12 @@ namespace CNTKLibraryManagedExampleTest
        // - how to evaluate a model.
        // - how to retrieve evaluation result and retrieve output data in the one-hot vector format.
        //
-        static void EvaluationSingleSequenceUsingOneHot(DeviceDescriptor device)
+        public static void EvaluationSingleSequenceUsingOneHot(DeviceDescriptor device)
        {
            var vocabToIndex = buildVocabIndex("ATIS.vocab");
            var indexToVocab = buildInvVocabIndex("ATIS.label");

-            Function myFunc = Function.LoadModel("atis.model");
+            Function myFunc = Function.LoadModel("atis.model", device);

            Console.WriteLine("Evaluate single sequence using one-hot vector");

@ -237,12 +235,12 @@ namespace CNTKLibraryManagedExampleTest
        // - how to evaluate a model.
        // - how to retrieve evaluation result and retrieve output data in the one-hot vector format.
        //
-        static void EvaluationBatchOfSequencesUsingOneHot(DeviceDescriptor device)
+        public static void EvaluationBatchOfSequencesUsingOneHot(DeviceDescriptor device)
        {
            var vocabToIndex = buildVocabIndex("ATIS.vocab");
            var indexToVocab = buildInvVocabIndex("ATIS.label");

-            Function myFunc = Function.LoadModel("atis.model");
+            Function myFunc = Function.LoadModel("atis.model", device);

            Console.WriteLine("Evaluate batch of sequences with variable length using one-hot vector");

@ -367,22 +365,5 @@ namespace CNTKLibraryManagedExampleTest
        {
            return File.ReadAllLines(filePath);
        }
-
-        static void Main(string[] args)
-        {
-            Console.WriteLine("======== Evaluate model using C# ========");
-
-            EvaluationSingleImage(DeviceDescriptor.CPUDevice);
-            EvaluationBatchOfImages(DeviceDescriptor.CPUDevice);
-            //TODO: Add examples with OneHot.
-            //EvaluationSingleSequenceUsingOneHot(DeviceDescriptor.CPUDevice);
-            //EvaluationBatchOfSequencesUsingOneHot(DeviceDescriptor.CPUDevice);
-
-            // TODO: using GPU.
-            //EvaluationSingleImage(DeviceDescriptor.GPUDevice(0));
-            //EvaluationBatchOfImages(DeviceDescriptor.GPUDevice(0));
-
-            Console.WriteLine("======== Evaluation completes. ========");
-        }
    }
 }
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CntkBitmapExtensions.cs
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/CntkBitmapExtensions.cs
@ -9,12 +9,11 @@ using System.Collections.Generic;
 using System.Drawing;
 using System.Drawing.Imaging;
 using System.Linq;
-using System.Runtime.InteropServices;
 using System.Threading.Tasks;

-namespace CNTK
+namespace CNTKLibraryCSEvalExamples
 {
-    public static class BitmapExtensions
+    public static class CntkBitmapExtensions
    {
        /// <summary>
        /// Resizes an image
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/Program.cs
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/Program.cs
@ -0,0 +1,25 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// Program.cs -- Example for using CNTK Library C# Eval CPUOnly Nuget Package.
+//
+
+using System;
+using CNTK;
+
+namespace CNTKLibraryCSEvalExamples
+{
+    class Program
+    {
+        static void Main(string[] args)
+        {
+            Console.WriteLine("======== Evaluate model using C# CPUOnly Build ========");
+
+            CNTKLibraryManagedExamples.EvaluationSingleImage(DeviceDescriptor.CPUDevice);
+            CNTKLibraryManagedExamples.EvaluationBatchOfImages(DeviceDescriptor.CPUDevice);
+            
+            Console.WriteLine("======== Evaluation completes. ========");
+        }
+    }
+}
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/Properties/AssemblyInfo.cs
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/Properties/AssemblyInfo.cs
@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("CNTKLibraryCSEvalCPUOnlyExamples")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("CNTKLibraryCSEvalCPUOnlyExamples")]
+[assembly: AssemblyCopyright("Copyright ©  2017")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("8aad7322-10b1-48c3-9bc7-005a7910c5e6")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
--- a/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/packages.config
+++ b/Examples/Evaluation/CNTKLibraryCSEvalCPUOnlyExamples/packages.config
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="CNTK.CPUOnly" version="2.0-beta9" targetFramework="net45" />
+</packages>
--- a/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/App.config
+++ b/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/App.config
@ -0,0 +1,6 @@
+<?xml version="1.0" encoding="utf-8"?>
+<configuration>
+    <startup> 
+        <supportedRuntime version="v4.0" sku=".NETFramework,Version=v4.5"/>
+    </startup>
+</configuration>
--- a/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/CNTKLibraryCSEvalGPUExamples.csproj
+++ b/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/CNTKLibraryCSEvalGPUExamples.csproj
@ -0,0 +1,79 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="14.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props" Condition="Exists('$(MSBuildExtensionsPath)\$(MSBuildToolsVersion)\Microsoft.Common.props')" />
+  <PropertyGroup>
+    <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
+    <Platform Condition=" '$(Platform)' == '' ">x64</Platform>
+    <ProjectGuid>{307E5BAC-DA03-45D2-ADEC-FE6620090109}</ProjectGuid>
+    <OutputType>Exe</OutputType>
+    <AppDesignerFolder>Properties</AppDesignerFolder>
+    <RootNamespace>CNTKLibraryCSEvalExamples</RootNamespace>
+    <AssemblyName>CNTKLibraryCSEvalGPUExamples</AssemblyName>
+    <TargetFrameworkVersion>v4.5</TargetFrameworkVersion>
+    <FileAlignment>512</FileAlignment>
+    <AutoGenerateBindingRedirects>true</AutoGenerateBindingRedirects>
+    <TargetFrameworkProfile />
+    <NuGetPackageImportStamp>
+    </NuGetPackageImportStamp>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Debug|x64'">
+    <DebugSymbols>true</DebugSymbols>
+    <OutputPath>$(SolutionDir)..\..\$(Platform)\CNTKLibraryCSEvalGPUExamples.$(Configuration)\</OutputPath>
+    <DefineConstants>DEBUG;TRACE</DefineConstants>
+    <WarningLevel>4</WarningLevel>
+    <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
+    <DebugType>full</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+  </PropertyGroup>
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)' == 'Release|x64'">
+    <OutputPath>$(SolutionDir)..\..\$(Platform)\CNTKLibraryCSEvalGPUExamples.$(Configuration)\</OutputPath>
+    <DefineConstants>TRACE</DefineConstants>
+    <WarningLevel>4</WarningLevel>
+    <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
+    <Optimize>true</Optimize>
+    <DebugType>pdbonly</DebugType>
+    <PlatformTarget>x64</PlatformTarget>
+    <ErrorReport>prompt</ErrorReport>
+    <CodeAnalysisRuleSet>MinimumRecommendedRules.ruleset</CodeAnalysisRuleSet>
+  </PropertyGroup>
+  <ItemGroup>
+    <Reference Include="CNTKLibraryManaged-2.0, Version=1.0.0.0, Culture=neutral, processorArchitecture=AMD64">
+      <HintPath>..\packages\CNTK.GPU.2.0-beta9\lib\net45\x64\CNTKLibraryManaged-2.0.dll</HintPath>
+      <Private>True</Private>
+    </Reference>
+    <Reference Include="System" />
+    <Reference Include="System.Core" />
+    <Reference Include="System.Drawing" />
+  </ItemGroup>
+  <ItemGroup>
+    <Compile Include="..\CNTKLibraryCSEvalCPUOnlyExamples\CntkBitmapExtensions.cs">
+      <Link>CntkBitmapExtensions.cs</Link>
+    </Compile>
+    <Compile Include="..\CNTKLibraryCSEvalCPUOnlyExamples\CNTKLibraryCSEvalExamples.cs">
+      <Link>CNTKLibraryCSEvalExamples.cs</Link>
+    </Compile>
+    <Compile Include="Program.cs" />
+    <Compile Include="Properties\AssemblyInfo.cs" />
+  </ItemGroup>
+  <ItemGroup>
+    <None Include="App.config" />
+    <None Include="packages.config" />
+  </ItemGroup>
+  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
+  <Import Project="..\packages\CNTK.GPU.2.0-beta9\build\net45\CNTK.GPU.targets" Condition="Exists('..\packages\CNTK.GPU.2.0-beta9\build\net45\CNTK.GPU.targets')" />
+  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
+    <PropertyGroup>
+      <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
+    </PropertyGroup>
+    <Error Condition="!Exists('..\packages\CNTK.GPU.2.0-beta9\build\net45\CNTK.GPU.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\CNTK.GPU.2.0-beta9\build\net45\CNTK.GPU.targets'))" />
+  </Target>
+  <!-- To modify your build process, add your task inside one of the targets below and uncomment it. 
+       Other similar extension points exist, see Microsoft.Common.targets.
+  <Target Name="BeforeBuild">
+  </Target>
+  <Target Name="AfterBuild">
+  </Target>
+  -->
+</Project>
--- a/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/Program.cs
+++ b/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/Program.cs
@ -0,0 +1,25 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// Program.cs -- Example for using CNTK Library C# Eval GPU Nuget Package.
+//
+
+using System;
+using CNTK;
+
+namespace CNTKLibraryCSEvalExamples
+{
+    class Program
+    {
+        static void Main(string[] args)
+        {
+            Console.WriteLine("======== Evaluate model using C# GPU Build ========");
+            
+            CNTKLibraryManagedExamples.EvaluationSingleImage(DeviceDescriptor.GPUDevice(0));
+            CNTKLibraryManagedExamples.EvaluationBatchOfImages(DeviceDescriptor.GPUDevice(0));
+
+            Console.WriteLine("======== Evaluation completes. ========");
+        }
+    }
+}
--- a/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/Properties/AssemblyInfo.cs
+++ b/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/Properties/AssemblyInfo.cs
@ -0,0 +1,36 @@
+using System.Reflection;
+using System.Runtime.CompilerServices;
+using System.Runtime.InteropServices;
+
+// General Information about an assembly is controlled through the following 
+// set of attributes. Change these attribute values to modify the information
+// associated with an assembly.
+[assembly: AssemblyTitle("CSharpEvalGPUExamples")]
+[assembly: AssemblyDescription("")]
+[assembly: AssemblyConfiguration("")]
+[assembly: AssemblyCompany("")]
+[assembly: AssemblyProduct("CSharpEvalGPUExamples")]
+[assembly: AssemblyCopyright("Copyright ©  2017")]
+[assembly: AssemblyTrademark("")]
+[assembly: AssemblyCulture("")]
+
+// Setting ComVisible to false makes the types in this assembly not visible 
+// to COM components.  If you need to access a type in this assembly from 
+// COM, set the ComVisible attribute to true on that type.
+[assembly: ComVisible(false)]
+
+// The following GUID is for the ID of the typelib if this project is exposed to COM
+[assembly: Guid("307e5bac-da03-45d2-adec-fe6620090109")]
+
+// Version information for an assembly consists of the following four values:
+//
+//      Major Version
+//      Minor Version 
+//      Build Number
+//      Revision
+//
+// You can specify all the values or you can default the Build and Revision Numbers 
+// by using the '*' as shown below:
+// [assembly: AssemblyVersion("1.0.*")]
+[assembly: AssemblyVersion("1.0.0.0")]
+[assembly: AssemblyFileVersion("1.0.0.0")]
--- a/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/packages.config
+++ b/Examples/Evaluation/CNTKLibraryCSEvalGPUExamples/packages.config
@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="utf-8"?>
+<packages>
+  <package id="CNTK.GPU" version="2.0-beta9" targetFramework="net45" />
+</packages>
--- a/Examples/Evaluation/CNTKLibraryEvalExamples.sln
+++ b/Examples/Evaluation/CNTKLibraryEvalExamples.sln
@ -0,0 +1,33 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio 14
+VisualStudioVersion = 14.0.25420.1
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CNTKLibraryCPPEvalExamples", "CNTKLibraryCPPEvalExamples\CNTKLibraryCPPEvalExamples.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalCPUOnlyExamples", "CNTKLibraryCSEvalCPUOnlyExamples\CNTKLibraryCSEvalCPUOnlyExamples.csproj", "{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CNTKLibraryCSEvalGPUExamples", "CNTKLibraryCSEvalGPUExamples\CNTKLibraryCSEvalGPUExamples.csproj", "{307E5BAC-DA03-45D2-ADEC-FE6620090109}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|x64 = Debug|x64
+		Release|x64 = Release|x64
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Release|x64
+		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.ActiveCfg = Release|x64
+		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.Build.0 = Release|x64
+		{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}.Debug|x64.ActiveCfg = Debug|x64
+		{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}.Debug|x64.Build.0 = Debug|x64
+		{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}.Release|x64.ActiveCfg = Release|x64
+		{8AAD7322-10B1-48C3-9BC7-005A7910C5E6}.Release|x64.Build.0 = Release|x64
+		{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Debug|x64.ActiveCfg = Debug|x64
+		{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Debug|x64.Build.0 = Debug|x64
+		{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Release|x64.ActiveCfg = Release|x64
+		{307E5BAC-DA03-45D2-ADEC-FE6620090109}.Release|x64.Build.0 = Release|x64
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
--- a/Examples/Evaluation/CSEvalClient/CSEvalClient.csproj
+++ b/Examples/Evaluation/CSEvalClient/CSEvalClient.csproj
@ -50,7 +50,7 @@
  </PropertyGroup>
  <ItemGroup>
    <Reference Include="EvalWrapper, Version=0.0.0.0, Culture=neutral, PublicKeyToken=52681d72504348ec, processorArchitecture=AMD64">
-      <HintPath>..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta7\lib\net45\x64\EvalWrapper.dll</HintPath>
+      <HintPath>..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta9\lib\net45\x64\EvalWrapper.dll</HintPath>
      <Private>True</Private>
    </Reference>
    <Reference Include="System" />
@ -59,7 +59,9 @@
    <Reference Include="System.Drawing" />
  </ItemGroup>
  <ItemGroup>
-    <Compile Include="CntkBitmapExtensions.cs" />
+    <Compile Include="..\CNTKLibraryCSEvalCPUOnlyExamples\CntkBitmapExtensions.cs">
+      <Link>CntkBitmapExtensions.cs</Link>
+    </Compile>
    <Compile Include="ModelEvaluator.cs" />
    <Compile Include="Program.cs" />
    <Compile Include="Properties\AssemblyInfo.cs" />
@ -86,11 +88,11 @@
    </BootstrapperPackage>
  </ItemGroup>
  <Import Project="$(MSBuildToolsPath)\Microsoft.CSharp.targets" />
-  <Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta7\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta7\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
+  <Import Project="..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta9\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets" Condition="Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta9\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" />
  <Target Name="EnsureNuGetPackageBuildImports" BeforeTargets="PrepareForBuild">
    <PropertyGroup>
      <ErrorText>This project references NuGet package(s) that are missing on this computer. Use NuGet Package Restore to download them.  For more information, see http://go.microsoft.com/fwlink/?LinkID=322105. The missing file is {0}.</ErrorText>
    </PropertyGroup>
-    <Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta7\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta7\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
+    <Error Condition="!Exists('..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta9\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets')" Text="$([System.String]::Format('$(ErrorText)', '..\packages\Microsoft.Research.CNTK.CpuEval-mkl.2.0-beta9\build\net45\Microsoft.Research.CNTK.CpuEval-mkl.targets'))" />
  </Target>
-</Project>
+</Project>
--- a/Examples/Evaluation/CSEvalClient/CntkBitmapExtensions.cs
+++ b/Examples/Evaluation/CSEvalClient/CntkBitmapExtensions.cs
@ -1,214 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-// CntkBitmapExtensions.cs -- extension methods for transforming images used in CNTK.
-//
-using System;
-using System.Collections.Generic;
-using System.Drawing;
-using System.Drawing.Imaging;
-using System.Linq;
-using System.Runtime.InteropServices;
-using System.Threading.Tasks;
-
-namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
-{
-    public static class CntkBitmapExtensions
-    {
-        /// <summary>
-        /// Resizes an image
-        /// </summary>
-        /// <param name="image">The image to resize</param>
-        /// <param name="width">New width in pixels</param>
-        /// <param name="height">New height in pixesl</param>
-        /// <param name="useHighQuality">Resize quality</param>
-        /// <returns>The resized image</returns>
-        public static Bitmap Resize(this Bitmap image, int width, int height, bool useHighQuality)
-        {
-            var newImg = new Bitmap(width, height);
-
-            newImg.SetResolution(image.HorizontalResolution, image.VerticalResolution);
-
-            using (var g = Graphics.FromImage(newImg))
-            {
-                g.CompositingMode = System.Drawing.Drawing2D.CompositingMode.SourceCopy;
-                if (useHighQuality)
-                {
-                    g.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.HighQualityBicubic;
-                    g.CompositingQuality = System.Drawing.Drawing2D.CompositingQuality.HighQuality;
-                    g.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.HighQuality;
-                    g.PixelOffsetMode = System.Drawing.Drawing2D.PixelOffsetMode.HighQuality;
-                }
-                else
-                {
-                    g.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.Default;
-                    g.CompositingQuality = System.Drawing.Drawing2D.CompositingQuality.Default;
-                    g.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.Default;
-                    g.PixelOffsetMode = System.Drawing.Drawing2D.PixelOffsetMode.Default;
-                }
-
-                var attributes = new ImageAttributes();
-                attributes.SetWrapMode(System.Drawing.Drawing2D.WrapMode.TileFlipXY);
-                g.DrawImage(image, new Rectangle(0, 0, width, height), 0, 0, image.Width, image.Height, GraphicsUnit.Pixel, attributes);
-            }
-
-            return newImg;
-        }
-
-        /// <summary>
-        /// Extracts image pixels in CHW
-        /// </summary>
-        /// <param name="image">The bitmap image to extract features from</param>
-        /// <returns>A list of pixels in HWC order</returns>
-        public static List<float> ExtractCHW(this Bitmap image)
-        {
-            var features = new List<float>(image.Width * image.Height * 3);
-            for (int c = 0; c < 3; c++)
-            {
-                for (int h = 0; h < image.Height; h++)
-                {
-                    for (int w = 0; w < image.Width; w++)
-                    {
-                        var pixel = image.GetPixel(w, h);
-                        float v = c == 0 ? pixel.B : c == 1 ? pixel.G : pixel.R;
-                        features.Add(v);
-                    }
-                }
-            }
-
-            return features;
-        }
-
-        /// <summary>
-        /// Extracts image pixels in CHW using parallelization
-        /// </summary>
-        /// <param name="image">The bitmap image to extract features from</param>
-        /// <returns>A list of pixels in CHW order</returns>
-        public static List<float> ParallelExtractCHW(this Bitmap image)
-        {
-            // We use local variables to avoid contention on the image object through the multiple threads.
-            int channelStride = image.Width * image.Height;
-            int imageWidth = image.Width;
-            int imageHeight = image.Height;
-
-            var features = new byte[imageWidth * imageHeight * 3];
-            var bitmapData = image.LockBits(new Rectangle(0, 0, imageWidth, imageHeight), ImageLockMode.ReadOnly, image.PixelFormat);
-            IntPtr ptr = bitmapData.Scan0;
-            int bytes = Math.Abs(bitmapData.Stride) * bitmapData.Height;
-            byte[] rgbValues = new byte[bytes];
-
-            int stride = bitmapData.Stride;
-
-            // Copy the RGB values into the array.
-            System.Runtime.InteropServices.Marshal.Copy(ptr, rgbValues, 0, bytes);
-
-            // The mapping depends on the pixel format
-            // The mapPixel lambda will return the right color channel for the desired pixel
-            Func<int, int, int, int> mapPixel = GetPixelMapper(image.PixelFormat, stride);
-
-            // Averaged over a large number of images, these loops here execute fastest 
-            // when doing Parallel.For only over c, but not over h and w.
-            Parallel.For(0, 3, (int c) =>
-            {
-                for (int h = 0; h < imageHeight; h++)
-                {
-                    for (int w = 0; w < imageWidth; w++)
-                    {
-                        features[channelStride * c + imageWidth * h + w] = rgbValues[mapPixel(h, w, c)];
-                    }
-                }
-            });
-
-            image.UnlockBits(bitmapData);
-
-            return features.Select(b => (float)b).ToList();
-        }
-
-        /// <summary>
-        /// Extracts image pixels in HWC
-        /// </summary>
-        /// <param name="image">The bitmap image to extract features from</param>
-        /// <returns>A list of pixels in HWC order</returns>
-        public static List<float> ExtractHWC(this Bitmap image)
-        {
-            var features = new List<float>(image.Width * image.Height * 3);
-            for (int w = 0; w < image.Width; w++)
-            {
-                for (int h = 0; h < image.Height; h++)
-                {
-                    for (int c = 0; c < 3; c++)
-                    {
-                        var pixel = image.GetPixel(w, h);
-                        float v = c == 0 ? pixel.B : c == 1 ? pixel.G : pixel.R;
-
-                        features.Add(v);
-                    }
-                }
-            }
-
-            return features;
-        }
-
-        /// <summary>
-        /// Extracts image pixels in HWC using multiple threads
-        /// </summary>
-        /// <param name="image">The bitmap image to extract features from</param>
-        /// <returns>A list of pixels in HWC order</returns>
-        public static List<float> ParallelExtractHWC(this Bitmap image)
-        {
-            int heightStride = image.Width * 3;
-            int widthStride = image.Height * 3;
-            int imageWidth = image.Width;
-            int imageHeight = image.Height;
-
-            var features = new byte[image.Width * image.Height * 3];
-            var bitmapData = image.LockBits(new Rectangle(0, 0, image.Width, image.Height), ImageLockMode.ReadOnly, image.PixelFormat);
-            IntPtr ptr = bitmapData.Scan0;
-            int bytes = Math.Abs(bitmapData.Stride) * bitmapData.Height;
-            byte[] rgbValues = new byte[bytes];
-
-            int stride = bitmapData.Stride;
-
-            // Copy the RGB values into the array.
-            System.Runtime.InteropServices.Marshal.Copy(ptr, rgbValues, 0, bytes);
-
-            // The mapping depends on the pixel format
-            // The mapPixel lambda will return the right color channel for the desired pixel
-            Func<int, int, int, int> mapPixel = GetPixelMapper(image.PixelFormat, stride);
-
-            Parallel.For(0, 3, (int c) =>
-            {
-                for (int h = 0; h < imageHeight; h++)
-                {
-                    for (int w = 0; w < imageWidth; w++)
-                    {
-                        features[w * widthStride + h * 3 + c] = rgbValues[mapPixel(h, w, c)];
-                    };
-                };
-            });
-
-            image.UnlockBits(bitmapData);
-
-            return features.Select(b => (float)b).ToList();
-        }
-
-        /// <summary>
-        /// Returns a function for extracting the R-G-B values properly from an image based on its pixel format
-        /// </summary>
-        /// <param name="pixelFormat">The image's pixel format</param>
-        /// <param name="heightStride">The stride (row byte count)</param>
-        /// <returns>A function with signature (height, width, channel) returning the corresponding color value</returns>
-        private static Func<int, int, int, int> GetPixelMapper(PixelFormat pixelFormat, int heightStride)
-        {
-            switch (pixelFormat)
-            {
-                case PixelFormat.Format32bppArgb:
-                    return (h, w, c) => h * heightStride + w * 4 + c;  // bytes are B-G-R-A
-                case PixelFormat.Format24bppRgb:
-                default:
-                    return (h, w, c) => h * heightStride + w * 3 + c;  // bytes are B-G-R
-            }
-        }
-    }
-}
--- a/Examples/Evaluation/CSEvalClient/Program.cs
+++ b/Examples/Evaluation/CSEvalClient/Program.cs
@ -14,7 +14,7 @@ using System.Linq;
 using System.Linq.Expressions;
 using System.Threading;
 using System.Threading.Tasks;
-using Microsoft.MSR.CNTK.Extensibility.Managed;
+using CNTKLibraryCSEvalExamples;

 namespace Microsoft.MSR.CNTK.Extensibility.Managed.CSEvalClient
 {
--- a/Examples/Evaluation/CSEvalClient/packages.config
+++ b/Examples/Evaluation/CSEvalClient/packages.config
@ -1,4 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>
 <packages>
-  <package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta7" targetFramework="net45" />
-</packages>
+  <package id="Microsoft.Research.CNTK.CpuEval-mkl" version="2.0-beta9" targetFramework="net45" />
+</packages>
--- a/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
+++ b/Examples/Image/Classification/AlexNet/BrainScript/AlexNet_ImageNet.cntk
@ -124,9 +124,9 @@ Train = {
                    transforms = (
                        {
                            type = "Crop"
-                            cropType = "random"
-                            cropRatio = 0.88671875
-                            jitterType = "uniRatio"
+                            cropType = "RandomSide"
+                            sideRatio = 0.88671875
+                            jitterType = "UniRatio"
                        }:{
                            type = "Scale"
                            width = 227
@ -167,7 +167,7 @@ Test = {
                        {
                            type = "Crop"
                            cropType = "center"
-                            cropRatio = 0.88671875
+                            sideRatio = 0.88671875
                        }:{
                            type = "Scale"
                            width = 227
--- a/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
+++ b/Examples/Image/Classification/ConvNet/BrainScript/ConvNetLRN_CIFAR10_DataAug.cntk
@ -94,7 +94,7 @@ TrainConvNet = {
            file = "$dataDir$/train_map.txt"
            input = {
                features = { transforms = (
-                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
+                    { type = "Crop" ; cropType = "RandomSide" ; sideRatio = 0.8 ; jitterType = "UniRatio" } :
                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
                    { type = "Mean" ; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
                    { type = "Transpose" }
--- a/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10_DataAug.cntk
+++ b/Examples/Image/Classification/ConvNet/BrainScript/ConvNet_CIFAR10_DataAug.cntk
@ -72,7 +72,7 @@ TrainConvNet = {
            file = "$dataDir$/train_map.txt"
            input = {
                features = { transforms = (
-                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
+                    { type = "Crop" ; cropType = "RandomSide" ; sideRatio = 0.8 ; jitterType = "UniRatio" } :
                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
                    { type = "Mean" ; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
                    { type = "Transpose" }
--- a/Examples/Image/Classification/ConvNet/Python/ConvNetLRN_CIFAR10_DataAug.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNetLRN_CIFAR10_DataAug.py
@ -0,0 +1,166 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+from __future__ import print_function
+import os
+import math
+import numpy as np
+import cntk
+import _cntk_py
+
+# Paths relative to current python file. 
+abs_path   = os.path.dirname(os.path.abspath(__file__))
+data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
+model_path = os.path.join(abs_path, "Models")
+
+# model dimensions
+image_height = 32
+image_width  = 32
+num_channels = 3  # RGB
+num_classes  = 10
+
+# Define the reader for both training and evaluation action.
+def create_reader(map_file, mean_file, is_training):
+    if not os.path.exists(map_file) or not os.path.exists(mean_file):
+        raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" %
+                           (map_file, mean_file))
+
+    # transformation pipeline for the features has jitter/crop only when training
+    transforms = []
+    if is_training:
+        transforms += [
+            cntk.io.ImageDeserializer.crop(crop_type='RandomSide', side_ratio=0.8, jitter_type='uniRatio') # train uses jitter
+        ]
+    transforms += [
+        cntk.io.ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
+        cntk.io.ImageDeserializer.mean(mean_file)
+    ]
+    # deserializer
+    return cntk.io.MinibatchSource(cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
+        features = cntk.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
+        labels   = cntk.io.StreamDef(field='label', shape=num_classes))),   # and second as 'label'
+        randomize=is_training)
+
+# Local Response Normalization layer. See Section 3.3 of the paper: 
+# https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf 
+# The mathematical equation is: 
+#   b_{x,y}^i=a_{x,y}^i/(k+\alpha\sum_{j=max(0,i-n)}^{min(N-1, i+n)}(a_{x,y}^j)^2)^\beta
+# where a_{x,y}^i is the activity of a neuron comoputed by applying kernel i at position (x,y)
+# N is the total number of kernals, n is half normalization width.  
+def LRN(k, n, alpha, beta): 
+    x = cntk.blocks.Placeholder(name='lrn_arg') 
+    x2 = cntk.ops.square(x) 
+    # reshape to insert a fake singleton reduction dimension after the 3th axis (channel axis). Note Python axis order and BrainScript are reversed. 
+    x2s = cntk.ops.reshape(x2, (1, cntk.InferredDimension), 0, 1)
+    W = cntk.ops.constant(alpha/(2*n+1), (1,2*n+1,1,1), name='W')
+    # 3D convolution with a filter that has a non 1-size only in the 3rd axis, and does not reduce since the reduction dimension is fake and 1
+    y = cntk.ops.convolution (W, x2s)
+    # reshape back to remove the fake singleton reduction dimension
+    b = cntk.ops.reshape(y, cntk.InferredDimension, 0, 2)
+    den = cntk.ops.exp(beta * cntk.ops.log(k + b)) 
+    apply_x = cntk.ops.element_divide(x, den)
+    return cntk.blocks.Block(apply_x, 'LRN')
+
+# Train and evaluate the network.
+def convnetlrn_cifar10_dataaug(reader_train, reader_test, epoch_size=50000, max_epochs = 80):
+    _cntk_py.set_computation_network_trace_level(1)
+
+    # Input variables denoting the features and label data
+    input_var = cntk.ops.input_variable((num_channels, image_height, image_width))
+    label_var = cntk.ops.input_variable((num_classes))
+
+    # apply model to input
+    scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var)
+
+    with cntk.layers.default_options (activation=cntk.ops.relu, pad=True): 
+        z = cntk.models.Sequential([
+            cntk.models.LayerStack(2, lambda : [
+                cntk.layers.Convolution((3,3), 64), 
+                cntk.layers.Convolution((3,3), 64), 
+                LRN (1.0, 4, 0.001, 0.75),
+                cntk.layers.MaxPooling((3,3), (2,2))
+            ]), 
+            cntk.models.LayerStack(2, lambda i: [
+                cntk.layers.Dense([256,128][i]), 
+                cntk.layers.Dropout(0.5)
+            ]), 
+            cntk.layers.Dense(num_classes, activation=None)
+        ])(scaled_input)
+
+    # loss and metric
+    ce = cntk.ops.cross_entropy_with_softmax(z, label_var)
+    pe = cntk.ops.classification_error(z, label_var)
+
+    # training config
+    minibatch_size = 64
+
+    # Set learning parameters
+    lr_per_sample          = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625]
+    lr_schedule            = cntk.learning_rate_schedule(lr_per_sample, unit=cntk.learner.UnitType.sample, epoch_size=epoch_size)
+    mm_time_constant       = [0]*20 + [600]*20 + [1200]
+    mm_schedule            = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
+    l2_reg_weight          = 0.002
+    
+    # trainer object
+    learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule,
+                                        unit_gain = True,
+                                        l2_regularization_weight = l2_reg_weight)
+    trainer =  cntk.Trainer(z, ce, pe, learner)
+
+    # define mapping from reader streams to network inputs
+    input_map = {
+        input_var: reader_train.streams.features,
+        label_var: reader_train.streams.labels
+    }
+
+    cntk.utils.log_number_of_parameters(z) ; print()
+    progress_printer = cntk.utils.ProgressPrinter(tag='Training')
+
+    # perform model training
+    for epoch in range(max_epochs):       # loop over epochs
+        sample_count = 0
+        while sample_count < epoch_size:  # loop over minibatches in the epoch
+            data = reader_train.next_minibatch(min(minibatch_size, epoch_size-sample_count), input_map=input_map) # fetch minibatch.
+            trainer.train_minibatch(data)                                   # update model with it
+            sample_count += trainer.previous_minibatch_sample_count         # count samples processed so far
+            progress_printer.update_with_trainer(trainer, with_metric=True) # log progress
+
+        progress_printer.epoch_summary(with_metric=True)
+        z.save_model(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
+    
+    ### Evaluation action
+    epoch_size     = 10000
+    minibatch_size = 16
+
+    # process minibatches and evaluate the model
+    metric_numer    = 0
+    metric_denom    = 0
+    sample_count    = 0
+    minibatch_index = 0
+
+    while sample_count < epoch_size:
+        current_minibatch = min(minibatch_size, epoch_size - sample_count)
+        # Fetch next test min batch.
+        data = reader_test.next_minibatch(current_minibatch, input_map=input_map)
+        # minibatch data to be trained with
+        metric_numer += trainer.test_minibatch(data) * current_minibatch
+        metric_denom += current_minibatch
+        # Keep track of the number of samples processed so far.
+        sample_count += data[label_var].num_samples
+        minibatch_index += 1
+
+    print("")
+    print("Final Results: Minibatch[1-{}]: errs = {:0.2f}% * {}".format(minibatch_index+1, (metric_numer*100.0)/metric_denom, metric_denom))
+    print("")
+
+    return metric_numer/metric_denom
+
+if __name__=='__main__':
+    reader_train = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True)
+    reader_test  = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)
+
+    convnetlrn_cifar10_dataaug(reader_train, reader_test)
+
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10.py
@ -73,7 +73,8 @@ def convnet_cifar10(debug_output=False):
    l2_reg_weight          = 0.002

    # Instantiate the trainer object to drive the model training
-    learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule, l2_regularization_weight = l2_reg_weight)
+    learner = cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule,
+                                        l2_regularization_weight = l2_reg_weight)
    trainer = cntk.Trainer(z, (ce, pe), learner)

    # define mapping from reader streams to network inputs
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug.py
@ -46,17 +46,17 @@ def create_reader(map_file, mean_file, is_training):
    transforms = []
    if is_training:
        transforms += [
-            ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter
+            cntk.io.ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
        ]
    transforms += [
        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
        ImageDeserializer.mean(mean_file)
    ]
    # deserializer
-    return MinibatchSource(ImageDeserializer(map_file, StreamDefs(
-        features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
-        labels   = StreamDef(field='label', shape=num_classes)      # and second as 'label'
-    )), randomize=is_training, epoch_size = INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)
+    return cntk.io.MinibatchSource(cntk.io.ImageDeserializer(map_file, cntk.io.StreamDefs(
+        features = cntk.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
+        labels   = cntk.io.StreamDef(field='label', shape=num_classes))),   # and second as 'label'
+        randomize=is_training, epoch_size = INFINITELY_REPEAT if is_training else FULL_DATA_SWEEP)

 ########################
 # define the model     #
--- a/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
+++ b/Examples/Image/Classification/ConvNet/Python/ConvNet_CIFAR10_DataAug_Distributed.py
@ -12,6 +12,9 @@ import numpy as np
 import cntk
 import _cntk_py

+from cntk.utils import *
+from cntk.distributed import data_parallel_distributed_learner, Communicator
+
 # default Paths relative to current python file.
 abs_path   = os.path.dirname(os.path.abspath(__file__))
 data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
@ -24,8 +27,8 @@ image_width  = 32
 num_channels = 3  # RGB
 num_classes  = 10

-# Define the reader for both training and evaluation action.
-def create_reader(map_file, mean_file, train, total_number_of_samples, distributed_after=cntk.io.INFINITE_SAMPLES):
+# Create a minibatch source.
+def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
    if not os.path.exists(map_file) or not os.path.exists(mean_file):
        raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" %
                           (map_file, mean_file))
@ -34,7 +37,7 @@ def create_reader(map_file, mean_file, train, total_number_of_samples, distribut
    transforms = []
    if train:
        transforms += [
-            cntk.io.ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter
+            cntk.io.ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
        ]

    transforms += [
@ -48,19 +51,17 @@ def create_reader(map_file, mean_file, train, total_number_of_samples, distribut
            features = cntk.io.StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
            labels   = cntk.io.StreamDef(field='label', shape=num_classes))),   # and second as 'label'
        epoch_size=total_number_of_samples,
-        multithreaded_deserializer = False,  # turn off omp as CIFAR-10 is not heavy for deserializer
-        distributed_after = distributed_after)
+        multithreaded_deserializer = True)

-# Train and evaluate the network.
-def convnet_cifar10_dataaug(create_train_reader, test_reader, create_dist_learner, max_epochs=80, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
-    _cntk_py.set_computation_network_trace_level(0)
+# Create the network.
+def create_conv_network():

    # Input variables denoting the features and label data
-    input_var = cntk.ops.input_variable((num_channels, image_height, image_width))
+    feature_var = cntk.ops.input_variable((num_channels, image_height, image_width))
    label_var = cntk.ops.input_variable((num_classes))

    # apply model to input
-    scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), input_var)
+    scaled_input = cntk.ops.element_times(cntk.ops.constant(0.00390625), feature_var)
    
    with cntk.layers.default_options(activation=cntk.ops.relu, pad=True):
        z = cntk.models.Sequential([
@ -80,55 +81,49 @@ def convnet_cifar10_dataaug(create_train_reader, test_reader, create_dist_learne
    ce = cntk.ops.cross_entropy_with_softmax(z, label_var)
    pe = cntk.ops.classification_error(z, label_var)

-    # training config
-    epoch_size = 50000  # for now we manually specify epoch size
-    minibatch_size = 64
+    cntk.utils.log_number_of_parameters(z) ; print()

+    return {
+        'feature': feature_var,
+        'label': label_var,
+        'ce' : ce,
+        'pe' : pe,
+        'output': z
+    }
+
+
+# Create trainer
+def create_trainer(network, epoch_size, num_quantization_bits):
    # Set learning parameters
    lr_per_sample     = [0.0015625]*20 + [0.00046875]*20 + [0.00015625]*20 + [0.000046875]*10 + [0.000015625]
    lr_schedule       = cntk.learning_rate_schedule(lr_per_sample, unit=cntk.learner.UnitType.sample, epoch_size=epoch_size)
    mm_time_constant  = [0]*20 + [600]*20 + [1200]
    mm_schedule       = cntk.learner.momentum_as_time_constant_schedule(mm_time_constant, epoch_size=epoch_size)
    l2_reg_weight     = 0.002
+    
+    # Create learner
+    learner = data_parallel_distributed_learner(
+        cntk.learner.momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight),
+        num_quantization_bits=num_quantization_bits,
+        distributed_after=0)

-    # trainer object
-    learner = create_dist_learner(
-        cntk.learner.momentum_sgd(z.parameters, lr_schedule, mm_schedule, l2_regularization_weight=l2_reg_weight))
+    # Create trainer
+    return cntk.Trainer(network['output'], network['ce'], network['pe'], learner)

-    trainer = cntk.Trainer(z, ce, pe, learner)
+# Train and test
+def train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size):

-    total_number_of_samples = max_epochs * epoch_size
-    train_reader = create_train_reader(total_number_of_samples)
-
-    # define mapping from reader streams to network inputs
+    # define mapping from intput streams to network inputs
    input_map = {
-        input_var: train_reader.streams.features,
-        label_var: train_reader.streams.labels
+        network['feature']: train_source.streams.features,
+        network['label']: train_source.streams.labels
    }

-    cntk.utils.log_number_of_parameters(z) ; print()
-    progress_printer = cntk.utils.ProgressPrinter(
-        freq=num_mbs_per_log,
-        tag='Training',
-        log_to_file=log_to_file,
-        distributed_learner=learner,
-        gen_heartbeat=gen_heartbeat,
-        num_epochs=max_epochs)
+    training_session = cntk.training_session(train_source, trainer,
+        cntk.minibatch_size_schedule(64), progress_printer, input_map, "ConvNet_CIFAR10_DataAug_", epoch_size)
+    training_session.train()

-    # perform model training
-    updated=True
-    epoch=0
-    
-    while updated:
-        data = train_reader.next_minibatch(minibatch_size, input_map=input_map)   # fetch minibatch.
-        updated = trainer.train_minibatch(data)                                   # update model with it
-        progress_printer.update_with_trainer(trainer, with_metric=True)           # log progress
-        current_epoch = int(trainer.total_number_of_samples_seen/epoch_size)
-        
-        if epoch != current_epoch:
-            progress_printer.epoch_summary(with_metric=True)
-            epoch = current_epoch
-            trainer.save_checkpoint(os.path.join(model_path, "ConvNet_CIFAR10_DataAug_{}.dnn".format(epoch)))
+    ### TODO: Stay tuned for an upcoming simpler EvalSession API for test/validation.    

    ### Evaluation action
    minibatch_size = 16
@ -139,9 +134,9 @@ def convnet_cifar10_dataaug(create_train_reader, test_reader, create_dist_learne
    minibatch_index = 0

    while True:
-        data = test_reader.next_minibatch(minibatch_size, input_map=input_map)
+        data = test_source.next_minibatch(minibatch_size, input_map=input_map)
        if not data: break
-        local_mb_samples=data[label_var].num_samples
+        local_mb_samples=data[network['label']].num_samples
        metric_numer += trainer.test_minibatch(data) * local_mb_samples
        metric_denom += local_mb_samples
        minibatch_index += 1
@ -156,7 +151,28 @@ def convnet_cifar10_dataaug(create_train_reader, test_reader, create_dist_learne

    return metric_numer/metric_denom

+
+# Train and evaluate the network.
+def convnet_cifar10_dataaug(train_data, test_data, mean_data, num_quantization_bits=32, epoch_size = 50000, max_epochs=80, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False):
+    _cntk_py.set_computation_network_trace_level(0)
+
+    progress_printer = ProgressPrinter(
+        freq=num_mbs_per_log,
+        tag='Training',
+        log_to_file=log_to_file,
+        rank=Communicator.rank(),
+        gen_heartbeat=gen_heartbeat,
+        num_epochs=max_epochs)
+
+    network = create_conv_network()
+    trainer = create_trainer(network, epoch_size, num_quantization_bits)
+    train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
+    test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
+    train_and_test(network, trainer, train_source, test_source, progress_printer, epoch_size)
+ 
+
 if __name__=='__main__':
+    
    parser = argparse.ArgumentParser()

    parser.add_argument('-datadir', help='only interested in changes to that file');
@ -174,21 +190,9 @@ if __name__=='__main__':
    if args['outputdir'] != None:
        model_path = args['outputdir'] + "/models"

-    distributed_after_samples = 0
-    num_quantization_bits = 32
-
-    create_dist_learner = \
-        lambda learner: cntk.distributed.data_parallel_distributed_learner(learner,
-                                                                           num_quantization_bits=num_quantization_bits,
-                                                                           distributed_after=distributed_after_samples)
-
-    mean=os.path.join(data_path, 'CIFAR-10_mean.xml')
+    mean_data=os.path.join(data_path, 'CIFAR-10_mean.xml')
    train_data=os.path.join(data_path, 'train_map.txt')
    test_data=os.path.join(data_path, 'test_map.txt')

-    create_train_reader = lambda data_size: create_reader(train_data, mean, True, data_size, distributed_after_samples)
-    test_reader = create_reader(test_data, mean, False, cntk.io.FULL_DATA_SWEEP)
-
-    convnet_cifar10_dataaug(create_train_reader, test_reader, create_dist_learner, log_to_file=log_dir, num_mbs_per_log=10, gen_heartbeat=False)
-
-    cntk.distributed.Communicator.finalize()
+    convnet_cifar10_dataaug(train_data, test_data, mean_data, num_quantization_bits=32, max_epochs=80, log_to_file=log_dir, num_mbs_per_log=10)
+    Communicator.finalize()
--- a/Examples/Image/Classification/ConvNet/Python/README.md
+++ b/Examples/Image/Classification/ConvNet/Python/README.md
@ -37,9 +37,19 @@ Run the example from the current folder using:

 We use a fixed crop ratio of `0.8` and scale the image to `32x32` pixels for training. Since all training images are pre-padded to `40x40` pixels, effectively we only perform translation transform without scaling. The accuracy of the network on test data is around `14%`, which is a lot better than the previous model.

+### ConvNetLRN_CIFAR10_DataAug.py
+
+The fourth example added local response normalization (LRN) to the previous example. LRN was used in [AlexNet](../../AlexNet), but it has lost its popularity. We implemented the LRN with a 3D convolution.
+
+Run the example from the current folder using:
+
+`python ConvNetLRN_CIFAR10_DataAug.py`
+
+All settings are identical to the previous example. The accuracy of the network on test data is slightly (`0.1-0.2%`) better than the previous example.
+
 ### ConvNet_CIFAR10_DataAug_Distributed.py

-The fourth example uses the same CNN as ConvNet_CIFAR10_DataAug.py, but it adds support for distributed training with simple aggregation. For a reference on distributed training, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
+The fifth example uses the same CNN as ConvNet_CIFAR10_DataAug.py, but it adds support for distributed training with simple aggregation. For a reference on distributed training, please check [here](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
 Note that [this example](./ConvNet_CIFAR10_DataAug_Distributed.py) supports CPU-only build.

 `mpiexec -n <#workers> python ConvNet_CIFAR10_DataAug_Distributed.py`
--- a/Examples/Image/Classification/GoogLeNet/BN-Inception/BrainScript/BN-Inception.bs
+++ b/Examples/Image/Classification/GoogLeNet/BN-Inception/BrainScript/BN-Inception.bs
@ -0,0 +1,47 @@
+#
+# BN-Inception network definition
+# Details are in https://arxiv.org/pdf/1502.03167v3.pdf
+#
+
+BN_Inception(input, labelDim, bnScale, bnTimeConst) =
+{
+    # 224 x 224 x 3
+    conv1 = ConvBNReLULayer{64, (7:7), (2:2), true, bnScale, bnTimeConst}(input)
+    # 112 x 112 x 64
+    pool1 = MaxPoolingLayer{(3:3), stride = (2:2), pad = true}(conv1)
+    # 56 x 56 x 64
+    conv2a = ConvBNReLULayer{64, (1:1), (1:1), true, bnScale, bnTimeConst}(pool1)
+    # 56 x 56 x 64
+    conv2b = ConvBNReLULayer{192, (3:3), (1:1), true, bnScale, bnTimeConst}(conv2a)
+    # 56 x 56 x 192
+    pool2 = MaxPoolingLayer{(3:3), stride = (2:2), pad = true}(conv2b)
+    
+    # Inception Blocks
+    # 28 x 28 x 192
+    inception3a = InceptionWithAvgPoolLayer{64, 64, 64, 64, 96, 32, bnScale, bnTimeConst}(pool2) 
+    # 28 x 28 x 256
+    inception3b = InceptionWithAvgPoolLayer{64, 64, 96, 64, 96, 64, bnScale, bnTimeConst}(inception3a) 
+    # 28 x 28 x 320
+    inception3c = InceptionPassThroughLayer{0, 128, 160, 64, 96, 0, bnScale, bnTimeConst}(inception3b) 
+    # 14 x 14 x 576
+    inception4a = InceptionWithAvgPoolLayer{224, 64, 96, 96, 128, 128, bnScale, bnTimeConst}(inception3c) 
+    # 14 x 14 x 576
+    inception4b = InceptionWithAvgPoolLayer{192, 96, 128, 96, 128, 128, bnScale, bnTimeConst}(inception4a) 
+    # 14 x 14 x 576
+    inception4c = InceptionWithAvgPoolLayer{160, 128, 160, 128, 160, 128, bnScale, bnTimeConst}(inception4b) 
+    # 14 x 14 x 576
+    inception4d = InceptionWithAvgPoolLayer{96, 128, 192, 160, 192, 128, bnScale, bnTimeConst}(inception4c) 
+    # 14 x 14 x 576
+    inception4e = InceptionPassThroughLayer{0, 128, 192, 192, 256, 0, bnScale, bnTimeConst}(inception4d)
+    # 7 x 7 x 1024
+    inception5a = InceptionWithAvgPoolLayer{352, 192, 320, 160, 224, 128, bnScale, bnTimeConst}(inception4e) 
+    # 7 x 7 x 1024
+    inception5b = InceptionWithMaxPoolLayer{352, 192, 320, 192, 224, 128, bnScale, bnTimeConst}(inception5a) 
+    
+    # Global Average
+    # 7 x 7 x 1024
+    pool3 = AveragePoolingLayer{(7:7)}(inception5b)
+    # 1 x 1 x 1024
+    z = LinearLayer{labelDim, init = 'heNormal'}(pool3)
+}
+
--- a/Examples/Image/Classification/GoogLeNet/BN-Inception/BrainScript/BN-Inception.cntk
+++ b/Examples/Image/Classification/GoogLeNet/BN-Inception/BrainScript/BN-Inception.cntk
@ -0,0 +1,153 @@
+#
+# BN-Inception network
+# Details are in https://arxiv.org/pdf/1502.03167v3.pdf
+#
+
+RootDir = "."
+ParentDir = ".."
+
+ConfigDir = "$RootDir$"
+DataDir = "$ParentDir$/Data"
+OutputDir = "$ParentDir$/Output"
+ModelDir = "$OutputDir$/Models"
+MeanDir = "$ConfigDir$"
+
+stderr = "$OutputDir$/BN-Inception"
+
+precision = "float"
+deviceId = "Auto"
+
+command = Train:Eval
+
+parallelTrain = "true"
+
+traceLevel = 1
+numMBsToShowResult = 100
+
+###################
+# TRAINING CONFIG #
+###################
+
+Train = [
+    action = "train"
+    modelPath = "$ModelDir$/BN-Inception"
+    
+    BrainScriptNetworkBuilder = {
+        include "InceptionLayers.bs"
+        include "BN-Inception.bs"
+        
+        imageShape  = 224:224:3                 # image dimensions
+        labelDim    = 1000                      # number of distinct labels
+        bnScale     = 1
+        bnTimeConst = 4096
+        
+        # inputs
+        features = Input {imageShape}
+        labels   = Input {labelDim}
+        
+        # apply model to features
+        model = BN_Inception(features, labelDim, bnScale, bnTimeConst)
+        z     = model.z
+        
+        # connect to system
+        ce       = CrossEntropyWithSoftmax (labels, z)
+        errs     = ClassificationError     (labels, z)
+        top5Errs = ClassificationError     (labels, z, topN = 5)
+
+        # define special nodes
+        featureNodes    = (features)
+        labelNodes      = (labels)
+        criterionNodes  = (ce)
+        evaluationNodes = (errs : top5Errs)
+        outputNodes     = (z)
+        
+    }
+
+    SGD = [
+        epochSize = 0
+        minibatchSize = 256 # 8 GPUs
+        learningRatesPerMB = 3.6*2:3.384
+        momentumPerMB = 0.9
+        maxEpochs = 300
+        gradUpdateType = "None"
+        L2RegWeight = 0.0001
+        numMBsToShowResult = 100
+        
+        autoAdjust = [
+            autoAdjustLR = "adjustAfterEpoch"
+            reduceLearnRateIfImproveLessThan = 1000
+            learnRateAdjustInterval = 2
+            learnRateDecreaseFactor = 0.94
+            loadBestModel = false
+        ]
+
+        ParallelTrain = [
+            parallelizationMethod = "DataParallelSGD"
+            distributedMBReading = "true"
+            parallelizationStartEpoch = 1
+            DataParallelSGD = [
+                gradientBits = 32
+            ]
+        ]
+    ]
+    
+    reader = [
+        readerType = "ImageReader"
+        file = "$DataDir$/train_map.txt"
+        randomize = "Auto"
+        features = [
+            width = 224
+            height = 224
+            channels = 3
+            cropType = "Random"
+            cropRatio = 0.46666:0.875
+            jitterType = "UniRatio"
+            meanFile = "$MeanDir$/ImageNet1K_mean.xml"
+        ]
+        labels = [
+            labelDim = 1000
+        ]
+    ]
+
+
+    cvreader = [
+        readerType = "ImageReader"
+        file = "$DataDir$/val_map.txt"
+        randomize = "None"
+        features = [
+            width = 224
+            height = 224
+            channels = 3
+            cropType = "Center"
+            cropRatio = 0.875
+            meanFile = "$MeanDir$/ImageNet1K_mean.xml"
+        ]
+        labels = [
+            labelDim = 1000
+        ]
+    ]        
+]
+
+Eval = [
+    action = "test"
+    modelPath = "$ModelDir$/BN-Inception"
+    evalNodeNames = errs:top5Errs  # also test top-5 error rate
+    minibatchSize = 256
+
+    reader = [
+        readerType = "ImageReader"
+        file = "$DataDir$/val_map.txt"
+        randomize = "None"
+        features = [
+            width = 224
+            height = 224
+            channels = 3
+            cropType = "Center"
+            cropRatio = 0.875
+            meanFile = "$MeanDir$/ImageNet1K_mean.xml"
+        ]
+        labels = [
+            labelDim = 1000
+        ]
+    ]    
+]
--- a/Examples/Image/Classification/GoogLeNet/BN-Inception/BrainScript/InceptionLayers.bs
+++ b/Examples/Image/Classification/GoogLeNet/BN-Inception/BrainScript/InceptionLayers.bs
@ -0,0 +1,88 @@
+#
+# BN-Inception network components
+# Details are in https://arxiv.org/pdf/1502.03167v3.pdf
+#
+
+ConvBNReLULayer {outChannels, kernel, stride, pad, bnScale, bnTimeConst} = Sequential(
+    ConvolutionalLayer{outChannels, kernel, init = 'heNormal', stride = stride, pad = pad, bias = false} :
+    BatchNormalizationLayer{spatialRank = 2, normalizationTimeConstant = bnTimeConst, initialScale = bnScale} :
+    ReLU
+)
+
+InceptionWithAvgPoolLayer {num1x1, num3x3r, num3x3, num3x3dblr, num3x3dbl, numPool, bnScale, bnTimeConst} = {
+    apply(x) = {
+        # 1x1 Convolution
+        branch1x1 = ConvBNReLULayer{num1x1, (1:1), (1:1), true, bnScale, bnTimeConst}(x)
+
+        # 3x3 Convolution
+        branch3x3 = Sequential( 
+            ConvBNReLULayer{num3x3r, (1:1), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3,  (3:3), (1:1), true, bnScale, bnTimeConst}
+        ) (x)
+
+        # Double 3x3 Convolution
+        branch3x3dbl = Sequential(
+            ConvBNReLULayer{num3x3dblr, (1:1), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3dbl,  (3:3), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3dbl,  (3:3), (1:1), true, bnScale, bnTimeConst}
+        ) (x)
+
+        # Average Pooling
+        branch_pool = Sequential(
+            AveragePoolingLayer{(3:3), stride = (1:1), pad = true} :
+            ConvBNReLULayer{numPool, (1:1), (1:1), true, bnScale, bnTimeConst}
+        ) (x)
+
+        out = Splice((branch1x1:branch3x3:branch3x3dbl:branch_pool), axis=3)
+    }.out
+}.apply
+
+InceptionWithMaxPoolLayer {num1x1, num3x3r, num3x3, num3x3dblr, num3x3dbl, numPool, bnScale, bnTimeConst} = {
+    apply(x) = {
+        # 1x1 Convolution
+        branch1x1 = ConvBNReLULayer{num1x1, (1:1), (1:1), true, bnScale, bnTimeConst}(x)
+
+        # 3x3 Convolution
+        branch3x3 = Sequential( 
+            ConvBNReLULayer{num3x3r, (1:1), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3,  (3:3), (1:1), true, bnScale, bnTimeConst}
+        ) (x)
+
+        # Double 3x3 Convolution
+        branch3x3dbl = Sequential(
+            ConvBNReLULayer{num3x3dblr, (1:1), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3dbl,  (3:3), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3dbl,  (3:3), (1:1), true, bnScale, bnTimeConst}
+        ) (x)
+
+        # Max Pooling
+        branch_pool = Sequential(
+            MaxPoolingLayer{(3:3), stride=(1:1), pad=true} :
+            ConvBNReLULayer{numPool, (1:1), (1:1), true, bnScale, bnTimeConst}
+        ) (x)
+
+        out = Splice((branch1x1:branch3x3:branch3x3dbl:branch_pool), axis=3)
+    }.out
+}.apply
+
+InceptionPassThroughLayer {num1x1, num3x3r, num3x3, num3x3dblr, num3x3dbl, numPool, bnScale, bnTimeConst} = {
+    apply(x) = {
+        # 3x3 Convolution
+        branch3x3 = Sequential( 
+            ConvBNReLULayer{num3x3r, (1:1), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3,  (3:3), (2:2), true, bnScale, bnTimeConst}
+        ) (x)
+
+        # Double 3x3 Convolution
+        branch3x3dbl = Sequential(
+            ConvBNReLULayer{num3x3dblr, (1:1), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3dbl,  (3:3), (1:1), true, bnScale, bnTimeConst} :
+            ConvBNReLULayer{num3x3dbl,  (3:3), (2:2), true, bnScale, bnTimeConst}
+        ) (x)
+        
+        # Max Pooling
+        branch_pool = MaxPoolingLayer{(3:3), stride=(2:2), pad=true}(x)
+
+        out = Splice((branch3x3:branch3x3dbl:branch_pool), axis=3)
+    }.out
+}.apply
--- a/Examples/Image/Classification/GoogLeNet/BN-Inception/README.md
+++ b/Examples/Image/Classification/GoogLeNet/BN-Inception/README.md
@ -0,0 +1,24 @@
+# CNTK Examples: Image/Classification/GoogLeNet/BN-Inception
+
+## Overview
+
+|Data:     |The ILSVRC2012 dataset (http://www.image-net.org/challenges/LSVRC/2012/) for image classification.
+|:---------|:---
+|Purpose   |This folder contains examples that demonstrate how to use CNTK to define BN-Inception (https://arxiv.org/abs/1502.03167) for image classification.
+|Network   |Deep convolutional neural networks codenamed "Inception" (GoogLeNet) with Batch Normalization.
+|Training  |Stochastic gradient descent with momentum.
+|Comments  |See below.
+
+## Running the example
+
+### Getting the data
+We use the ILSVRC2012 datasets to demonstrate how to train a BN-Inception network. BN-Inception was initially published by Researchers at Google Inc., and it is firstly described in the Batch Normalization paper (https://arxiv.org/abs/1502.03167) to demonstrate the power of Batch Normalization with minor changes on the original GoogLeNet. It has been proved that it could increase the training speed and achieve better accuracy, compared with the GoogLeNet v1 which have been well known for winning first place in the [ILSVRC](http://www.image-net.org/challenges/LSVRC/) 2014 detection challenge.
+
+
+ILSVRC2012 datasets are not included in the CNTK distribution. You may obtain it through http://image-net.org.
+
+## Details
+
+We currently offer the BN-Inception model (https://arxiv.org/abs/1502.03167). Only BrainScript version is available at this moment.
+
+### [BrainScript](./BrainScript)
--- a/Examples/Image/Classification/GoogLeNet/InceptionV3/BrainScript/InceptionBlocks.bs
+++ b/Examples/Image/Classification/GoogLeNet/InceptionV3/BrainScript/InceptionBlocks.bs
@ -1,13 +1,16 @@
 #
 # This file contains the basic build block of Inception Network as defined in:
+#
 #   https://arxiv.org/pdf/1512.00567.pdf
 #
+# and in Tensorflow implementation
+#

 #
 # Convolution layer with Batch Normalization and Rectifier Linear activation.
 #
 ConvBNReLULayer {numOutputChannels, filterShape, stride, pad = true, bnTimeConst = 4096} = Sequential(
-    ConvolutionalLayer {numOutputChannels, filterShape, init = "heNormal", stride = stride, pad = pad, bias = false} :
+    ConvolutionalLayer {numOutputChannels, filterShape, init = "glorotUniform", stride = stride, pad = pad, bias = false} :
    BatchNormalizationLayer {spatialRank = 2, normalizationTimeConstant = bnTimeConst, useCntkEngine = false}        :
    ReLU
 )
--- a/Examples/Image/Classification/GoogLeNet/InceptionV3/BrainScript/InceptionV3.bs
+++ b/Examples/Image/Classification/GoogLeNet/InceptionV3/BrainScript/InceptionV3.bs
@ -1,8 +1,11 @@

 #
 # Inception V3 model from:
+#
 #    https://arxiv.org/pdf/1512.00567.pdf
 #
+# and in Tensorflow implementation
+#
 InceptionV3(input, labelDim, bnTimeConst) =
 {
    # 299 x 299 x 3
@ -21,14 +24,15 @@ InceptionV3(input, labelDim, bnTimeConst) =
    pool_2 = MaxPoolingLayer{(3:3), stride = (2:2), pad = false}(conv_5)
    # 35 x 35 x 192

+    #
    # Inception Blocks
-    # 35 x 35 x 256
+    #
    mixed_1 = InceptionBlock1{64, (48:64), (64:96:96), 32, bnTimeConst}(pool_2)
-    # 35 x 35 x 288
+    # 35 x 35 x 256
    mixed_2 = InceptionBlock1{64, (48:64), (64:96:96), 64, bnTimeConst}(mixed_1)
    # 35 x 35 x 288
    mixed_3 = InceptionBlock1{64, (48:64), (64:96:96), 64, bnTimeConst}(mixed_2)
-    # 17 x 17 x 768
+    # 35 x 35 x 288
    mixed_4 = InceptionBlock2{384, (64:96:96), bnTimeConst}(mixed_3)
    # 17 x 17 x 768
    mixed_5 = InceptionBlock3{192, (128:128:192), (128:128:128:128:192), 192, bnTimeConst}(mixed_4)
@ -38,28 +42,47 @@ InceptionV3(input, labelDim, bnTimeConst) =
    mixed_7 = InceptionBlock3{192, (160:160:192), (160:160:160:160:192), 192, bnTimeConst}(mixed_6)
    # 17 x 17 x 768
    mixed_8 = InceptionBlock3{192, (192:192:192), (192:192:192:192:192), 192, bnTimeConst}(mixed_7)
-    # 8 x 8 x 1280
+    # 17 x 17 x 768
    mixed_9 = InceptionBlock4{(192:320), (192:192:192:192), bnTimeConst}(mixed_8)
-    # 8 x 8 x 2048
+    # 17 x 17 x 1280
    mixed_10 = InceptionBlock5{320, (384:384:384), (448:384:384:384), 192, bnTimeConst}(mixed_9)
    # 8 x 8 x 2048
    mixed_11 = InceptionBlock5{320, (384:384:384), (448:384:384:384), 192, bnTimeConst}(mixed_10)
    # 8 x 8 x 2048

-    # Global average
+    #
+    # Prediction
+    #
    pool_3 = AveragePoolingLayer{(8:8), pad = false}(mixed_11)
    # 1 x 1 x 2048
    drop = Dropout(pool_3)
    # 1 x 1 x 2048
-    z = DenseLayer{labelDim}(drop)
+    z = LinearLayer{labelDim}(drop)

+    #
    # Auxiliary
-    # 8 x 8 x 1280
-    aux_pool_1 = AveragePoolingLayer{(5:5), pad = false}(mixed_8)
-    # 3 x 3 x 1280
+    #
+    # 17 x 17 x 768
+    aux_pool_1 = AveragePoolingLayer{(5:5), stride = (3:3), pad = false}(mixed_8)
+    # 5 x 5 x 768
    aux_conv_1 = ConvBNReLULayer{128, (1:1), (1:1), pad=true, bnTimeConst = bnTimeConst}(aux_pool_1)
-    # 3 x 3 x 128
-    aux_conv_2 = ConvBNReLULayer{768, (3:3), (1:1), pad=false, bnTimeConst = bnTimeConst}(aux_conv_1)
-
-    aux = DenseLayer{labelDim}(aux_conv_2)
+    # 5 x 5 x 128
+    aux_conv_2 = ConvBNReLULayer{768, (5:5), (1:1), pad=false, bnTimeConst = bnTimeConst}(aux_conv_1)
+    # 1 x 1 x 768    
+    aux = LinearLayer{labelDim}(aux_conv_2)
 }
+
+#
+# Inception V3 model with normalized input, to use the below function
+# remove "ImageNet1K_mean.xml" from each reader.
+#
+InceptionV3Norm(input, labelDim, bnTimeConst) = 
+{
+    # Normalize inputs to -1 and 1.
+    featMean  = 128
+    featScale = 1/128
+    Normalize{m,f} = x => f .* (x - m)
+            
+    inputNorm = Normalize{featMean, featScale}(input)
+    model     = InceptionV3(inputNorm, labelDim, bnTimeConst)
+}.model
--- a/Examples/Image/Classification/GoogLeNet/InceptionV3/BrainScript/InceptionV3.cntk
+++ b/Examples/Image/Classification/GoogLeNet/InceptionV3/BrainScript/InceptionV3.cntk
@ -7,19 +7,19 @@ command = Train:Eval

 deviceId        = "Auto"
 precision       = "float"
-traceLevel     = 1
+#traceLevel     = 1
 #perfTraceLevel = 1
 parallelTrain  = true

 RootDir     = "."
 ConfigDir   = "$RootDir$"
+ImageNetDir = "$ConfigDir$"
 DataDir     = "$RootDir$"
 OutputDir   = "$RootDir$/Output"
-ModelDir    = "$OutputDir$/Models"
-
-modelPath = "$ModelDir$/InceptionV3"
-#stderr      = "$OutputDir$/InceptionV3.log"
-
+ModelDir    = "$OutputDir$/Model"
+stderr      = "$OutputDir$/InceptionV3.log"
+modelPath   = "$ModelDir$/InceptionV3.model"
+    
 ImageH    = 299
 ImageW    = 299
 ImageC    = 3
@ -27,7 +27,7 @@ NumLabels = 1000

 Train = {
    action = "train"
-    traceLevel     = 1
+
    BrainScriptNetworkBuilder = {
        include "$ConfigDir$/InceptionBlocks.bs"
        include "$ConfigDir$/InceptionV3.bs"
@ -35,16 +35,16 @@ Train = {
        imageShape  = $ImageH$:$ImageW$:$ImageC$
        labelDim    = $NumLabels$
        bnTimeConst = 4096
-        auxWeight   = Constant(0.4)
+        auxWeight   = Constant(0.3)

        # inputs
        features = Input {imageShape}
        labels   = Input {labelDim}

        # apply model to features
-        model = InceptionV3(features, labelDim, bnTimeConst)
-        z     = model.z
-        aux   = model.aux
+        model    = InceptionV3Norm(features, labelDim, bnTimeConst)
+        z        = model.z
+        aux      = model.aux

        # connect to system
        ceAux    = CrossEntropyWithSoftmax     (labels, aux)
@ -61,52 +61,60 @@ Train = {
    }

    SGD = {
-        epochSize     = 256000
-        maxEpochs     = 1
-        minibatchSize = 128 # 16 GPU
+        epochSize     = 0
+        maxEpochs     = 160
+        minibatchSize = 512 # 16 GPUs, 32 per GPU.
        dropoutRate   = 0.2

-        learningRatesPerMB     = 1
-        momentumAsTimeConstant = 4096
-        #momentumPerMB      = 0.9
-        
-        gradUpdateType        = "rmsProp"
-        normWithAveMultiplier = true
-        rms_wgt_inc           = 1.2
-        rms_wgt_dec           = 0.75
-        rms_wgt_max           = 10.0
-        rms_wgt_min           = 0.1
-        rms_gamma             = 0.9
+        learningRatesPerMB     = 3.2*10: 1.6*10: 0.8*10: 0.4*10: 0.2*10: 0.1*10: 0.05*10: 0.025*10: 0.0125*10: 0.00625*10: 0.003125*10: 0.0015625*10: 0.00078125*10: 0.000390625*10: 0.0001953125
+        momentumPerMB          = 0.9

        disableRegInBatchNormalization = true
-        numMBsToShowResult = 20
-
+        
        parallelTrain = {
            parallelizationMethod = "dataParallelSGD"
            parallelizationStartEpoch = 1
            distributedMBReading = true
            dataParallelSGD = { 
-                gradientBits = 32
+                gradientBits = 32 
            }
        }
+        
+        firstMBsToShowResult = 10 ; numMBsToShowResult = 500
    }

    reader = {
        verbosity = 0 ; randomize = true
        deserializers = ({
            type = "ImageDeserializer" ; module = "ImageReader"
-            file = "$DataDir$/val_map.txt"
+            file = "$DataDir$/train_map.txt"
            input = {
                features = { transforms = (
-                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
+                    { type = "Crop" ; cropType = "randomArea" ; areaRatio = 0.08:1.0 ; jitterType = "uniRatio" ; aspectRatio = 0.75:1.0 } :
                    { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
-                    { type = "Mean" ; meanFile = "$ConfigDir$/ImageNet1K_mean.xml" } :
+                    { type = "Color" ; brightnessRadius = 0.2 ; contrastRadius = 0.2 ; saturationRadius = 0.4 } :
                    { type = "Transpose" }
                )}
                labels = { labelDim = $NumLabels$ }
            }
        })
    }
+    
+    cvreader = {
+        verbosity = 0 ; randomize = false
+        deserializers = ({
+            type = "ImageDeserializer" ; module = "ImageReader"
+            file = "$DataDir$/val_map.txt"
+            input = {
+                features = { transforms = (
+                   { type = "Crop" ; cropType = "Center" ; sideRatio = 0.875 } :
+                   { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
+                   { type = "Transpose" }
+                )}
+                labels = { labelDim = $NumLabels$ }
+            }
+        })
+    }    
 }

 # Eval action
@ -123,8 +131,8 @@ Eval = {
            file = "$DataDir$/val_map.txt"
            input = {
                features = { transforms = (
+                   { type = "Crop" ; cropType = "Center" ; sideRatio = 0.875 } :
                   { type = "Scale" ; width = $ImageW$ ; height = $ImageH$ ; channels = $ImageC$ ; interpolations = "linear" } :
-                   { type = "Mean"; meanFile = "$ConfigDir$/ImageNet1K_mean.xml" } :
                   { type = "Transpose" }
                )}
                labels = { labelDim = $NumLabels$ }
--- a/Examples/Image/Classification/GoogLeNet/InceptionV3/BrainScript/README.md
+++ b/Examples/Image/Classification/GoogLeNet/InceptionV3/BrainScript/README.md
@ -1,4 +1,4 @@
-# CNTK Examples: Image/Classification/GoogLeNet
+# CNTK Examples: Image/Classification/GoogLeNet/InceptionV3

 ## BrainScript

--- a/Examples/Image/Classification/GoogLeNet/InceptionV3/README.md
+++ b/Examples/Image/Classification/GoogLeNet/InceptionV3/README.md
@ -0,0 +1,24 @@
+# CNTK Examples: Image/Classification/GoogLeNet/InceptionV3
+
+## Overview
+
+|Data:     |The ILSVRC2012 dataset (http://www.image-net.org/challenges/LSVRC/2012/) for image classification.
+|:---------|:---
+|Purpose   |This folder contains examples that demonstrate how to use CNTK to define Inception V3 (https://arxiv.org/abs/1512.00567) for image classification.
+|Network   |Deep convolutional neural networks codenamed "Inception" (GoogLeNet) version 3.
+|Training  |RMSProp.
+|Comments  |See below.
+
+## Running the example
+
+### Getting the data
+We use the ILSVRC2012 datasets to demonstrate how to train an Inception V3 network. Inception V3 was initially published by Researchers at Google Inc., and it is fine-tuned to have excellent classification accuracy and low computation cost. Its original version, GoogLeNet, won first place in the [ILSVRC](http://www.image-net.org/challenges/LSVRC/) 2014 detection challenge.
+
+
+ILSVRC2012 datasets are not included in the CNTK distribution. You may obtain it through http://image-net.org.
+
+## Details
+
+We currently offer the Inception V3 model, published in December 2015 (https://arxiv.org/abs/1512.00567). Only BrainScript version is available at this moment.
+
+### [BrainScript](./BrainScript)
--- a/Examples/Image/Classification/GoogLeNet/README.md
+++ b/Examples/Image/Classification/GoogLeNet/README.md
@ -4,9 +4,9 @@

 |Data:     |The ILSVRC2012 dataset (http://www.image-net.org/challenges/LSVRC/2012/) for image classification.
 |:---------|:---
-|Purpose   |This folder contains examples that demonstrate how to use CNTK to define GoogLeNet (https://arxiv.org/abs/1409.4842) for image classification.
+|Purpose   |This folder contains examples that demonstrate how to use CNTK to define GoogLeNet (https://arxiv.org/abs/1409.4842) and its derivations for image classification.
 |Network   |Deep convolutional neural networks codenamed "Inception" (GoogLeNet).
-|Training  |RMSProp.
+|Training  |See the details.
 |Comments  |See below.

 ## Running the example
@ -19,6 +19,8 @@ ILSVRC2012 datasets are not included in the CNTK distribution. You may obtain it

 ## Details

-We currently offer the Inception V3 model, published in December 2015 (https://arxiv.org/abs/1512.00567). Only BrainScript version is available at this moment.
+We currently offer the BN-Inception (https://arxiv.org/abs/1502.03167) and Inception V3 (https://arxiv.org/abs/1512.00567) models.

-### [BrainScript](./BrainScript)
+### [BN-Inception](./BN-Inception)
+
+### [Inception V3](./InceptionV3)
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
@ -111,9 +111,9 @@ TrainNetwork = {
            width = 224
            height = 224
            channels = 3
-            cropType = "Random"
+            cropType = "RandomSide"
            jitterType = "UniRatio"
-            cropRatio = 0.46666:0.875
+            sideRatio = 0.46666:0.875
            hflip = true
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
@ -131,7 +131,7 @@ TrainNetwork = {
            height = 224
            channels = 3
            cropType = "Center"
-            cropRatio = 0.875
+            sideRatio = 0.875
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
        labels = {
@ -156,9 +156,9 @@ BNStatistics = {
            width = 224
            height = 224
            channels = 3
-            cropType = "Random"
+            cropType = "RandomSide"
            hflip = true
-            cropRatio = 0.46666:0.875
+            sideRatio = 0.46666:0.875
            jitterType = "UniRatio"
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
@ -183,7 +183,7 @@ Eval = {
            height = 224
            channels = 3
            cropType = "Center"
-            cropRatio = 0.875
+            sideRatio = 0.875
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
        labels = {
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet110_CIFAR10.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet110_CIFAR10.cntk
@ -83,7 +83,7 @@ TrainConvNet = {
            file = "$dataDir$/train_map.txt"
            input = {
                features = { transforms = (
-                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
+                    { type = "Crop" ; cropType = "RandomSide" ; sideRatio = 0.8 ; jitterType = "UniRatio" } :
                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
                    { type = "Mean" ; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
                    { type = "Transpose" }
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
@ -111,9 +111,9 @@ TrainNetwork = {
            width = 224
            height = 224
            channels = 3
-            cropType = "Random"
+            cropType = "RandomSide"
            jitterType = "UniRatio"
-            cropRatio = 0.46666:0.875
+            sideRatio = 0.46666:0.875
            hflip = true
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
@ -131,7 +131,7 @@ TrainNetwork = {
            height = 224
            channels = 3
            cropType = "Center"
-            cropRatio = 0.875
+            sideRatio = 0.875
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
        labels = {
@ -156,9 +156,9 @@ BNStatistics = {
            width = 224
            height = 224
            channels = 3
-            cropType = "Random"
+            cropType = "RandomSide"
            hflip = true
-            cropRatio = 0.46666:0.875
+            sideRatio = 0.46666:0.875
            jitterType = "UniRatio"
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
@ -183,7 +183,7 @@ Eval = {
            height = 224
            channels = 3
            cropType = "Center"
-            cropRatio = 0.875
+            sideRatio = 0.875
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
        labels = {
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet20_CIFAR10.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet20_CIFAR10.cntk
@ -83,7 +83,7 @@ TrainConvNet = {
            file = "$dataDir$/train_map.txt"
            input = {
                features = { transforms = (
-                    { type = "Crop" ; cropType = "random" ; cropRatio = 0.8 ; jitterType = "uniRatio" } :
+                    { type = "Crop" ; cropType = "RandomSide" ; sideRatio = 0.8 ; jitterType = "UniRatio" } :
                    { type = "Scale" ; width = 32 ; height = 32 ; channels = 3 ; interpolations = "linear" } :
                    { type = "Mean" ; meanFile = "$dataDir$/CIFAR-10_mean.xml" } : 
                    { type = "Transpose" }
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet50_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet50_ImageNet1K.cntk
@ -110,9 +110,9 @@ TrainNetwork = {
            width = 224
            height = 224
            channels = 3
-            cropType = "Random"
+            cropType = "RandomSide"
            jitterType = "UniRatio"
-            cropRatio = 0.46666:0.875
+            sideRatio = 0.46666:0.875
            hflip = true
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
@ -130,7 +130,7 @@ TrainNetwork = {
            height = 224
            channels = 3
            cropType = "Center"
-            cropRatio = 0.875
+            sideRatio = 0.875
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
        labels = {
@ -156,9 +156,9 @@ BNStatistics = {
            width = 224
            height = 224
            channels = 3
-            cropType = "Random"
+            cropType = "RandomSide"
            hflip = true
-            cropRatio = 0.46666:0.875
+            sideRatio = 0.46666:0.875
            jitterType = "UniRatio"
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
@ -183,7 +183,7 @@ Eval = {
            height = 224
            channels = 3
            cropType = "Center"
-            cropRatio = 0.875
+            sideRatio = 0.875
            meanFile = "$meanDir$/ImageNet1K_mean.xml"
        }
        labels = {
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py
@ -40,7 +40,7 @@ def create_reader(map_file, mean_file, train):
    transforms = []
    if train:
        transforms += [
-            ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter
+            ImageDeserializer.crop(crop_type='randomside', side_ratio=0.8, jitter_type='uniratio') # train uses jitter
        ]
    transforms += [
        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
@ -53,7 +53,7 @@ def create_reader(map_file, mean_file, train):


 # Train and evaluate the network.
-def train_and_evaluate(reader_train, reader_test, network_name, max_epochs):
+def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_epochs):

    set_computation_network_trace_level(0)

@ -76,7 +76,6 @@ def train_and_evaluate(reader_train, reader_test, network_name, max_epochs):
    pe = classification_error(z, label_var)

    # shared training parameters 
-    epoch_size = 50000                    # for now we manually specify epoch size
    minibatch_size = 128
    momentum_time_constant = -minibatch_size/np.log(0.9)
    l2_reg_weight = 0.0001
@ -112,7 +111,7 @@ def train_and_evaluate(reader_train, reader_test, network_name, max_epochs):
        z.save_model(os.path.join(model_path, network_name + "_{}.dnn".format(epoch)))
    
    # Evaluation parameters
-    epoch_size     = 10000
+    test_epoch_size     = 10000
    minibatch_size = 16

    # process minibatches and evaluate the model
@ -121,8 +120,8 @@ def train_and_evaluate(reader_train, reader_test, network_name, max_epochs):
    sample_count    = 0
    minibatch_index = 0

-    while sample_count < epoch_size:
-        current_minibatch = min(minibatch_size, epoch_size - sample_count)
+    while sample_count < test_epoch_size:
+        current_minibatch = min(minibatch_size, test_epoch_size - sample_count)
        # Fetch next test min batch.
        data = reader_test.next_minibatch(current_minibatch, input_map=input_map)
        # minibatch data to be trained with
@ -150,4 +149,5 @@ if __name__=='__main__':
    reader_train = create_reader(os.path.join(data_path, 'train_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), True)
    reader_test  = create_reader(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False)

-    train_and_evaluate(reader_train, reader_test, network_name, epochs)
+    epoch_size = 50000
+    train_and_evaluate(reader_train, reader_test, network_name, epoch_size, epochs)
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
@ -8,14 +8,15 @@ from __future__ import print_function
 import os
 import argparse
 import math
+import cntk
 import numpy as np

 from cntk.utils import *
 from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error
-from cntk.io import MinibatchSource, ImageDeserializer, StreamDef, StreamDefs, INFINITE_SAMPLES, FULL_DATA_SWEEP
-from cntk import Trainer, cntk_py, distributed
+from cntk import Trainer, cntk_py 
 from cntk.learner import momentum_sgd, learning_rate_schedule, momentum_as_time_constant_schedule, UnitType
 from _cntk_py import set_computation_network_trace_level
+from cntk.distributed import data_parallel_distributed_learner, Communicator

 from resnet_models import *

@ -24,43 +25,19 @@ abs_path   = os.path.dirname(os.path.abspath(__file__))
 data_path  = os.path.join(abs_path, "..", "..", "..", "DataSets", "CIFAR-10")
 model_path = os.path.join(abs_path, "Models")

-# model dimensions
+# For this example we are using the same data source as for conv net - CIFAR
+sys.path.append(os.path.join(abs_path, "..", "..", "ConvNet", "Python"))
+from ConvNet_CIFAR10_DataAug_Distributed import create_image_mb_source
+
+# model dimensions - these match the ones from convnet_cifar10_dataaug
+# so we can use the same data source
 image_height = 32
 image_width  = 32
 num_channels = 3  # RGB
 num_classes  = 10

-# Define the reader for both training and evaluation action.
-def create_reader(map_file, mean_file, train, total_data_size, distributed_after=INFINITE_SAMPLES):
-    if not os.path.exists(map_file) or not os.path.exists(mean_file):
-        raise RuntimeError("File '%s' or '%s' does not exist. Please run install_cifar10.py from DataSets/CIFAR-10 to fetch them" %
-                           (map_file, mean_file))
-
-    # transformation pipeline for the features has jitter/crop only when training
-    transforms = []
-    if train:
-        transforms += [
-            ImageDeserializer.crop(crop_type='Random', ratio=0.8, jitter_type='uniRatio') # train uses jitter
-        ]
-    transforms += [
-        ImageDeserializer.scale(width=image_width, height=image_height, channels=num_channels, interpolations='linear'),
-        ImageDeserializer.mean(mean_file)
-    ]
-    # deserializer
-    return MinibatchSource(
-        ImageDeserializer(map_file, StreamDefs(
-            features = StreamDef(field='image', transforms=transforms), # first column in map file is referred to as 'image'
-            labels   = StreamDef(field='label', shape=num_classes))),   # and second as 'label'
-        epoch_size=total_data_size,
-        multithreaded_deserializer = False,  # turn off omp as CIFAR-10 is not heavy for deserializer
-        distributed_after = distributed_after)
-
-
-# Train and evaluate the network.
-def train_and_evaluate(create_train_reader, test_reader, network_name, max_epochs, create_dist_learner, scale_up=False):
-
-    set_computation_network_trace_level(0)
-
+# Create network
+def create_resnet_network(network_name):
    # Input variables denoting the features and label data
    input_var = input_variable((num_channels, image_height, image_width))
    label_var = input_variable((num_classes))
@ -68,10 +45,8 @@ def train_and_evaluate(create_train_reader, test_reader, network_name, max_epoch
    # create model, and configure learning parameters 
    if network_name == 'resnet20': 
        z = create_cifar10_model(input_var, 3, num_classes)
-        lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
    elif network_name == 'resnet110': 
        z = create_cifar10_model(input_var, 18, num_classes)
-        lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01]
    else: 
        return RuntimeError("Unknown model name!")

@ -79,14 +54,24 @@ def train_and_evaluate(create_train_reader, test_reader, network_name, max_epoch
    ce = cross_entropy_with_softmax(z, label_var)
    pe = classification_error(z, label_var)

-    # shared training parameters 
-    epoch_size = 50000                    # for now we manually specify epoch size
-    
-    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
-    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
-    # up. However, bigger minimatch size on the same number of samples means less updates, 
-    # thus leads to higher training error. This is a trade-off of speed and accuracy
-    minibatch_size = 128 * (distributed.Communicator.num_workers() if scale_up else 1)
+    return {
+        'name' : network_name,
+        'feature': input_var,
+        'label': label_var,
+        'ce' : ce,
+        'pe' : pe,
+        'output': z
+    }
+
+
+# Create trainer
+def create_trainer(network, minibatch_size, epoch_size, num_quantization_bits):
+    if network['name'] == 'resnet20': 
+        lr_per_mb = [1.0]*80+[0.1]*40+[0.01]
+    elif network['name'] == 'resnet110': 
+        lr_per_mb = [0.1]*1+[1.0]*80+[0.1]*40+[0.01]
+    else: 
+        return RuntimeError("Unknown model name!")

    momentum_time_constant = -minibatch_size/np.log(0.9)
    l2_reg_weight = 0.0001
@ -96,37 +81,29 @@ def train_and_evaluate(create_train_reader, test_reader, network_name, max_epoch
    lr_schedule = learning_rate_schedule(lr_per_sample, epoch_size=epoch_size, unit=UnitType.sample)
    mm_schedule = momentum_as_time_constant_schedule(momentum_time_constant)
    
-    # trainer object
-    learner     = create_dist_learner(momentum_sgd(z.parameters, lr_schedule, mm_schedule,
-                                                   l2_regularization_weight = l2_reg_weight))
-    trainer     = Trainer(z, ce, pe, learner)
+    # learner object
+    local_learner = momentum_sgd(network['output'].parameters, lr_schedule, mm_schedule,
+                                 l2_regularization_weight = l2_reg_weight)

-    total_number_of_samples = max_epochs * epoch_size
-    train_reader=create_train_reader(total_number_of_samples)
+    learner = data_parallel_distributed_learner(learner=local_learner,
+                                                            num_quantization_bits=num_quantization_bits,
+                                                            distributed_after=0)
+    return Trainer(network['output'], network['ce'], network['pe'], learner)

-    # define mapping from reader streams to network inputs
+# Train and test
+def train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size):
+
+    # define mapping from intput streams to network inputs
    input_map = {
-        input_var: train_reader.streams.features,
-        label_var: train_reader.streams.labels
+        network['feature']: train_source.streams.features,
+        network['label']: train_source.streams.labels
    }

-    log_number_of_parameters(z) ; print()
-    progress_printer = ProgressPrinter(tag='Training')
+    training_session = cntk.training_session(train_source, trainer,
+        cntk.minibatch_size_schedule(minibatch_size), progress_printer, input_map, "ConvNet_CIFAR10_DataAug_", epoch_size)
+    training_session.train()

-    # perform model training
-    current_epoch=0
-    updated=True
-    while updated:
-        data=train_reader.next_minibatch(minibatch_size, input_map=input_map) # fetch minibatch.
-        updated=trainer.train_minibatch(data)                                 # update model with it
-        progress_printer.update_with_trainer(trainer, with_metric=True)       # log progress
-        epoch_index = int(trainer.total_number_of_samples_seen/epoch_size)
-        if current_epoch != epoch_index:                                      # new epoch reached
-            progress_printer.epoch_summary(with_metric=True)
-            current_epoch=epoch_index
-            trainer.save_checkpoint(os.path.join(model_path, network_name + "_{}.dnn".format(current_epoch)))
-
-    # Evaluation parameters
+    # TODO: Stay tuned for an upcoming simpler EvalSession API for test/validation.
    epoch_size     = 10000
    minibatch_size = 16

@ -137,10 +114,10 @@ def train_and_evaluate(create_train_reader, test_reader, network_name, max_epoch
    minibatch_index = 0

    while True:
-        data = test_reader.next_minibatch(minibatch_size, input_map=input_map)
+        data = test_source.next_minibatch(minibatch_size, input_map=input_map)
        if not data: break;

-        local_mb_samples=data[label_var].num_samples
+        local_mb_samples=data[network['label']].num_samples
        metric_numer += trainer.test_minibatch(data) * local_mb_samples
        metric_denom += local_mb_samples
        minibatch_index += 1
@ -151,6 +128,33 @@ def train_and_evaluate(create_train_reader, test_reader, network_name, max_epoch

    return metric_numer/metric_denom

+
+# Train and evaluate the network.
+def resnet_cifar10(train_data, test_data, mean_data, network_name, num_quantization_bits=32, epoch_size=50000, max_epochs=160, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, scale_up=False):
+
+    set_computation_network_trace_level(0)
+    
+    # NOTE: scaling up minibatch_size increases sample throughput. In 8-GPU machine,
+    # ResNet110 samples-per-second is ~7x of single GPU, comparing to ~3x without scaling
+    # up. However, bigger minimatch size on the same number of samples means less updates, 
+    # thus leads to higher training error. This is a trade-off of speed and accuracy
+    minibatch_size = 128 * (Communicator.num_workers() if scale_up else 1)
+
+    progress_printer = ProgressPrinter(
+        freq=num_mbs_per_log,
+        tag='Training',
+        log_to_file=log_to_file,
+        rank=Communicator.rank(),
+        gen_heartbeat=gen_heartbeat,
+        num_epochs=max_epochs)
+
+    network = create_resnet_network(network_name)
+    trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits)
+    train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
+    test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=cntk.io.FULL_DATA_SWEEP)
+    return train_and_test(network, trainer, train_source, test_source, progress_printer, minibatch_size, epoch_size)
+
+
 if __name__=='__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--network', help='network type, resnet20 or resnet110', required=False, default='resnet20')
@ -168,17 +172,15 @@ if __name__=='__main__':

    # Create distributed trainer factory
    print("Start training: quantize_bit = {}, epochs = {}, distributed_after = {}".format(num_quantization_bits, epochs, distributed_after_samples))
-    create_dist_learner = lambda learner: distributed.data_parallel_distributed_learner(learner=learner,
-                                                                                        num_quantization_bits=num_quantization_bits,
-                                                                                        distributed_after=distributed_after_samples)
+    
    train_data=os.path.join(data_path, 'train_map.txt')
    test_data=os.path.join(data_path, 'test_map.txt')
-    mean=os.path.join(data_path, 'CIFAR-10_mean.xml')
+    mean_data=os.path.join(data_path, 'CIFAR-10_mean.xml')

-    create_train_reader=lambda data_size: create_reader(train_data, mean, True, data_size, distributed_after_samples)
-    test_reader=create_reader(test_data, mean, False, FULL_DATA_SWEEP)
-
-    train_and_evaluate(create_train_reader, test_reader, network_name, epochs, create_dist_learner, scale_up)
+    epoch_size = 50000
+    resnet_cifar10(train_data, test_data, mean_data,
+                   network_name, num_quantization_bits, epoch_size, epochs,
+                   scale_up=scale_up)

    # Must call MPI finalize when process exit
-    distributed.Communicator.finalize()
+    Communicator.finalize()
--- a/Examples/Image/Classification/VGG/VGG_A_ndl_deprecated.cntk
+++ b/Examples/Image/Classification/VGG/VGG_A_ndl_deprecated.cntk
@ -58,13 +58,13 @@ Train=[
            channels=3
            # Below are the optional parameters.
            # Possible values: Center, Random. Default: Center
-            cropType="Random"
-            # Horizontal random flip, will be enabled by default if cropType=Random
+            cropType="RandomSide"
+            # Horizontal random flip, will be enabled by default because cropType=RandomSide
            #hflip="true"
-            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
-            cropRatio=0.875
+            # Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
+            sideRatio=0.875
            # Crop scale ratio jitter type.
-            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
+            # Possible values: None, UniRatio. Default: None
            jitterType="UniRatio"
            # Interpolation to use when scaling image to width x height size.
            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
--- a/Examples/Image/Classification/VGG/VGG_E_BN_ndl_deprecated.cntk
+++ b/Examples/Image/Classification/VGG/VGG_E_BN_ndl_deprecated.cntk
@ -67,13 +67,13 @@ Train=[
            channels=3
            # Below are the optional parameters.
            # Possible values: Center, Random. Default: Center
-            cropType="Random"
-            # Horizontal random flip, will be enabled by default if cropType=Random
+            cropType="RandomSide"
+            # Horizontal random flip, will be enabled because cropType=RandomSide
            #hflip="true"
-            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
-            cropRatio=0.875
+            # Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
+            sideRatio=0.875
            # Crop scale ratio jitter type.
-            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
+            # Possible values: None, UniRatio. Default: None
            jitterType="UniRatio"
            # Interpolation to use when scaling image to width x height size.
            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
--- a/Examples/Image/Classification/VGG/VGG_E_ndl_deprecated.cntk
+++ b/Examples/Image/Classification/VGG/VGG_E_ndl_deprecated.cntk
@ -67,13 +67,13 @@ Train=[
            channels=3
            # Below are the optional parameters.
            # Possible values: Center, Random. Default: Center
-            cropType="Random"
-            # Horizontal random flip, will be enabled by default if cropType=Random
+            cropType="RandomSide"
+            # Horizontal random flip, will be enabled because cropType=RandomSide
            #hflip="true"
-            # Crop scale ratio. Examples: cropRatio=0.9, cropRatio=0.7:0.9. Default: 1.
-            cropRatio=0.875
+            # Crop scale side ratio. Examples: sideRatio=0.9, sideRatio=0.7:0.9.
+            sideRatio=0.875
            # Crop scale ratio jitter type.
-            # Possible values: None, UniRatio, UniLength, UniArea. Default: UniRatio
+            # Possible values: None, UniRatio. Default: None
            jitterType="UniRatio"
            # Interpolation to use when scaling image to width x height size.
            # Possible values: nearest, linear, cubic, lanczos. Default: linear.
--- a/Examples/Image/GettingStarted/07_Deconvolution.cntk
+++ b/Examples/Image/GettingStarted/07_Deconvolution.cntk
@ -0,0 +1,109 @@
+# Parameters can be overwritten on the command line
+# for example: cntk configFile=myConfigFile RootDir=../.. 
+# For running from Visual Studio add
+# currentDirectory=$(SolutionDir)/<path to corresponding data folder> 
+
+#command = trainNetwork:testNetwork:writeResults
+command = trainNetwork:testNetwork
+
+precision = "float"; traceLevel = 1 ; deviceId = "auto"
+
+rootDir = ".." ; dataDir = "$rootDir$/DataSets/MNIST" ;
+outputDir = "./Output" ;
+
+modelPath = "$outputDir$/Models/07_Deconvolution.model"
+#stderr = "$outputDir$/07_Deconvolution_bs_out.txt"
+#makemode=false
+
+# TRAINING CONFIG
+trainNetwork = {
+    action = "train"
+    
+    BrainScriptNetworkBuilder = {
+        cMap = 1
+        model = inputFeatures => {
+            conv1   = ConvolutionalLayer {cMap, (5:5), pad = true, activation=ReLU}(inputFeatures)
+            pool1   = MaxPoolingLayer {(4:4), stride=(4:4)}(conv1)
+            unpool1 = MaxUnpoolingLayer {(4:4), stride=(4:4)}(pool1, conv1)
+            deconv1 = DeconvLayer {1, (5:5), cMap, lowerPad=(2:2:0), upperPad=(2:2:0), bias=false}(unpool1)
+        }.deconv1
+
+        # inputs
+        imageShape = 28:28:1
+        features = Input {imageShape}
+
+        featScale = 1/256
+        Scale{f} = x => Constant(f) .* x
+
+        # apply model to features
+        f1 = Scale{featScale} (features)
+        z = model (f1)
+
+        # rmse loss function
+        f2 = Scale{featScale} (features)
+        err = z - f2
+        sqErr = err .* err
+        mse = ReduceMean(sqErr)
+        rmse = Sqrt(mse)
+        
+        # declare special nodes
+        featureNodes = (features)
+        criterionNodes = (rmse)
+        evaluationNodes = (rmse)
+        outputNodes = (z)
+    }
+
+    SGD = {
+        epochSize = 60000
+        minibatchSize = 64
+        maxEpochs = 3
+        
+        learningRatesPerSample = 0.00015
+        momentumAsTimeConstant = 600
+        
+        firstMBsToShowResult = 5
+        numMBsToShowResult = 235
+    }
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        # See DataSets/MNIST/README.md for details on getting the data (Train-28x28_cntk_text.txt).
+        file = "$DataDir$/Train-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }   
+}
+
+# TEST CONFIG
+testNetwork = {
+    action = "test"
+    minibatchSize = 1024    # reduce this if you run out of memory
+
+    reader = {
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
+
+# WRITE CONFIG
+writeResults = {
+    action = "write"
+    minibatchSize = 1
+    outputPath = "$outputDir$/decoder_output_bs.txt"
+    
+    reader = {
+        randomize = False
+        readerType = "CNTKTextFormatReader"
+        file = "$DataDir$/Test-28x28_cntk_text.txt"
+        input = {
+            features = { dim = 784 ; format = "dense" }
+            labels =   { dim = 10  ; format = "dense" }
+        }
+    }
+}
--- a/Examples/Image/GettingStarted/07_Deconvolution_Visualizer.py
+++ b/Examples/Image/GettingStarted/07_Deconvolution_Visualizer.py
@ -0,0 +1,108 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+
+from __future__ import print_function
+import os
+import numpy as np
+from cntk import load_model
+from cntk.ops import combine
+from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, FULL_DATA_SWEEP
+from PIL import Image
+from cntk import graph
+
+
+# Paths relative to current python file.
+abs_path   = os.path.dirname(os.path.abspath(__file__))
+data_path  = os.path.join(abs_path, "..", "DataSets", "MNIST")
+model_path = os.path.join(abs_path, "Output", "Models")
+
+
+# Helper to print all node names
+def print_all_node_names(model_file, is_BrainScript=True):
+    loaded_model = load_model(model_file)
+    if is_BrainScript:
+        loaded_model = combine([loaded_model.outputs[0]])
+    node_list = graph.depth_first_search(loaded_model, lambda x: x.is_output)
+    print("printing node information in the format")
+    print("node name (tensor shape)")
+    for node in node_list:
+        print(node.name, node.shape)
+
+
+# Helper to save array as grayscale image
+def save_as_png(val_array, img_file_name, dim=28):
+    img_array = val_array.reshape((dim, dim))
+    img_array = np.clip(img_array, 0, img_array.max())
+    img_array *= 255.0 / img_array.max()
+    img_array = np.rint(img_array).astype('uint8')
+
+    try:
+        os.remove(img_file_name)
+    except OSError:
+        pass
+
+    im = Image.fromarray(img_array)
+    im2 = im.resize((224,224))
+    im2.save(img_file_name)
+
+
+if __name__ == '__main__':
+    num_objects_to_eval = 5
+
+    # define location of output, model and data and check existence
+    output_path = os.path.join(abs_path, "Output")
+    model_file = os.path.join(model_path, "07_Deconvolution.model")
+    data_file = os.path.join(data_path, "Test-28x28_cntk_text.txt")
+    if not (os.path.exists(model_file) and os.path.exists(data_file)):
+        print("Cannot find required data or model. "
+              "Please get the MNIST data set and run 'cntk configFile=07_Deconvolution.cntk' to create the model.")
+        exit(0)
+
+    # create minibatch source
+    minibatch_source = MinibatchSource(CTFDeserializer(data_file, StreamDefs(
+        features  = StreamDef(field='features', shape=(28*28)),
+        labels    = StreamDef(field='labels',   shape=10)
+    )), randomize=False, epoch_size = FULL_DATA_SWEEP)
+
+    # use this to print all node names in the model
+    # print_all_node_names(model_file)
+
+    # load model and pick desired nodes as output
+    loaded_model = load_model(model_file)
+    output_nodes = combine(
+        [loaded_model.find_by_name('f1').owner,
+         loaded_model.find_by_name('z.p1').owner,
+         loaded_model.find_by_name('z').owner])
+
+    # evaluate model save output
+    features_si = minibatch_source['features']
+    with open(os.path.join(output_path, "decoder_output_py.txt"), 'wb') as decoder_text_file:
+        with open(os.path.join(output_path, "encoder_output_py.txt"), 'wb') as encoder_text_file:
+            for i in range(0, num_objects_to_eval):
+                mb = minibatch_source.next_minibatch(1)
+                raw_dict = output_nodes.eval(mb[features_si])
+                output_dict = {}
+                for key in raw_dict.keys(): output_dict[key.name] = raw_dict[key]
+
+                encoder_input = output_dict['f1']
+                encoder_output = output_dict['z.p1']
+                decoder_output = output_dict['z']
+                in_values = (encoder_input[0,0].flatten())[np.newaxis]
+                enc_values = (encoder_output[0,0].flatten())[np.newaxis]
+                out_values = (decoder_output[0,0].flatten())[np.newaxis]
+
+                # write results as text and png
+                np.savetxt(decoder_text_file, out_values, fmt="%.6f")
+                np.savetxt(encoder_text_file, enc_values, fmt="%.6f")
+                save_as_png(in_values,  os.path.join(output_path, "imageAutoEncoder_%s__input.png" % i))
+                save_as_png(out_values, os.path.join(output_path, "imageAutoEncoder_%s_output.png" % i))
+
+                # visualizing the encoding is only possible and meaningful with a single conv filter
+                enc_dim = 7
+                if(enc_values.size == enc_dim*enc_dim):
+                    save_as_png(enc_values, os.path.join(output_path, "imageAutoEncoder_%s_encoding.png" % i), dim=enc_dim)
+
+    print("Done. Wrote output to %s" % output_path)
--- a/Examples/Image/GettingStarted/README.md
+++ b/Examples/Image/GettingStarted/README.md
@ -108,3 +108,15 @@ In the sixth example, we show how to train CNTK with multiple process(GPUs) for
 `mpiexec -n 2 cntk configFile=06_OneConvRegrMultiNode.cntk parallelTrain=True parallelizationMethod=DataParallelSGD`

 You can change the parallelizationMethod to other three options. To see more detailed guide on multiple GPUs and machines tasks, please refer to [Multiple GPUs and machines](https://github.com/Microsoft/CNTK/wiki/Multiple-GPUs-and-machines).
+
+### 07_Deconvolution.cntk
+
+Example number seven shows how to use Deconvolution and Unpooling to generate a simple image auto encoder. It uses the MNIST dataset, which has a resolution of 28x28x1, encodes it into a 7x7x1 representation using convolution and pooling and decodes to the original resolution. The training criterion is root-mean-square error (RMSE). To run this example, use the following command:
+
+`cntk configFile=07_Deconvolution.cntk`
+
+The rmse values for training and testing are 0.225 and 0.223 respectively. To visualize the encoded and decoded images run the following command (from a Python CNTK environment):
+
+`python 07_Deconvolution_Visualizer.py`
+
+The visualizations will be stored in the `Output` folder together with a text representation of the encoder and the decoder output.
--- a/Examples/Text/CharacterLM/char-rnn.py
+++ b/Examples/Text/CharacterLM/char-rnn.py
@ -163,7 +163,7 @@ def train_lm(training_file):
    momentum_time_constant = momentum_as_time_constant_schedule(1100)
    clipping_threshold_per_sample = 5.0
    gradient_clipping_with_truncation = True
-    learner = momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant, 
+    learner = momentum_sgd(z.parameters, lr_per_sample, momentum_time_constant,
                           gradient_clipping_threshold_per_sample=clipping_threshold_per_sample,
                           gradient_clipping_with_truncation=gradient_clipping_with_truncation)
    trainer = Trainer(z, ce, errs, learner)
--- a/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py
+++ b/Examples/Video/GettingStarted/Python/Conv3D_UCF11.py
@ -198,7 +198,7 @@ def conv3d_ucf11(train_reader, test_reader, max_epochs=30):
    mm_schedule            = momentum_as_time_constant_schedule(momentum_time_constant, epoch_size=epoch_size)

    # Instantiate the trainer object to drive the model training
-    learner     = momentum_sgd(z.parameters, lr_schedule, mm_schedule)
+    learner     = momentum_sgd(z.parameters, lr_schedule, mm_schedule, True)
    trainer     = Trainer(z, ce, pe, learner)

    log_number_of_parameters(z) ; print()
--- a/11
+++ b/11
@ -456,6 +456,7 @@ CNTKLIBRARY_COMMON_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/Serialization.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/DistributedCommunicator.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/DistributedLearnerBase.cpp \
+	$(SOURCEDIR)/CNTKv2LibraryDll/TrainingSession.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/DataParallelDistributedLearner.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/proto/CNTK.pb.cc \

@ -513,7 +514,7 @@ CNTKLIBRARY_TESTS_SRC =\
 	$(CNTKLIBRARY_TESTS_SRC_PATH)/DeviceSelectionTests.cpp \
 	$(CNTKLIBRARY_TESTS_SRC_PATH)/MinibatchSourceTest.cpp \
 	$(CNTKLIBRARY_TESTS_SRC_PATH)/UserDefinedFunctionTests.cpp \
-	Examples/Evaluation/CPPEvalV2Client/EvalMultithreads.cpp \
+	Examples/Evaluation/CNTKLibraryCPPEvalExamples/EvalMultithreads.cpp \

 CNTKLIBRARY_TESTS:=$(BINDIR)/v2librarytests
 CNTKLIBRARY_TESTS_OBJ := $(patsubst %.cu, $(OBJDIR)/%.o, $(patsubst %.cpp, $(OBJDIR)/%.o, $(CNTKLIBRARY_TESTS_SRC)))
@ -633,11 +634,11 @@ $(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB) $(READER_LIBS
 ########################################
 # Eval V2 Sample client
 ########################################
-EVALV2_SAMPLE_CLIENT:=$(BINDIR)/cppevalv2client
+EVALV2_SAMPLE_CLIENT:=$(BINDIR)/CNTKLibraryCPPEvalExamples

 EVALV2_SAMPLE_CLIENT_SRC=\
-	$(SOURCEDIR)/../Examples/Evaluation/CPPEvalV2Client/CPPEvalV2Client.cpp  \
-	$(SOURCEDIR)/../Examples/Evaluation/CPPEvalV2Client/EvalMultithreads.cpp
+	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalExamples/CNTKLibraryCPPEvalExamples.cpp  \
+	$(SOURCEDIR)/../Examples/Evaluation/CNTKLibraryCPPEvalExamples/EvalMultithreads.cpp

 EVALV2_SAMPLE_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVALV2_SAMPLE_CLIENT_SRC))

@ -1132,6 +1133,7 @@ UNITTEST_NETWORK_SRC = \
 	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/OperatorEvaluation.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/stdafx.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/TestHelpers.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/NetworkTests/EditDistanceTests.cpp \
 	$(SOURCEDIR)/CNTK/ModelEditLanguage.cpp \
 	$(SOURCEDIR)/ActionsLib/TrainActions.cpp \
 	$(SOURCEDIR)/ActionsLib/EvalActions.cpp \
@ -1185,6 +1187,7 @@ UNITTEST_MATH_SRC = \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixQuantizerTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixSparseDenseInteractionsTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixLearnerTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \

 UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
--- a/README.md
+++ b/README.md
@ -3,17 +3,44 @@
 Give us feedback through these [channels](https://github.com/Microsoft/CNTK/wiki/Feedback-Channels).

 # Latest news
+***2017-01-20.* V 2.0 Beta 9 Release**  
+Highlights of this Release:
+* Default Python version is now 3.5 (relates to default parameters in client installations as well as [Runtime Images at Docker Hub](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers)).
+* New and updated core and Python API features.
+* New Tutorials and Examples:
+  * Deconvolution layer and image auto encoder example using deconvolution and unpooling ([Example **07_Deconvolution** in *Image - Getting Started*](https://github.com/Microsoft/CNTK/tree/v2.0.beta9.0/Examples/Image/GettingStarted)).
+  * [Basic autoencoder with MNIST data](https://github.com/Microsoft/CNTK/blob/v2.0.beta9.0/Tutorials/CNTK_105_Basic_Autoencoder_for_Dimensionality_Reduction.ipynb).
+  * [LSTM Timeseries with Simulated Data (Part A)](https://github.com/Microsoft/CNTK/blob/v2.0.beta9.0/Tutorials/CNTK_106A_LSTM_Timeseries_with_Simulated_Data.ipynb). (More will come in the next Releases)
+* New [CNTK NuGet Packages](https://github.com/Microsoft/CNTK/wiki/NuGet-Package).
+
+See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_2_0_beta_9_Release_Notes).  
+Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases).
+
+***2017-01-19.* V 2.0 Beta 8 Release available at Docker Hub**  
+CNTK V 2.0 Beta 8 Runtime packages are now available as [Public Images at Docker Hub](https://hub.docker.com/r/microsoft/cntk/).  
+See more on CNTK as Docker Images in this [Wiki article](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers).
+
+***2017-01-16.* V 2.0 Beta 8 Release**  
+Highlights of this Release:
+* Support of Python v. 2.7, 3.4, and 3.5. See [binary and source setup](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine) instructions to find out about how to select Python version.
+* New Python API features.
+* New Python example [Feature extraction using a trained model in Python API](https://github.com/Microsoft/CNTK/tree/v2.0.beta8.0/Examples/Image/FeatureExtraction).
+* Support of [Visual Studio 2015](https://github.com/Microsoft/CNTK/wiki/Setup-Migrate-VS13-to-VS15) for Windows version.
+* Introduction of [C# API in CNTK Evaluation Library](https://github.com/Microsoft/CNTK/wiki/CNTK-Library-Managed-API) and a new set of [CNTK NuGet Packages](https://github.com/Microsoft/CNTK/wiki/NuGet-Package).
+* CNTK Runtime packages are now available as [Public Images at Docker Hub](https://github.com/Microsoft/CNTK/wiki/CNTK-Docker-Containers). (**Beta 7** is currently available; Beta 8 Images availability will be announced separately in a few days)
+* Version 3 of [CNTK Custom MKL Library](https://cntk.ai/mkl/) is available.
+
+See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_2_0_beta_8_Release_Notes).  
+Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases).
+
+***2017-01-10.* CNTK for Windows supports Visual 2015**

-***2017-01-10.*** CNTK for Windows supports Visual 2015
 If you pull or merge the master branch, CNTK will now require Visual Studio 2015 to build on Windows. There are two ways to move your development environment to Visual Studio 2015:

-[Migrate VS2013 to VS2015](https://github.com/Microsoft/CNTK/wiki/Setup-Migrate-VS13-to-VS15):
-This gives you a fine grained control over where components are installed 
+* [Migrate VS2013 to VS2015](https://github.com/Microsoft/CNTK/wiki/Setup-Migrate-VS13-to-VS15): This gives you a fine grained control over where components are installed 
+* [Script driven setup](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-with-script-on-Windows): This gives you an mostly automated migration to Visual Studio 2015

-[Script driven setup](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-with-script-on-Windows):
-This gives you an mostly automated migration to Visual Studio 2015
-
-***2016-12-22.*** V 2.0 Beta 7 Release
+***2016-12-22.* V 2.0 Beta 7 Release**
 Highlights of this Release:

 * Python API behaviour is changed to be more strict.
@ -26,42 +53,7 @@ and [GoogLeNet (Inception V3)](https://github.com/Microsoft/CNTK/tree/v2.0.beta7
 See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_2_0_beta_7_Release_Notes)
 Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)

-***2016-12-13.*** V 2.0 Beta 6 Release
-Highlights of this Release:
-
-* Both Windows and Linux packages are now created using NVIDIA CUDA 8.0 toolkit.
-* Linux version now supports Python 3.5 (Windows support is coming soon).
-* Support for training on one-hot and sparse arrays via NumPy.
-* New Examples and Tutorials: [Video action recognition](https://github.com/Microsoft/CNTK/tree/v2.0.beta6.0/Examples/Video/GettingStarted), [Finance Timeseries with Pandas/Numpy](https://github.com/Microsoft/CNTK/blob/v2.0.beta6.0/Tutorials/CNTK_104_Finance_Timeseries_Basic_with_Pandas_Numpy.ipynb), [Neural Character Language Models](https://github.com/Microsoft/CNTK/tree/v2.0.beta6.0/Examples/Text/CharacterLM/README.md)
-* Stability Improvements and bug fixes.
-
-See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_2_0_beta_6_Release_Notes)
-Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
-
-***2016-11-25.*** V 2.0 Beta 5 Release
-Highlights of this Release:
-
-* The Windows binary packages are now created using the NVIDIA CUDA 8 toolkit, see the [release notes](https://github.com/Microsoft/CNTK/wiki/CNTK_2_0_beta_5_Release_Notes) for details. The CNTK-Linux binary packages are still built with CUDA 7.5. The Linux support for Cuda8 will follow shortly!
-* Performance enhancements for evaluation of bitmap images through the new `EvaluateRgbImage` function in the [managed Eval API](https://github.com/Microsoft/CNTK/wiki/Managed-EvalDLL-API).
-* A new version of the [CNTK Nuget package](https://github.com/Microsoft/CNTK/wiki/NuGet-Package) is available.
-* Stability Improvements and bug fixes, i.e. decreased memory footprint in CNTK Text Format deserializer.
-* We continue to improve documentation and tutorials on an ongoing basis, in this release we added a [Sequence-to-Sequence tutorial](https://github.com/Microsoft/CNTK/blob/v2.0.beta5.0/Tutorials/CNTK_204_Sequence_To_Sequence.ipynb).
-
-See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_2_0_beta_5_Release_Notes)
-Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
-
-***2016-11-21.*** V 2.0 Beta 4 Release
-Highlights of this Release:
-
-* New ASGD/Hogwild! training using Microsoft’s Parameter Server ([Project Multiverso](https://github.com/Microsoft/multiverso))
-* Distributed Scenarios now supported in CNTK Python API
-* New [Memory Compression](https://github.com/Microsoft/CNTK/wiki/Top-level-configurations#hypercompressmemory) mode to reduce memory usage on GPU
-* CNTK Docker image with 1bit-SGD support
-* Stability Improvements and bug fixes
-
-See more in the [Release Notes](https://github.com/Microsoft/CNTK/wiki/CNTK_2_0_beta_4_Release_Notes)
-Get the Release from the [CNTK Releases page](https://github.com/Microsoft/CNTK/releases)
-
+See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 # What is The Microsoft Cognitive Toolkit

--- a/Scripts/install/linux/install-cntk.sh
+++ b/Scripts/install/linux/install-cntk.sh
@ -6,7 +6,7 @@
 # for full license information.
 # ==============================================================================

-PY_VERSION=34
+PY_VERSION=35

 while [ $# -gt 0 ]; do
  case "$1" in
@ -49,7 +49,7 @@ CNTK_EXAMPLES_PATH="$PWD/Examples"
 CNTK_TUTORIALS_PATH="$PWD/Tutorials"
 CNTK_BINARY="$CNTK_BIN_PATH/cntk"
 CNTK_PY_ENV_FILE="$SCRIPT_DIR/conda-linux-cntk-py$PY_VERSION-environment.yml"
-CNTK_WHEEL_PATH="cntk/python/cntk-2.0.beta7.0-$PYWHEEL_QUALIFIER-linux_x86_64.whl"
+CNTK_WHEEL_PATH="cntk/python/cntk-2.0.beta9.0-$PYWHEEL_QUALIFIER-linux_x86_64.whl"

 test -d "$CNTK_BIN_PATH" && test -d "$CNTK_LIB_PATH" && test -d "$CNTK_DEP_LIB_PATH" && 
 test -d "$CNTK_TUTORIALS_PATH" &&
--- a/Scripts/install/windows/_action.ps1
+++ b/Scripts/install/windows/_action.ps1
@ -89,14 +89,14 @@ function InstallYml(
    $env= $table["Env"]
    $ymlFile  = $table["ymlFile"]

-    $envsDir = join-path $basePath "envs"
-    $targetDir = join-path $envsDir $env
+    $envsDir = Join-Path $basePath envs
+    $targetDir = Join-Path $envsDir $env

    if (test-path -path $targetDir -PathType Container) {
-        $newTable = @{ Function = "InstallExe"; Command = "$basepath\Scripts\conda.exe"; Param = "env update --file $ymlFile --name $targetDir"; WorkDir = "$basePath\Scripts"; runAs=$false }
+        $newTable = @{ Function = "InstallExe"; Command = "$basepath\Scripts\conda.exe"; Param = "env update --file `"$ymlFile`" --name `"$targetDir`""; WorkDir = "$basePath\Scripts"; runAs=$false }
    }
    else {
-        $newTable = @{ Function = "InstallExe"; Command = "$basepath\Scripts\conda.exe"; Param = "env create --file $ymlFile --prefix $targetDir"; WorkDir = "$basePath\Scripts"; runAs=$false }
+        $newTable = @{ Function = "InstallExe"; Command = "$basepath\Scripts\conda.exe"; Param = "env create --file `"$ymlFile`" --prefix `"$targetDir`""; WorkDir = "$basePath\Scripts"; runAs=$false }
    }

    InstallExe $newTable
@ -149,6 +149,7 @@ function InstallWheel(
    $EnvName      = $table["EnvName"]
    $message      = $table["message"]
    $whlDirectory = $table["WheelDirectory"]
+    $pyVersion = $table["PyVersion"]

    Write-Host $message
    if (-not $Execute) {
@ -156,8 +157,8 @@ function InstallWheel(
         return 
    }

-    $whlFile = Get-ChildItem $cntkRootDir\cntk\Python\cntk*.whl
-    if ($whlFile -eq $null) {
+    $whlFile = Get-ChildItem $cntkRootDir\cntk\Python\cntk*cp$pyVersion-cp$pyVersion*.whl
+    if (-not $whlFile) {
        throw "No WHL file found at $cntkRootDir\cntk\Python"
    }
    if ($whlFile.Count -gt 1) {
@ -260,6 +261,7 @@ function CreateBatch(

    $func = $table["Function"]
    $filename = $table["Filename"]
+    $pyVersion = $table["PyVersion"]

    if (-not $Execute) {
        Write-Host "Create-Batch [$filename]:No-Execute flag. No file created"
@ -277,7 +279,7 @@ if /I "%CMDCMDLINE%" neq ""%COMSPEC%" " (
    exit /b 0
 )
 set PATH=$cntkRootDir\cntk;%PATH%
-"$AnacondaBasePath\Scripts\activate" "$AnacondaBasePath\envs\cntk-py34"
+"$AnacondaBasePath\Scripts\activate" "$AnacondaBasePath\envs\cntk-py$pyVersion"
 "@

    add-content -Path $filename -Encoding Ascii -Value $batchScript
--- a/Scripts/install/windows/_info.ps1
+++ b/Scripts/install/windows/_info.ps1
@ -32,11 +32,11 @@ The script will analyse your machine and will determine which components are req
 The required components will be downloaded in [$localCache]
 Repeated operation of this script will reuse already downloaded components.

- - If required VS2012 Runtime and VS2013 Runtime will be installed
+ - If required VS2015 Runtime will be installed
 - If required MSMPI will be installed
 - Anaconda3 will be installed into [$AnacondaBasePath]
- - A CNTK-PY34 environment will be created or updated in [$AnacondaBasePath\envs]
- - CNTK will be installed or updated in the CNTK-PY34 environment
+ - A CNTK-PY$PyVersion environment will be created or updated in [$AnacondaBasePath\envs]
+ - CNTK will be installed or updated in the CNTK-PY$PyVersion environment
 "
 }

@ -108,7 +108,7 @@ function DisplayStart()
    CheckOSVersion

    if (-not $Execute) {
-        Write-Host $(DisplayWarningNoExecuteMessage)
+        Write-Warning $(DisplayWarningNoExecuteMessage)
    }
    
    Write-Host $(DisplayStartContinueMessage)
@ -131,7 +131,7 @@ Write-Host "
 CNTK v2 Python install complete.

 To activate the CNTK Python environment and set the PATH to include CNTK, start a command shell and run
-   $cntkRootDir\scripts\cntkpy34.bat
+   $cntkRootDir\scripts\cntkpy$PyVersion.bat

 Please checkout tutorials and examples here:
   $cntkRootDir\Tutorials
--- a/Scripts/install/windows/_operations.ps1
+++ b/Scripts/install/windows/_operations.ps1
@ -10,15 +10,10 @@ $operations = @(
    @{Name = "Verifying Installation contents"; ShortName = "INSTCONTENT"; Info = "Verifying Installation contents";
      Verification = @( @{Function = "VerifyInstallationContent"; Path = "$cntkRootDir" } )
     },
-    @{Name = "Installation VS2012 Runtime"; ShortName = "VS2012"; Info = "Install VS2012 Runtime";
-      Verification = @( @{Function = "VerifyWin32ProductExists"; Match = "^Microsoft Visual C\+\+ 2012 x64 Additional Runtime" },
-                        @{Function = "VerifyWin32ProductExists"; Match = "^Microsoft Visual C\+\+ 2012 x64 Minimum Runtime" } );
-      Action = @( @{Function = "InstallExe"; Command  = "$cntkRootDir\prerequisites\VS2012\vcredist_x64.exe"; Param = "/install /passive /norestart"; Message="Installing VS2012 Runtime...." } )
-     },
-    @{Name = "Installation VS2013 Runtime"; ShortName = "VS2013"; Info = "Install VS2013 Runtime";
-      Verification = @( @{Function = "VerifyWin32ProductExists"; Match = "^Microsoft Visual C\+\+ 2013 x64 Additional Runtime" },
-                        @{Function = "VerifyWin32ProductExists"; Match = "^Microsoft Visual C\+\+ 2013 x64 Minimum Runtime" } );
-      Action = @( @{Function = "InstallExe"; Command  = "$cntkRootDir\prerequisites\VS2013\vcredist_x64.EXE"; Param = "/install /passive /norestart"; Message="Installing VS2013 Runtime...." } )
+    @{Name = "Installation VS2015 Runtime"; ShortName = "VS2015"; Info = "Install VS2015 Runtime";
+      Verification = @( @{Function = "VerifyWin32ProductExists"; Match = "^Microsoft Visual C\+\+ 201(5|7) x64 Additional Runtime" },
+                        @{Function = "VerifyWin32ProductExists"; Match = "^Microsoft Visual C\+\+ 201(5|7) x64 Minimum Runtime" } );
+      Action = @( @{Function = "InstallExe"; Command  = "$cntkRootDir\prerequisites\VS2015\vc_redist.x64.exe"; Param = "/install /passive /norestart"; Message="Installing VS2015 Runtime...." } )
     },
    @{Name = "MSMPI Installation"; ShortName = "CNTK"; Info = "Install MSMPI";
      Verification = @( @{Function = "VerifyWin32ProductVersion"; Match = "^Microsoft MPI \(\d+\."; Version = "7.0.12437.6" } );
@ -29,16 +24,16 @@ $operations = @(
      Download = @( @{Function = "Download"; Source = "https://repo.continuum.io/archive/Anaconda3-4.1.1-Windows-x86_64.exe"; Destination = "$localCache\Anaconda3-4.1.1-Windows-x86_64.exe" } );
      Action = @( @{Function = "InstallExe"; Command = "$localCache\Anaconda3-4.1.1-Windows-x86_64.exe"; Param = "/InstallationType=JustMe /AddToPath=0 /RegisterPython=0 /S /D=$AnacondaBasePath"; runAs=$false; Message="Installing Anaconda3-4.1.1. This will take several minutes. Please be patient ...."} );
     },
-    @{Name = "CNTK Python Environment 3.4"; ShortName = "CNTKPY34"; Info = "Setup CNTK PythonEnvironment 3.4";
+    @{Name = "CNTK Python Environment"; ShortName = "CNTKPY"; Info = "Setup CNTK PythonEnvironment $PyVersion";
      Verification  = @( @{Function = "VerifyRunAlways"  } );
-      Action = @( @{Function = "InstallYml"; BasePath = $AnacondaBasePath; Env = "cntk-py34"; ymlFile= "$MyDir\conda-windows-cntk-py34-environment.yml" } )
+      Action = @( @{Function = "InstallYml"; BasePath = $AnacondaBasePath; Env = "cntk-py$PyVersion"; ymlFile= "$MyDir\conda-windows-cntk-py$PyVersion-environment.yml"; PyVersion = $PyVersion } )
     },
-    @{Name = "CNTK WHL Install"; ShortName = "CNTKWHL34"; Info = "Setup/Update CNTK Wheel";
+    @{Name = "CNTK WHL Install"; ShortName = "CNTKWHL"; Info = "Setup/Update CNTK Wheel $PyVersion";
      Verification  = @( @{Function = "VerifyRunAlways" } );
-      Action = @( @{Function = "InstallWheel"; BasePath = "$AnacondaBasePath"; EnvName = "cntk-py34"; WheelDirectory="$AnacondaBasePath\envs\cntk-py34\Lib\site-packages\cntk"; Message="Setup/Update of CNTK Wheel environment. Please be patient...." } )
+      Action = @( @{Function = "InstallWheel"; BasePath = "$AnacondaBasePath"; EnvName = "cntk-py$PyVersion"; WheelDirectory="$AnacondaBasePath\envs\cntk-py$PyVersion\Lib\site-packages\cntk"; PyVersion = $PyVersion; Message="Setup/Update of CNTK Wheel $PyVersion environment. Please be patient...." } )
     },
-    @{Name = "Create CNTKPY34 batch file"; ShortName = "BATCH34"; Info = "Create CNTKPY34 batch file";
-      Verification  = @( @{Function = "VerifyFile"; Path = "$cntkRootDir\scripts\cntkpy34.bat" } );
-      Action = @( @{Function = "CreateBatch"; Filename = "$cntkRootDir\scripts\cntkpy34.bat" } )
+    @{Name = "Create CNTKPY batch file"; ShortName = "BATCH"; Info = "Create CNTKPY batch file";
+      Verification  = @( @{Function = "VerifyFile"; Path = "$cntkRootDir\scripts\cntkpy$PyVersion.bat"; PyVersion = $PyVersion } );
+      Action = @( @{Function = "CreateBatch"; Filename = "$cntkRootDir\scripts\cntkpy$PyVersion.bat"; PyVersion = $PyVersion } )
     }
 )
--- a/Scripts/install/windows/_verify.ps1
+++ b/Scripts/install/windows/_verify.ps1
@ -127,8 +127,7 @@ function VerifyInstallationContent(
    $path = $table["Path"]

    $noInstallRequired = (join-path $path cntk\cntk.exe | test-path -PathType Leaf) 
-    $noInstallRequired = (join-path $path prerequisites\VS2012\vcredist_x64.exe | test-path -PathType Leaf) -and $noInstallRequired
-    $noInstallRequired = (join-path $path prerequisites\VS2013\vcredist_x64.exe | test-path -PathType Leaf) -and $noInstallRequired
+    $noInstallRequired = (join-path $path prerequisites\VS2015\vc_redist.x64.exe | test-path -PathType Leaf) -and $noInstallRequired
    $noInstallRequired = (join-path $path prerequisites\MSMpiSetup.exe | test-path -PathType Leaf) -and $noInstallRequired

    if ($noInstallRequired) {
@ -136,7 +135,7 @@ function VerifyInstallationContent(
        return $noInstallRequired
    }
    
-    throw "`nFatal Error: Files from CNTK binary download package are missing!`nThe install script must be run out of the unpacked binary CNTK package, not from a CNTK source clone."
+    throw "`nFatal Error: Files from the CNTK binary download package are missing!`nThe install script must be run out of the unpacked binary CNTK package, not from a CNTK source clone."
 }

 function VerifyDirectory(
--- a/Scripts/install/windows/install.ps1
+++ b/Scripts/install/windows/install.ps1
@ -7,26 +7,38 @@
  .SYNOPSIS
 Use this cmdlet to install CNTK from a precompiled binary drop (see https://github.com/Microsoft/CNTK/releases)

+ By default the script will:
+
+ - Create or reuse Anaconda3 in the folder `C:\local\Anaconda3-4.1.1-Windows-x86_64`
+ - Create or update a CNTK Python 3.5 environment in `C:\local\Anaconda3-4.1.1-Windows-x86_64\envs\cntk-py35`
+
 .DESCRIPTION
- The script will download and install the CNTK prerequisites and Anaconda environment
+ The script will download and install the CNTK prerequisites and Anaconda environment.

 It will analyse your machine and will determine which components are required. 
 The required components will be downloaded and cached.
 Repeated operation of this script will reuse already downloaded components.

- - If required VS2012 Runtime and VS2013 Runtime will be installed
+ - If required VS2015 Runtime will be installed
 - If required MSMPI will be installed
 - Anaconda3 will be installed into [<AnacondaBasePath>]
- - A CNTK-PY34 environment will be created or updated in [<AnacondaBasePath>\envs]
- - CNTK will be installed or updated in the CNTK-PY34 environment
+ - A CNTK-PY<version> environment will be created or updated in [<AnacondaBasePath>\envs]
+ - CNTK will be installed or updated in the CNTK-PY<version> environment
 
 .PARAMETER Execute
- This is an optional parameter. Without setting this switch, no changes to the machine setup/installation will be performed
+ You need to supply this optional parameter to have the install script perform any changes to your machine. 
+ Without this parameter NO CHANGES will be done to your machine.

 .PARAMETER AnacondaBasePath
- This is an optional parameter and can be used to specify an already installed Anaconda3 installation.
+ This optional parameter allows you to specify the location of an Anaconda installation to be used or created on your 
+ machine. If the directory exists on your machine, the script will continue under the assumption that this is a working 
+ Anaconda 3 (4.1.1) (or compatible) installation, and will create the CNTK Python environment in that location.
 By default a version of Anaconda3 will be installed into [C:\local\Anaconda3-4.1.1-Windows-x86_64]

+ .PARAMETER PyVersion
+ This is an optional parameter and can be used to specify the Python version used in the CNTK Python environment.
+ Supported values for this parameter are 27, 34, or 35. The default values is 35 (for a CNTK Python 35 environment).
+
 .EXAMPLE
 .\install.ps1
 
@ -39,15 +51,13 @@
 .\install.ps1 -Execute -AnacondaBasePath d:\cntkBeta

 This will install Anaconda in the [d:\cntkBeta] directory.
-  
-
 #>

 [CmdletBinding()]
 Param(
    [parameter(Mandatory=$false)] [string] $AnacondaBasePath = "C:\local\Anaconda3-4.1.1-Windows-x86_64",
-    [parameter(Mandatory=$false)] [switch] $Execute
-)
+    [parameter(Mandatory=$false)] [ValidateSet("27", "34", "35")] [string] $PyVersion = "35",
+    [parameter(Mandatory=$false)] [switch] $Execute)

 $MyDir = Split-Path $MyInvocation.MyCommand.Definition

--- a/Source/1BitSGD
+++ b/Source/1BitSGD
@ -1 +1 @@
-Subproject commit 7bde79e23210f87289af940c6b4e615a335f830f
+Subproject commit 4b2396f36b8129d035a0166cd2d1a1e457404249
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -160,6 +160,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
 #endif
    else if (EqualInsensitive(nodeType, OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode), L"CBCEWithSM")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(ClassificationErrorNode), L"ErrorPrediction")) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(EditDistanceErrorNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(EqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(GreaterEqualNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(GreaterNode))) ret = true;
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -84,20 +84,44 @@ ConvolutionalLayer {numOutputChannels,   # e.g. (1) or BS.Constants.None
                    #reductionRank = 1,          # TODO: support this
                    stride = 1, pad = false,
                    lowerPad = 0, upperPad = 0,
-                    #transpose = false,          # TODO: support this
                    maxTempMemSizeInSamples = 0} =
 {
    reductionRank = 1 # TODO: shall become an optional parameter
    outputChannelsShape = _AsArray (numOutputChannels)
-    outputRank = Length (outputChannelsShape)
    filterRank = Length (filterShape)
    kernelShape = _ConcatArrays (filterShape, Repeat (reductionRank, Inferred)) # kernel := filter plus reductionDims
    W = ParameterTensor{_ConcatArrays (kernelShape, outputChannelsShape), init = init, initValueScale = initValueScale, initFilterRank = filterRank, initOutputRank = -1}  # [ W x H x C x K ]
    b = ParameterTensor(_ConcatArrays (Repeat (Length (filterShape), 1), outputChannelsShape), initValue = initBias)                                                       # [ 1 x 1 x     K ]
    sharing = true    # TODO: support this
-    transpose = false # TODO: support this
    apply (x) = {
-        c = Convolution (W, x, filterShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = pad, lowerPad = lowerPad, upperPad = upperPad, transpose = transpose, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
+        c = Convolution (W, x, filterShape, mapDims = numOutputChannels, stride = stride, sharing = sharing, autoPadding = pad, lowerPad = lowerPad, upperPad = upperPad, deconv = false, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
+        res = activation (if bias then c + b else c)
+    }.res
+}.apply
+
+# DeconvLayer -- create a deconvolution layer with optional non-linearity
+DeconvLayer {numOutputChannels,
+             filterShape,         # e.g. (3:3)
+             numInputChannels,
+             bias = true,
+             activation = (x=>x),
+             init = 'glorotUniform',
+             initValueScale = 0.001,
+             initBias = 0,
+             stride = 1, 
+             sharing = true,
+             autoPadding = false,
+             lowerPad = 0, 
+             upperPad = 0,
+             maxTempMemSizeInSamples = 0} =
+{
+    outputChannelsShape = _AsArray (numOutputChannels)
+    kernelShape = _ConcatArrays (filterShape, outputChannelsShape) 
+    paramShape = _ConcatArrays (kernelShape, _AsArray (numInputChannels)) 
+    W = ParameterTensor{paramShape, init=init, initValueScale=initValueScale, initOnCPUOnly=true}
+    b = ParameterTensor(_ConcatArrays (Repeat (Length (filterShape), 1), outputChannelsShape), initValue = initBias)
+    apply (x) = {
+        c = Convolution(W, x, kernelShape, mapDims=numInputChannels, stride=stride, sharing=sharing, autoPadding=autoPadding, lowerPad=lowerPad, upperPad=upperPad, deconv=true, maxTempMemSizeInSamples = maxTempMemSizeInSamples)
        res = activation (if bias then c + b else c)
    }.res
 }.apply
@ -115,6 +139,15 @@ MaxPoolingLayer {filterShape, stride = 1, pad = false, lowerPad = 0, upperPad =
 AveragePoolingLayer {filterShape, stride = 1, pad = false, lowerPad = 0, upperPad = 0} =
    _PoolingLayer {"average", filterShape, stride = stride, pad = pad, lowerPad = lowerPad, upperPad = upperPad}

+MaxUnpoolingLayer {filterShape,     # e.g. (3:3)
+                   stride = 1, 
+                   pad = false,
+                   lowerPad = 0, 
+                   upperPad = 0} = 
+{
+    apply (unpoolInput, poolInput) = MaxUnpooling (unpoolInput, poolInput, filterShape, stride = stride, autoPadding = pad, lowerPad = lowerPad, upperPad = upperPad)
+}.apply
+    
 # RecurrentLSTMLayer -- create an LSTM layer
 RecurrentLSTMLayer {outputDim,
                    cellShape = None, # if set then use a projection
@ -571,7 +604,7 @@ ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ ope
 ReconcileMBLayout = ReconcileDynamicAxis # back compat
 CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
 # ND convo & pooling/unpooling   --why is autoPadding true? Normally one would want to reduce dimensions, no?
-Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = _AsNodes (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
+Convolution(weightNode, inputValueNode, kernelDims, mapDims = 0, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, deconv=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = _AsNodes (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] ; transpose = deconv /*plus the function args*/ ]
 Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = _AsNodes (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxUnpooling' ; inputs = _AsNodes (unpoolInput : poolInput); kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
 # 2D pooling
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -666,7 +666,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp

 static void PrintBanner(int argc, wchar_t* argv[], const string& timestamp)
 {
-    fprintf(stderr, "CNTK 2.0.beta7.0+ (");
+    fprintf(stderr, "CNTK 2.0.beta9.0+ (");
 #ifdef _GIT_EXIST
    fprintf(stderr, "%s %.6s, ", _BUILDBRANCH_, _BUILDSHA1_);
 #endif
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -121,6 +121,7 @@ namespace CNTK
    struct MinibatchInfo
    {
        bool atEndOfData;
+        bool atEndOfSweep;
        size_t numberOfSamples;
        NDArrayViewPtr trainingLossValue;
        NDArrayViewPtr evalCriterionValue;
@ -611,6 +612,11 @@ namespace CNTK
        ///
        CNTK_API NDArrayViewPtr Alias(bool readOnly = false) const;

+        ///
+        /// Creates a new NDArrayView which is an alias of 'this' view but with a new shape.
+        ///
+        CNTK_API NDArrayViewPtr AsShape(const NDShape& newShape) const;
+
        ///
        /// Copies the contents of the 'source' NDArrayView to 'this' view.
        /// The shapes of the 'source' view and 'this' view must be identical.
@ -2379,6 +2385,7 @@ namespace CNTK
        friend class Trainer;

    public:
+
        ///
        /// Computes and stores the values of specified variables in the 'outputs' map, using provided 'inputs' values corresponding
        /// to each leaf variable of the Function of VariableKind 'Input'.
@ -2410,11 +2417,15 @@ namespace CNTK
        CNTK_API virtual void Backward(const BackPropStatePtr& state,
                                       const std::unordered_map<Variable, ValuePtr>& rootGradientValues,
                                       std::unordered_map<Variable, ValuePtr>& backPropagatedGradientValuesForInputs);
-
        ///
        /// Returns the name of the operation that this Function denotes
        ///
-        virtual const std::wstring& OpName() const = 0;
+        virtual const std::wstring& OpName() const 
+#ifdef SWIG 
+        { NOT_IMPLEMENTED; }
+#else
+        = 0;
+#endif

    protected:
        ///
@ -2471,6 +2482,11 @@ namespace CNTK
        ///
        CNTK_API static FunctionPtr Deserialize(const Dictionary& dictionary, const ::CNTK::DeviceDescriptor& device = DeviceDescriptor::UseDefaultDevice());

+        ///
+        /// This method needs to be explicitly overriden in subclasses.
+        ///
+        size_t CurrentVersion() const override { NOT_IMPLEMENTED; }
+
    public:
        ///
        /// Returns the name of 'this' Function.
@ -2516,10 +2532,10 @@ namespace CNTK
        CNTK_API bool IsBlock() const;

        ///
-        /// Returns the composite Function underlying this block Function.
+        /// Returns the root of the Function graph underlying this block Function.
        /// Throws an exception of this is not a block Function
        ///
-        CNTK_API FunctionPtr BlockComposite() const;
+        CNTK_API FunctionPtr BlockRoot() const;

        ///
        /// Returns the mapping from the arguments of the composite underlying this block Function
@ -2726,7 +2742,10 @@ namespace CNTK
            ThrowFormatted<std::invalid_argument>(formatString.c_str(), DiagnosticsName().c_str(), std::forward<_Types>(_Args)...);
        }
    private:
+    public:
+        CNTK_API Function(const std::vector<Variable>& inputs, const std::vector<Variable>& outputs, const std::wstring& name = L"", const std::wstring& uid = Internal::GenerateUid(L"UserDefinedFunction"));

+    private:
        CNTK_API Function(const std::vector<Variable>& inputs, const std::vector<Variable>& outputs, Dictionary&& functionConfig, const FunctionPtr& rootFunction, const std::wstring& name, const std::wstring& uid);

        std::vector<Variable> m_inputs;
@ -3055,6 +3074,16 @@ namespace CNTK
        return ClassificationError(prediction, labels, Axis(0), name);
    }

+    ///
+    /// Create an instance of the CNTK built-in LambdaRank loss an effective proxy for optimizing the NDCG metric
+    ///
+    CNTK_API FunctionPtr LambdaRank(const Variable& prediction, const Variable& gains, const Variable& groupId, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in operation for evaluating the NDCG at 1 metric
+    ///
+    CNTK_API FunctionPtr NDCGAt1(const Variable& prediction, const Variable& gains, const Variable& groupId, const std::wstring& name = L"");
+
    ///
    /// Create an instance of the CNTK built-in operation for getting the past value along the lone dynamic axis of the specified operand.
    /// Throws an exception of the operand has more than one dynamic axis.
@ -3281,7 +3310,7 @@ namespace CNTK
        ///
        /// A special value that can be used for the epochSize to indicate that the schedule is sweep-based.
        ///
-        static const size_t EntireSweep = 0;
+        static const size_t FullDataSweep = 0;

        ///
        /// Create a schedule with a constant parameter value.
@ -3293,7 +3322,7 @@ namespace CNTK
        /// schedule[0] is used for the first 'epochSize' samples, schedule[1] -- for the second,
        /// and so on. The last value is then used repeatedly until the end of training.
        ///
-        CNTK_API TrainingParameterSchedule(const std::vector<T>& schedule, UnitType unit, size_t epochSize = 1);
+        CNTK_API TrainingParameterSchedule(const std::vector<T>& schedule, UnitType unit, size_t epochSize = FullDataSweep);

        ///
        /// Create a schedule using the list of key-value pairs, where the key specifies 
@ -3304,7 +3333,7 @@ namespace CNTK
        /// the first 100 samples, then '0.1' is used for the second 200 samples, 
        /// after which the values is switched to '0.005'.
        ///
-        CNTK_API TrainingParameterSchedule(const std::vector<std::pair<size_t, T>>& schedule, UnitType unit, size_t epochSize = 1);
+        CNTK_API TrainingParameterSchedule(const std::vector<std::pair<size_t, T>>& schedule, UnitType unit, size_t epochSize = FullDataSweep);

        ///
        /// Returns a value corresponding to the absolute sample (or sweep) 
@ -3319,7 +3348,7 @@ namespace CNTK
        ///
        UnitType Unit() const { return m_unit; }

-        bool IsSweepBased() const { return m_epochSize == EntireSweep; }
+        bool IsSweepBased() const { return m_epochSize == FullDataSweep; }

        CNTK_API virtual ~TrainingParameterSchedule();

@ -3351,20 +3380,23 @@ namespace CNTK
    class TrainingParameterPerUnitSchedule : public TrainingParameterSchedule<T>
    {
    public:
-        TrainingParameterPerUnitSchedule(double value) 
+        TrainingParameterPerUnitSchedule(T value)
            : TrainingParameterSchedule<T>::TrainingParameterSchedule(value, U)
        { }
-        
-        TrainingParameterPerUnitSchedule(const std::vector<double>& schedule, size_t epochSize = 1) 
+
+        TrainingParameterPerUnitSchedule(const std::vector<T>& schedule, 
+                                         size_t epochSize = TrainingParameterSchedule<T>::FullDataSweep)
            : TrainingParameterSchedule<T>::TrainingParameterSchedule(schedule, U, epochSize)
        { }
-        
-        TrainingParameterPerUnitSchedule(const std::vector<std::pair<size_t, double>>& schedule, size_t epochSize = 1) 
+
+
+        TrainingParameterPerUnitSchedule(const std::vector<std::pair<size_t, T>>& schedule, 
+                                         size_t epochSize = TrainingParameterSchedule<T>::FullDataSweep)
            : TrainingParameterSchedule<T>::TrainingParameterSchedule(schedule, U, epochSize)
        { }

 #ifdef SWIG // for Python interop (adds indexer)
-        const double __getitem__(size_t count) const
+        const T __getitem__(size_t count) const
        {
            return TrainingParameterSchedule<T>::operator[](count);
        }
@ -3391,6 +3423,8 @@ namespace CNTK
    typedef TrainingParameterPerSampleSchedule<double> MomentumPerSampleSchedule;
    typedef TrainingParameterPerMinibatchSchedule<double> MomentumPerMinibatchSchedule;

+    typedef TrainingParameterPerSampleSchedule<size_t> MinibatchSizeSchedule;
+
    ///
    /// This class allows to specify momentum as time constant in place of momentum per sample in 
    /// all of Learners factory methods. The specified values are then automatically converted into 
@ -3405,13 +3439,13 @@ namespace CNTK
            ConvertToPerSampleValues();
        }
        
-        MomentumAsTimeConstantSchedule(const std::vector<double>& schedule, size_t epochSize = 1) 
+        MomentumAsTimeConstantSchedule(const std::vector<double>& schedule, size_t epochSize = FullDataSweep) 
            : TrainingParameterSchedule<double>::TrainingParameterSchedule(schedule, UnitType::Sample, epochSize) 
        { 
            ConvertToPerSampleValues();
        }
        
-        MomentumAsTimeConstantSchedule(const std::vector<std::pair<size_t, double>>& schedule, size_t epochSize = 1) 
+        MomentumAsTimeConstantSchedule(const std::vector<std::pair<size_t, double>>& schedule, size_t epochSize = FullDataSweep) 
            : TrainingParameterSchedule<double>::TrainingParameterSchedule(schedule, UnitType::Sample, epochSize)
        { 
            ConvertToPerSampleValues();
@ -3428,9 +3462,10 @@ namespace CNTK
        CNTK_API void ConvertToPerSampleValues();
    };

-
+    ///
    /// A collection of additional options that affect parameter updates and 
    /// are applicable for all standard learners 
+    ///
    struct AdditionalLearningOptions
    {
        double l1RegularizationWeight = 0.0;
@ -3444,6 +3479,16 @@ namespace CNTK
        bool gradientClippingWithTruncation = true;
    };

+    ///  
+    /// Returns true if by default momentum is applied in the unit-gain fashion.
+    ///
+    CNTK_API bool DefaultUnitGainValue();
+
+    ///  
+    /// Sets globally default unit-gain flag value.
+    ///
+    CNTK_API void SetDefaultUnitGainValue(bool value);
+
    ///
    /// Abstraction for learning a subset of parameters of a learnable Function using first order gradient values
    /// For e.g momentum, AdaGrad, RMSProp etc. are different types of learners with their own algorithms for
@ -3456,7 +3501,7 @@ namespace CNTK
        // Method to update the parameters associated with this learner. By returning false, this method indicates that
        // learning has stopped for all of the parameters associated with this learner
        //
-        virtual bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) = 0;
+        virtual bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd = false) = 0;

        ///
        /// Returns the set of parameters associated with this learner.
@ -3554,6 +3599,7 @@ namespace CNTK
    CNTK_API LearnerPtr MomentumSGDLearner(const std::vector<Parameter>& parameters,
                                           const LearningRateSchedule& learningRateSchedule,
                                           const MomentumSchedule& momentumSchedule,
+                                           bool unitGain = DefaultUnitGainValue(),
                                           AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());

    ///
@ -3562,6 +3608,7 @@ namespace CNTK
    CNTK_API LearnerPtr NesterovLearner(const std::vector<Parameter>& parameters,
                                        const LearningRateSchedule& learningRateSchedule,
                                        const MomentumSchedule& momentumSchedule,
+                                        bool unitGain = DefaultUnitGainValue(),
                                        AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());

    static MomentumSchedule DefaultVarianceMomentum = MomentumAsTimeConstantSchedule(2 * 3600 * 100);
@ -3572,6 +3619,7 @@ namespace CNTK
    CNTK_API LearnerPtr AdamLearner(const std::vector<Parameter>& parameters,
                                    const LearningRateSchedule& learningRateSchedule,
                                    const MomentumSchedule& momentumSchedule,
+                                    bool unitGain = DefaultUnitGainValue(),
                                    const MomentumSchedule& varianceMomentumSchedule = DefaultVarianceMomentum,
                                    bool lowMemory = true,
                                    AdditionalLearningOptions additionalOptions = AdditionalLearningOptions());
@ -3611,9 +3659,9 @@ namespace CNTK
            return m_communicator;
        }

-        bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t minibatchSampleCount) override
+        bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t minibatchSampleCount, bool sweepEnd = false) override
        {
-            MinibatchInfo info{ false, minibatchSampleCount };
+            MinibatchInfo info{ false, sweepEnd, minibatchSampleCount };
            return Update(gradientValues, info);
        }

@ -3632,6 +3680,16 @@ namespace CNTK
            m_learner->ResetSmoothedGradients();
        }

+        //
+        // Returns the total number of samples needed for warmup.
+        // After reaching this number of samples the learner switches to the distributed mode.
+        // Warm up is useful for 
+        //
+        virtual size_t ParallelizationAfter()
+        {
+            return m_distributeAfterSamples;
+        }
+
        //
        // Method to update the parameters associated with this learner. By returning false, this method indicates that
        // learning has stopped for all of the parameters associated with this learner
@ -3639,11 +3697,12 @@ namespace CNTK
        CNTK_API virtual bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& minibatch) = 0;

    protected:
-        DistributedLearner(DistributedCommunicatorPtr communicator, LearnerPtr learner)
+        DistributedLearner(DistributedCommunicatorPtr communicator, LearnerPtr learner, size_t distributeAfterSamples)
            : Learner(learner? learner->Parameters() : std::vector<Parameter>(),
                      LearningRateSchedule(0, LearningRateSchedule::UnitType::Sample)),
              m_learner(learner),
-              m_communicator(communicator)
+              m_communicator(communicator),
+              m_distributeAfterSamples(distributeAfterSamples)
        {
            if (!m_learner)
                InvalidArgument("Learner is not allowed to be null.");
@ -3654,6 +3713,7 @@ namespace CNTK

        const LearnerPtr m_learner;
        const DistributedCommunicatorPtr m_communicator;
+        const size_t m_distributeAfterSamples;

        // Disallow copy and move construction and assignment
        DistributedLearner(const DistributedLearner&) = delete; DistributedLearner& operator=(const DistributedLearner&) = delete; DistributedLearner& operator=(DistributedLearner&&) = delete; DistributedLearner(DistributedLearner&&) = delete;
@ -3682,32 +3742,44 @@ namespace CNTK
        bool resetSGDMomentumAfterAggregation = true,
        double blockLearningRate = 1.0);

+    ///
+    /// Describes an input stream: its name, element type, storage, etc.
+    ///
+    struct StreamInformation
+    {
+        std::wstring m_name;           // Unique name of the stream
+        size_t m_id;                   // Unique identifier of the stream
+        StorageFormat m_storageFormat; // Storage format of the stream
+        DataType m_elementType;        // Element type of the stream
+        NDShape m_sampleLayout;        // Layout of the sample for the stream
+    };
+
+    inline bool operator==(const StreamInformation& left, const StreamInformation& right)
+    {
+        return ((left.m_id == right.m_id) &&
+            (left.m_name == right.m_name) &&
+            (left.m_storageFormat == right.m_storageFormat) &&
+            (left.m_elementType == right.m_elementType) &&
+            (left.m_sampleLayout == right.m_sampleLayout));
+    }
+
    ///
    /// Trainer is the top-level abstraction responsible for the orchestration of the training of a model
    /// using the specified learners and training data either explicitly supplied as Value objects or from
    /// a MinibatchSource object.
    ///
-    class Trainer
+    class Trainer : public std::enable_shared_from_this<Trainer>
    {
    public:
-        ///
-        /// Construct a Trainer to train the specified 'model' with the specified 'trainingLoss' Variable as the training criterion
-        /// and using the specified set of 'parameterLearners' for updating the model's parameters using computed gradients.
-        ///
-        CNTK_API Trainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const std::vector<LearnerPtr>& parameterLearners);
-
-        ///
-        /// Construct a Trainer to train the specified 'model' with the specified 'trainingLoss' as the training criterion,
-        /// the specified 'evaluationFunction' as the criterion for evaluating the trained model's quality, and using the specified set
-        /// of 'parameterLearners' for updating the model's parameters using computed gradients.
-        ///
-        // TODO: Add overload for multiple evaluation criterion
-        CNTK_API Trainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const FunctionPtr& evaluationFunction, const std::vector<LearnerPtr>& parameterLearners);
-
        ///
        /// Optimize model parameters using the specified 'arguments' minibatch of training samples.
        /// Returns false if all parameter learners indicate end of learning (through their Update method's return value).
        ///
+        CNTK_API bool TrainMinibatch(const std::unordered_map<Variable, MinibatchData>& arguments, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice());
+        
+        ///
+        /// An overload of the TrainMinibatch above that takes a map of variables and their values (as its first argument).
+        ///
        CNTK_API bool TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice());

        ///
@ -3717,12 +3789,22 @@ namespace CNTK
        /// for the 'outputs' for which the ValuePtr mapping was left null by the caller.
        /// Returns false if all parameter learners indicate end of learning (through their Update method's return value).
        ///
+        CNTK_API bool TrainMinibatch(const std::unordered_map<Variable, MinibatchData>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice());
+
+        ///
+        /// An overload of the TrainMinibatch above that takes a map of variables and their values (as its first argument).
+        ///
        CNTK_API bool TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice());

        ///
        /// Test the model on the specified batch of samples using the evaluation Function specified during construction of the Trainer
        /// Returns the average evaluation criterion value per sample for the tested minibatch of samples
        ///
+        CNTK_API double TestMinibatch(const std::unordered_map<Variable, MinibatchData>& arguments, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice());
+
+        ///
+        /// An overload of the TestMinibatch above that takes a map of variables and their values (as its first argument).
+        ///
        CNTK_API double TestMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice());

        ///
@ -3776,14 +3858,20 @@ namespace CNTK
        CNTK_API size_t TotalNumberOfSamplesSeen() const;

    private:
+        template <typename T1, typename ...CtorArgTypes>
+        friend std::shared_ptr<T1> MakeSharedObject(CtorArgTypes&& ...ctorArgs);
+
+        Trainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const std::vector<LearnerPtr>& parameterLearners);
+        Trainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const FunctionPtr& evaluationFunction, const std::vector<LearnerPtr>& parameterLearners);
+
        void ExecuteForwardBackward(
            const std::unordered_map<Variable, ValuePtr>& arguments,
            std::unordered_map<Variable, ValuePtr>& outputsToFetch,
            const DeviceDescriptor& computeDevice,
            std::unordered_map<Variable, ValuePtr>& parameterGradients);

-        bool TrainLocalMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice);
-        bool TrainDistributedMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice);
+        bool TrainLocalMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, bool sweepEnd, const DeviceDescriptor& computeDevice);
+        bool TrainDistributedMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, bool sweepEnd, const DeviceDescriptor& computeDevice);

        void Save(const std::wstring& modelFilePath, const std::vector<DictionaryValue>& learnerState, const Dictionary& externalState);

@ -3805,25 +3893,17 @@ namespace CNTK
    };

    ///
-    /// Describes an input stream: its name, element type, storage, etc.
+    /// Construct a Trainer to train the specified 'model' with the specified 'trainingLoss' Variable as the training criterion
+    /// and using the specified set of 'parameterLearners' for updating the model's parameters using computed gradients.
    ///
-    struct StreamInformation
-    {
-        std::wstring m_name;           // Unique name of the stream
-        size_t m_id;                   // Unique identifier of the stream
-        StorageFormat m_storageFormat; // Storage format of the stream
-        DataType m_elementType;        // Element type of the stream
-        NDShape m_sampleLayout;        // Layout of the sample for the stream
-    };
+    CNTK_API TrainerPtr CreateTrainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const std::vector<LearnerPtr>& parameterLearners);

-    inline bool operator==(const StreamInformation& left, const StreamInformation& right)
-    {
-        return ((left.m_id == right.m_id) &&
-                (left.m_name == right.m_name) &&
-                (left.m_storageFormat == right.m_storageFormat) &&
-                (left.m_elementType == right.m_elementType) &&
-                (left.m_sampleLayout == right.m_sampleLayout));
-    }
+    ///
+    /// Construct a Trainer to train the specified 'model' with the specified 'trainingLoss' as the training criterion,
+    /// the specified 'evaluationFunction' as the criterion for evaluating the trained model's quality, and using the specified set
+    /// of 'parameterLearners' for updating the model's parameters using computed gradients.
+    ///
+    CNTK_API TrainerPtr CreateTrainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const FunctionPtr& evaluationFunction, const std::vector<LearnerPtr>& parameterLearners);
 }

 namespace std {
@ -3838,11 +3918,34 @@ namespace std {

 namespace CNTK
 {
+    ///
+    /// A struct that combines the minibatch meta-data with the actual minibatch data.
+    /// The former includes the number of sequences and samples in the minibatch,
+    /// as well as the sweep-end flag, which is set to true to indicate that the minibatch 
+    /// concludes a data sweep (i.e, it's the last minibatch at the end of the sweep).
+    ///
    struct MinibatchData
    {
-        size_t m_numSequences;
-        size_t m_numSamples;
-        ValuePtr m_data;
+        MinibatchData() : MinibatchData(nullptr)
+        {}
+
+        // a convenience constructor to allow passing ValuePtr arguments in place 
+        // of MinibatchData parameter (e.g., in Trainer::TrainMinibatch)
+        MinibatchData(ValuePtr value) : MinibatchData(value, 0)
+        {}
+
+        MinibatchData(ValuePtr value, size_t numSamples, bool sweepEnd = false) 
+            : MinibatchData(value, numSamples, numSamples, sweepEnd)
+        {}
+
+        MinibatchData(ValuePtr value, size_t numSequences, size_t numSamples, bool sweepEnd) 
+            : data(value), numberOfSequences(numSequences), numberOfSamples(numSamples), sweepEnd(sweepEnd) 
+        {}
+
+        ValuePtr data;
+        size_t numberOfSequences;
+        size_t numberOfSamples;
+        bool sweepEnd; 
    };

    ///
@ -3868,14 +3971,22 @@ namespace CNTK
        /// In case the size is specified in terms of both #sequences and #samples, the smaller of the 2 is taken.
        /// An empty map is returned when the MinibatchSource has no more data to return.
        ///
-        virtual const std::unordered_map<StreamInformation, MinibatchData>& GetNextMinibatch(size_t minibatchSizeInSamples,
+        CNTK_API const std::unordered_map<StreamInformation, MinibatchData>& GetNextMinibatch(
            size_t minibatchSizeInSequences,
-            const DeviceDescriptor& device = DeviceDescriptor::UseDefaultDevice()) = 0;
+            size_t minibatchSizeInSamples,
+            const DeviceDescriptor& device = DeviceDescriptor::UseDefaultDevice());

        ///
-        /// Returns whether the MinibatchSource is running in distributed manner
+        /// Same as above but allows to specify partition of data in a distributed environment.
+        /// Depending on the number of workers the data is splitted in different partitions,
+        /// and depending on the worker rank, only a particular partition is read.
        ///
-        virtual bool IsDistributed() const = 0;
+        CNTK_API virtual const std::unordered_map<StreamInformation, MinibatchData>& GetNextMinibatch(
+            size_t minibatchSizeInSequences,
+            size_t minibatchSizeInSamples,
+            size_t numberOfWorkers,
+            size_t workerRank,
+            const DeviceDescriptor& device = DeviceDescriptor::UseDefaultDevice()) = 0;

        ///
        /// Destruct this MinibatchSource.
@ -3942,7 +4053,7 @@ namespace CNTK
    /// 
    /// Instantiate the CNTK built-in test format minibatch source
    ///
-    inline MinibatchSourcePtr TextFormatMinibatchSource(const std::wstring& dataFilePath, const std::vector<StreamConfiguration>& streamConfigs, size_t epochSize = MinibatchSource::InfinitelyRepeat, bool randomize = true, size_t distributedAfterSampleCount = MinibatchSource::InfiniteSamples)
+    inline MinibatchSourcePtr TextFormatMinibatchSource(const std::wstring& dataFilePath, const std::vector<StreamConfiguration>& streamConfigs, size_t epochSize = MinibatchSource::InfinitelyRepeat, bool randomize = true)
    {
        ::CNTK::Dictionary minibatchSourceConfiguration;
        minibatchSourceConfiguration[L"epochSize"] = epochSize;
@ -3973,10 +4084,6 @@ namespace CNTK

        deserializerConfiguration[L"input"] = inputStreamsConfig;
        minibatchSourceConfiguration[L"deserializers"] = std::vector<::CNTK::DictionaryValue>({ deserializerConfiguration });
-
-        //TODO: change all these dictionary names to string constants
-        minibatchSourceConfiguration[L"distributedAfterSampleCount"] = distributedAfterSampleCount;
-
        return CreateCompositeMinibatchSource(minibatchSourceConfiguration);
    }

@ -4106,6 +4213,101 @@ namespace CNTK
    /// Distributed communicator that allows quantized aggregations.
    ///
    CNTK_API QuantizedDistributedCommunicatorPtr QuantizedMPICommunicator(bool zeroThresholdFor1Bit, bool useQuantizationForSelfStripe, size_t numQuantizationBits);
+
+    ///
+    /// Base abstract class that represents a training session.
+    /// Derived classes can redefine different aspects of training, overriding base virtual methods (GetMinibatchSize, OnMinibatchStart, etc.)
+    ///
+    class TrainingSession
+    {
+    public:
+        CNTK_API TrainingSession(
+            const MinibatchSourcePtr& trainingSource,
+            const TrainerPtr& trainer,
+            const std::unordered_map<Variable, StreamInformation>& modelInputToMinibatchSourceStream,
+            const TrainingParameterPerUnitSchedule<size_t, TrainingParameterSchedule<size_t>::UnitType::Sample>& minibatchSizeSchedule,
+            size_t checkpointFrequencyInSamples,
+            const std::wstring& checkPointFileName);
+
+        ///
+        /// Runs the session.
+        ///
+        CNTK_API void Train(const DeviceDescriptor& computeDevice);
+
+        ///
+        /// Restores a session from a checkpoint.
+        ///
+        CNTK_API void RestoreFromCheckpoint(const std::wstring& checkpointFileName);
+
+        CNTK_API virtual ~TrainingSession() {}
+
+    public:
+        ///
+        /// Optionally overridable, called each time before a new minibatch is requested from the minibatch source
+        /// during training (from Run method).
+        ///
+        virtual size_t GetMinibatchSize()
+        {
+            return m_minibatchSizeSchedule[Trainer()->TotalNumberOfSamplesSeen()];
+        }
+
+        ///
+        /// Optionally overridable callback that is invoked before each minibatch.
+        ///
+        CNTK_API virtual void OnMinibatchStart() {};
+
+        ///
+        /// Optionally overridable callback that is invoked after each minibatch.
+        ///
+        CNTK_API virtual void OnMinibatchEnd() {};
+
+        ///
+        /// Optionally overridable callback that is invoked before each checkpoint.
+        ///
+        CNTK_API virtual void OnCheckpointStart() {};
+
+        ///
+        /// Optionally overridable callback that is invoked after each checkpoint.
+        ///
+        CNTK_API virtual void OnCheckpointEnd() {};
+
+    protected:
+        ///
+        /// Accessors.
+        ///
+        TrainerPtr Trainer() const { return m_trainer; }
+
+        MinibatchSourcePtr TrainingMinibatchSource() const { return m_trainingSource; }
+
+    private:
+        /// Disallow copy and move construction and assignment
+        TrainingSession(const TrainingSession&) = delete; TrainingSession& operator=(const TrainingSession&) = delete; TrainingSession& operator=(TrainingSession&&) = delete; TrainingSession(TrainingSession&&) = delete;
+
+        void SaveCheckpoint();
+
+        static const std::wstring s_checkpointIndex;
+        static const std::wstring s_trainingMinibatchSource;
+
+        const size_t m_checkpointFrequencyinSamples;
+        const std::wstring m_checkPointFileName;
+        size_t m_currentCheckpointIndex;
+
+        MinibatchSourcePtr m_trainingSource;
+        TrainerPtr m_trainer;
+        std::unordered_map<Variable, StreamInformation> m_modelInputToMinibatchSourceStream;
+        size_t m_parallelAfterSamples;
+        size_t m_workerRank;
+        size_t m_numberOfWorkers;
+        const MinibatchSizeSchedule m_minibatchSizeSchedule;
+    };
+
+    CNTK_API TrainingSessionPtr CreateBasicTrainingSession(
+        const MinibatchSourcePtr& trainingSource,
+        const TrainerPtr& trainer,
+        const std::unordered_map<Variable, StreamInformation>& modelInputToMinibatchSourceStream,
+        const TrainingParameterPerUnitSchedule<size_t, TrainingParameterSchedule<size_t>::UnitType::Sample>& minibatchSizeSchedule,
+        size_t checkpointFrequencyinSamples,
+        const std::wstring& checkPointFileName);
 }


--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -160,6 +160,9 @@ namespace CNTK
    enum class PrimitiveOpType : unsigned int;
    enum class DataType : unsigned int;

+    struct MinibatchInfo;
+    struct MinibatchData;
+
    class Serializer;

    // Similar to make_shared except that it associates a custom deleter with the shared_ptr to ensure
@ -208,6 +211,12 @@ namespace CNTK
    struct VariableFields;
    typedef std::shared_ptr<VariableFields> VariableFieldsPtr;

+    class TrainingSession;
+    typedef std::shared_ptr<TrainingSession> TrainingSessionPtr;
+
+    class Trainer;
+    typedef std::shared_ptr<Trainer> TrainerPtr;
+
    namespace Internal
    {
        CNTK_API FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name = L"");
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -284,6 +284,10 @@ namespace CNTK
                    opType = PrimitiveOpType::CrossEntropyWithSoftmax;
                else if (node->OperationName() == OperationNameOf(ClassificationErrorNode))
                    opType = PrimitiveOpType::ClassificationError;
+                else if (node->OperationName() == OperationNameOf(LambdaRankNode))
+                    opType = PrimitiveOpType::LambdaRank;
+                else if (node->OperationName() == OperationNameOf(NDCG1EvalNode))
+                    opType = PrimitiveOpType::NDCG;
                else if (node->OperationName() == OperationNameOf(ReduceElementsNode))
                {
                    auto reduceElementsNode = node->As<ReduceElementsNode<ElementType>>();
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -176,6 +176,7 @@
      <PrecompiledHeader>Create</PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="Trainer.cpp" />
+    <ClCompile Include="TrainingSession.cpp" />
    <ClCompile Include="Utils.cpp" />
    <ClCompile Include="Value.cpp" />
    <ClCompile Include="Variable.cpp" />
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -24,6 +24,7 @@
    <ClCompile Include="PrimitiveFunction.cpp" />
    <ClCompile Include="DistributedLearnerBase.cpp" />
    <ClCompile Include="DataParallelDistributedLearner.cpp" />
+    <ClCompile Include="TrainingSession.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="stdafx.h" />
@ -51,6 +52,7 @@
    <ClInclude Include="DataParallelDistributedLearner.h" />
    <ClInclude Include="BlockFunction.h" />
    <ClInclude Include="Variable.h" />
+    <ClInclude Include="TrainingSession.h" />
  </ItemGroup>
  <ItemGroup>
    <Filter Include="API">
@ -65,4 +67,4 @@
      <Filter>proto</Filter>
    </None>
  </ItemGroup>
-</Project>
+</Project>
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -528,4 +528,16 @@ namespace CNTK
    {
        return Microsoft::MSR::CNTK::CPUMatrix<float>::GetMaxNumThreads();
    }
+
+    static std::atomic<bool> s_defaultUnitGainValue(true);
+
+    bool DefaultUnitGainValue() 
+    {
+        return s_defaultUnitGainValue;
+    }
+
+    void SetDefaultUnitGainValue(bool value) 
+    {
+        s_defaultUnitGainValue.store(value);
+    }
 }
--- a/Source/CNTKv2LibraryDll/CompositeFunction.cpp
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.cpp
@ -88,7 +88,7 @@ namespace CNTK

            // For block functions we need to recursively traverse the underlying composite
            if (function->IsBlock())
-                PreorderTraverseFunctions(function->BlockComposite()->RootFunction(), SerializationTraversalFunc);
+                PreorderTraverseFunctions(function->BlockRoot(), SerializationTraversalFunc);
        };

        PreorderTraverseFunctions(RootFunction(), SerializationTraversalFunc);
@ -663,6 +663,12 @@ namespace CNTK
            case PrimitiveOpType::ClassificationError:
                computationNodePtr = New<ClassificationErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
                break;
+            case PrimitiveOpType::LambdaRank:
+                computationNodePtr = New<LambdaRankNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                break;
+            case PrimitiveOpType::NDCG:
+                computationNodePtr = New<NDCG1EvalNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                break;
            case PrimitiveOpType::PastValue:
            case PrimitiveOpType::FutureValue:
            {
@ -909,7 +915,7 @@ namespace CNTK
                    for (auto compositeArgument : compositeArguments)
                        m_variableToNodeMap[compositeArgument] = m_variableToNodeMap.at(compositeArgument.BlockFunctionVariableMapping());

-                    PreorderTraverseFunctions(function->BlockComposite()->RootFunction(), PatchBlockArgumentsMapping);
+                    PreorderTraverseFunctions(function->BlockRoot(), PatchBlockArgumentsMapping);
                }
            };
            PreorderTraverseFunctions(rootFunction, PatchBlockArgumentsMapping);
@ -921,15 +927,11 @@ namespace CNTK
                return (m_isVariableRootMap[outputVar] && (!ownerBlockFunc || IsVariableRoot(ownerBlockFunc->CompositeOutputsMap().at(outputVar))));
            };

-            // If any of the function outputs is not a root node, we need to explicitly add it to the 'output' group of the ComputationNetwork
-            for (auto rootOutput : rootFunctionOutputs)
-            {
-                if (!IsVariableRoot(rootOutput))
-                    m_computationNetwork->AddToNodeGroup(L"output", m_variableToNodeMap.at(rootOutput));
-            }
-
-            // If any of the requested outputs is not a root node, we need to explicitly add it to the 'output' group of the ComputationNetwork
-            for (auto output : outputs)
+            // If any of the function or requested outputs is not a root node, we need to explicitly
+            // add it to the 'output' group of the ComputationNetwork
+            std::unordered_set<Variable> networkOutputs(outputs);
+            networkOutputs.insert(rootFunctionOutputs.begin(), rootFunctionOutputs.end());
+            for (auto output : networkOutputs)
            {
                if (!IsVariableRoot(output))
                {
@ -1011,34 +1013,28 @@ namespace CNTK
        if (!m_networkMatricesAllocated && allocateNetworkMatrices)
        {
            ComputationNodeBasePtr backpropRootNode;
+            if (!m_currentBackpropRoots.empty())
+                backpropRootNode = m_variableToNodeMap.at(*m_currentBackpropRoots.begin());

            // Now recursively traverse the network in a top-down fashion
            auto rootFunction = RootFunction();
            auto rootFunctionOutputs = rootFunction->Outputs();
            std::vector<ComputationNodeBasePtr> forwardRootNodes;
            for (auto rootOutput : rootFunctionOutputs)
-            {
-                auto currentRootNode = m_variableToNodeMap.at(rootOutput);
-                forwardRootNodes.push_back(currentRootNode);
-
-                if (m_currentBackpropRoots.find(rootOutput) != m_currentBackpropRoots.end())
-                    backpropRootNode = currentRootNode;
-            }
+                forwardRootNodes.push_back(m_variableToNodeMap.at(rootOutput));

            std::vector<ComputationNodeBasePtr> forwardOutputNodes;
            for (auto output : outputs)
-            {
-                auto currentOutputNode = m_variableToNodeMap.at(output);
-                forwardOutputNodes.push_back(currentOutputNode);
-
-                // Select the root node for backpropagation
-                if (m_currentBackpropRoots.find(output) != m_currentBackpropRoots.end())
-                    backpropRootNode = currentOutputNode;
-            }
+                forwardOutputNodes.push_back(m_variableToNodeMap.at(output));

            m_computationNetwork->AllocateAllMatrices(forwardRootNodes, forwardOutputNodes, backpropRootNode);
            m_networkMatricesAllocated = allocateNetworkMatrices;

+            std::unordered_set<ComputationNodeBasePtr> allNetworkRoots = { backpropRootNode };
+            allNetworkRoots.insert(forwardRootNodes.begin(), forwardRootNodes.end());
+            allNetworkRoots.insert(forwardOutputNodes.begin(), forwardOutputNodes.end());
+            m_allNetworkRootsInGlobalEvalOrder = m_computationNetwork->SortByGlobalEvalOrder(allNetworkRoots);
+
            m_currentOutputs = outputs;
            m_currentOutputs.insert(rootFunctionOutputs.begin(), rootFunctionOutputs.end());
            m_currentOutputs.insert(m_currentBackpropRoots.begin(), m_currentBackpropRoots.end());
@ -1348,7 +1344,7 @@ namespace CNTK
        PopulateNetworkInputs(arguments);

        // Dropout nodes have an implicit input in the form of the random mask that is applied to its explicit input
-        // This mask is regerated every minibatch and hence dropout nodes with a non-zero dropout rate must me marked outdated
+        // This mask is regenerated every minibatch and hence dropout nodes with a non-zero dropout rate must me marked outdated
        // w.r.t. inputs to force evaluation in each minibatch
        list<ComputationNodeBasePtr> dropoutNodes = m_computationNetwork->GetNodesWithType(OperationNameOf(DropoutNode));
        for (auto& nodeIter : dropoutNodes)
@ -1382,7 +1378,16 @@ namespace CNTK

        ScopedNetworkOperationMode modeGuard(m_computationNetwork, outputsToRetainBackwardStateFor.empty() ? NetworkOperationMode::inferring : NetworkOperationMode::training);

-        m_computationNetwork->ForwardProp(outputsToEvaluate);
+        // We may have to include additional nodes in the ForwardProp to align with how the memory sharing structure is setup
+        // We need to include all roots that lie earlier in the global eval order than the actual outputs we are interested
+        // in evaluation.
+        // TODO: This may incur additonal compute costs in some rare scenarios. We need to come up with a better way to handle this.
+        outputsToEvaluate = m_computationNetwork->SortByGlobalEvalOrder(outputsToEvaluate);
+        auto lastOutputInEvalOrder = outputsToEvaluate.back();
+        auto iterEndRootInEvalOrder = std::find(m_allNetworkRootsInGlobalEvalOrder.begin(), m_allNetworkRootsInGlobalEvalOrder.end(), lastOutputInEvalOrder) + 1;
+
+        auto augmentedOutputsToEvaluate = std::vector<ComputationNodeBasePtr>(m_allNetworkRootsInGlobalEvalOrder.begin(), iterEndRootInEvalOrder);
+        m_computationNetwork->ForwardProp(augmentedOutputsToEvaluate);

        GetNetworkOutputs(outputs);

--- a/Source/CNTKv2LibraryDll/CompositeFunction.h
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.h
@ -300,6 +300,8 @@ namespace CNTK

        bool m_networkMatricesAllocated;

+        std::vector<Microsoft::MSR::CNTK::ComputationNodeBasePtr> m_allNetworkRootsInGlobalEvalOrder;
+
        std::unordered_map<Parameter, size_t> m_lastRecordedParameterValueTimeStamps;

        // Version history:
--- a/Source/CNTKv2LibraryDll/ComputeInputStatistics.cpp
+++ b/Source/CNTKv2LibraryDll/ComputeInputStatistics.cpp
@ -84,7 +84,7 @@ namespace CNTK
                break;

            for (auto& currentStreamKV : computedMeanAndInvStdDevs)
-                CompositeFunction::PopulateComputationNodeValue<float>({ streamToDummyInputVariableMap[currentStreamKV.first], minibatchData[currentStreamKV.first].m_data }, streamToInputNodeMap[currentStreamKV.first], layoutsPopulated);
+                CompositeFunction::PopulateComputationNodeValue<float>({ streamToDummyInputVariableMap[currentStreamKV.first], minibatchData[currentStreamKV.first].data }, streamToInputNodeMap[currentStreamKV.first], layoutsPopulated);

            ComputationNetwork::BumpEvalTimeStamp(allInputNodes);

--- a/Source/CNTKv2LibraryDll/DataParallelDistributedLearner.cpp
+++ b/Source/CNTKv2LibraryDll/DataParallelDistributedLearner.cpp
@ -147,6 +147,6 @@ namespace CNTK
        if (info.IsEmpty())
            return false;

-        return m_learner->Update(gradientValues, info.numberOfSamples);
+        return m_learner->Update(gradientValues, info.numberOfSamples, info.atEndOfSweep);
    }
 }
--- a/Source/CNTKv2LibraryDll/DistributedLearnerBase.cpp
+++ b/Source/CNTKv2LibraryDll/DistributedLearnerBase.cpp
@ -10,8 +10,7 @@
 namespace CNTK
 {
    DistributedLearnerBase::DistributedLearnerBase(DistributedCommunicatorPtr communicator, LearnerPtr learner, size_t distributeAfterSamples)
-        : DistributedLearner(communicator, learner),
-          m_distributeAfterSamples(distributeAfterSamples)
+        : DistributedLearner(communicator, learner, distributeAfterSamples)
    {
        if (!m_learner)
            InvalidArgument("Learner is not allowed to be null.");
--- a/Source/CNTKv2LibraryDll/DistributedLearnerBase.h
+++ b/Source/CNTKv2LibraryDll/DistributedLearnerBase.h
@ -25,8 +25,6 @@ namespace CNTK
        static void PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info);
        static void ConvertToOrdered(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, std::vector<std::pair<Parameter, NDArrayViewPtr>>& result);

-        const size_t m_distributeAfterSamples;
-
        std::vector<std::pair<Parameter, NDArrayViewPtr>> m_gradientBuffer;
        std::vector<Parameter> m_parameters;

--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -29,6 +29,9 @@ namespace CNTK
        : Function(inputs, outputs, std::move(functionConfig), nullptr, name, uid)
    {}

+    Function::Function(const std::vector<Variable>& inputs, const std::vector<Variable>& outputs, const std::wstring& name, const std::wstring& uid) :
+        Function(inputs, outputs, Dictionary(), name, uid) {}
+
    Function::Function(const std::vector<Variable>& inputs, const std::vector<Variable>& outputs, Dictionary&& functionConfig, const FunctionPtr& rootFunction, const std::wstring& name, const std::wstring& uid)
        : m_rootFunction(rootFunction), m_name(name != L"" ? name : uid), m_uid(uid), m_attributes(std::move(functionConfig))
    {
@ -121,13 +124,13 @@ namespace CNTK
        return (blockFunction != nullptr);
    }

-    FunctionPtr Function::BlockComposite() const
+    FunctionPtr Function::BlockRoot() const
    {
        if (!IsBlock())
-            InvalidArgument("Function::BlockComposite() cannot be called for a Function which is not a block");
+            InvalidArgument("Function::BlockRoot() cannot be called for a Function which is not a block");

        auto blockFunction = dynamic_cast<const BlockFunction*>(this);
-        return blockFunction->Composite();
+        return blockFunction->Composite()->RootFunction();
    }

    std::shared_ptr<std::vector<std::pair<Variable, Variable>>> Function::BlockArgumentsMappingImpl() const
@ -557,9 +560,10 @@ namespace CNTK
            clonedFunction = MakeSharedObject<PrimitiveFunction>(primitiveFunction->OpType(), inputs, std::move(attributesCopy), primitiveFunction->Name());
        else
        {
-            auto clonedComposite = primitiveFunction->BlockComposite()->Clone(parameterCloneMethod, replacements);
+            auto cloneeComposite = dynamic_cast<const BlockFunction*>(primitiveFunction)->Composite();
+            auto clonedComposite = cloneeComposite->Clone(parameterCloneMethod, replacements);

-            auto cloneeBlockCompositeArguments = primitiveFunction->BlockComposite()->Arguments();
+            auto cloneeBlockCompositeArguments = cloneeComposite->Arguments();
            auto clonedBlockCompositeArguments = clonedComposite->Arguments();
            std::unordered_map<Variable, Variable> cloneeToClonedBlockCompositeArgumentsMap;
            for (size_t i = 0; i < cloneeBlockCompositeArguments.size(); ++i)
@ -954,6 +958,18 @@ namespace CNTK
        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Logistic, operands, Dictionary(), name), name);
    }

+    FunctionPtr LambdaRank(const Variable& prediction, const Variable& gains, const Variable& groupId, const std::wstring& name)
+    {
+        std::vector<Variable> operands = { prediction, gains, groupId };
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::LambdaRank, operands, Dictionary(), name), name);
+    }
+
+    FunctionPtr NDCGAt1(const Variable& prediction, const Variable& gains, const Variable& groupId, const std::wstring& name)
+    {
+        std::vector<Variable> operands = { prediction, gains, groupId };
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::NDCG, operands, Dictionary(), name), name);
+    }
+
    FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name)
    {
        auto predictionPlaceholder = PlaceholderVariable(L"prediction");
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -9,7 +9,7 @@
 #include "Utils.h"
 #include "Serialization.h"

-#define UPDATE_FUNCTION                                                                                       \
+#define DISPATCH_TO_TYPED_UPDATE_FUNCTION                                                                     \
    switch (smoothedGradientValue->GetDataType())                                                             \
    {                                                                                                         \
    case DataType::Float:                                                                                     \
@ -22,6 +22,11 @@
        NOT_IMPLEMENTED;                                                                                      \
    }

+#define GET_WRITABLE_MATRICES                                                                                 \
+    const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);               \
+    const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);                               \
+    const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameter.Value());
+
 using namespace Microsoft::MSR::CNTK;
 using namespace std;

@ -184,15 +189,13 @@ namespace CNTK
            LogicError("Learner parameters contain duplicates.");
        }

-        for (const auto& parameter : parameters)
+        if (allocateSmoothGradients)
        {
-            if (!allocateSmoothGradients)
+            for (const auto& parameter : parameters)
            {
-                continue;
+                NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape());
+                m_smoothedGradientValues.emplace(parameter, view);
            }
-
-            NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape());
-            m_smoothedGradientValues.insert(make_pair(parameter, view));
        }
    }

@ -222,7 +225,7 @@ namespace CNTK
        }
    }

-    /*virtual*/ bool LearnerBase::Update(unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) /*override*/
+    /*virtual*/ bool LearnerBase::Update(unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd) /*override*/
    {
        if (LearningRate(trainingSampleCount) == 0.0)
        {
@ -230,7 +233,10 @@ namespace CNTK
        }

        // make sure trainingSampleCount is a valid value
-        assert(trainingSampleCount > 0);
+        if (trainingSampleCount == 0)
+        {
+            InvalidArgument("Learner::Update(): cannot perform an update with an empty minibatch.");
+        }

        for (const auto& parameter : Parameters())
        {
@ -256,7 +262,7 @@ namespace CNTK
            Print(gradientValue, "Gradient Update");
            Print(smoothedGradientValue, "Smoothed Gradient Input");
 #endif
-            UPDATE_FUNCTION;
+            DISPATCH_TO_TYPED_UPDATE_FUNCTION;

 #if DUMPOUTPUT
            Print(parameter.Value(), "Parameter Update");
@ -270,12 +276,17 @@ namespace CNTK
        }
        m_sampleCount += trainingSampleCount;
        m_minibatchCount++;
-        // TODO: sweep count also needs to be updated.
+        if (sweepEnd)
+        {
+            m_sweepCount++;
+        }
+
        return true;
    }

    template <typename ElementType>
-    void LearnerBase::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    void LearnerBase::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                             const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
        const auto& parameterValue = parameter.Value();
        PreProcess<ElementType>(parameterValue, gradientValue, trainingSampleCount);
@ -364,27 +375,39 @@ namespace CNTK
        }
    }

-    /*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    LearnerSGD::LearnerSGD(const std::vector<Parameter>& parameters, 
+                           const LearningRateSchedule& learningRateSchedule, 
+                           AdditionalLearningOptions additionalOptions,
+                           bool allocateSmoothGradients)
+                           : LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients)
    {
-        UPDATE_FUNCTION;
+        if (!allocateSmoothGradients)
+        {
+            // the vanilla sgd does not need the smooth gradients per se, 
+            // insert dummy nd views instead.
+            for (const auto& parameter : parameters)
+            {
+                m_smoothedGradientValues.emplace(parameter, AllocateNDArrayView(parameter, {}));
+            }
+        }
+    }
+
+    /*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                        const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    {
+        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
    }

    template <typename ElementType>
-    void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                            const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
-        const auto& parameterValue = parameter.Value();
-        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
+        UNUSED(smoothedGradientValue);
        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
-        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
-
+        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameter.Value());
        const auto learningRate = ElementType(LearningRate(trainingSampleCount));
-        const auto momentum = ElementType(MomentumValueForMB(trainingSampleCount));

-        // TODO: break up the NormalGrad into 3 different functions, each with its own set of parameters
-        // Also, come up with a better name for NormalGrad (Default? Regular? Plain?).
-        // (one for vanilla SGD, the other for momentum SGD, and the third one for NAG).
-        smoothedGradientMatrix->NormalGrad(*gradientMatrix, *parameterMatrix,
-                                           learningRate, momentum, UseNesterovMomentum());
+        parameterMatrix->SGDUpdate(*gradientMatrix, learningRate);
    }

    double LearnerMomentumSGD::MomentumValueForMB(const MomentumSchedule& schedule, size_t minibatchSize) const
@ -397,6 +420,44 @@ namespace CNTK
        return std::pow(currentMomentum, minibatchSize);
    }

+    /*virtual*/ void LearnerMomentumSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                                const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    {
+        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerMomentumSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                    const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        GET_WRITABLE_MATRICES;
+
+        const auto learningRate = ElementType(LearningRate(trainingSampleCount));
+        const auto momentum = ElementType(MomentumValueForMB(trainingSampleCount));
+
+        parameterMatrix->MomentumSGDUpdate(*gradientMatrix, *smoothedGradientMatrix,
+                                           learningRate, momentum, UseUnitGainMomentum());
+    }
+
+    /*virtual*/ void LearnerNesterov::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                             const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    {
+        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
+    }
+
+    template <typename ElementType>
+    void LearnerNesterov::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                 const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        GET_WRITABLE_MATRICES;
+
+        const auto learningRate = ElementType(LearningRate(trainingSampleCount));
+        const auto momentum = ElementType(MomentumValueForMB(trainingSampleCount));
+
+        parameterMatrix->NesterovAcceleratedMomentumSGDUpdate(*gradientMatrix, *smoothedGradientMatrix,
+                                                              learningRate, momentum, UseUnitGainMomentum());
+    }
+
    LearnerAdaGrad::LearnerAdaGrad(const std::vector<Parameter>& parameters,
                                   const LearningRateSchedule& learningRateSchedule,
                                   bool needAveMultiplier,
@ -416,24 +477,21 @@ namespace CNTK
            const auto shape = GetMatrixShape(parameter);
            NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] });

-            m_smoothedGradientValues.insert(make_pair(parameter, view));
+            m_smoothedGradientValues.emplace(parameter, view);
        }
    }

-    /*virtual*/ void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    /*virtual*/ void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                            const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
    {
-        UPDATE_FUNCTION;
+        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
    }

    template <typename ElementType>
-    void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    void LearnerAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
-        UNUSED(trainingSampleCount);
-
-        const auto& parameterValue = parameter.Value();
-        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
-        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
-        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
+        GET_WRITABLE_MATRICES

        const auto learningRate = LearningRate(trainingSampleCount);

@ -446,32 +504,33 @@ namespace CNTK
    LearnerFSAdaGrad::LearnerFSAdaGrad(const vector<Parameter>& parameters,
                                       const LearningRateSchedule& learningRateSchedule,
                                       const MomentumSchedule& momentumSchedule,
+                                       bool unitGain,
                                       const MomentumSchedule& varianceMomentumSchedule,
                                       AdditionalLearningOptions additionalOptions)
-                                       : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
+                                       : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, 
+                                                            unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
                                       m_varianceMomentumSchedule(varianceMomentumSchedule)
    {
        for (const auto& parameter : parameters)
        {
            const auto shape = GetMatrixShape(parameter);
            NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
-            m_smoothedGradientValues.insert(make_pair(parameter, view));
-            m_smoothedCounts.insert(make_pair(parameter, 0.0));
+            m_smoothedGradientValues.emplace(parameter, view);
+            m_smoothedCounts.emplace(parameter, 0.0);
        }
    }

-    /*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    /*virtual*/ void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                              const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
    {
-        UPDATE_FUNCTION;
+        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
    }

    template <typename ElementType>
-    void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    void LearnerFSAdaGrad::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                  const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
-        const auto& parameterValue = parameter.Value();
-        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
-        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
-        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
+        GET_WRITABLE_MATRICES;

        const auto learningRate = LearningRate(trainingSampleCount);
        const auto momentum = MomentumValueForMB(trainingSampleCount);
@ -480,7 +539,8 @@ namespace CNTK

        double& smoothedCount = m_smoothedCounts.at(parameter);

-        smoothedGradientMatrix->FSAdagradUpdate(trainingSampleCount, *gradientMatrix, *parameterMatrix, smoothedCount, learningRate, s_targetAdagradAvDenom, momentum, varMomentum);
+        smoothedGradientMatrix->FSAdagradUpdate(trainingSampleCount, *gradientMatrix, *parameterMatrix, smoothedCount, learningRate, 
+                                                s_targetAdagradAvDenom, momentum, varMomentum, UseUnitGainMomentum());
    }

    LearnerRMSProp::LearnerRMSProp(const vector<Parameter>& parameters,
@ -503,24 +563,21 @@ namespace CNTK
            const auto shape = GetMatrixShape(parameter);
            NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] });

-            m_smoothedGradientValues.insert(make_pair(parameter, view));
+            m_smoothedGradientValues.emplace(parameter, view);
        }
    }

-    /*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
+    /*virtual*/ void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                            const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const /*override*/
    {
-        UPDATE_FUNCTION;
+        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
    }

    template <typename ElementType>
-    void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    void LearnerRMSProp::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
+                                const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
    {
-        UNUSED(trainingSampleCount);
-
-        const auto& parameterValue = parameter.Value();
-        const auto& smoothedGradientMatrix = GetWritableMatrix<ElementType>(smoothedGradientValue);
-        const auto& gradientMatrix = GetWritableMatrix<ElementType>(gradientValue);
-        const auto& parameterMatrix = GetWritableMatrix<ElementType>(parameterValue);
+        GET_WRITABLE_MATRICES;

        const auto learningRate = LearningRate(trainingSampleCount);

@ -548,22 +605,25 @@ namespace CNTK
    LearnerPtr MomentumSGDLearner(const vector<Parameter>& parameters,
                                  const LearningRateSchedule& learningRateSchedule,
                                  const MomentumSchedule& momentumSchedule,
+                                  bool unitGain,
                                  AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRateSchedule, momentumSchedule, additionalOptions);
+        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions);
    }

    LearnerPtr NesterovLearner(const vector<Parameter>& parameters,
                               const LearningRateSchedule& learningRateSchedule,
                               const MomentumSchedule& momentumSchedule,
+                               bool unitGain,
                               AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerNesterov>(parameters, learningRateSchedule, momentumSchedule, additionalOptions);
+        return MakeSharedObject<LearnerNesterov>(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions);
    }

    LearnerPtr AdamLearner(const vector<Parameter>& parameters,
                           const LearningRateSchedule& learningRateSchedule,
                           const MomentumSchedule& momentumSchedule,
+                           bool unitGain,
                           const MomentumSchedule& varianceMomentumSchedule, /*= MomentumAsTimeConstantSchedulePerSample(2 * 3600 * 100)*/
                           bool lowMemory, /*= true*/
                           AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
@ -572,7 +632,7 @@ namespace CNTK
        {
            LogicError("AdamLearner: only the low-memory variant is supported at the moment.");
        }
-        return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRateSchedule, momentumSchedule, varianceMomentumSchedule, additionalOptions);
+        return MakeSharedObject<LearnerFSAdaGrad>(parameters, learningRateSchedule, momentumSchedule, unitGain, varianceMomentumSchedule, additionalOptions);
    }

    LearnerPtr AdaGradLearner(const vector<Parameter>& parameters,
--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -17,7 +17,7 @@ namespace CNTK
    class LearnerBase : public Learner
    {
    public:
-        virtual bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount) override final;
+        virtual bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd = false) override final;

        virtual Dictionary CreateCheckpoint() override final;

@ -108,26 +108,13 @@ namespace CNTK
    };

    // Vanilla gradient descent optimization algorithm.
-    class LearnerSGD : public LearnerBase
+    class LearnerSGD final : public LearnerBase
    {
    public:
-        LearnerSGD(const std::vector<Parameter>& parameters, 
-                   const LearningRateSchedule& learningRateSchedule, 
+        LearnerSGD(const std::vector<Parameter>& parameters,
+                   const LearningRateSchedule& learningRateSchedule,
                   AdditionalLearningOptions additionalOptions,
-                   bool allocateSmoothGradients = true)
-                   : LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients)
-        {}
-
-        // TODO: get rid of this as soon as NormalGrad is refactored.
-        virtual double MomentumValueForMB(size_t /*minibatchSize*/) const
-        {
-            return 0.0;
-        }
-
-        virtual bool UseNesterovMomentum() const
-        {
-            return false;
-        }
+                   bool allocateSmoothGradients = false);

    protected:

@ -138,30 +125,45 @@ namespace CNTK
    };

    // SGD optimization with momentum. 
-    class LearnerMomentumSGD : public LearnerSGD
+    class LearnerMomentumSGD : public LearnerBase
    {
    public:
        LearnerMomentumSGD(const std::vector<Parameter>& parameters,
                           const LearningRateSchedule& learningRateSchedule,
                           const MomentumSchedule& momentumSchedule,
+                           bool unitGain,
                           AdditionalLearningOptions additionalOptions,
                           bool allocateSmoothGradients = true)
-                           : LearnerSGD(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients),
-                           m_momentumSchedule(momentumSchedule)
+                           : LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients),
+                           m_momentumSchedule(momentumSchedule), 
+                           m_unitGain(unitGain)
        { }

        // returns current per-minibatch momentum value.
-        virtual double MomentumValueForMB(size_t minibatchSize) const override
+        virtual double MomentumValueForMB(size_t minibatchSize) const
        {
            return MomentumValueForMB(m_momentumSchedule, minibatchSize);
        }

    protected:
+        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+
        // returns current per-minibatch momentum value from the provided schedule.
        double MomentumValueForMB(const MomentumSchedule& schedule, size_t minibatchSize) const;

+        // Return true if the update should use classic momentum and 
+        // false if the unit-gain momentum should be used instead.
+        bool UseUnitGainMomentum() const
+        {
+            return m_unitGain;
+        }
+
    private:
        MomentumSchedule m_momentumSchedule;
+        bool m_unitGain;
    };

    // Nesterov's accelerated SGDLearnerBase descent. 
@ -172,14 +174,16 @@ namespace CNTK
        LearnerNesterov(const std::vector<Parameter>& parameters,
                        const LearningRateSchedule& learningRateSchedule,
                        const MomentumSchedule& momentumSchedule,
+                        bool unitGain,
                        AdditionalLearningOptions additionalOptions)
-                        : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, additionalOptions, /*allocateSmoothGradients*/ true)
+                        : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, /*allocateSmoothGradients*/ true)
        {}

-        virtual bool UseNesterovMomentum() const override
-        {
-            return true;
-        }
+    protected:
+        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const override;
+
+        template <typename ElementType>
+        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
    };

    class LearnerAdaGrad : public LearnerBase
@ -206,6 +210,7 @@ namespace CNTK
        LearnerFSAdaGrad(const std::vector<Parameter>& parameters,
                         const LearningRateSchedule& learningRateSchedule,
                         const MomentumSchedule& momentumSchedule,
+                         bool unitGain,
                         const MomentumSchedule& varianceMomentumSchedule,
                         AdditionalLearningOptions additionalOptions);

--- a/Source/CNTKv2LibraryDll/MinibatchSource.cpp
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.cpp
@ -23,6 +23,11 @@ namespace CNTK
        return GetNextMinibatch(0, minibatchSizeInSamples, device);
    }

+    const std::unordered_map<StreamInformation, MinibatchData>& MinibatchSource::GetNextMinibatch(size_t minibatchSizeInSequences, size_t minibatchSizeInSamples, const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/)
+    {
+        return GetNextMinibatch(minibatchSizeInSequences, minibatchSizeInSamples, 1, 0, device);
+    }
+
    const StreamInformation& MinibatchSource::StreamInfo(const std::wstring& streamName)
    {
        std::unordered_set<const StreamInformation*> matchingStreamInfos;
@ -68,18 +73,15 @@ namespace CNTK
    }

    /*static*/ const std::wstring CompositeMinibatchSource::PositionAttributeName = L"minibatchSourcePosition";
-    /*static*/ const std::wstring CompositeMinibatchSource::DistributedAfterSampleCountAttributeName = L"minibatchDistributedAfterSampleCount";

    CompositeMinibatchSource::CompositeMinibatchSource(const Dictionary& configuration)
        : m_epochEndReached(false),
          m_prevMinibatchSize(0),
-          m_epochSize(MinibatchSource::InfinitelyRepeat),
+          m_maxNumSamplesToRead(MinibatchSource::InfinitelyRepeat),
          m_randomizedWindow(MinibatchSource::DefaultRandomizationWindow),
          m_truncationLength(0),
          m_numWorkers(1),
-          m_workerRank(0),
-          m_distributed(false),
-          m_distributedAfterSampleCount(MinibatchSource::InfiniteSamples)
+          m_workerRank(0)
    {
        // The CNTK reader implementation requires for each deserializer both the module and deserializer type be specified
        // This is redundant and the V2 API users will just specify type from which the module is automatically inferred
@ -134,13 +136,7 @@ namespace CNTK

        const wchar_t* epochSizeConfigurationKey = L"epochSize";
        if (augmentedConfiguration.Contains(epochSizeConfigurationKey))
-            m_epochSize = augmentedConfiguration[epochSizeConfigurationKey].Value<size_t>();
-
-        if (m_epochSize == MinibatchSource::FullDataSweep)
-            m_epochSize = Microsoft::MSR::CNTK::requestDataSize;
-        // Setting big value, but not the max in order to avoid bit overflow.
-        else if (m_epochSize == MinibatchSource::InfinitelyRepeat)
-            m_epochSize = std::numeric_limits<size_t>::max() / 2;
+            m_maxNumSamplesToRead = augmentedConfiguration[epochSizeConfigurationKey].Value<size_t>();

        const wchar_t* randomizedWindowConfigurationKey = L"randomizationWindow";
        if (augmentedConfiguration.Contains(randomizedWindowConfigurationKey))
@ -158,11 +154,6 @@ namespace CNTK
            m_truncationLength = augmentedConfiguration[truncationLengthConfigurationKey].Value<size_t>();
        }

-        // TODO: change all the dictionary names to string constants
-        const wchar_t* distributedAfterSampleCountConfigurationKey = L"distributedAfterSampleCount";
-        if (augmentedConfiguration.Contains(distributedAfterSampleCountConfigurationKey))
-            m_distributedAfterSampleCount = augmentedConfiguration[distributedAfterSampleCountConfigurationKey].Value<size_t>();
-
        typedef Reader*(*CreateCompositeDataReaderProc)(const ConfigParameters* parameters);
        CreateCompositeDataReaderProc createReaderProc = (CreateCompositeDataReaderProc)Plugin().Load(L"CompositeDataReader", "CreateCompositeDataReader");
        std::shared_ptr<Microsoft::MSR::CNTK::Reader> compositeDataReader(createReaderProc(&config));
@ -194,6 +185,8 @@ namespace CNTK
    /*virtual*/ const std::unordered_map<StreamInformation, MinibatchData>&
    CompositeMinibatchSource::GetNextMinibatch(size_t minibatchSizeInSequences,
                                               size_t minibatchSizeInSamples,
+                                               size_t numberOfWorkers,
+                                               size_t workerRank,
                                               const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/) /*override*/
    {
        m_minibatchData.clear();
@ -206,35 +199,31 @@ namespace CNTK
            if (minibatchSizeInSamples == 0)
                InvalidArgument("GetNextMinibatch: Requested minibatch sizes must be > 0");

-            // For the first number of m_distributedAfterSampleCount samples, minibatch source won't run distributed.
-            bool wasDistributed = m_distributed;
-            if (!m_distributed && IsDistributed())
-            {
-                m_distributed = true;
-
-                if (m_numWorkers == 1)
-                {
-                    MPIWrapperPtr mpi = MPIWrapper::GetInstance();
-                    if (mpi == nullptr)
-                    {
-                        // create mpi instance if intended to be distributed
-                        mpi = MPIWrapper::GetInstance(true);
-                    }
-                    m_numWorkers = mpi->NumNodesInUse();
-                    m_workerRank = mpi->CurrentNodeRank();
-                }
-            }
-
            if (m_prevMinibatchSize == 0)
            {
                EpochConfiguration epochConfig;
-                epochConfig.m_numberOfWorkers = m_distributed ? m_numWorkers : 1;
-                epochConfig.m_workerRank = m_distributed ? m_workerRank : 0;
+                epochConfig.m_numberOfWorkers = numberOfWorkers;
+                epochConfig.m_workerRank = workerRank;
                epochConfig.m_minibatchSizeInSamples = minibatchSizeInSamples;
                epochConfig.m_truncationSize = m_truncationLength;
+                epochConfig.m_allowMinibatchesToCrossSweepBoundaries = true;
+
+                if (m_maxNumSamplesToRead == MinibatchSource::FullDataSweep)
+                {
+                    epochConfig.m_totalEpochSizeInSamples = Microsoft::MSR::CNTK::requestDataSize;
+                }
+                else if (m_maxNumSamplesToRead == MinibatchSource::InfinitelyRepeat)
+                {
+                    // Setting big value, but not the max in order to aviod bit overflow.
+                    epochConfig.m_totalEpochSizeInSamples = std::numeric_limits<size_t>::max() / 2;
+                }
+                else 
+                {
+                    epochConfig.m_totalEpochSizeInSamples = m_maxNumSamplesToRead;
+                }

-                epochConfig.m_totalEpochSizeInSamples = m_epochSize;
                epochConfig.m_epochIndex = 0;
+
                m_matrices.clear();

                std::unordered_set<InputStreamDescription> inputs;
@ -262,31 +251,38 @@ namespace CNTK

                m_shim->StartEpoch(epochConfig, inputs);
                m_prevMinibatchSize = minibatchSizeInSamples;
-                wasDistributed = m_distributed;
+                m_workerRank = workerRank;
+                m_numWorkers = numberOfWorkers;
            }

-            if (minibatchSizeInSamples != m_prevMinibatchSize || wasDistributed != m_distributed)
+            if (minibatchSizeInSamples != m_prevMinibatchSize || m_workerRank != workerRank || m_numWorkers != numberOfWorkers)
            {
                std::map<std::wstring, int> inputDescriptions;
                for (const auto& s : m_streamInfos)
                    inputDescriptions[s.m_name] = AsCNTKImplDeviceId(device);

                ReaderConfiguration newConfig;
-                newConfig.m_numberOfWorkers = m_distributed ? m_numWorkers : 1;
-                newConfig.m_workerRank = m_distributed ? m_workerRank : 0;
+                newConfig.m_numberOfWorkers = numberOfWorkers;
+                newConfig.m_workerRank = workerRank;
                newConfig.m_minibatchSizeInSamples = minibatchSizeInSamples;
                newConfig.m_truncationSize = m_truncationLength;
+                newConfig.m_allowMinibatchesToCrossSweepBoundaries = true;

                m_shim->SetConfiguration(newConfig, inputDescriptions);

                m_prevMinibatchSize = minibatchSizeInSamples;
+                m_workerRank = workerRank;
+                m_numWorkers = numberOfWorkers;
            }

            auto hasData = m_shim->GetMinibatch(m_matrices);
            m_epochEndReached = m_shim->IsEndOfEpoch();
+
            if (m_epochEndReached && !hasData)
                return m_minibatchData;

+            bool hasReachedSweepEnd = m_shim->IsEndOfSweep();
+
            for (const auto& s: m_streamInfos)
            {
                auto input = m_matrices.GetInput(s.m_name);
@ -310,7 +306,7 @@ namespace CNTK
                    size_t numSamples = input.pMBLayout->GetActualNumSamples();
                    size_t numSequences = input.pMBLayout->GetNumSequences();

-                    m_minibatchData[currentStreamInfo] = { numSequences, numSamples, minibatchValuePtr };
+                    m_minibatchData[currentStreamInfo] = { minibatchValuePtr, numSequences, numSamples, hasReachedSweepEnd };
                }
                else
                    LogicError("Input data of type other than DataType::Float is currently unsupported by the CNTK built-in composite MinibatchSource!");
@ -324,7 +320,6 @@ namespace CNTK
    {
        Dictionary checkpointState;
        checkpointState[PositionAttributeName] = m_shim->GetCurrentSamplePosition();
-        checkpointState[DistributedAfterSampleCountAttributeName] = m_distributedAfterSampleCount;
        return checkpointState;
    }

@ -332,6 +327,5 @@ namespace CNTK
    {
        auto checkpointedMinibatchSourcePosition = checkpoint[PositionAttributeName].Value<size_t>();
        m_shim->SetCurrentSamplePosition(checkpointedMinibatchSourcePosition);
-        m_distributedAfterSampleCount = checkpoint[DistributedAfterSampleCountAttributeName].Value<size_t>();
    }
 }
--- a/Source/CNTKv2LibraryDll/MinibatchSource.h
+++ b/Source/CNTKv2LibraryDll/MinibatchSource.h
@ -24,18 +24,16 @@ namespace CNTK

        virtual const std::unordered_set<StreamInformation>& StreamInfos() override { return m_streamInfos; }

-        virtual const std::unordered_map<StreamInformation, MinibatchData>& GetNextMinibatch(size_t minibatchSizeInSamples,
-                                                                                             size_t minibatchSizeInSequences,
-                                                                                             const DeviceDescriptor& device = DeviceDescriptor::UseDefaultDevice()) override;
+        const std::unordered_map<StreamInformation, MinibatchData>& GetNextMinibatch(
+            size_t minibatchSizeInSamples,
+            size_t minibatchSizeInSequences,
+            size_t numberOfWorkers,
+            size_t workerRank,
+            const DeviceDescriptor& device = DeviceDescriptor::UseDefaultDevice()) override;

        virtual Dictionary GetCheckpointState() const override;
        virtual void RestoreFromCheckpoint(const Dictionary& checkpoint) override;

-        virtual bool IsDistributed() const override
-        {
-            return m_shim->GetCurrentSamplePosition() >= m_distributedAfterSampleCount;
-        }
-
    private:
        static Microsoft::MSR::CNTK::InputStreamDescription GetInputStreamDescription(const StreamInformation& s, const DeviceDescriptor& device)
        {
@ -46,15 +44,13 @@ namespace CNTK
            return Microsoft::MSR::CNTK::InputStreamDescription(s.m_name, CNTKdeviceId, CNTKMatrixType, CNTKMatrixFormat);
        }

-    private: 
+    private:
        std::unordered_set<StreamInformation> m_streamInfos;
        bool m_epochEndReached;
-        bool m_distributed;
        size_t m_numWorkers;
        size_t m_workerRank;
-        size_t m_distributedAfterSampleCount;
        size_t m_prevMinibatchSize;
-        size_t m_epochSize;
+        size_t m_maxNumSamplesToRead;
        size_t m_randomizedWindow;
        size_t m_truncationLength;
        std::unordered_map<StreamInformation, MinibatchData> m_minibatchData;
--- a/Source/CNTKv2LibraryDll/NDArrayView.cpp
+++ b/Source/CNTKv2LibraryDll/NDArrayView.cpp
@ -289,6 +289,33 @@ namespace CNTK
        return MakeSharedObject<NDArrayView>(GetDataType(), Device(), GetStorageFormat(), Shape(), IsReadOnly() || readOnly, tensorView);
    }

+    NDArrayViewPtr NDArrayView::AsShape(const NDShape& newShape) const
+    {
+        if (newShape.TotalSize() != Shape().TotalSize())
+        {
+            InvalidArgument("NDArrayView::AsShape: The size (%d) of 'source' view shape's (%S) must be same as the size (%d) of the newShape (%S)!",
+                (int)Shape().TotalSize(), AsStringForErrorReporting(Shape()).c_str(),
+                (int)newShape.TotalSize(), AsStringForErrorReporting(newShape).c_str());
+        }
+
+        auto newTensorShape = AsTensorShape(newShape);
+        void* tensorView = nullptr;
+        switch (m_dataType)
+        {
+        case DataType::Float:
+            tensorView = new TensorView<float>(*(GetTensorView<float>()), newTensorShape);
+            break;
+        case DataType::Double:
+            tensorView = new TensorView<double>(*(GetTensorView<double>()), newTensorShape);
+            break;
+        default:
+            LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
+            break;
+        }
+
+        return MakeSharedObject<NDArrayView>(GetDataType(), Device(), GetStorageFormat(), newShape, IsReadOnly(), tensorView);
+    }
+
    // TODO: This could actually be strided?
    template <typename ElementType>
    ElementType* NDArrayView::WritableDataBuffer()
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
@ -130,7 +130,9 @@ namespace CNTK
            (op == PrimitiveOpType::CrossEntropyWithSoftmax) ||
            (op == PrimitiveOpType::ClassificationError) ||
            (op == PrimitiveOpType::Logistic) ||
-            (op == PrimitiveOpType::CosDistance))
+            (op == PrimitiveOpType::CosDistance) || 
+            (op == PrimitiveOpType::LambdaRank) ||
+            (op == PrimitiveOpType::NDCG))
        {
            outputDynamicAxes = std::vector<Axis>({});
        }
@ -527,9 +529,13 @@ namespace CNTK
                    case PrimitiveOpType::SquaredError:
                    case PrimitiveOpType::CrossEntropyWithSoftmax:
                    case PrimitiveOpType::ClassificationError:
+                    case PrimitiveOpType::LambdaRank:
+                    case PrimitiveOpType::NDCG:
                    {
                        if ((op == PrimitiveOpType::ClassificationError) || (op == PrimitiveOpType::Logistic))
                            assert(inputs.size() >= 2);
+                        else if ((op == PrimitiveOpType::LambdaRank) || (op == PrimitiveOpType::NDCG))
+                            assert(inputs.size() == 3);
                        else
                            assert(inputs.size() == 2);

@ -708,7 +714,8 @@ namespace CNTK

        if (m_op == PrimitiveOpType::Block)
        {
-            auto blockCompositeFunc = dynamic_cast<const CompositeFunction*>(BlockComposite().get());
+            auto blockFunction = dynamic_cast<const BlockFunction*>(this);
+            auto blockCompositeFunc = dynamic_cast<const CompositeFunction*>(blockFunction->Composite().get());
            dict[blockFunctionCompositeKey] = blockCompositeFunc->SerializeBlockComposite();
            dict[blockFunctionOpNameKey] = OpName();

@ -742,7 +749,7 @@ namespace CNTK
        // The hard requirement that the serialization depends on is that
        // new op type values are only added to the end of the list, after Combine.
        // This also applies to other enums (DataType, VariableKind, etc.)
-        if (op > PrimitiveOpType::Unpooling)
+        if (op > PrimitiveOpType::NDCG)
        {
            CNTK::LogicError("Unexpected op '%ls':'%u' (%s).", 
                             opKey.c_str(), 
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.h
@ -86,8 +86,10 @@ namespace CNTK
        {PrimitiveOpType::Sin, L"Sin"},
        {PrimitiveOpType::Cos, L"Cos"},
        {PrimitiveOpType::Pass, L"Pass"},
-        {PrimitiveOpType::Block, L"Block"},
-        {PrimitiveOpType::Unpooling, L"Unpooling"},
+        { PrimitiveOpType::Block, L"Block" },
+        { PrimitiveOpType::Unpooling, L"Unpooling" },
+        { PrimitiveOpType::LambdaRank, L"LambdaRank" },
+        { PrimitiveOpType::NDCG, L"NDCG" },
    };

    inline const std::wstring& PrimitiveOpTypeName(PrimitiveOpType opType)
@ -118,6 +120,10 @@ namespace CNTK
            if (numFunctionInputs > 2)
                indexMap.insert({ 2, 2 });
        }
+        else if (op == PrimitiveOpType::LambdaRank)
+            indexMap = std::unordered_map<size_t, size_t>({ { 0, 1 }, { 1, 0 }, { 2, 2 } });
+        else if (op == PrimitiveOpType::NDCG)
+            indexMap = std::unordered_map<size_t, size_t>({ { 0, 1 },{ 1, 0 },{ 2, 2 } });
        else if (op == PrimitiveOpType::CrossEntropyWithSoftmax)
            indexMap = std::unordered_map<size_t, size_t>({ { 0, 1 }, { 1, 0 } });
        else if (op == PrimitiveOpType::GatherPacked)
@ -711,6 +717,6 @@ namespace CNTK

        // Increasing s_serializationVersion every time we add more ops allows us to print 
        // a more meaningful message when trying to load a new model with a stale binary. 
-        static const size_t s_serializationVersion = 2;
+        static const size_t s_serializationVersion = 3;
    };
 }
--- a/Source/CNTKv2LibraryDll/PrimitiveOpType.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveOpType.h
@ -68,6 +68,8 @@ namespace CNTK
        Pass = 56,
        Block = 57,
        Unpooling = 58,
+        LambdaRank = 59,
+        NDCG = 60,
        // New op types should only be appended to the end of this list.
        // If you append here, also add checks in SerializationTests (CheckEnumValuesNotModified)
        // and bump up PrimitiveFunction::s_serializationVersion
--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -119,6 +119,29 @@ namespace CNTK
        return (numSamplesInDataArrayView - numMaskedSamples);
    }

+    static std::unordered_map<Variable, ValuePtr> GetInputs(const std::unordered_map<Variable, MinibatchData>& arguments)
+    {
+        std::unordered_map<Variable, ValuePtr> inputs(arguments.size());
+        for (const auto& kv : arguments)
+        {
+            inputs[kv.first] = kv.second.data;
+        }
+        return inputs;
+    }
+
+    static bool IsAtSweepEnd(const std::unordered_map<Variable, MinibatchData>& arguments)
+    {
+        return std::any_of(arguments.begin(), arguments.end(), [](const std::pair<const Variable, MinibatchData>& kv)
+        {
+            return kv.second.sweepEnd;
+        });
+    }
+
+    double Trainer::TestMinibatch(const std::unordered_map<Variable, MinibatchData>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
+    {
+        return TestMinibatch(GetInputs(arguments), computeDevice);
+    }
+
    double Trainer::TestMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
        if (!m_aggregatedEvaluationFunction)
@ -126,12 +149,26 @@ namespace CNTK

        // TODO: Should we refactor this code that is somewhat similar to the prologue of the TrainMinibatch function
        std::unordered_map<Variable, ValuePtr> outputs = { { m_aggregatedEvaluationFunction, nullptr }, { m_testSampleCountVar, nullptr } };
+
        m_combinedTrainingFunction->Forward(arguments, outputs, computeDevice);

        auto sampleCount = GetSampleCount(m_testSampleCountVar, outputs[m_testSampleCountVar]);
        return (GetScalarValue(outputs[m_aggregatedEvaluationFunction]) / sampleCount);
    }

+    bool Trainer::TrainMinibatch(const std::unordered_map<Variable, MinibatchData>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
+    {
+        std::unordered_map<Variable, ValuePtr> outputsToFetch = {};
+        return TrainMinibatch(arguments, outputsToFetch, computeDevice);
+    }
+
+    bool Trainer::TrainMinibatch(const std::unordered_map<Variable, MinibatchData>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
+    {
+        if (!m_distributed)
+            return TrainLocalMinibatch(GetInputs(arguments), outputsToFetch, IsAtSweepEnd(arguments), computeDevice);
+        return TrainDistributedMinibatch(GetInputs(arguments), outputsToFetch, IsAtSweepEnd(arguments), computeDevice);
+    }
+
    bool Trainer::TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
        std::unordered_map<Variable, ValuePtr> outputsToFetch = {};
@ -141,11 +178,11 @@ namespace CNTK
    bool Trainer::TrainMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
        if (!m_distributed)
-            return TrainLocalMinibatch(arguments, outputsToFetch, computeDevice);
-        return TrainDistributedMinibatch(arguments, outputsToFetch, computeDevice);
+            return TrainLocalMinibatch(arguments, outputsToFetch, false, computeDevice);
+        return TrainDistributedMinibatch(arguments, outputsToFetch, false, computeDevice);
    }

-    bool Trainer::TrainLocalMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
+    bool Trainer::TrainLocalMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, bool sweepEnd, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
        bool emptyMinibatch = arguments.empty() || (arguments.begin()->second == nullptr);
        if (emptyMinibatch) // Nothing to train with.
@ -157,10 +194,10 @@ namespace CNTK
        std::unordered_map<Parameter, NDArrayViewPtr> gradients;
        for (const auto& parameter : m_combinedTrainingFunction->Parameters())
            gradients[parameter] = parameterGradients[parameter]->Data();
-        return m_parameterLearners->Update(gradients, m_prevMinibatchNumSamples);
+        return m_parameterLearners->Update(gradients, m_prevMinibatchNumSamples, sweepEnd);
    }

-    bool Trainer::TrainDistributedMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
+    bool Trainer::TrainDistributedMinibatch(const std::unordered_map<Variable, ValuePtr>& arguments, std::unordered_map<Variable, ValuePtr>& outputsToFetch, bool sweepEnd, const DeviceDescriptor& computeDevice /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
        std::unordered_map<Parameter, NDArrayViewPtr> gradients;
        auto modelParameters = m_combinedTrainingFunction->Parameters();
@ -187,7 +224,7 @@ namespace CNTK
            evalCriterion = m_prevMinibatchAggregateEvalCriterionValue->Data();
        }

-        MinibatchInfo info { arguments.empty(), m_prevMinibatchNumSamples, trainingLoss, evalCriterion };
+        MinibatchInfo info{ arguments.empty(), sweepEnd, m_prevMinibatchNumSamples, trainingLoss, evalCriterion };
        bool updated = m_parameterLearners->Update(gradients, info);
        m_prevMinibatchNumSamples = info.numberOfSamples;

@ -344,4 +381,14 @@ namespace CNTK
    {
        return m_parameterLearners->ParameterLearners().front()->TotalNumberOfSamplesSeen();
    }
+
+    TrainerPtr CreateTrainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const std::vector<LearnerPtr>& parameterLearners)
+    {
+        return MakeSharedObject<Trainer>(model, lossFunction, parameterLearners);
+    }
+
+    TrainerPtr CreateTrainer(const FunctionPtr& model, const FunctionPtr& lossFunction, const FunctionPtr& evaluationFunction, const std::vector<LearnerPtr>& parameterLearners)
+    {
+        return MakeSharedObject<Trainer>(model, lossFunction, evaluationFunction, parameterLearners);
+    }
 }
--- a/Source/CNTKv2LibraryDll/TrainingSession.cpp
+++ b/Source/CNTKv2LibraryDll/TrainingSession.cpp
@ -0,0 +1,148 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "stdafx.h"
+#include "CNTKLibrary.h"
+#include "fileutil.h"
+
+namespace CNTK
+{
+    const std::wstring TrainingSession::s_checkpointIndex = L"CheckpointIndex";
+    const std::wstring TrainingSession::s_trainingMinibatchSource = L"TrainingMinibatchSource";
+
+    TrainingSessionPtr CreateBasicTrainingSession(
+        const MinibatchSourcePtr& trainingSource,
+        const TrainerPtr& trainer,
+        const std::unordered_map<Variable, StreamInformation>& modelInputToMinibatchSourceStream,
+        const MinibatchSizeSchedule& minibatchSizeSchedule,
+        size_t checkpointFrequencyinSamples,
+        const std::wstring& checkPointFileName)
+    {
+        return MakeSharedObject<TrainingSession>(trainingSource,
+            trainer,
+            modelInputToMinibatchSourceStream,
+            minibatchSizeSchedule,
+            checkpointFrequencyinSamples,
+            checkPointFileName);
+    }
+
+    TrainingSession::TrainingSession(
+        const MinibatchSourcePtr& trainingSource,
+        const TrainerPtr& trainer,
+        const std::unordered_map<Variable, StreamInformation>& modelInputToMinibatchSourceStream,
+        const MinibatchSizeSchedule& schedule,
+        size_t checkpointFrequencyInSamples,
+        const std::wstring& checkPointFileName) :
+        m_trainingSource(trainingSource),
+        m_trainer(trainer),
+        m_modelInputToMinibatchSourceStream(modelInputToMinibatchSourceStream),
+        m_checkpointFrequencyinSamples(checkpointFrequencyInSamples),
+        m_checkPointFileName(checkPointFileName),
+        m_currentCheckpointIndex(0),
+        m_parallelAfterSamples(0),
+        m_workerRank(0),
+        m_numberOfWorkers(1),
+        m_minibatchSizeSchedule(schedule)
+    {
+        if (!trainingSource)
+            InvalidArgument("Minibatch source is not allowed to be null.");
+        if (!trainer)
+            InvalidArgument("Trainer is not allowed to be null.");
+        if(modelInputToMinibatchSourceStream.empty())
+            InvalidArgument("Input mapping is not allowed to be empty.");
+        if (m_checkPointFileName.empty() && checkpointFrequencyInSamples != 0)
+            InvalidArgument("Checkpoint file name is not allowed to be empty.");
+
+        // Let's calculate the warm up period the distributed learners may need.
+        // We will take the maximum warm up period required.
+        auto learners = trainer->ParameterLearners();
+        m_parallelAfterSamples = 0;
+        for (const auto& l: learners)
+        {
+            auto distributed = std::dynamic_pointer_cast<DistributedLearner>(l);
+            if (distributed)
+            {
+                m_parallelAfterSamples = std::max(m_parallelAfterSamples, distributed->ParallelizationAfter());
+                m_workerRank = distributed->GetCommunicator()->CurrentWorker().m_globalRank;
+                m_numberOfWorkers = distributed->GetCommunicator()->Workers().size();
+            }
+        }
+    }
+
+    void TrainingSession::Train(const DeviceDescriptor& computeDevice)
+    {
+        std::unordered_map<Variable, ValuePtr> minibatch;
+        bool shouldTrain = true;
+        size_t workerRank = 0, numberOfWorkers = 1;
+        size_t samplesInEpoch = 0;
+        while (shouldTrain)
+        {
+            // Check if we are operating in distributed mode.
+            if (m_parallelAfterSamples >= m_trainer->TotalNumberOfSamplesSeen())
+            {
+                numberOfWorkers = m_numberOfWorkers;
+                workerRank = m_workerRank;
+            }
+
+            size_t mbSize = GetMinibatchSize();
+            auto minibatchData = m_trainingSource->GetNextMinibatch(0 /*numberOfSequences*/, mbSize, numberOfWorkers, workerRank, computeDevice);
+
+            minibatch.clear();
+            if (!minibatchData.empty())
+            {
+                for (auto v : m_modelInputToMinibatchSourceStream)
+                    minibatch.insert({ v.first, minibatchData[v.second].data });
+            }
+
+            OnMinibatchStart();
+            shouldTrain = m_trainer->TrainMinibatch(minibatch, computeDevice);
+            OnMinibatchEnd();
+
+            // Local number of samples.
+            samplesInEpoch += m_trainer->PreviousMinibatchSampleCount();
+
+            // Check whether to create a checkpoint
+            if (m_checkpointFrequencyinSamples > 0)
+            {
+                size_t checkpointIndex = m_trainer->TotalNumberOfSamplesSeen() / m_checkpointFrequencyinSamples;
+                if (checkpointIndex > m_currentCheckpointIndex)
+                {
+                    samplesInEpoch = 0;
+                    m_currentCheckpointIndex = checkpointIndex;
+                    SaveCheckpoint();
+                }
+            }
+        }
+
+        if (m_checkpointFrequencyinSamples > 0)
+            SaveCheckpoint();
+    }
+
+    void TrainingSession::RestoreFromCheckpoint(const std::wstring& checkpointFileName)
+    {
+        Dictionary externalState = m_trainer->RestoreFromCheckpoint(checkpointFileName);
+        m_currentCheckpointIndex = externalState[s_checkpointIndex].Value<size_t>();
+        m_trainingSource->RestoreFromCheckpoint(externalState[s_trainingMinibatchSource].Value<Dictionary>());
+    }
+
+    void TrainingSession::SaveCheckpoint()
+    {
+        OnCheckpointStart();
+        Dictionary externalState;
+        externalState[s_checkpointIndex] = m_currentCheckpointIndex;
+        externalState[s_trainingMinibatchSource] = m_trainingSource->GetCheckpointState();
+
+        std::wstring tempFileName = m_checkPointFileName + L".tmp";
+        m_trainer->SaveCheckpoint(tempFileName, externalState);
+
+        // Perform the actual renaming only on the main worker.
+        if (m_workerRank == 0)
+        {
+            _wunlink(m_checkPointFileName.c_str());
+            renameOrDie(tempFileName, m_checkPointFileName);
+        }
+        OnCheckpointEnd();
+    }
+}
--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -241,7 +241,7 @@ namespace CNTK

    template <typename T>
    TrainingParameterSchedule<T>::TrainingParameterSchedule(T value, UnitType unit) 
-        : m_schedule({ make_pair(0, value) }), m_unit(unit), m_epochSize(EntireSweep)
+        : m_schedule({ make_pair(0, value) }), m_unit(unit), m_epochSize(FullDataSweep)
    {
    }

@ -268,13 +268,9 @@ namespace CNTK
    template <typename T>
    void TrainingParameterSchedule<T>::ConstructSchedule(const std::vector<std::pair<size_t, T>>& schedule)
    {
-        if (m_epochSize == EntireSweep)
-        {
-            //Sweep based schedules are currently not functional (learners don't have sweep info).
-            NOT_IMPLEMENTED;
-        }
-
-        const auto epochSize = (m_epochSize == EntireSweep) ? 1 : m_epochSize;
+        // In case of the FullDataSweep, the scheduling unit is just 1 sweep, 
+        // otherwise, it's the epoch size in samples.
+        const auto unitSize = (m_epochSize == FullDataSweep) ? 1 : m_epochSize;

        if (schedule.size() == 0)
            RuntimeError("TrainingParameterSchedule::ConstructSchedule : schedule is empty.");
@ -288,7 +284,7 @@ namespace CNTK
                RuntimeError("TrainingParameterSchedule::ConstructSchedule : unit count in the 'schedule' argument cannot be 0.");

            unitCount += (pair.first != 0) ? pair.first : 1;
-            m_schedule[epochSize * unitCount] = pair.second;
+            m_schedule[unitSize * unitCount] = pair.second;
        }
    }

@ -830,6 +826,7 @@ namespace CNTK
    template void DictionaryValue::FreePtrAsType<NDArrayView>();

    template class TrainingParameterSchedule<double>;
+    template class TrainingParameterSchedule<size_t>;

    Learners::Learners(const std::vector<LearnerPtr>& learners) :
        m_learners(learners),
@ -879,14 +876,14 @@ namespace CNTK
        }
    }

-    bool Learners::Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t sampleInMinibatch)
+    bool Learners::Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t sampleInMinibatch, bool sweepEnd)
    {
        bool anyUpdatesPerformed = false;
        for (auto learner : m_learners)
        {
            std::unordered_map<Parameter, NDArrayViewPtr> learnerGradients;
            GetLearnerGradients(learner, gradientValues, learnerGradients);
-            anyUpdatesPerformed |= learner->Update(learnerGradients, sampleInMinibatch);
+            anyUpdatesPerformed |= learner->Update(learnerGradients, sampleInMinibatch, sweepEnd);
        }
        return anyUpdatesPerformed;
    }
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -501,7 +501,7 @@ namespace CNTK
    public:
        explicit Learners(const std::vector<LearnerPtr>& learners);

-        bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount);
+        bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd);
        bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& minibatchInfo);

        std::vector<DictionaryValue> CreateCheckpoint();
--- a/Source/CNTKv2LibraryDll/Value.cpp
+++ b/Source/CNTKv2LibraryDll/Value.cpp
@ -194,16 +194,19 @@ namespace CNTK
        NDMaskPtr deviceValueMask = CreateMask(sequenceLengths, sequenceStartFlags, DeviceDescriptor::CPUDevice());

        NDArrayViewPtr valueData;
+        NDShape valueDataShape = sampleShape.AppendShape({ maxSequenceLength, numSequences });
        if (numSequences == 1)
        {
            if (createNewCopy)
                valueData = sequences[0]->DeepClone();
            else
                valueData = sequences[0];
+
+            // We can use the original buffer directly but need to reshape to the valueDataShape
+            valueData = valueData->AsShape(valueDataShape);
        }
        else
        {
-            NDShape valueDataShape = sampleShape.AppendShape({ maxSequenceLength, numSequences });
            if (isDataSparse)
            {
                if (storageFormat != StorageFormat::SparseCSC)
--- a/Source/Common/BestGpu.cpp
+++ b/Source/Common/BestGpu.cpp
@ -61,8 +61,6 @@ struct ProcessorData
    nvmlMemory_t memory;
    nvmlUtilization_t utilization;
    cudaDeviceProp deviceProp;
-    size_t cudaFreeMem;
-    size_t cudaTotalMem;
    bool cntkFound;
    int deviceId; // the deviceId (cuda side) for this processor
 };
@ -270,29 +268,16 @@ void BestGpu::GetCudaProperties()
    if (m_cudaData)
        return;

-    int currentDevice, rc;
-    rc = cudaGetDevice(&currentDevice);
    int dev = 0;

    for (ProcessorData* pd : m_procData)
    {
-        cudaSetDevice(dev);
        pd->deviceId = dev;
        cudaGetDeviceProperties(&pd->deviceProp, dev);
-        size_t free;
-        size_t total;
-        cudaMemGetInfo(&free, &total);
        pd->cores = _ConvertSMVer2Cores(pd->deviceProp.major, pd->deviceProp.minor) * pd->deviceProp.multiProcessorCount;
-        pd->cudaFreeMem = free;
-        pd->cudaTotalMem = total;
        dev++;
-        cudaDeviceReset();
    }
    m_cudaData = m_procData.size() > 0;
-    if (rc == CUDA_SUCCESS)
-    {
-        cudaSetDevice(currentDevice);
-    }
 }

 void BestGpu::Init()
@ -486,10 +471,7 @@ std::vector<int> BestGpu::GetDevices(int number, BestGpuFlags p_bestFlags)
        score = (1.0 - pd->utilization.gpu / 75.0f) * utilGpuW;
        score += (1.0 - pd->utilization.memory / 60.0f) * utilMemW;
        score += pd->cores / 1000.0f * speedW;
-        double mem = pd->memory.total > 0 ? pd->memory.free / (double) pd->memory.total : 1000000; // I saw this to be 0 when remoted in
-        // if it's not a tcc driver, then it's WDDM driver and values will be off because windows allocates all the memory from the nvml point of view
-        if (!pd->deviceProp.tccDriver || pd->memory.total == 0)
-            mem = pd->cudaFreeMem / (double) pd->cudaTotalMem;
+        double mem = pd->memory.total > 0 ? pd->memory.free / (double) pd->memory.total : 1; // I saw this to be 0 when remoted in
        score += mem * freeMemW;
        score += (pd->cntkFound ? 0 : 1) * mlAppRunningW;
        for (int i = 0; i < best.size(); i++)
--- a/Source/Common/Include/Sequences.h
+++ b/Source/Common/Include/Sequences.h
@ -294,6 +294,13 @@ public:
        return GetNumTimeSteps() * GetNumParallelSequences();
    }

+    // Get the number of frames of the input sequence that belong to the MB, i.e. disregarding sequence elements that are outside of the MB boundaries
+    // Input sequence is expected to belong to this MBLayout
+    size_t GetNumSequenceFramesInCurrentMB(const SequenceInfo& sequenceInfo) const
+    {
+        return min(sequenceInfo.tEnd, GetNumTimeSteps()) - max(sequenceInfo.tBegin, (ptrdiff_t)0);
+    }
+
    // return all sequences stored in this minibatch
    const vector<SequenceInfo>& GetAllSequences() const
    {
@ -515,6 +522,18 @@ public:
        return col;
    }

+    // get the matrix-column indices for a given sequence
+    // sequence is expected to belong to this MB
+    vector<size_t> GetColumnIndices(const SequenceInfo& seq) const
+    {
+        size_t numFrames = GetNumSequenceFramesInCurrentMB(seq);
+        vector<size_t> res;
+        res.reserve(numFrames);
+        for (size_t i = 0; i < numFrames;++i)
+            res.push_back(GetColumnIndex(seq,i));
+        return res;
+    }
+
 private:
    // we are trying to access content--this verifies that the structure is consistent
    // All frames must now be declared.
@ -836,7 +855,7 @@ inline bool MBLayout::IsBeyondMinibatch(const FrameRange& fr) const
    if (fr.IsAllFrames())
        LogicError("MBLayout::IsBeyondStartOrEnd() cannot be applied to FrameRange that specifies more than a single time step.");

-    const auto beginTime = (ptrdiff_t)fr.timeIdxInSeq + fr.m_timeOffset; // we test off the frame without offset
+    const auto beginTime = (ptrdiff_t)fr.timeIdxInSeq + fr.m_timeOffset; // we test off the frame with offset
    const auto endTime = beginTime + (ptrdiff_t)fr.m_timeRange;
    return beginTime < 0 || endTime > (ptrdiff_t)GetNumTimeSteps();
 }
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -446,6 +446,7 @@ bool ComputationNetwork::IsTypicalCriterionNode(ComputationNodeBasePtr nodePtr)
        nodePtr->OperationName() == OperationNameOf(CrossEntropyNode) ||
        nodePtr->OperationName() == OperationNameOf(ClassBasedCrossEntropyWithSoftmaxNode) ||
        nodePtr->OperationName() == OperationNameOf(ClassificationErrorNode) ||
+        nodePtr->OperationName() == OperationNameOf(EditDistanceErrorNode) ||
 #ifdef COMING_SOON
        nodePtr->OperationName() == OperationNameOf(CRFNode) ||
 #endif
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -54,6 +54,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
    else if (nodeType == OperationNameOf(DropoutNode))                          return New<DropoutNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(DummyCriterionNode))                   return New<DummyCriterionNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(DynamicAxisNode))                      return New<DynamicAxisNode<ElemType>>(forward<_Types>(_Args)...);
+    else if (nodeType == OperationNameOf(EditDistanceErrorNode))                     return New<EditDistanceErrorNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(ElementTimesNode))                     return New<ElementTimesNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(EnvironmentInputNode))                 return New<EnvironmentInputNode<ElemType>>(forward<_Types>(_Args)...);
    else if (nodeType == OperationNameOf(EpochAccumulatorNode))                 return New<EpochAccumulatorNode<ElemType>>(forward<_Types>(_Args)...);
@ -428,6 +429,12 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Class
    return net.AddNodeToNetAndAttachInputs(New<ClassificationErrorNode<ElemType>>(net.GetDeviceId(), nodeName), { a, b });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<int> samplesToIgnore, const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<EditDistanceErrorNode<ElemType>>(net.GetDeviceId(), nodeName, subPen, delPen, insPen, squashInputs, samplesToIgnore), { a, b });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::PerDimMeanVarNormalization(const ComputationNodePtr feature, const ComputationNodePtr mean,
                                                                                                      const ComputationNodePtr InvStdDev, const std::wstring nodeName)
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -129,6 +129,7 @@ public:
    ComputationNodePtr Diagonal(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr Dropout(const ComputationNodePtr a, const std::wstring nodeName = L"");
    ComputationNodePtr DummyCriterion(const ComputationNodePtr objectives, const ComputationNodePtr derivatives, const ComputationNodePtr prediction, const std::wstring nodeName = L"");
+    ComputationNodePtr EditDistanceError(const ComputationNodePtr a, const ComputationNodePtr b, float subPen, float delPen, float insPen, bool squashInputs, vector<int> samplesToIgnore, const std::wstring nodeName = L"");
    ComputationNodePtr ElementTimes(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
    ComputationNodePtr DynamicAxis(const ComputationNodePtr a, const std::wstring& nodeName = L"");
    ComputationNodePtr ClassificationError(const ComputationNodePtr a, const ComputationNodePtr b, const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -1005,15 +1005,27 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa

    bool performingBackPropagation = (trainRootNode != nullptr) || (Globals::ShouldEnableHyperCompressMemory());

+    // Construct the composite forward prop eval order by enumerating the
+    // nodes corresponding to each of our roots in global eval oder
+    forwardPropRoots = SortByGlobalEvalOrder(forwardPropRoots);
+
    // Create a composite Eval order with the specified nodes as roots
    // For each node determine parents and whether the output of the
    // node is needed during back propagation
    std::unordered_map<ComputationNodeBasePtr, bool> outputValueNeededDuringBackProp;
    std::unordered_map<ComputationNodeBasePtr, std::unordered_set<ComputationNodeBasePtr>> parentsMap;
+    std::vector<ComputationNodeBasePtr> compositeForwardPropEvalOrder;
+    std::unordered_set<ComputationNodeBasePtr> uniqueForwardPropEvalNodes;
    for (auto& rootNode : forwardPropRoots)
    {
        for (const auto& node : GetEvalOrder(rootNode))
        {
+            if (uniqueForwardPropEvalNodes.find(node) == uniqueForwardPropEvalNodes.end())
+            {
+                uniqueForwardPropEvalNodes.insert(node);
+                compositeForwardPropEvalOrder.push_back(node);
+            }
+
            for (int i = 0; i < node->GetNumInputs(); i++)
            {
                ComputationNodeBasePtr input = node->GetInputs()[i];
@ -1050,13 +1062,6 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
        }
    }

-    // Construct the composite forward prop eval order by enumerating the
-    // nodes corresponding to each of our roots and then arranging them in the
-    // relative order that they appear in the global evaluation order
-
-    std::list<ComputationNodeBasePtr> nodesForForwardPropRoots = ComputationNodeBase::EnumerateNodes(forwardPropRoots);
-    std::vector<ComputationNodeBasePtr> compositeForwardPropEvalOrder = SortByGlobalEvalOrder(nodesForForwardPropRoots);
-
    set<ComputationNodeBasePtr> completedEvaluate;
    for (auto& nodeIter : compositeForwardPropEvalOrder)
    {
--- a/Показать больше
+++ b/Показать больше