Merge branch 'master' into qiwye/asgd-dev

# Conflicts: # CNTK.sln # Makefile
2016-11-10 16:50:46 +08:00 · 2016-11-10 16:50:46 +08:00 · 249989b95f
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -158,7 +158,7 @@
    </Link>
  </ItemDefinitionGroup>

-  <ItemDefinitionGroup Condition="'$(ConfigurationType)' == 'StaticLibrary' And $(ReleaseBuild) And !$(NoOptBuild)">
+  <ItemDefinitionGroup Condition="$(ReleaseBuild) And !$(NoOptBuild)">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
@ -180,8 +180,16 @@
      <IntrinsicFunctions>false</IntrinsicFunctions>
    </ClCompile>
    <Link>
+      <EnableCOMDATFolding>false</EnableCOMDATFolding>
      <OptimizeReferences>false</OptimizeReferences>
+      <Profile>false</Profile>
    </Link>
  </ItemDefinitionGroup>

+  <PropertyGroup Condition="$(NoOptBuild)" Label="Configuration">
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+
 </Project>
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1292,6 +1292,11 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "Multiverso", "Source\Multiv
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "MultiversoTests", "Source\Multiverso\Test\unittests\MultiversoTests.vcxproj", "{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalExtendedClientTest\CPPEvalExtendedClientTest.vcxproj", "{5D29C76D-648A-456F-920D-48230F2FB3C8}"
+	ProjectSection(ProjectDependencies) = postProject
+		{482999D1-B7E2-466E-9F8D-2119F93EAFD9} = {482999D1-B7E2-466E-9F8D-2119F93EAFD9}
+	EndProjectSection
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug_CpuOnly|Any CPU = Debug_CpuOnly|Any CPU
@ -2297,6 +2302,31 @@ Global
 		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|Mixed Platforms.Build.0 = Release|x64
 		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|x64.ActiveCfg = Release|x64
 		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F}.Release|x64.Build.0 = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|x64.ActiveCfg = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|x64.Build.0 = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|Any CPU.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|Any CPU.ActiveCfg = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|Mixed Platforms.Build.0 = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|x64.ActiveCfg = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -2476,5 +2506,6 @@ Global
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
 		{16F14058-B116-49D9-8BA0-209F3AFFE849} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{EC7157E9-A51F-4702-A5FD-8DAF88C7029F} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
+		{5D29C76D-648A-456F-920D-48230F2FB3C8} = {05E45AF7-C069-4057-BC16-0A532D068CE4}
 	EndGlobalSection
 EndGlobal
--- a/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.cpp
+++ b/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.cpp
@ -0,0 +1,326 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CPPEvalExtendedClient.cpp : Sample application using the extended evaluation interface from C++
+//
+
+#include <sys/stat.h>
+#include <inttypes.h>
+#include <algorithm>
+#include <fstream>
+#include <unordered_map>
+
+#include "Eval.h"
+#ifdef _WIN32
+#include "Windows.h"
+#endif
+
+using namespace std;
+using namespace Microsoft::MSR::CNTK;
+
+// Used for retrieving the model appropriate for the element type (float / double)
+template<typename ElemType>
+using GetEvalProc = void(*)(IEvaluateModelExtended<ElemType>**);
+
+std::unordered_map<std::string, size_t> buildVocab(std::string filePath)
+{
+    std::ifstream ifs(filePath);
+    size_t idx = 0;
+
+    std::unordered_map<std::string, size_t> vocab;
+    std::string line;
+    while (std::getline(ifs, line))
+    {
+        vocab.insert(std::pair<std::string, size_t>(line, idx));
+        idx += 1;
+    }
+
+    ifs.close();
+    return vocab;
+}
+
+std::unordered_map<size_t, std::string> buildInvVocab(std::string filePath)
+{
+    std::ifstream ifs(filePath);
+    size_t idx = 1;
+
+    std::unordered_map<size_t, std::string> vocab;
+    std::string line;
+    while (std::getline(ifs, line))
+    {
+        vocab.insert(std::pair<size_t, std::string>(idx, line));
+        idx += 1;
+    }
+
+    ifs.close();
+    return vocab;
+}
+
+size_t word2idx(std::string word, std::unordered_map<std::string, size_t>& word2idxVocab)
+{
+    std::unordered_map<std::string, size_t>::iterator iter = word2idxVocab.find(word);
+    if (iter == word2idxVocab.end())
+    {
+        throw std::runtime_error("word not found in source vocab");
+    }
+
+    return iter->second;
+}
+
+
+std::string idx2word(size_t idx, std::unordered_map<size_t, std::string>& idx2wordVocab)
+{
+    std::unordered_map<size_t, std::string>::iterator iter = idx2wordVocab.find(idx);
+    if (iter == idx2wordVocab.end())
+    {
+        throw std::runtime_error("word index is not found in target vocab");
+    }
+
+    return iter->second;
+}
+
+void addOneHotWord(Values<float>& inputBuffers, size_t idx, VariableSchema& inputLayouts, size_t inputNode)
+{
+    size_t inputDim = inputLayouts[inputNode].m_numElements;
+    for (size_t i = 0; i < inputDim; i++)
+    {
+        if (i == idx)
+        {
+            inputBuffers[inputNode].m_buffer.push_back(1);
+        }
+        else
+        {
+            inputBuffers[inputNode].m_buffer.push_back(0);
+        }
+    }
+}
+
+std::vector<std::string> feedInputVectors(std::string sentence, std::unordered_map<std::string, size_t>& word2idxVocab, Values<float>& inputBuffers, VariableSchema& inputLayouts)
+{
+    std::vector<std::string> words;
+
+    // Split input sentence by space.
+    char delimiters = ' ';
+    size_t begin = 0;
+    size_t end = sentence.find_first_of(delimiters);
+    while (end != sentence.npos)
+    {
+        words.push_back(sentence.substr(begin, end - begin));
+        begin = end + 1;
+        end = sentence.find(delimiters, begin);
+    }
+
+    words.push_back(sentence.substr(begin));
+
+    // Convert words to ids.
+    std::vector<size_t> wordIds;
+    for (size_t i = 0; i < words.size(); i++)
+    {
+        size_t id = word2idx(words[i], word2idxVocab);
+        wordIds.push_back(id);
+    }
+
+    // Process the input words to construct network input vectors.
+    // As the sentence begins and ends with special tag, we will ignore the first and last word.
+    for (size_t i = 1; i < words.size() - 1; i++)
+    {
+        // Current word.
+        size_t cwIdx = wordIds[i];
+        addOneHotWord(inputBuffers, cwIdx, inputLayouts, 0);
+
+        // Next word.
+        size_t nwIdx = wordIds[i + 1];
+        addOneHotWord(inputBuffers, nwIdx, inputLayouts, 1);
+
+        // Previous word.
+        size_t pwIdx = wordIds[i - 1];
+        addOneHotWord(inputBuffers, pwIdx, inputLayouts, 2);
+    }
+
+    return words;
+}
+
+IEvaluateModelExtended<float>* SetupNetworkAndGetLayouts(std::string modelDefinition, VariableSchema& inputLayouts, VariableSchema& outputLayouts)
+{
+    // Native model evaluation instance
+    IEvaluateModelExtended<float> *eval;
+
+    GetEvalExtendedF(&eval);
+
+    try
+    {
+        eval->CreateNetwork(modelDefinition);
+    }
+    catch (std::exception& ex)
+    {
+        fprintf(stderr, "%s\n", ex.what());
+        throw;
+    }
+    fflush(stderr);
+
+    // Get the model's layers dimensions
+    outputLayouts = eval->GetOutputSchema();
+
+    for (auto vl : outputLayouts)
+    {
+        fprintf(stderr, "Output dimension: %" PRIu64 "\n", vl.m_numElements);
+        fprintf(stderr, "Output name: %ls\n", vl.m_name.c_str());
+    }
+
+    eval->StartForwardEvaluation({ outputLayouts[0].m_name });
+    inputLayouts = eval->GetInputSchema();
+    outputLayouts = eval->GetOutputSchema();
+
+    return eval;
+}
+
+
+/// <summary>
+/// Program for demonstrating how to run model evaluations using the native extended evaluation interface, also show
+/// how to input sequence vectors to LSTM(RNN) network.
+/// </summary>
+/// <description>
+/// This program is a native C++ client using the native extended evaluation interface
+/// located in the <see cref="eval.h"/> file.
+/// The CNTK evaluation library (EvalDLL.dll on Windows, and LibEval.so on Linux), must be found through the system's path. 
+/// The other requirement is that Eval.h be included
+/// In order to run this program the model must already exist in the example. To create the model,
+/// first run the example in <CNTK>/Examples/Text/ATIS. Once the model file ATIS.slot.lstm is created,
+/// you can run this client.
+/// This program demonstrates the usage of the Evaluate method requiring the input and output layers as parameters.
+int main(int argc, char* argv[])
+{
+    // Get the binary path (current working directory)
+    argc = 0;
+    std::string app = argv[0];
+    std::string path;
+    size_t pos;
+    int ret;
+
+#ifdef _WIN32
+    pos = app.rfind("\\");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);
+
+    // This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
+    const std::string modelBaseDir = path + "/../../Examples/Text/ATIS/";
+    
+#else // on Linux
+    pos = app.rfind("/");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);
+
+    // This relative path assumes launching from CNTK's binary folder, e.g. build/cpu/release/bin/
+    const std::string modelBaseDir = path + "/../../../../Examples/Text/ATIS/";
+#endif
+    const std::string modelWorkingDirectory = modelBaseDir + "work/";
+
+    const std::string modelFilePath = modelWorkingDirectory + "ATIS.slot.lstm";
+
+    try
+    {
+        struct stat statBuf;
+        if (stat(modelFilePath.c_str(), &statBuf) != 0)
+        {
+            fprintf(stderr, "Error: The model %s does not exist. Please follow instructions in README.md in <CNTK>/Examples/Text/ATIS to create the model.\n", modelFilePath.c_str());
+            return(1);
+        }
+
+        std::string networkConfiguration;
+        networkConfiguration += "modelPath=\"" + modelFilePath + "\"";
+
+        VariableSchema inputLayouts;
+        VariableSchema outputLayouts;
+        IEvaluateModelExtended<float> *eval;
+        eval = SetupNetworkAndGetLayouts(networkConfiguration, inputLayouts, outputLayouts);
+
+        vector<size_t> inputBufferSize;
+        for (size_t i = 0; i < inputLayouts.size(); i++)
+        {
+            fprintf(stdout, "Input node name: %ls\n", inputLayouts[i].m_name.c_str());
+            fprintf(stdout, "Input feature dimension: %" PRIu64 "\n", inputLayouts[i].m_numElements);
+            inputBufferSize.push_back(inputLayouts[i].m_numElements);
+        }
+
+        vector<size_t> outputBufferSize;
+        for (size_t i = 0; i < outputLayouts.size(); i++)
+        {
+            outputBufferSize.push_back(outputLayouts[i].m_numElements);
+        }
+
+        // Build source word vocab to id 
+        const::string sourceVocab = modelBaseDir + "/Data/ATIS.vocab";
+        if (stat(sourceVocab.c_str(), &statBuf) != 0)
+        {
+            fprintf(stderr, "Error: The file '%s' does not exist.\n", sourceVocab.c_str());
+            return(1);
+        }
+        std::unordered_map<std::string, size_t> word2idxVocab = buildVocab(sourceVocab);
+
+        // Build id to target word vocab
+        const::string targetVocab = modelBaseDir + "/Data/ATIS.label";
+        if (stat(targetVocab.c_str(), &statBuf) != 0)
+        {
+            fprintf(stderr, "Error: The file '%s' does not exist.\n", targetVocab.c_str());
+            return(1);
+        }
+        std::unordered_map<size_t, std::string> idx2wordVocab = buildInvVocab(targetVocab);
+
+        // Use the following sentence as input example.
+        // One single space is used as word sperator. 
+        std::string inputSequences = "BOS i would like to find a flight from charlotte to las vegas that makes a stop in st. louis EOS";
+
+        Values<float> inputBuffers = inputLayouts.CreateBuffers<float>(inputBufferSize);
+        Values<float> outputBuffers = outputLayouts.CreateBuffers<float>(outputBufferSize);
+
+        // Feed input sequence vectors to network
+        std::vector<std::string> words = feedInputVectors(inputSequences, word2idxVocab, inputBuffers, inputLayouts);
+
+        // Forward propagation
+        eval->ForwardPass(inputBuffers, outputBuffers);
+
+        // Get output from output layer
+        auto buf = outputBuffers[0].m_buffer;
+        size_t bufSize = outputBuffers[0].m_buffer.size();
+
+        std::vector<std::string> outputs;
+        size_t outputDim = outputLayouts[0].m_numElements;
+        size_t outputStep = bufSize / outputDim;
+
+        auto iter = buf.begin();
+        for (size_t i = 0; i < outputStep; i++)
+        {
+            auto max_iter = std::max_element(iter, iter + outputDim);
+            auto index = max_iter - iter;
+            outputs.push_back(idx2word(index, idx2wordVocab));
+            iter += outputDim;
+        }
+
+        words.erase(words.begin());
+        words.pop_back();
+        fprintf(stdout, "Slot tag for sentence \"%s\" is as follows:\n", inputSequences.c_str());
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            fprintf(stdout, "%10s -- %s\n", words[i].c_str(), outputs[i].c_str());
+        }
+
+        eval->Destroy();
+       
+        // This pattern is used by End2EndTests to check whether the program runs to complete.
+        fprintf(stdout, "Evaluation complete.\n");
+        ret = 0;
+    }
+    catch (const std::exception& err)
+    {
+        fprintf(stderr, "Evaluation failed. EXCEPTION occurred: %s\n", err.what());
+        ret = 1;
+    }
+    catch (...)
+    {
+        fprintf(stderr, "Evaluation failed. Unknown ERROR occurred.\n");
+        ret = 1;
+    }
+
+    fflush(stdout);
+    fflush(stderr);
+    return ret;
+}
--- a/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj
+++ b/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj
@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CPPEvalExtendedClient</RootNamespace>
+    <ProjectName>CPPEvalExtendedClient</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <SDLCheck>true</SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
+      <AdditionalDependencies>EvalDll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <Profile>true</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="CPPEvalExtendedClient.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj.filters
+++ b/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj.filters
@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="CPPEvalExtendedClient.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/Examples/Evaluation/EvalClients.sln
+++ b/Examples/Evaluation/EvalClients.sln
@ -9,6 +9,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClient", "CSEvalClien
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalV2Client", "CPPEvalV2Client\CPPEvalV2Client.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClient", "CPPEvalExtendedClient\CPPEvalExtendedClient.vcxproj", "{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@ -25,6 +27,9 @@ Global
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Release|x64
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.ActiveCfg = Release|x64
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.Build.0 = Release|x64
+		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.ActiveCfg = Release|x64
+		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.ActiveCfg = Release|x64
+		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/Examples/Evaluation/README.md
+++ b/Examples/Evaluation/README.md
@ -1,8 +1,13 @@
 #EvalClients

 The folder contains some examples using the CNTK evaluation library. Please note that only the 64-bit target is supported by CNTK evaluation library.
+
 -CPPEvalClient: demonstrate the use of the C++ CNTK eval lib. Only the release configuration is supported.  
+
 -CSEvalClient: demonstrate the use of the C# CNTK eval lib.
+
 -EvalClients.sln: the VS2013 solution file to build examples. It creates two binaries in the directory $(SolutionDir)..\..\x64\:
-    * CPPEvalClient.$(Configuration)\CPPEvalClient.exe: the C++ example executable. To run the example, please first include the directory containing CNTK dependent dlls, usually $(SolutionDir)..\..\cntk, in the PATH environment variable.  
-    * CSEvalClient.$(Configuration)\CSEvalClient.exe: the C# example executable.
+
+    - CPPEvalClient.$(Configuration)\CPPEvalClient.exe: the C++ example executable. To run the example, please first include the directory containing CNTK dependent dlls, usually $(SolutionDir)..\..\cntk, in the PATH environment variable. 
+    
+    - CSEvalClient.$(Configuration)\CSEvalClient.exe: the C# example executable.
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
@ -13,6 +13,7 @@ modelPath = "$outputDir$/Models/ResNet_101"
 stderr = "$outputDir$/ResNet_101_BS_out"

 parallelTrain = true
+hyperCompressMemory = true

 TrainNetwork = {
    action = "train"
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
@ -13,6 +13,7 @@ modelPath = "$outputDir$/Models/ResNet_152"
 stderr = "$outputDir$/ResNet_152_BS_out"

 parallelTrain = true
+hyperCompressMemory = true

 TrainNetwork = {
    action = "train"
--- a/Examples/Text/ATIS/ATIS.cntk
+++ b/Examples/Text/ATIS/ATIS.cntk
@ -2,7 +2,7 @@
 # An LSTM model is built to tag each word in sentences with its semantic label.

 WorkDir = work
-DataDir = data
+DataDir = Data

 makeMode = false
 modelPath = $WorkDir$/ATIS.slot.lstm
@ -96,9 +96,11 @@ Train = [
            parallelizationMethod = "DataParallelSGD"
            parallelizationStartEpoch = 2
            distributedMBReading = true
-            dataParallelSGD = [
-                gradientBits = 1
-            ]
+            # Comment out the following lines if you want to enable parallelTrain to use 1-bit-SGD.
+            # For that you also need CNTK binaries built with 1-bit-SGD enabled.
+            # dataParallelSGD = [
+            #    gradientBits = 1
+            # ]
        ]
    ]

--- a/49
+++ b/49
@ -147,6 +147,14 @@ ifdef CUDA_PATH
    LIBS_LIST += cudnn
    COMMON_FLAGS +=-DUSE_CUDNN
  endif
+
+# Set up NCCL if needed
+  ifdef NCCL_PATH
+    INCLUDEPATH += $(NCCL_PATH)/include
+    LIBPATH += $(NCCL_PATH)/lib
+    LIBS_LIST += nccl
+    COMMON_FLAGS += -DUSE_NCCL
+  endif
 else
  DEVICE = cpu

@ -313,6 +321,7 @@ MATH_SRC =\
 	$(SOURCEDIR)/Math/DataTransferer.cpp \
 	$(SOURCEDIR)/Math/RNGHandle.cpp \
 	$(SOURCEDIR)/Math/TensorView.cpp \
+	$(SOURCEDIR)/Math/NcclComm.cpp \

 ifdef SUPPORT_AVX2
 MATH_SRC +=\
@ -406,7 +415,7 @@ CNTKLIBRARY_COMMON_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/Utils.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Value.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Variable.cpp \
-    $(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \
+	$(SOURCEDIR)/CNTKv2LibraryDll/Learner.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/Serialization.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/DistributedCommunicator.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/DataParallelDistributedTrainer.cpp \
@ -415,7 +424,6 @@ CNTKLIBRARY_COMMON_SRC =\
 CNTKLIBRARY_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/ComputeInputStatistics.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/MinibatchSource.cpp \
-	$(SOURCEDIR)/CNTKv2LibraryDll/Globals.cpp \

 CNTKLIBRARY_SRC+=$(CNTKLIBRARY_COMMON_SRC)
 CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
@ -510,7 +518,7 @@ SGDLIB_SRC=\
 	$(SOURCEDIR)/SGDLib/Profiler.cpp \
 	$(SOURCEDIR)/SGDLib/SGD.cpp \
 	$(SOURCEDIR)/SGDLib/PostComputingActions.cpp \
-	
+
 SGDLIB_SRC+=$(CNTKLIBRARY_COMMON_SRC)

 EVAL_SRC=\
@ -538,31 +546,46 @@ EVAL_LIB:=$(LIBDIR)/lib$(EVAL).so
 ALL_LIBS+=$(EVAL_LIB)
 SRC+=$(EVAL_SRC)

-$(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB) 
+$(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
 	@echo Building $(EVAL_LIB) for $(ARCH) with build type $(BUILDTYPE)
 	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(lMULTIVERSO) $(PROTOBUF_PATH)/lib/libprotobuf.a

 ########################################
-# Eval Sample client
+# Eval Sample clients
 ########################################
-EVAL_SAMPLE_CLIENT:=$(BINDIR)/cppevalclient
+EVAL_CLIENT:=$(BINDIR)/cppevalclient

-EVAL_SAMPLE_CLIENT_SRC=\
+EVAL_CLIENT_SRC=\
 	$(SOURCEDIR)/../Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp 

-EVAL_SAMPLE_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SAMPLE_CLIENT_SRC))
+EVAL_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_CLIENT_SRC))

-ALL+=$(EVAL_SAMPLE_CLIENT)
-SRC+=$(EVAL_SAMPLE_CLIENT_SRC)
+ALL+=$(EVAL_CLIENT)
+SRC+=$(EVAL_CLIENT_SRC)

-$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB) 
+$(EVAL_CLIENT): $(EVAL_CLIENT_OBJ) | $(EVAL_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
-	@echo building $(EVAL_SAMPLE_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
-	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH) $(lMULTIVERSO)
+	@echo building $(EVAL_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)  $(lMULTIVERSO)

+EVAL_EXTENDED_CLIENT:=$(BINDIR)/cppevalextendedclient
+
+EVAL_EXTENDED_CLIENT_SRC=\
+	$(SOURCEDIR)/../Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.cpp 
+
+EVAL_EXTENDED_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_EXTENDED_CLIENT_SRC))
+
+ALL+=$(EVAL_EXTENDED_CLIENT)
+SRC+=$(EVAL_EXTENDED_CLIENT_SRC)
+
+$(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $(EVAL_EXTENDED_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)

 ########################################
 # Eval V2 Sample client
--- a/Scripts/linux/install-cntk.sh
+++ b/Scripts/linux/install-cntk.sh
@ -10,7 +10,7 @@
 # TODO cut down on logging
 set -x -e -o pipefail

-REPO_TAG=v2.0.beta2.0
+REPO_TAG=v2.0.beta3.0

 while [ $# -gt 0 ]; do
  case "$1" in
@ -41,7 +41,7 @@ CNTK_DEP_LIB_PATH="$PWD/cntk/dependencies/lib"
 CNTK_EXAMPLES_PATH="$PWD/Examples"
 CNTK_BINARY="$CNTK_BIN_PATH/cntk"
 CNTK_PY34_ENV_FILE="$SCRIPT_DIR/conda-linux-cntk-py34-environment.yml"
-CNTK_WHEEL_PATH="cntk/python/cntk-2.0.beta2.0-cp34-cp34m-linux_x86_64.whl"
+CNTK_WHEEL_PATH="cntk/python/cntk-2.0.beta3.0-cp34-cp34m-linux_x86_64.whl"
 test -d "$CNTK_BIN_PATH" && test -d "$CNTK_LIB_PATH" && test -d "$CNTK_DEP_LIB_PATH" && 
 test -d "$CNTK_EXAMPLES_PATH" && test -x "$CNTK_BINARY" &&
 test -f "$CNTK_PY34_ENV_FILE" && test -f "$CNTK_WHEEL_PATH" || {
--- a/Scripts/windows/_action.ps1
+++ b/Scripts/windows/_action.ps1
@ -26,11 +26,7 @@ function ActionItem(
    $expr = $func +' $item' 
        
    Write-Verbose "Calling Operation: [$func]"
-    $result = Invoke-Expression $expr 
-    if (-not $result) {
-        return 
-    }
-    return 
+    Invoke-Expression $expr 
 }


@ -47,10 +43,14 @@ function InstallExe(
    $processWait = $table["ProcessWait"]
    $message =  $table["message"]
    $runAs = $table["runAs"]
+    $maxErrorLevel = $table["maxErrorLevel"]

    if ($runAs -eq $null) {
        $runAs = $true
    }
+    if ($maxErrorLevel -eq $null) {
+        $maxErrorLevel = 0
+    }
    if ($platform -ne $null) {
        $runningOn = ((Get-WmiObject -class Win32_OperatingSystem).Caption).ToUpper()
        $platform  = ($platform.ToString()).ToUpper()
@ -65,10 +65,10 @@ function InstallExe(
    }
    
    if ($dir -eq $null) {
-        $ecode = DoProcess -command $cmd -param "$param" -requiresRunAs $runAs
+        DoProcess -command $cmd -param $param -requiresRunAs $runAs -maxErrorLevel $maxErrorLevel
    }
    else {
-        $ecode = DoProcess -command $cmd -param "$param" -requiresRunAs $runAs -workingDir "$dir" 
+        DoProcess -command $cmd -param $param -requiresRunAs $runAs -workingDir $dir -maxErrorLevel $maxErrorLevel
    }
    
    if ( ($processWait -ne $null) -and ($Execute) -and ($false) ) {
@ -77,11 +77,44 @@ function InstallExe(
            $pwait = Get-Process $processWait -ErrorAction SilentlyContinue
        } while (-not ($pwait -eq $null))
    }
+}
+
+function ExecuteApplication(
+    [Parameter(Mandatory = $true)][hashtable] $table)
+{
+    FunctionIntro $table
    
-      
-    if ($ecode -eq 0) { return $true }
-          
-    return $false
+    $func = $table["Function"]
+    $appName = $table["AppName"]
+    $param= $table["Param"]
+    $appDir = $table["AppDir"]
+    $usePath = $table["UseEnvPath"]
+    $dir  = $table["WorkDir"]
+    $maxErrorLevel = $table["maxErrorLevel"]
+
+    if ($appDir -eq $null) {
+        $appDir = ""
+    }
+    if ($usePath -eq $null) {
+        $usePath = $false
+    }
+    if ($maxErrorLevel -eq $null) {
+        $maxErrorLevel = 0
+    }
+
+    if ($Execute) {
+        $application = ResolveApplicationName $appName $appDir $usePath
+        if ($application.Length -eq 0) {
+            throw "ExecuteApplication: Couldn't resolve program [$appName] with location directory [$appDir] and usePath [$usePath]"
+        }
+
+        if ($dir -eq $null) {
+            DoProcess -command $application -param $param -maxErrorLevel $maxErrorLevel
+        }
+        else {
+            DoProcess -command $application -param $param -workingDir $dir -maxErrorLevel $maxErrorLevel
+        }
+    }
 }

 function InstallWheel(
@ -110,12 +143,12 @@ function InstallWheel(
    $whl = $whlFile.FullName

    $condaExe = Join-Path $BasePath 'Scripts\conda.exe'
-    $newPaths = Invoke-DosCommand $condaExe (Write-Output ..activate cmd.exe $EnvName)
+    $newPaths = Invoke-DosCommand $condaExe (Write-Output ..activate cmd.exe $EnvName)  -maxErrorLevel 0

    $oldPath = $env:PATH
    $env:PATH = $newPaths + ';' + $env:PATH

-    Invoke-DosCommand pip (Write-Output install $whl)
+    Invoke-DosCommand pip (Write-Output install $whl) -maxErrorLevel 0
    $env:PATH = $oldPath 
    return
 }
@ -133,8 +166,6 @@ function MakeDirectory(
            New-Item $path -type directory
        }
    }
-    
-    return $true
 }

 function AddToPath(
@ -160,7 +191,6 @@ function AddToPath(

    if ($pv.Contains("$ap")) {
        Write-Verbose "AddToPath - path information already up-to-date" 
-        return $true
    }

    Write-Host Adding [$dir] to environment [$env]
@ -173,7 +203,6 @@ function AddToPath(
    if ($Execute) {
        SetEnvVar -name $env -content "$pathvalue" 
    }
-    return $true
 }

 function ExtractAllFromZip(
@ -186,10 +215,10 @@ function ExtractAllFromZip(
    $destinationFolder = $table["destinationFolder"]

    if (-not (test-path -path $destinationFolder)) {
-        return $false
+        throw "$destinationFolder doesn't exist"
    }
    if (-not (test-path $zipFileName -PathType Leaf)) {
-        return $false
+        throw "$zipFileName doesn't exist"
    }

    if ($Execute) {
@ -199,7 +228,6 @@ function ExtractAllFromZip(

        $destination.CopyHere($zipFile.Items())
    }
-    return $true
 }

 function CreateBatch(
@ -237,7 +265,8 @@ function DoProcess(
    [string]  $command,
    [string]  $param,
    [string]  $workingDir = "",
-    [boolean] $requiresRunAs = $false)
+    [boolean] $requiresRunAs = $false,
+    [int] $maxErrorLevel)
 {
    $info = "start-process [$command] with [$param]"

@ -245,7 +274,7 @@ function DoProcess(

    if (-not $Execute) {
         Write-Host  "** Running in DEMOMODE - setting Exit Code **: 0"
-         return 0
+         return
    }

    if ($workingDir.Length -eq 0) {
@ -266,15 +295,13 @@ function DoProcess(
        }
    }

-
    $eCode = ($process.ExitCode)

-    if ($eCode -ne 0) {
-        Write-Host  "$message ** Exit Code **:($eCode)"
-    } else {
-        Write-Verbose "$message ** Exit Code **:($eCode)"
+    if ($ecode -gt $maxErrorLevel) {
+        throw "Running 'start-process $commandString $param' failed with exit code [$ecode]"
    }
-    return $eCode
+    
+    return
 }


@ -287,17 +314,15 @@ function SetEnvVar(
    Write-Verbose "SetEnvVar [$name] with [$content]"
    
    if ($Execute) {
-        # [environment]::SetEnvironmentVariable($name, $content, $location)
-
        $commandString = "& { [environment]::SetEnvironmentVariable('"+$name+"', '"+$content+"', '"+$location+"') }"
-
-        RunPowershellCommand -command "$commandString" -elevated $true
+        RunPowershellCommand -command "$commandString" -elevated $true -maxErrorLevel 0
    }    
 }

 function RunPowershellCommand(
    [string] $commandString,
-    [boolean] $elevated
+    [boolean] $elevated,
+    [int] $maxErrorLevel
 )
 {
    $commandBytes = [System.Text.Encoding]::Unicode.GetBytes($commandString)
@ -310,8 +335,12 @@ function RunPowershellCommand(
    else {
        $process = Start-Process -PassThru -FilePath powershell.exe -ArgumentList $commandLine -wait
    }
+    
    $eCode = ($process.ExitCode)
-    return ($ecode -eq 0)
+    if ($ecode -gt $maxErrorLevel) {
+        throw "Running 'powershell.exe $commandString' failed with exit code [$ecode]"
+    }
+    return
 }

 function Invoke-DosCommand {
@ -321,7 +350,7 @@ function Invoke-DosCommand {
    [string] $Command,
    [string[]] $Argument,
    [string] [ValidateScript({ Test-Path -PathType Container $_ })] $WorkingDirectory,
-    [switch] $IgnoreNonZeroExitCode,
+    [int] $maxErrorLevel,
    [switch] $SuppressOutput
  )
    Write-Verbose "Running '$Command $Argument'"
@ -336,7 +365,43 @@ function Invoke-DosCommand {
    if ($WorkingDirectory) {
        Pop-Location
    }
-    if (($LASTEXITCODE -ne 0) -and -not $IgnoreNonZeroExitCode) {
+    if ($LASTEXITCODE -gt $maxErrorLevel) {
        throw "Running '$Command $Argument' failed with exit code $LASTEXITCODE"
    }
 }
+
+function ResolveApplicationName(
+    [string] $name,
+    [string] $directory,
+    [bool] $usePath)
+{
+    $application = ""
+
+    if ($directory.Length -gt 0) {
+        $application = CallGetCommand (join-path $directory $name)
+    }
+    if ($application.Length -eq 0) {
+        if ($usePath) {
+            # we are at this point if we are supposed to check in the path environment for a match and
+            # $directory was empty or we couldn't find it in the $directory
+
+            $application = CallGetCommand $name
+        }
+    }
+    # application will be an empty string if we couldn't resolve the name, otherwise we can execute $application
+
+    return $application
+}
+
+function CallGetCommand(
+    [string] $application)
+{
+    try {
+        get-command $application -CommandType Application -ErrorAction Stop | Out-Null
+        return $application
+    }
+    catch {
+        # the application can't be found, so return empty string
+        return ""
+    }
+}
--- a/Scripts/windows/_info.ps1
+++ b/Scripts/windows/_info.ps1
@ -82,6 +82,23 @@ function CheckPowershellVersion
    return $false
 }

+function CheckOSVersion 
+{
+    $runningOn = (Get-WmiObject -class Win32_OperatingSystem).Caption
+    $isMatching = ($runningOn -match "^Microsoft Windows (8\.1|10|Server 2012 R2)") 
+
+    if ($isMatching) {
+        return
+    }
+
+    Write-Host "
+You are running the this install script on [$runningOn].
+The Microsoft Cognitive Toolkit is designed and tested on Windows 8.1, Windows 10, 
+and Windows Server 2012 R2. 
+"
+    return
+}
+
 function DisplayStart()
 {
    Write-Host $(DisplayStartMessage)
@ -90,6 +107,8 @@ function DisplayStart()
        return $false
    }

+    CheckOSVersion
+
    if (-not $Execute) {
        Write-Host $(DisplayWarningNoExecuteMessage)
    }
--- a/Scripts/windows/_operations.ps1
+++ b/Scripts/windows/_operations.ps1
@ -4,6 +4,9 @@
 #

 $operations = @(
+    @{Name = "Scan System for installed programs"; ShortName = "SCANPROG"; Info = "Scan System for installed programs"; 
+      Verification = @( @{Function = "VerifyScanPrograms" } )
+     },
    @{Name = "Verifying Installation contents"; ShortName = "INSTCONTENT"; Info = "Verifying Installation contents"; 
      Verification = @( @{Function = "VerifyInstallationContent"; Path = "$cntkRootDir" } )
     },
@ -45,8 +48,9 @@ $operations = @(
                  @{Function = "AddToPath"; Dir = "C:\Program Files\Git\cmd"; AtStart  = $true; } )
     },
    @{Name = "Clone CNTK from Github"; ShortName = "CNTKCLONE"; Info = "Clone CNTK from Github repository";
-      Verification = @( @{Function = "VerifyDirectory"; Path = "$RepoLocation" } );
+      Verification = @( @{Function = "VerifyDirectory"; Path = $RepoLocation } ); 
      Action = @( @{Function = "MakeDirectory"; Path = $repoDirectory },
-                  @{Function = "InstallExe"; Command = "C:\Program Files\Git\bin\git.exe"; Param = "clone --branch $RepoTag --recursive https://github.com/Microsoft/CNTK/ $repoName"; WorkDir = "$repoDirectory"; Message="Cloning CNTK (branch $RepoTag) repository...." } )
+                  @{Function = "ExecuteApplication"; AppName = "git.exe"; Param = "clone --branch $RepoTag --recursive https://github.com/Microsoft/CNTK/ $repoName"; AppDir = "C:\Program Files\Git\bin"; UseEnvPath = $true; WorkDir = $repoDirectory } )
     }
 )
+
--- a/Scripts/windows/_verify.ps1
+++ b/Scripts/windows/_verify.ps1
@ -58,6 +58,19 @@ function VerifyItem(
    return $noInstallRequired
 }

+function VerifyScanPrograms(
+    [Parameter(Mandatory = $true)][hashtable] $table)
+{
+    FunctionIntro $table
+    $func = $table["Function"]
+    $noInstallRequired = $true
+    
+    # no actual work is being performed, just the script local datastructure with the list
+    # of installed programs is being initialized
+    LoadWin32Product
+    return $noInstallRequired
+}
+
 function VerifyWin32ProductExists(
    [Parameter(Mandatory = $true)][hashtable] $table)
 {
--- a/Scripts/windows/install.ps1
+++ b/Scripts/windows/install.ps1
@ -60,7 +60,7 @@
 Param(
    [parameter(Mandatory=$false)] [string] $AnacondaBasePath = "C:\local\Anaconda3-4.1.1-Windows-x86_64",
    [parameter(Mandatory=$false)] [switch] $Execute,
-    [parameter(Mandatory=$false)] [string] $RepoTag="v2.0.beta2.0",
+    [parameter(Mandatory=$false)] [string] $RepoTag="v2.0.beta3.0",
    [parameter(Mandatory=$false)] [string] $RepoLocation="c:\repos\CNTK"
 )

--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -61,11 +61,6 @@
 #define let const auto
 #endif

-// TODO: Temporary mechanism to enable memory sharing for
-// node output value matrices. This will go away when the
-// sharing is ready to be enabled by default
-bool g_shareNodeValueMatrices = false;
-
 using namespace std;
 using namespace Microsoft::MSR;
 using namespace Microsoft::MSR::CNTK;
@ -243,6 +238,9 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
            ProgressTracing::SetStepOffset(fullEpochsOffset); // this is the epoch number that SGD will log relative to
        }

+        if (Globals::ShouldEnableHyperCompressMemory())
+            Matrix<ElemType>::UseCachedResizeOrNot(true);
+
        // determine the action to perform, and do it
        for (int j = 0; j < action.size(); j++)
        {
@ -563,7 +561,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
        mpi = MPIWrapper::GetInstance(true /*create*/);
    }  

-    g_shareNodeValueMatrices = config(L"shareNodeValueMatrices", false);
+    if (config(L"shareNodeValueMatrices", false))
+        Globals::EnableShareNodeValueMatrices();
+    if (config(L"hyperCompressMemory", false))
+        Globals::EnableHyperCompressMemory();

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

@ -644,7 +645,7 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp

 static void PrintBanner(int argc, wchar_t* argv[], const string& timestamp)
 {
-    fprintf(stderr, "CNTK 2.0.beta2.0+ (");
+    fprintf(stderr, "CNTK 2.0.beta3.0+ (");
 #ifdef _GIT_EXIST
    fprintf(stderr, "%s %.6s, ", _BUILDBRANCH_, _BUILDSHA1_);
 #endif
@ -705,7 +706,10 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
       mpi = MPIWrapper::GetInstance(true /*create*/);
    } 

-    g_shareNodeValueMatrices = config(L"shareNodeValueMatrices", false);
+    if (config(L"shareNodeValueMatrices", false))
+        Globals::EnableShareNodeValueMatrices();
+    if (config(L"hyperCompressMemory", false))
+        Globals::EnableHyperCompressMemory();

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -2399,6 +2399,11 @@ namespace CNTK
        ///
        CNTK_API static FunctionPtr LoadModel(DataType dataType, const std::wstring& modelFile, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice());

+        ///
+        /// Prints the entire graph underlying this function to stderr
+        ///
+        CNTK_API void PrintGraph() const;
+
    private:

        template <typename VariableType, typename FilterFunction>
@ -2694,6 +2699,16 @@ namespace CNTK
        return TransposeTimes(leftOperand, rightOperand, /*outputRank =*/ 1, name);
    }

+    ///
+    /// Create an instance of the CNTK built-in operation to compute binary cross-entropy for specified input operands.
+    ///
+    CNTK_API FunctionPtr BinaryCrossEntropy(const Variable& prediction, const Variable& targets, const std::wstring& name = L"");
+
+    ///
+    /// Create an instance of the CNTK built-in operation to compute weighted binary cross-entropy for specified input operands.
+    ///
+    CNTK_API FunctionPtr WeightedBinaryCrossEntropy(const Variable& prediction, const Variable& targets, const Variable& weights, const std::wstring& name = L"");
+
    ///
    /// Create an instance of the CNTK built-in operation to compute squared-error for specified input operands.
    ///
@ -2899,6 +2914,13 @@ namespace CNTK
        CNTK_API FunctionPtr IsFirst(const Variable& operand, const std::wstring& name = L"");
        CNTK_API FunctionPtr IsLast(const Variable& operand, const std::wstring& name = L"");

+        CNTK_API FunctionPtr Slice(const Variable& operand, int beginIndex, int endIndex, const std::wstring& name = L"");
+
+        ///
+        /// Create an instance of the CNTK built-in sum reduction operation on specified tensor input operand along the operands lone dynamic sequence axis
+        ///
+        CNTK_API FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name = L"");
+
        CNTK_API FunctionPtr First(const Variable& operand, const std::wstring& name = L"");
        CNTK_API FunctionPtr Last(const Variable& operand, const std::wstring& name = L"");

--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -206,9 +206,11 @@ namespace CNTK
        CNTK_API FunctionPtr GatherPacked(const Variable& operand, const Variable& packedIndex, const std::wstring& name = L"");
        CNTK_API FunctionPtr ScatterPacked(const Variable& operand, const Variable& packedIndex, const Variable& condition, const std::wstring& name = L"");
        CNTK_API FunctionPtr ZeroesWithDynamicAxesLike(const Variable& operand);
-        CNTK_API FunctionPtr Where(const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
-        CNTK_API FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
-        CNTK_API FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Where(const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name = L"");
        CNTK_API FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name = L"");
        CNTK_API FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name = L"");

@ -236,7 +238,8 @@ namespace CNTK

        CNTK_API void SetFixedRandomSeed(unsigned long fixedRandomSeed);

-        CNTK_API void SetForwardValuesSharing(bool enableSharing);
+        CNTK_API void EnableForwardValuesSharing();
+        CNTK_API void EnableHyperMemoryCompress();

        CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
        CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);
--- a/Source/CNTKv2LibraryDll/BackCompat.cpp
+++ b/Source/CNTKv2LibraryDll/BackCompat.cpp
@ -232,6 +232,8 @@ namespace CNTK
                    primitiveFunctionConfigParameters[PrimitiveFunction::AttributeNameOffset] = (size_t)node->As<FutureValueNode<ElementType>>()->TimeStep();
                    opType = PrimitiveOpType::FutureValue;
                }
+                else if (node->OperationName() == OperationNameOf(LogisticNode))
+                    opType = PrimitiveOpType::Logistic;
                else if (node->OperationName() == OperationNameOf(SquareErrorNode))
                    opType = PrimitiveOpType::SquaredError;
                else if (node->OperationName() == OperationNameOf(CrossEntropyWithSoftmaxNode))
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -39,10 +39,6 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
-  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
@ -51,11 +47,10 @@
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="$(DebugBuild)">
-    <LinkIncremental>true</LinkIncremental>
-    <TargetName>CNTKLibrary-$(LibraryVersion)</TargetName>
+      <LinkIncremental>true</LinkIncremental>
+      <TargetName>CNTKLibrary-$(LibraryVersion)</TargetName>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)">
-    <LinkIncremental>false</LinkIncremental>
    <TargetName>CNTKLibrary-$(LibraryVersion)</TargetName>
  </PropertyGroup>
  <PropertyGroup>
@ -100,9 +95,6 @@
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>CNTKV2LIBRARYDLL;WIN32;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <OpenMPSupport>false</OpenMPSupport>
@ -114,10 +106,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ReaderLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib;$(ProtobufLib);%(AdditionalDependencies)</AdditionalDependencies>
-      <Profile>true</Profile>
      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
@ -169,7 +158,6 @@
      </PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="Function.cpp" />
-    <ClCompile Include="Globals.cpp" />
    <ClCompile Include="Learner.cpp" />
    <ClCompile Include="MinibatchSource.cpp" />
    <ClCompile Include="NDArrayView.cpp" />
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -21,7 +21,6 @@
    </ClCompile>
    <ClCompile Include="DistributedCommunicator.cpp" />
    <ClCompile Include="DataParallelDistributedTrainer.cpp" />
-    <ClCompile Include="Globals.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="stdafx.h" />
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -60,9 +60,14 @@ namespace CNTK
            return s_disableAutomaticUnpackingOfPackedValues.load();
        }

-        void SetForwardValuesSharing(bool enableSharing)
+        void EnableForwardValuesSharing()
        {
-            g_shareNodeValueMatrices = enableSharing;
+            Microsoft::MSR::CNTK::Globals::EnableShareNodeValueMatrices();
+        }
+
+        void EnableHyperMemoryCompress()
+        {
+            Microsoft::MSR::CNTK::Globals::EnableHyperCompressMemory();
        }

        bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -544,6 +544,12 @@ namespace CNTK
        return CompositeFunction::Deserialize(modelDictionary, device);
    }

+    void Function::PrintGraph() const
+    {
+        CompositeFunction::Traverse(RootFunction(), [](const FunctionPtr& function) {
+        });
+    }
+
    // Names for the reduction operations as used by the CNTK ReduceElementsNode
    /*static*/ const std::wstring PrimitiveFunction::InternalSumReductionOpName = L"Sum";
    /*static*/ const std::wstring PrimitiveFunction::InternalLogSumReductionOpName = L"LogSum";
@ -580,6 +586,8 @@ namespace CNTK
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameEpsilon = L"epsilon";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameUseCuDNNEngine = L"useCuDNNEngine";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameNewDynamicAxes = L"newDynamicAxes";
+    /*static*/ const std::wstring PrimitiveFunction::AttributeNameNewSequenceAxisLengthScalingFactor = L"newSequenceAxisLengthScalingFactor";
+    /*static*/ const std::wstring PrimitiveFunction::AttributeNameNewSequenceAxisLengthAdditiveFactor = L"newSequenceAxisLengthAdditiveFactor";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameBeginIndex = L"beginIndex";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameEndIndex = L"endIndex";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameReductionOpName = L"reductionOpName";
@ -626,12 +634,47 @@ namespace CNTK
        if (outputDataType == DataType::Unknown)
            outputDataType = firstKnownInputDataType;

-        // We currently require that the inputs' dynamic axes if any match
+        // We currently require that the inputs' dynamic axes, if any, match
        std::vector<Axis> outputDynamicAxes;
-        if ((op == PrimitiveOpType::SumAll) || (op == PrimitiveOpType::SquaredError) || (op == PrimitiveOpType::CrossEntropyWithSoftmax) || (op == PrimitiveOpType::ClassificationError))
+        if ((op == PrimitiveOpType::SumAll) ||
+            (op == PrimitiveOpType::SquaredError) ||
+            (op == PrimitiveOpType::CrossEntropyWithSoftmax) ||
+            (op == PrimitiveOpType::ClassificationError) ||
+            (op == PrimitiveOpType::Logistic))
+        {
            outputDynamicAxes = std::vector<Axis>({});
+        }
        else if (op == PrimitiveOpType::Where)
-            outputDynamicAxes = AsVector<Axis>(functionConfig[PrimitiveFunction::AttributeNameNewDynamicAxes].Value<std::vector<DictionaryValue>>());
+        {
+            if (functionConfig.Contains(PrimitiveFunction::AttributeNameNewDynamicAxes))
+                outputDynamicAxes = AsVector<Axis>(functionConfig[PrimitiveFunction::AttributeNameNewDynamicAxes].Value<std::vector<DictionaryValue>>());
+            else
+            {
+                if (inputs[0].DynamicAxes() == Axis::UnknownDynamicAxes())
+                    outputDynamicAxes = Axis::UnknownDynamicAxes();
+                else
+                {
+                    if (functionConfig.Contains(PrimitiveFunction::AttributeNameNewSequenceAxisLengthScalingFactor) &&
+                        functionConfig.Contains(PrimitiveFunction::AttributeNameNewSequenceAxisLengthAdditiveFactor))
+                    {
+                        size_t newSequenceAxisLengthScalingFactor = functionConfig[PrimitiveFunction::AttributeNameNewSequenceAxisLengthScalingFactor].Value<size_t>();
+                        int newSequenceAxisLengthAdditiveFactor = functionConfig[PrimitiveFunction::AttributeNameNewSequenceAxisLengthAdditiveFactor].Value<int>();
+
+                        auto derivedDynamicAxes = GetDerivedDynamicAxes(inputs[0].DynamicAxes()[0], newSequenceAxisLengthScalingFactor, newSequenceAxisLengthAdditiveFactor);
+                        std::copy(derivedDynamicAxes.begin(), derivedDynamicAxes.end(), std::back_inserter(outputDynamicAxes));
+                    }
+                    else
+                    {
+                        outputDynamicAxes.push_back(Axis::NewUniqueDynamicAxis(L"whereNodeDynamicAxis"));
+                    }
+
+                    for (size_t i = 1; i < inputs[0].DynamicAxes().size(); ++i)
+                        outputDynamicAxes.push_back(inputs[0].DynamicAxes()[i]);
+
+                    functionConfig[PrimitiveFunction::AttributeNameNewDynamicAxes] = AsDictionaryValueVector(outputDynamicAxes);
+                }
+            }
+        }
        else if (op == PrimitiveOpType::ScatterPacked)
            outputDynamicAxes = inputs[2].DynamicAxes();
        else if ((op == PrimitiveOpType::PackedIndex) || (op == PrimitiveOpType::GatherPacked))
@ -852,9 +895,9 @@ namespace CNTK
            case PrimitiveOpType::Convolution:
            {
                assert(inputs.size() == 2);
-                    auto& strides = functionConfig[PrimitiveFunction::AttributeNameStrides].Value<NDShape>();
-                    auto& lowerPad = functionConfig[PrimitiveFunction::AttributeNameLowerPad].Value<NDShape>();
-                    auto& upperPad = functionConfig[PrimitiveFunction::AttributeNameUpperPad].Value<NDShape>();
+                auto& strides = functionConfig[PrimitiveFunction::AttributeNameStrides].Value<NDShape>();
+                auto& lowerPad = functionConfig[PrimitiveFunction::AttributeNameLowerPad].Value<NDShape>();
+                auto& upperPad = functionConfig[PrimitiveFunction::AttributeNameUpperPad].Value<NDShape>();
                auto sharing = AsVector<bool>(functionConfig[PrimitiveFunction::AttributeNameSharing].Value<std::vector<DictionaryValue>>());
                auto autoPadding = AsVector<bool>(functionConfig[PrimitiveFunction::AttributeNameAutoPadding].Value<std::vector<DictionaryValue>>());
                bool transpose = functionConfig[PrimitiveFunction::AttributeNameTranspose].Value<bool>();
@ -863,23 +906,24 @@ namespace CNTK

                NDShape outputMapCount, kernelShape;
                std::tie(outputMapCount, kernelShape) = GetConvolutionOutputMapCountAndKernelShape(inputs[0].Shape(), inputs[1].Shape());
-                    auto originalKernelShape = kernelShape;
-                    outputShape = ConvolutionOpOutputShape(op, inputs[1].Shape(), kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, transpose, inferDimensions);
-                    if (originalKernelShape != kernelShape)
-                    {
-                        for (size_t i = 0; i < kernelShape.Rank(); ++i)
-                            inputs[0].m_dataFields->m_shape[i] = kernelShape[i];
-                    }
+                auto originalKernelShape = kernelShape;
+                outputShape = ConvolutionOpOutputShape(op, inputs[1].Shape(), kernelShape, outputMapCount, strides, sharing, autoPadding, lowerPad, upperPad, transpose, inferDimensions);
+                if (originalKernelShape != kernelShape)
+                {
+                    for (size_t i = 0; i < kernelShape.Rank(); ++i)
+                        inputs[0].m_dataFields->m_shape[i] = kernelShape[i];
+                }

-                    functionConfig[PrimitiveFunction::AttributeNameSharing] = AsDictionaryValueVector(sharing);
-                    functionConfig[PrimitiveFunction::AttributeNameAutoPadding] = AsDictionaryValueVector(autoPadding);
+                functionConfig[PrimitiveFunction::AttributeNameSharing] = AsDictionaryValueVector(sharing);
+                functionConfig[PrimitiveFunction::AttributeNameAutoPadding] = AsDictionaryValueVector(autoPadding);
                break;
            }
+            case PrimitiveOpType::Logistic:
            case PrimitiveOpType::SquaredError:
            case PrimitiveOpType::CrossEntropyWithSoftmax:
            case PrimitiveOpType::ClassificationError:
            {
-                if (op == PrimitiveOpType::ClassificationError)
+                if ((op == PrimitiveOpType::ClassificationError) || (op == PrimitiveOpType::Logistic))
                    assert(inputs.size() >= 2);
                else
                    assert(inputs.size() == 2);
@ -892,9 +936,9 @@ namespace CNTK
                if (predictionShape != labelsShape)
                    RuntimeError("Prediction output operand's shape %S is incompatible with label operand's shape %S for the %S operation", AsStringForErrorReporting(predictionShape).c_str(), AsStringForErrorReporting(labelsShape).c_str(), PrimitiveOpTypeName(op).c_str());

-                    std::vector<int> reductionAxes;
-                    for (int i = 0; i < (int)inputs[0].Shape().Rank(); ++i)
-                    reductionAxes.push_back(i);
+                std::vector<int> reductionAxes;
+                for (int i = 0; i < (int)inputs[0].Shape().Rank(); ++i)
+                reductionAxes.push_back(i);

                outputShape = ReductionOpOutputShape(op, predictionShape, reductionAxes, /*preserveReductionAxes =*/ false);
                break;
@ -1098,7 +1142,7 @@ namespace CNTK
        std::vector<FunctionPtr> topoSortedPrimitiveFunctions;
        std::vector<Variable> inputs;
        std::unordered_set<std::wstring> inputUids;
-        Traverse([&visitedFunctions, &inputs, &topoSortedPrimitiveFunctions, &inputUids](const FunctionPtr& function) {
+        Traverse(RootFunction(), [&visitedFunctions, &inputs, &topoSortedPrimitiveFunctions, &inputUids](const FunctionPtr& function) {
                    std::vector<Variable> functionInputs = function->Inputs();
                    for (const auto& input : functionInputs)
                    {
@ -1576,6 +1620,9 @@ namespace CNTK
            computationNodePtr = New<ConvolutionNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(kernelShape), AsTensorShape(outputMapCount), AsTensorShape(strides), sharing, autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), transpose, ImageLayoutKind::CHW, maxTempMemSizeInSamples);
            break;
        }
+        case PrimitiveOpType::Logistic:
+            computationNodePtr = New<LogisticNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+            break;
        case PrimitiveOpType::SquaredError:
            computationNodePtr = New<SquareErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
            break;
@ -2585,7 +2632,7 @@ namespace CNTK

    FunctionPtr Round(const Variable& operand, const std::wstring& name)
    {
-        return Floor(Plus(operand, Constant::Scalar(operand.GetDataType(), 0.5)), name);
+        return Floor(Plus(operand, Constant::Scalar(0.5f)), name);
    }

    FunctionPtr Floor(const Variable& operand, const std::wstring& name)
@ -2633,11 +2680,9 @@ namespace CNTK

        return TransposeAxes(operand, Axis(0), Axis(1), name);
    }
+
    FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name)
    {
-        if (axis == Axis::DefaultBatchAxis())
-            LogicError("Slice is currently unsupported along the batch axis");
-
        if (axis.IsStaticAxis())
        {
            if ((endIndex - beginIndex) <= 0)
@ -2646,46 +2691,10 @@ namespace CNTK
            return Internal::Slice(operand, axis, beginIndex, endIndex, name);
        }

-        if ((beginIndex == 0) && (endIndex == 0))
-            return operand;
+        if (axis == Axis::DefaultBatchAxis())
+            LogicError("Slice is currently unsupported along the batch axis");

-        auto operandAxes = operand.DynamicAxes();
-        auto findAxis = std::find(operandAxes.begin(), operandAxes.end(), axis);
-        if (findAxis == operandAxes.end())
-            InvalidArgument("The specified dynamic axis named %S does not match any of the dynamic axes of the operand", axis.Name().c_str());
-
-        auto beginFlagsLambda = [beginIndex, operand]() {
-            return (beginIndex > 0) ? Minus(Constant::Scalar(operand.GetDataType(), 1.0), Internal::IsWithin(operand, beginIndex)) : Internal::IsWithin(operand, beginIndex);
-        };
-
-        auto endFlagsLambda = [endIndex, operand]() {
-            return (endIndex > 0) ? Internal::IsWithin(operand, endIndex) : Minus(Constant::Scalar(operand.GetDataType(), 1.0), Internal::IsWithin(operand, endIndex));
-        };
-
-        FunctionPtr flags;
-        if (beginIndex == 0)
-            flags = endFlagsLambda();
-        else if (endIndex == 0)
-            flags = beginFlagsLambda();
-        else
-            flags = ElementTimes(beginFlagsLambda(), endFlagsLambda());
-
-        // Since we are slicing along a dynamic axis, the output variable's dynamic axes will be different than the operand
-        std::vector<Axis> newDynamicAxes;
-        for (auto operandAxis : operandAxes)
-        {
-            if (operandAxis == axis)
-            {
-                int sliceLength = (endIndex - beginIndex);
-                size_t multiplicativeFactor = (sliceLength > 0) ? 0 : 1;
-                auto derivedDynamicAxes = GetDerivedDynamicAxes(operandAxis, multiplicativeFactor, sliceLength);
-                std::copy(derivedDynamicAxes.begin(), derivedDynamicAxes.end(), std::back_inserter(newDynamicAxes));
-            }
-            else
-                newDynamicAxes.push_back(operandAxis);
-        }
-
-        return Internal::Gather(operand, flags, newDynamicAxes, name);
+        LogicError("CNTK::Slice: Invalid axis argument provided. To slice a sequence along its ordered dynamic axis use Sequence::Slice.");
    }

    FunctionPtr RandomSample(const Variable& operand, size_t numSamples, bool allowDuplicates, const std::wstring& name)
@ -2721,6 +2730,7 @@ namespace CNTK

        return UnaryOp(PrimitiveOpType::Reshape, operand, std::move(additionalProperties), name);
    }
+
    FunctionPtr BinaryOp(PrimitiveOpType op, const Variable& leftOperand, const Variable& rightOperand, Dictionary&& opConfig, const std::wstring& name)
    {
        std::vector<Variable> operands = { leftOperand, rightOperand };
@ -2792,6 +2802,18 @@ namespace CNTK
        return BinaryOp(PrimitiveOpType::TransposeTimes, leftOperand, rightOperand, std::move(additionalProperties), name);
    }

+    FunctionPtr BinaryCrossEntropy(const Variable& prediction, const Variable& targets, const std::wstring& name)
+    {
+        std::vector<Variable> operands = { prediction, targets };
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Logistic, operands, Dictionary(), name), name);
+    }
+
+    FunctionPtr WeightedBinaryCrossEntropy(const Variable& prediction, const Variable& targets, const Variable& weights, const std::wstring& name)
+    {
+        std::vector<Variable> operands = { prediction, targets, weights };
+        return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::Logistic, operands, Dictionary(), name), name);
+    }
+
    FunctionPtr SquaredError(const Variable& prediction, const Variable& targets, const std::wstring& name)
    {
        auto difference = Minus(prediction, targets);
@ -2815,14 +2837,14 @@ namespace CNTK
        if (topN == 1)
        {
            if (axis == Axis(0))
-                return Minus(Constant::Scalar(prediction.GetDataType(), 1.0), TransposeTimes(labels, Hardmax(prediction)), name);
+                return Minus(Constant::Scalar(1.0f), TransposeTimes(labels, Hardmax(prediction)), name);
            else
            {
                auto axMax = ReduceMax(prediction, axis);
                auto pred = Equal(prediction, axMax);
                auto wrongPred = NotEqual(labels, pred);
                auto axErr = ReduceSum(wrongPred, axis);
-                auto capErr = GreaterEqual(axErr, Constant::Scalar(prediction.GetDataType(), 1.0));
+                auto capErr = GreaterEqual(axErr, Constant::Scalar(1.0f));
                return ReduceMean(capErr, Axis::AllStaticAxes(), name);
            }
        }
@ -2831,7 +2853,7 @@ namespace CNTK
            if (axis != Axis(0))
                LogicError("ClassificationError along a specific axis does not support topN!");

-            std::vector<Variable> operands = { prediction, labels, Constant::Scalar(prediction.GetDataType(), (double)topN) };
+            std::vector<Variable> operands = { prediction, labels, Constant::Scalar((float)topN) };
            return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ClassificationError, operands, Dictionary(), name), name);
        }
    }
@ -3011,75 +3033,113 @@ namespace CNTK
    {
        // TODO: This is a temporary and expensive hack until we have a real alias implementation
        // that does not waste memory and compute cycles
-        return Plus(operand, Constant::Scalar(operand.GetDataType(), 0), name);
+        return Plus(operand, Constant::Scalar(0.0f), name);
    }

    namespace Sequence
    {
        void VerifyIsSequence(const Variable& operand)
        {
-            // The operand must have at least one dynamic axis and its first dynamic axis must be ordered
-            if (operand.DynamicAxes().empty() || !operand.DynamicAxes()[0].IsOrdered())
+            // The operand must have at least one dynamic axis
+            if (operand.DynamicAxes().empty())
                InvalidArgument("A sequence function can only be applied on operands with at least one dynamic axis and whose first dynamic axis is ordered");
        }

        FunctionPtr IsFirst(const Variable& operand, const std::wstring& name)
        {
-            VerifyIsSequence(operand);
            return Internal::IsWithin(operand, 1, name);
        }

        FunctionPtr IsLast(const Variable& operand, const std::wstring& name)
        {
-            VerifyIsSequence(operand);
            return Internal::IsWithin(operand, -1, name);
        }

+        FunctionPtr Slice(const Variable& operand, int beginIndex, int endIndex, const std::wstring& name)
+        {
+            VerifyIsSequence(operand);
+
+            if ((beginIndex == 0) && (endIndex == 0))
+                return operand;
+
+            auto beginFlagsLambda = [beginIndex, operand]() {
+                return (beginIndex > 0) ? Minus(Constant::Scalar(1.0f), Internal::IsWithin(operand, beginIndex)) : Internal::IsWithin(operand, beginIndex);
+            };
+
+            auto endFlagsLambda = [endIndex, operand]() {
+                return (endIndex > 0) ? Internal::IsWithin(operand, endIndex) : Minus(Constant::Scalar(1.0f), Internal::IsWithin(operand, endIndex));
+            };
+
+            FunctionPtr flags;
+            if (beginIndex == 0)
+                flags = endFlagsLambda();
+            else if (endIndex == 0)
+                flags = beginFlagsLambda();
+            else
+                flags = ElementTimes(beginFlagsLambda(), endFlagsLambda());
+
+            int sliceLength = (endIndex - beginIndex);
+            size_t multiplicativeFactor = (sliceLength > 0) ? 0 : 1;
+
+            return Internal::Gather(operand, flags, { multiplicativeFactor, sliceLength }, name);
+        }
+
        FunctionPtr First(const Variable& operand, const std::wstring& name)
        {
-            VerifyIsSequence(operand);
-            return Slice(operand, operand.DynamicAxes()[0], 0, 1, name);
+            return Sequence::Slice(operand, 0, 1, name);
        }

        FunctionPtr Last(const Variable& operand, const std::wstring& name)
        {
-            VerifyIsSequence(operand);
-            return Slice(operand, operand.DynamicAxes()[0], -1, 0, name);
-        }
-
-        std::vector<Axis> WhereOpDynamicAxes(const Variable& operand)
-        {
-            VerifyIsSequence(operand);
-
-            std::vector<Axis> newDynamicAxes = { Axis::NewUniqueDynamicAxis(L"whereNodeDynamicAxis") };
-            for (size_t i = 1; i < operand.DynamicAxes().size(); ++i)
-                newDynamicAxes.push_back(operand.DynamicAxes()[i]);
-
-            return newDynamicAxes;
+            return Sequence::Slice(operand, -1, 0, name);
        }

        FunctionPtr Where(const Variable& condition, const std::wstring& name)
        {
-            return Internal::Where(condition, WhereOpDynamicAxes(condition), name);
+            return UnaryOp(PrimitiveOpType::Where, condition, Dictionary(), name);
        }

        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::wstring& name)
        {
-            return Internal::Gather(operand, condition, WhereOpDynamicAxes(condition), name);
+            return Internal::Gather(operand, condition, name);
        }

        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::wstring& name)
        {
-            return Internal::Scatter(operand, condition, WhereOpDynamicAxes(condition), name);
+            return Internal::Scatter(operand, condition, name);
        }

        FunctionPtr BroadcastAs(const Variable& operand, const Variable& broadcastAs, const std::wstring& name)
        {
-            auto dataPadded = Internal::Scatter(operand, Sequence::IsFirst(broadcastAs), operand.DynamicAxes());
+            auto dataPadded = Internal::Scatter(operand, Sequence::IsFirst(broadcastAs), std::make_pair<size_t, int>(0, 1));
            auto placeHolderOutput = PlaceholderVariable(operand.Shape(), broadcastAs.DynamicAxes());
            auto output = ElementSelect(Sequence::IsFirst(broadcastAs), dataPadded, PastValue(placeHolderOutput), name);
            return output->ReplacePlaceholders({ { placeHolderOutput, output } });
        }
+
+        FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const std::wstring& name)
+        {
+            using namespace std::placeholders;
+
+            std::function<FunctionPtr(const Variable& leftOperand, const Variable& rightOperand)> reductionFunctor;
+            if (reductionOpName == PrimitiveFunction::InternalSumReductionOpName)
+                reductionFunctor = std::bind(Plus, _1, _2, L"");
+            else
+                LogicError("%S reduction along dynamic axis is currently unsupported", reductionOpName.c_str());
+
+            // We are reducing over a dynamic axis which is currently implemented using recurrence
+            auto cumulativeSumFunctionPlaceholder = PlaceholderVariable(operand.Shape());
+            auto prevAccumulatedValuesFunction = PastValue(cumulativeSumFunctionPlaceholder);
+            auto cumulativeSumFunction = reductionFunctor(prevAccumulatedValuesFunction, operand);
+            cumulativeSumFunction->ReplacePlaceholders({ { cumulativeSumFunctionPlaceholder, cumulativeSumFunction } });
+
+            return Sequence::Slice(cumulativeSumFunction, -1, 0, name);
+        }
+
+        FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name)
+        {
+            return ReduceElements(operand, PrimitiveFunction::InternalSumReductionOpName, name);
+        }
    }

    namespace Internal
@ -3092,9 +3152,9 @@ namespace CNTK
                InvalidArgument("CNTK::Sequence::IsWithin: The offset must be positive");

            if (offset > 0)
-                return PastValue(Internal::ZeroesWithDynamicAxesLike(operand), Constant::Scalar(operand.GetDataType(), 1.0), offset, name);
+                return PastValue(Internal::ZeroesWithDynamicAxesLike(operand), Constant::Scalar(1.0f), offset, name);
            else
-                return FutureValue(Internal::ZeroesWithDynamicAxesLike(operand), Constant::Scalar(operand.GetDataType(), 1.0), -offset, name);
+                return FutureValue(Internal::ZeroesWithDynamicAxesLike(operand), Constant::Scalar(1.0f), -offset, name);
        }

        FunctionPtr PackedIndex(const Variable& operand, const Variable& index, const std::wstring& name)
@ -3131,21 +3191,32 @@ namespace CNTK
            }
        }

-        FunctionPtr Where(const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name)
+        FunctionPtr Where(const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name)
        {
            auto additionalProperties = Dictionary();
-            additionalProperties[PrimitiveFunction::AttributeNameNewDynamicAxes] = AsDictionaryValueVector(newDynamicAxes);
+            additionalProperties[PrimitiveFunction::AttributeNameNewSequenceAxisLengthScalingFactor] = newDerivedSequenceAxisScalingAndAdditiveFactor.first;
+            additionalProperties[PrimitiveFunction::AttributeNameNewSequenceAxisLengthAdditiveFactor] = newDerivedSequenceAxisScalingAndAdditiveFactor.second;
            return UnaryOp(PrimitiveOpType::Where, condition, std::move(additionalProperties), name);
        }

-        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name)
+        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::wstring& name)
        {
-            return Internal::GatherPacked(operand, Internal::PackedIndex(/*layout of*/ operand, Where(condition, newDynamicAxes)), name);
+            return Internal::GatherPacked(operand, Internal::PackedIndex(/*layout of*/ operand, Sequence::Where(condition)), name);
        }

-        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::vector<Axis>& whereNodeDynamicAxes, const std::wstring& name)
+        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name)
        {
-            return Internal::ScatterPacked(operand, Internal::PackedIndex(/*layout of*/ condition, Where(condition, whereNodeDynamicAxes)), /*layout of*/ condition, name);
+            return Internal::GatherPacked(operand, Internal::PackedIndex(/*layout of*/ operand, Where(condition, newDerivedSequenceAxisScalingAndAdditiveFactor)), name);
+        }
+
+        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::wstring& name)
+        {
+            return Internal::ScatterPacked(operand, Internal::PackedIndex(/*layout of*/ condition, Sequence::Where(condition)), /*layout of*/ condition, name);
+        }
+
+        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name)
+        {
+            return Internal::ScatterPacked(operand, Internal::PackedIndex(/*layout of*/ condition, Where(condition, newDerivedSequenceAxisScalingAndAdditiveFactor)), /*layout of*/ condition, name);
        }

        FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name)
@ -3160,8 +3231,6 @@ namespace CNTK

        FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name)
        {
-            using namespace std::placeholders;
-
            if (axis.IsStaticAxis() || (axis == Axis::AllStaticAxes()))
            {
                auto additionalProperties = Dictionary();
@ -3173,20 +3242,7 @@ namespace CNTK
            if (axis == Axis::DefaultBatchAxis())
                LogicError("Reduction is currently unsupported along the batch axis");

-            if (reductionOpName != PrimitiveFunction::InternalSumReductionOpName)
-                LogicError("%S reduction along dynamic axis is currently unsupported", reductionOpName.c_str());
-
-            std::function<FunctionPtr(const Variable& leftOperand, const Variable& rightOperand)> reductionFunctor;
-            if (reductionOpName == PrimitiveFunction::InternalSumReductionOpName)
-                reductionFunctor = std::bind(Plus, _1, _2, L"");
-
-            // We are reducing over a dynamic axis which is currently implemented using recurrence
-            auto cumulativeSumFunctionPlaceholder = PlaceholderVariable(operand.Shape());
-            auto prevAccumulatedValuesFunction = PastValue(cumulativeSumFunctionPlaceholder);
-            auto cumulativeSumFunction = reductionFunctor(prevAccumulatedValuesFunction, operand);
-            cumulativeSumFunction->ReplacePlaceholders({ { cumulativeSumFunctionPlaceholder, cumulativeSumFunction } });
-
-            return CNTK::Slice(cumulativeSumFunction, axis, -1, 0, name);
+            LogicError("CNTK::ReduceElements: Invalid axis argument provided. To reduce a sequence along its ordered dynamic axis use Sequence::ReduceElements.");
        }
   }
 }
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -65,7 +65,7 @@ namespace CNTK
        {PrimitiveOpType::Times, L"Times"},
        {PrimitiveOpType::TransposeTimes, L"TransposeTimes"},
        {PrimitiveOpType::Convolution, L"Convolution"},
-        {PrimitiveOpType::SquaredError, L"SquaredError"},
+        { PrimitiveOpType::SquaredError, L"SquaredError" },
        {PrimitiveOpType::CrossEntropyWithSoftmax, L"CrossEntropyWithSoftmax"},
        {PrimitiveOpType::ClassificationError, L"ClassificationError"},
        {PrimitiveOpType::PastValue, L"PastValue"},
@ -79,6 +79,7 @@ namespace CNTK
        {PrimitiveOpType::RandomSample, L"RandomSample"},
        {PrimitiveOpType::RandomSampleInclusionFrequency, L"RandomSampleInclusionFrequency"},
        {PrimitiveOpType::ROIPooling, L"ROIPooling"},
+        { PrimitiveOpType::Logistic, L"Logistic" },
    };

    inline const std::wstring& PrimitiveOpTypeName(PrimitiveOpType opType)
@ -103,7 +104,15 @@ namespace CNTK
            if (numFunctionInputs > 2)
                indexMap.insert({2, 2});
        }
-        else if ((op == PrimitiveOpType::CrossEntropyWithSoftmax) || (op == PrimitiveOpType::GatherPacked))
+        else if (op == PrimitiveOpType::Logistic)
+        {
+            indexMap = std::unordered_map<size_t, size_t>({ { 0, 1 }, { 1, 0 } });
+            if (numFunctionInputs > 2)
+                indexMap.insert({ 2, 2 });
+        }
+        else if (op == PrimitiveOpType::CrossEntropyWithSoftmax)
+            indexMap = std::unordered_map<size_t, size_t>({ { 0, 1 }, { 1, 0 } });
+        else if (op == PrimitiveOpType::GatherPacked)
            indexMap = std::unordered_map<size_t, size_t>({ { 0, 1 }, { 1, 0 } });
        else if (op == PrimitiveOpType::ScatterPacked)
            indexMap = std::unordered_map<size_t, size_t>({ { 0, 2 }, { 1, 1 }, { 2, 0 } });
@ -187,6 +196,8 @@ namespace CNTK
        static const std::wstring AttributeNameEpsilon;
        static const std::wstring AttributeNameUseCuDNNEngine;
        static const std::wstring AttributeNameNewDynamicAxes;
+        static const std::wstring AttributeNameNewSequenceAxisLengthScalingFactor;
+        static const std::wstring AttributeNameNewSequenceAxisLengthAdditiveFactor;
        static const std::wstring AttributeNameBeginIndex;
        static const std::wstring AttributeNameEndIndex;
        static const std::wstring AttributeNameReductionOpName;
@ -699,22 +710,11 @@ namespace CNTK
            return CompositeFunctionOpName;
        }

-    private:
-        virtual void ReplacePlaceholdersInPlace(const std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                                std::unordered_set<const Function*>& visitedFunctions,
-                                                std::unordered_set<Variable>& replacedPlaceholders) override;
-
-        CompositeFunction(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name, const std::wstring& uid = Internal::GenerateUid(L"CompositeFunction"))
-            : Function({}, rootFunction->Outputs(), Dictionary(), rootFunction, name, uid),
-            m_allPrimitiveFunctions(std::move(allPrimitiveFunctions)), m_networkMatricesAllocated(false)
-        {}
-
        template <typename FunctionType>
-        void Traverse(const FunctionType& functor) const
+        static void Traverse(const FunctionPtr& rootFunction, const FunctionType& functor)
        {
-            const auto& root = RootFunction();
            std::unordered_set<FunctionPtr> visitedFunctions;
-            Traverse(root, visitedFunctions, functor);
+            Traverse(rootFunction, visitedFunctions, functor);
        }

        // Recursively traverses the Function graph underlying the 'rootFunction' invoking the provided functor for all visited nodes in the graph.
@ -735,6 +735,16 @@ namespace CNTK
            }
        }

+    private:
+        virtual void ReplacePlaceholdersInPlace(const std::unordered_map<Variable, Variable>& placeholderReplacements,
+                                                std::unordered_set<const Function*>& visitedFunctions,
+                                                std::unordered_set<Variable>& replacedPlaceholders) override;
+
+        CompositeFunction(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name, const std::wstring& uid = Internal::GenerateUid(L"CompositeFunction"))
+            : Function({}, rootFunction->Outputs(), Dictionary(), rootFunction, name, uid),
+            m_allPrimitiveFunctions(std::move(allPrimitiveFunctions)), m_networkMatricesAllocated(false)
+        {}
+
        std::vector<Variable> DetermineInputs() const
        {
            const auto& root = RootFunction();
--- a/Source/CNTKv2LibraryDll/Globals.cpp
+++ b/Source/CNTKv2LibraryDll/Globals.cpp
@ -1,10 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-
-#include "stdafx.h"
-
-// TODO: Currently there are some known issues with memory sharing for forward pass output matrices that 
-// need to be addressed before we can switch to using memory sharing by default here.
-bool g_shareNodeValueMatrices = false;
--- a/Source/CNTKv2LibraryDll/PrimitiveOpType.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveOpType.h
@ -57,6 +57,7 @@ namespace CNTK
        RandomSample = 45,
        RandomSampleInclusionFrequency = 46,
        ROIPooling = 47,
+        Logistic = 48,
        // New op types should only be appended to the end of this list.
    };
 }
--- a/Source/Common/Common.vcxproj
+++ b/Source/Common/Common.vcxproj
@ -62,6 +62,7 @@
    </ClCompile>
  </ItemDefinitionGroup>
  <ItemGroup>
+    <ClCompile Include="..\Math\NcclComm.cpp" />
    <ClCompile Include="Config.cpp" />
    <ClCompile Include="DataReader.cpp" />
    <ClCompile Include="DataWriter.cpp" />
@ -76,4 +77,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/Source/Common/Globals.cpp
+++ b/Source/Common/Globals.cpp
@ -13,4 +13,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    std::atomic<bool> Globals::m_forceDeterministicAlgorithms(false);
    std::atomic<bool> Globals::m_forceConstantRandomSeed(false);

-}}}
+    std::atomic<bool> Globals::m_enableShareNodeValueMatrices(false);
+    std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
+
+}}}
--- a/Source/Common/Include/Globals.h
+++ b/Source/Common/Include/Globals.h
@ -22,8 +22,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // TODO: Currently the flag is set to false. Should be switched to true after more rigorous testing.
        static bool UseV2Aggregator() { return false; }

+        static void EnableShareNodeValueMatrices()
+        {
+            m_enableShareNodeValueMatrices = true;
+        }
+
+        static bool ShouldEnableShareNodeValueMatrices()
+        {
+            return m_enableShareNodeValueMatrices;
+        }
+
+        static void EnableHyperCompressMemory()
+        {
+            m_enableHyperCompressMemory = true;
+        }
+
+        static bool ShouldEnableHyperCompressMemory()
+        {
+            return m_enableHyperCompressMemory;
+        }
+
    private:
        static std::atomic<bool> m_forceDeterministicAlgorithms;
+        // The global flag to enable matrices values in forward and backward prop
+        static std::atomic<bool> m_enableShareNodeValueMatrices;
+        // The global flag to enable hyper memory compression 
+        static std::atomic<bool> m_enableHyperCompressMemory;
        static std::atomic<bool> m_forceConstantRandomSeed;
    };
 }}}
--- a/Source/Common/Include/MPIWrapper.h
+++ b/Source/Common/Include/MPIWrapper.h
@ -1,3 +1,9 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
 #pragma once

 // Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#ms-mpi or
@ -71,6 +77,7 @@ class MPIWrapper : public std::enable_shared_from_this<MPIWrapper>
    std::wstring m_myName;
    int m_numMPINodes;
    size_t m_numNodesInUse;
+    bool m_multiHost;

    // MPI communicator that reflects the current subset selection
    MPI_Comm m_currentComm;
@ -145,6 +152,7 @@ public:
        MPI_Comm_rank(MPI_COMM_WORLD, &m_myRank);
        MPI_Comm_size(MPI_COMM_WORLD, &m_numMPINodes);
        m_numNodesInUse = m_numMPINodes;
+        m_multiHost = true;

        // Verify that the environment variable used by GetTotalNumberOfMPINodes()  
        // matches what the MPI API says. There're actually two possible cases:
@ -305,6 +313,35 @@ private:
            fflush(stderr);
        }
        Ping("requestnodes (after change)");
+
+        // If all ranks run on a single host, we can enable optimized communication
+        // paths (e.g. NCCL). To determine if a single machine is being used, we
+        // check that MPI_Get_processor_name matches for all ranks.
+        const int nameMax = MPI_MAX_PROCESSOR_NAME + 1;
+        char myName[nameMax] = {0};
+        int  myNameLen = 0;
+        MPI_Get_processor_name(myName, &myNameLen) || MpiFail("requestnodes: MPI_Get_processor_name");
+        myName[myNameLen] = '\0';
+
+        std::vector<char> nameBuffer(m_numNodesInUse * nameMax);
+        char* allNames = nameBuffer.data();
+        MPI_Allgather(myName, nameMax, MPI_CHAR, allNames, nameMax, MPI_CHAR, m_currentComm)
+            || MpiFail("requestnodes: MPI_Allgather");
+
+        m_multiHost = false;
+        for(size_t i=1; i<m_numNodesInUse; i++)
+        {
+            if (strcmp(allNames, allNames+i*nameMax) != 0)
+            {
+                m_multiHost = true;
+                break;
+            }
+        }
+
+        fprintf(stderr, "requestnodes [%s]: using %d out of %d MPI nodes on %s (%d requested); we (%d) are %s\n",
+                msg, (int) m_numNodesInUse, (int) m_numMPINodes, m_multiHost ? "multiple hosts" : "a single host",
+                (int) requestednodes, (int) CurrentNodeRank(), IsIdle() ? "out (idle)" : "in (participating)");
+        fflush(stderr);
    }

 public:
@ -360,6 +397,11 @@ public:
        return 0;
    }

+    bool IsMultiHost()
+    {
+        return m_multiHost;
+    }
+
    // -----------------------------------------------------------------------
    // data-exchange functions (wrappers around MPI functions)
    // -----------------------------------------------------------------------
--- a/Source/Common/MPIWrapper.cpp
+++ b/Source/Common/MPIWrapper.cpp
@ -1,6 +1,11 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
 #include "Include/Basics.h"
 #include "Include/MPIWrapper.h"

 using namespace Microsoft::MSR::CNTK;
 int MPIWrapper::s_myRank = -1;
-std::shared_ptr<MPIWrapper> Microsoft::MSR::CNTK::MPIWrapper::s_mpi = nullptr;
+std::shared_ptr<MPIWrapper> Microsoft::MSR::CNTK::MPIWrapper::s_mpi = nullptr;
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -1002,7 +1002,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
    // Due to special topology, if a node is solely induced by parameters, its function value should not be shared
    MarkValueNonSharableNodes();

-    bool performingBackPropagation = (trainRootNode != nullptr);
+    bool performingBackPropagation = (trainRootNode != nullptr) || (Globals::ShouldEnableHyperCompressMemory());

    // Create a composite Eval order with the specified nodes as roots
    // For each node determine parents and whether the output of the
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@ -61,7 +61,7 @@
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
    <ClCompile>
-      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
@ -136,4 +136,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -12,6 +12,7 @@
 #include "TensorShape.h"
 #include "MatrixPool.h"
 #include "ComputationEnvironment.h"
+#include "Globals.h"

 #include <unordered_set>
 #include <map>
@ -46,8 +47,6 @@
 #define CNTK_MODEL_VERSION_15 15 // add new nodes: LambdaRankNode and NDCG1Eval
 #define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_15

-extern bool g_shareNodeValueMatrices;
-
 // helper mode for debugging
 // If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
 // #define TRACK_GAP_NANS
@ -768,7 +767,11 @@ public:
    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const { return true; }

    void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
-    bool IsOutputNeededDuringBackprop() const { return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop; }
+    bool IsOutputNeededDuringBackprop() const 
+    { 
+        return (!Globals::ShouldEnableShareNodeValueMatrices() && !Globals::ShouldEnableHyperCompressMemory())
+            || m_outputNeededDuringBackprop; 
+    }

    // -----------------------------------------------------------------------
    // helpers for network traversal
@ -1631,6 +1634,20 @@ public:
 #endif
        // tracing
        Trace();
+
+        // Any memory not needed could resize to zero immediately when HyperCompressMemory active. Since the memory won't really release,
+        // all these memory blocks are gathered into a memory pool. When the next request coming, the best fitting block will be chosen.
+        if (Globals::ShouldEnableHyperCompressMemory()) 
+        {
+            for (auto& input : GetInputs())
+            {
+                if (!input->IsOutputNeededDuringBackprop())
+                {
+                    auto inputNodePtr = DownCast(input);
+                    inputNodePtr->Value().Resize(0, 0);
+                }
+            }
+        }
    }

 #if 0   // (keep it around in case we need to add stuff in the future)
@ -1640,9 +1657,9 @@ public:
        }
 #endif

-#ifdef _DEBUG
    virtual void /*IComputationNode::*/ EndBackprop() override
    {
+#ifdef _DEBUG
        Base::EndBackprop();
 #ifdef TRACK_GAP_NANS
        for (size_t i = 0; i < m_inputs.size(); i++)
@ -1656,8 +1673,18 @@ public:
            }
        }
 #endif
-    }
 #endif
+        // We could release the gradient of value sharable nodes and all no-longer used memory generated in forward.
+        if (IsValueSharable() && Globals::ShouldEnableHyperCompressMemory())
+        {
+            if (GradientPtr()) 
+                Gradient().Resize(0, 0);
+
+            // canceling the graph dependency
+            if (IsOutputNeededDuringBackprop()) 
+                Value().Resize(0, 0);
+        }
+    }

    // this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation
    // TODO: move to -Base (or -Network?)
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -395,7 +395,7 @@ public:
            // If input data is sparse, then gradient is block sparse.
            if (InputRef(1).Value().GetMatrixType() == SPARSE && InputRef(0).Gradient().GetMatrixType() == DENSE && Gradient().GetMatrixType() == DENSE)
            {
-                // We need a sparse matrix for the gradient. However, we should allocate a new one instead of switching the type in place
+                // We need a sparse matrix for the gradient. We allocate a new one instead of switching the type in place
                // since switching in place may affect other nodes who share this matrix due to memory sharing
                auto& currentInput0GradientMatrixRef = InputRef(0).Gradient();
                auto newInput0SparseGradientMatrix = std::make_shared<Matrix<ElemType>>(currentInput0GradientMatrixRef.GetNumRows(),
@ -556,7 +556,7 @@ public:
        {
            Input(0)->CreateGradientMatrixIfNull();

-            // We need a sparse matrix for the gradient. However, we should allocate a new one instead of switching the type in place
+            // We need a sparse matrix for the gradient. We allocate a new one instead of switching the type in place
            // since switching in place may affect other nodes who share this matrix due to memory sharing
            auto& currentInput0GradientMatrixRef = InputRef(0).Gradient();
            if (currentInput0GradientMatrixRef.GetMatrixType() != SPARSE)
--- a/Source/ComputationNetworkLib/TrainingNodes.cpp
+++ b/Source/ComputationNetworkLib/TrainingNodes.cpp
@ -126,7 +126,7 @@ void RandomSampleNode<ElemType>::ForwardPropNonLooping()
    if (ValueAsMatrix().GetMatrixType() != SPARSE)
    {
        // BUGBUG: matrix type should be configured during validation
-        // We should allocate a new one instead of switching the type in place since switching in place may
+        // Note: We allocate a new one instead of switching the type in place since switching in place may
        // affect other nodes who share this matrix due to memory sharing
        auto newSparseValueMatrix = std::make_shared<Matrix<ElemType>>(ValueAsMatrix().GetNumRows(), ValueAsMatrix().GetNumCols(), CPUDEVICE, SPARSE, matrixFormatSparseCSC);
 #ifdef _MSC_VER
@ -140,10 +140,7 @@ void RandomSampleNode<ElemType>::ForwardPropNonLooping()

    // TODO: Should we prepare the CSC data directly on the CPU and move it in one go?
    // Currently the reader will place the data onto the GPU. It will then be pulled on-demand to the CPU once (and cached there).
-    valueMatrix.TransferToDeviceIfNotThere(CPUDEVICE, /*ismoved =*/ true/*means: BOTH state not ok */, /*emptyTransfer =*/ true, /*updatePreferredDevice =*/ false);
-
-    // BUGUBUG: This is a no-op; was the intent to change the preferred device to CPU?
-    valueMatrix.SetDevice(CPUDEVICE);
+    valueMatrix.TransferToDeviceIfNotThere(CPUDEVICE, /*ismoved =*/ true/*means: BOTH state not ok */, /*emptyTransfer =*/ true, /*updatePreferredDevice =*/ true);
    valueMatrix.Reset();

    // Get vector with indices of randomly sampled classes
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -2506,7 +2506,7 @@ public:
        if (expAvgFactor != 0 || blendFactor != 1)
            m_samplesSeen += GetMBLayout()->GetActualNumSamples();

-        Base::EndBackprop();
+        Base::EndForwardProp();
    }

    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -30,11 +30,6 @@
 #include "latticearchive.h"
 #include <limits>

-// TODO: Temporary mechanism to enable memory sharing for
-// node output value matrices. This will go away when the
-// sharing is ready to be enabled by default
-bool g_shareNodeValueMatrices = false;
-
 namespace Microsoft { namespace MSR { namespace CNTK {


@ -44,7 +39,10 @@ void CNTKEvalBase<ElemType>::Init(const std::string& config)
    m_config.Parse(config);
    size_t nThreads = m_config("numCPUThreads", "1");
    CPUMatrix<ElemType>::SetNumThreads(nThreads);
-    g_shareNodeValueMatrices = m_config(L"shareNodeValueMatrices", false);
+    if (m_config(L"shareNodeValueMatrices", false))
+        Globals::EnableShareNodeValueMatrices();
+    if (m_config(L"hyperCompressMemory", false))
+        Globals::EnableHyperCompressMemory();
 }


--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -14,9 +14,12 @@
 #endif

 #include "Basics.h"
+#include "basetypes.h"
 #include <string>
 #include <stdint.h>
 #include <memory>
+#include <unordered_map>
+#include <map>

 #pragma warning( disable: 4251 )
 typedef unsigned char byte;
@ -38,6 +41,8 @@ typedef unsigned char byte;
 #define GPUSPARSE_INDEX_TYPE int // cuSparse only supports int array indexes
 #define CPUSPARSE_INDEX_TYPE int // to be consistent with cuSparse but limited the possible size of the matrix.

+#define MEM_MAX_LIMIT_TIMES 2 // The maximum times allowed a cached memory block allocated to a request
+
 namespace Microsoft { namespace MSR { namespace CNTK {

 MATH_API void SetMathLibTraceLevel(int traceLevel);
@ -61,11 +66,13 @@ public:
    template <typename AllocatedElemType>
    static void Free(int deviceId, AllocatedElemType* bufferPtr, bool ignoreCUDARetCode = false);

+    // Let it be public method, the memory manager could check the totoal free memory and decide whether to physically
+    // release all the cached memory.
+    static std::pair<size_t, size_t> GetFreeAndTotalMemoryInMBs(int deviceId);
+
 private:
    template <typename AllocatedElemType>
    static AllocatedElemType* AllocateNoTrace(int deviceId, size_t numElements);
-
-    static std::pair<size_t, size_t> GetFreeAndTotalMemoryInMBs(int deviceId);
 };

 // -----------------------------------------------------------------------
@ -205,6 +212,158 @@ enum MatrixFlags
    matrixFlagSetValueOnDevice = 1 << bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
 };

+
+// -----------------------------------------------------------------------
+// BufferManagement -- to control the allocation and release of memory
+// 
+// 1. The goal of buffer management
+// The best way to save memory is releasing memory right after no longer used in the rest of the mini-batch, which makes 
+// the extra cost on memory operation and slows down the speed. An option to solve that is building the static link between 
+// all nodes in pre-computing process and making memory re-use in the runtime, known as shared node value matrices in CNTK. 
+// The other option is using a buffer pool to take over the allocation and release request. Whereas the physical operation on 
+// memory, logical operation will make nearly no cost on allocation or release. Since the second option, achieved as 
+// BufferManagement below, could control all the memory operation, including some trivial ones, like the workspace in convolutions, 
+// and more flexible, allocating based on size and being easy to implement new algorithm, it is usually more powerful than the
+// first method.
+// 2. How it works?
+// First, it should be called in Resize function. In Resize function, using Request and LogicalReleaseFunction to replace the original 
+// request and release functions. Since BufferManagement is singleton for deviceId, just call the GetManagementInstance. And in Resize, 
+// there is a flag named growthOnly, which will request only the size increases to save the allocation cost. In the case, since the 
+// buffer pool, nearly no cost on allocation, the growth only will be disable in BufferManagement mode.
+// -----------------------------------------------------------------------
+class BufferManagement
+{
+private:
+    BufferManagement() = default;
+
+    // Disable all the copy & move functions to keep the instance safely
+    DISABLE_COPY_AND_MOVE(BufferManagement);
+
+public:
+    static BufferManagement& GetManagerInstance(DEVICEID_TYPE deviceId)
+    {
+        static std::mutex instancLock;
+        auto instance = m_instances.find(deviceId);
+        if (instance == m_instances.end()) 
+        {
+            std::lock_guard<std::mutex> lock(instancLock);
+            if (instance == m_instances.end())
+            {
+                instance = m_instances.insert(std::make_pair(deviceId, std::unique_ptr<BufferManagement>(
+                    new BufferManagement()))).first;
+                instance->second->m_deviceId = deviceId;
+                instance->second->m_totalManageSize = 0;
+                instance->second->m_totalAllocSize = 0;
+            }
+        }
+        return *(instance->second);
+    }
+
+    // for requesting, find in buffer container first, if failed, allocate a new one
+    // if allocating from buffer, the size will be modified to the real buffer size
+    template<class ElemType>
+    ElemType* RequestBuffer(size_t& size)
+    {
+        ElemType* bufferPtr = nullptr;
+        auto& bufferContainer = BufferContainer<ElemType>();
+
+        // simply allocating based on size, more efficient and complex algorithm could be implemented here
+        auto bufferHint = bufferContainer.lower_bound(size);
+        if (bufferHint != bufferContainer.end() && bufferHint->first < size * MEM_MAX_LIMIT_TIMES) 
+        {
+            bufferPtr = bufferHint->second;
+            size = bufferHint->first;
+            m_totalManageSize -= size;
+            bufferContainer.erase(bufferHint);
+            return bufferPtr;
+        }
+
+        if (m_deviceId >= 0) {
+#ifndef CPUONLY
+            auto deviceSize = TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(m_deviceId);
+            float freeMemoryRatio = (float)deviceSize.first / deviceSize.second;
+            if (freeMemoryRatio < 0.05f || (deviceSize.first << 20) / sizeof(ElemType) < size) 
+            {
+                PhysicalReleaseAllBuffer<ElemType>();
+            }
+            bufferPtr = TracingGPUMemoryAllocator::Allocate<ElemType>(m_deviceId, size);
+            m_totalAllocSize += size;
+#endif
+        }
+        else 
+        {
+            // first, try no-throw allocation.
+            // if failed, empty the buffer and re-try a throwing allocation
+            // if failed again, let system throw the bad_alloc exception
+            bufferPtr = new (std::nothrow) ElemType[size];
+            if (!bufferPtr) 
+            {
+                PhysicalReleaseAllBuffer<ElemType>();
+                bufferPtr = new ElemType[size];
+            }
+            m_totalAllocSize += size;
+        }
+
+        return bufferPtr;
+    }
+
+    // insert the header of buffer into the buffer container
+    template<class ElemType>
+    void LogicalReleaseBuffer(ElemType* buffer, size_t size)
+    {
+        auto& bufferContainer = BufferContainer<ElemType>();
+        bufferContainer.insert(std::make_pair(size, buffer));
+        m_totalManageSize += size;
+    }
+
+    // physical release the buffer
+    template<class ElemType>
+    void PhysicalReleaseBuffer(ElemType* buffer)
+    {
+        if (m_deviceId >= 0) 
+        {
+#ifndef CPUONLY
+            TracingGPUMemoryAllocator::Free<ElemType>(m_deviceId, buffer, false);
+#endif
+        }
+        else {
+            delete[] buffer;
+        }
+    }
+
+    // empty all the cached buffer
+    template<class ElemType>
+    void PhysicalReleaseAllBuffer()
+    {
+        auto& bufferContainer = BufferContainer<ElemType>();
+
+        for (auto& iter : bufferContainer) 
+        {
+            PhysicalReleaseBuffer<ElemType>(iter.second);
+        }
+
+        bufferContainer.clear();
+        m_totalManageSize = 0;
+    }
+
+private:
+    static std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> m_instances;
+
+    template <class ElemType>
+    std::multimap<size_t, ElemType*>& BufferContainer();
+    DEVICEID_TYPE m_deviceId;
+    size_t m_totalManageSize;
+    size_t m_totalAllocSize;
+
+    // map to store all the temp buffer handle
+    std::multimap<size_t, float*> m_bufferFloatContainer;
+    std::multimap<size_t, double*> m_bufferDoubleContainer;
+    std::multimap<size_t, char*> m_bufferCharContainer;
+    std::multimap<size_t, short*> m_bufferShortContainer;
+    std::multimap<size_t, int*> m_bufferIntContainer;
+};
+
+
 // -----------------------------------------------------------------------
 // BaseMatrixStorage -- base class for all matrix types (CPU, GPU) x (dense, sparse)
 // -----------------------------------------------------------------------
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -247,6 +247,11 @@ protected:
            if (CUDNN_STATUS_SUCCESS == err2)
                err = CUDNN_STATUS_SUCCESS;
        }
+
+        // Only supported in MatrixPool enable
+        // NOTE: it's unnecessary to keep the workspace.
+        workspace.Resize(0, 0);
+
        CUDNN_CALL(err);
    }

@ -278,6 +283,7 @@ protected:
        // Compute gradients with respect to the output tensor (data).
        CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
                                                ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
+        workspace.Resize(0, 0);
    }

    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
@ -308,6 +314,7 @@ protected:
        // Compute gradients with respect to the output tensor (data).
        CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
                                                  ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
+        workspace.Resize(0, 0);
    }

    void EnsurePoolingInitialized() override
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -1505,32 +1505,43 @@ void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly)
+void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
 {
    if (GetNumRows() != numRows || GetNumCols() != numCols)
-        Resize(numRows, numCols, growOnly);
+        Resize(numRows, numCols, growOnly, cachedResize);
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
+void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
 {
    if (GetNumRows() == numRows && GetNumCols() == numCols)
        return;

    VerifyResizable(__func__);
+    bool isForceResize = (!growOnly) || cachedResize;

    size_t numElements = numRows * numCols;
-    if (numElements > GetSizeAllocated() ||                 // grow allocation
-        (!growOnly && numElements != GetSizeAllocated()))   // shrink allocation if not growOnly
+    if (numElements > GetSizeAllocated() ||                     // grow allocation
+        (isForceResize && numElements != GetSizeAllocated()))   // shrink allocation if not growOnly
    {
        // reallocate buffer if numElements > 0
        ElemType* pArray = nullptr;
        if (numElements > 0)
-            pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
+        {
+            if (cachedResize)
+                pArray = BufferManagement::GetManagerInstance(GetComputeDeviceId()).RequestBuffer<ElemType>(numElements);
+            else
+                pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
+        }

        // If the buffer exists, free it
        if (Buffer())
-            TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
+        {
+            if(cachedResize)
+                BufferManagement::GetManagerInstance(GetComputeDeviceId()).LogicalReleaseBuffer<ElemType>(Buffer(), GetSizeAllocated());
+            else
+                TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
+        }

        SetBuffer(pArray, numElements * sizeof(ElemType));
        SetSizeAllocated(numElements);
@ -4559,8 +4570,8 @@ template GPUMatrix<char>::GPUMatrix(const GPUMatrix<char>&);
 template GPUMatrix<char>::GPUMatrix(GPUMatrix<char>&&);
 template char* GPUMatrix<char>::CopyToArray() const;
 template void GPUMatrix<char>::ChangeDeviceTo(int);
-template void GPUMatrix<char>::Resize(size_t, size_t, bool);
-template void GPUMatrix<char>::RequireSize(size_t, size_t, bool);
+template void GPUMatrix<char>::Resize(size_t, size_t, bool, bool);
+template void GPUMatrix<char>::RequireSize(size_t, size_t, bool, bool);

 template GPUMatrix<char>::~GPUMatrix();
 template GPUMatrix<char> GPUMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
@ -4584,8 +4595,8 @@ template GPUMatrix<short>::GPUMatrix(const GPUMatrix<short>&);
 template GPUMatrix<short>::GPUMatrix(GPUMatrix<short>&&);
 template short* GPUMatrix<short>::CopyToArray() const;
 template void GPUMatrix<short>::ChangeDeviceTo(int);
-template void GPUMatrix<short>::Resize(size_t, size_t, bool);
-template void GPUMatrix<short>::RequireSize(size_t, size_t, bool);
+template void GPUMatrix<short>::Resize(size_t, size_t, bool, bool);
+template void GPUMatrix<short>::RequireSize(size_t, size_t, bool, bool);

 template GPUMatrix<short>::~GPUMatrix();
 template GPUMatrix<short> GPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -232,12 +232,12 @@ public:
    // RequireSize is now the new preferred method of ensuring the correct size inside of the Matrix class. Since Resize will fail if the storage object has
    // multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
    // will call Resize, which may fail if the matrix has multiple views.
-    void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
-    void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly); }
+    void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow
+    void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true, bool cachedResize = false) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly, cachedResize); }

    // Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
    // actually resizes the underlying matrix, doing any allocation as required.
-    void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
+    void Resize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow

    ElemType&       operator()(const size_t /*row*/, const size_t /*col*/)       { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
    const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -156,6 +156,23 @@ int GetMathLibTraceLevel()

 MatrixBase::~MatrixBase() { }

+#pragma region BufferManagement
+
+std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> BufferManagement::m_instances;
+
+template <>
+std::multimap<size_t, float*>& BufferManagement::BufferContainer<float>() { return m_bufferFloatContainer; }
+template <>
+std::multimap<size_t, double*>& BufferManagement::BufferContainer<double>() { return m_bufferDoubleContainer; }
+template <>
+std::multimap<size_t, char*>& BufferManagement::BufferContainer<char>() { return m_bufferCharContainer; }
+template <>
+std::multimap<size_t, short*>& BufferManagement::BufferContainer<short>() { return m_bufferShortContainer; }
+template <>
+std::multimap<size_t, int*>& BufferManagement::BufferContainer<int>() { return m_bufferIntContainer; }
+
+#pragma endregion
+
 #pragma region Constructors, destructors and other static matrix builders


@ -165,6 +182,10 @@ MatrixBase::~MatrixBase() { }
 //            { GPU code },
 //            ...

+// By default, the CachedMatrixBuffer is disable
+template <class ElemType>
+bool Matrix<ElemType>::m_useCachedResize = false;
+
 // Initialize members 
 template <class ElemType>
 void Matrix<ElemType>::Init(DEVICEID_TYPE deviceId)
@ -278,6 +299,9 @@ void Matrix<ElemType>::SetDataLocation(CurrentDataLocation location, MatrixType
        LogicError("SetDataLocation: New m_baseMatrix must not be NULL.");
 }

+template <class ElemType>
+void Matrix<ElemType>::UseCachedResizeOrNot(bool useCachedResize) { m_useCachedResize = useCachedResize; }
+
 //this is a private constructor only used internally to initialize a blank matrix
 template <class ElemType>
 Matrix<ElemType>::Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID)
@ -1593,7 +1617,7 @@ void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const
    // TODO: should this function test whether the size is changing, and skip if it isn't? We have at least one explicit test for this code calling this (recurrent node)
    DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
        { m_CPUMatrix->Resize(numRows, numCols, growOnly); },
-        { m_GPUMatrix->Resize(numRows, numCols, growOnly); },
+        { m_GPUMatrix->Resize(numRows, numCols, growOnly, m_useCachedResize); },
        { m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); },
        { m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); });
 #ifdef _DEBUG
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -76,6 +76,9 @@ private:
    mutable size_t m_numTimesDeviceChanged;
    mutable size_t m_numTimesMatrixTypeChanged;
    mutable int m_devicesTransferedTo[2]; // TODO: what is this for? Seems only diagnostics
+ 
+    // whether to use cached memory Resize() or not
+    static bool m_useCachedResize;

    // Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
    void _transferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
@ -130,6 +133,8 @@ public:
        SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
    }

+    static void UseCachedResizeOrNot(bool useCachedResize);
+
 private:
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID);                                  // only used internally to initialize a blank matrix
--- a/Source/Math/NcclComm.cpp
+++ b/Source/Math/NcclComm.cpp
@ -0,0 +1,121 @@
+//
+// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
+#include "NcclComm.h"
+
+#ifdef USE_NCCL
+#include "GPUMatrix.h"
+#include <nccl.h>
+#include <cuda_runtime.h>
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// allows to write cudaFunction() || "error"   (CUDA runtime)
+static void operator||(cudaError_t rc, const char *msg)
+{
+    if (rc != cudaSuccess)
+        RuntimeError("%s: %s (cuda error %d)", msg, cudaGetErrorString(rc), (int) rc);
+}
+
+NcclComm::NcclComm(int deviceId, const MPIWrapperPtr& mpi)
+    : m_ncclComm(nullptr), m_stream(nullptr)
+{
+    if (mpi->IsMultiHost())
+        return;
+
+    size_t numRanks = mpi->NumNodesInUse();
+    MPI_Comm mpiComm = mpi->Communicator();
+    std::vector<int> allDevs(numRanks);
+    MPI_Allgather(&deviceId, 1, MPI_INT, allDevs.data(), 1, MPI_INT, mpiComm)
+        || MpiFail("NcclComm: MPI_Allgather");
+
+    for (size_t r = 0; r<numRanks; r++)
+    {
+        if (allDevs[r] == CPUDEVICE)
+        {
+            fprintf(stderr, "NcclComm: disabled, at least one rank using CPU device\n");
+            return;
+        }
+        for (size_t s = 0; s<r; s++)
+            if (allDevs[r] == allDevs[s])
+            {
+                fprintf(stderr, "NcclComm: disabled, same device used by more than one rank\n");
+                return;
+            }
+    }
+
+    ncclUniqueId ncclId;
+    ncclResult_t res;
+
+    res = ncclGetUniqueId(&ncclId);
+    if (res != ncclSuccess)
+        RuntimeError("NcclComm failed to obtain ncclUniqueId: %s", ncclGetErrorString(res));
+
+    MPI_Bcast(&ncclId, NCCL_UNIQUE_ID_BYTES, MPI_CHAR, 0, mpiComm)
+        || MpiFail("NcclComm: MPI_Bcase");
+
+    PrepareDevice(deviceId);
+    res = ncclCommInitRank(&m_ncclComm, numRanks, ncclId, mpi->CurrentNodeRank());
+    if (res != ncclSuccess)
+      RuntimeError("NcclComm failed to initialize ncclComm_t: %s", ncclGetErrorString(res));
+
+    cudaStreamCreateWithFlags(&m_stream, cudaStreamNonBlocking)
+        || "cudaStreamCreateWithFlags failed";
+    fprintf(stderr, "NcclComm: initialized\n");
+}
+
+NcclComm::~NcclComm()
+{
+    if (m_stream != nullptr)
+        cudaStreamDestroy(m_stream);
+    if (m_ncclComm != nullptr)
+        ncclCommDestroy(m_ncclComm);
+}
+
+bool NcclComm::IsSupported()
+{
+    return m_ncclComm != nullptr;
+}
+
+void NcclComm::AllReduceImpl(void* buffer, size_t count, DataType dtype)
+{
+    ncclResult_t res;
+    if (dtype == DataType::FLOAT)
+    {
+        res = ncclAllReduce(buffer, buffer, count, ncclFloat, ncclSum, m_ncclComm, m_stream);
+    }
+    else
+    {
+        assert(dtype == DataType::DOUBLE);
+        res = ncclAllReduce(buffer, buffer, count, ncclDouble, ncclSum, m_ncclComm, m_stream);
+    }
+
+    if (res != ncclSuccess)
+        RuntimeError("NcclComm ncclAllReduce failed: %s", ncclGetErrorString(res));
+}
+
+void NcclComm::Sync()
+{
+    cudaStreamSynchronize(m_stream) || "NcclComm: cudaStreamSynchronize failed";
+}
+
+}}} // end namespaces
+
+#else // !USE_NCCL
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+NcclComm::NcclComm(int /*deviceId*/, const MPIWrapperPtr& /*mpi*/) { }
+
+NcclComm::~NcclComm() { }
+
+bool NcclComm::IsSupported()
+{
+    return false;
+}
+
+void NcclComm::Sync() { }
+
+}}} // end namespaces
+#endif
--- a/Source/Math/NcclComm.h
+++ b/Source/Math/NcclComm.h
@ -0,0 +1,56 @@
+//
+// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// Encapsulates NCCLs dependencies
+#pragma once
+
+#include "Matrix.h"
+#include "MPIWrapper.h"
+
+#include <vector>
+#include <type_traits>
+
+// Forward declare CUDA stuff
+typedef struct CUstream_st* cudaStream_t;
+typedef struct ncclComm* ncclComm_t;
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+class NcclComm
+{
+#ifdef USE_NCCL
+private:
+    enum class DataType : int {FLOAT, DOUBLE};
+    void AllReduceImpl(void* buffer, size_t count, DataType dtype);
+    cudaStream_t m_stream;
+    ncclComm_t m_ncclComm;
+#endif
+
+public:
+    NcclComm(int deviceId, const MPIWrapperPtr& mpiComm);
+    ~NcclComm();
+    bool IsSupported();
+    void Sync(); // waits for outstanding reductions to complete
+
+    template <typename ElemType>
+    void AllReduce(const std::vector<Matrix<ElemType>*>& grads)
+    {
+#ifdef USE_NCCL
+        DataType dtype = DataType::FLOAT;
+        if (std::is_same<ElemType, double>::value)
+            dtype = DataType::DOUBLE;
+        else if (!std::is_same<ElemType, float>::value)
+            RuntimeError("NcclComm Unsupported reduction type");
+
+        for (size_t i=0; i<grads.size(); ++i)
+        {
+            AllReduceImpl(grads[i]->Data(), grads[i]->GetNumElements(), dtype);
+        }
+#else
+        RuntimeError("NcclComm: CNTK was built without NCCL support.");
+#endif
+    }
+};
+
+}}}
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -1067,12 +1067,12 @@ void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly)
+void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
 {
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
+void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
 {
 }

--- a/Source/Readers/CNTKTextFormatReader/TextParser.cpp
+++ b/Source/Readers/CNTKTextFormatReader/TextParser.cpp
@ -368,7 +368,6 @@ typename TextParser<ElemType>::SequenceBuffer TextParser<ElemType>::LoadSequence
        }
        else
        {
-            IncrementNumberOfErrorsOrDie();
            if (ShouldWarn())
            {
                fprintf(stderr,
@ -378,6 +377,7 @@ typename TextParser<ElemType>::SequenceBuffer TextParser<ElemType>::LoadSequence
                    GetSequenceKey(sequenceDsc).c_str(),
                    GetFileInfo().c_str());
            }
+            IncrementNumberOfErrorsOrDie();
        }

        if (!bytesToRead && numRowsRead < expectedRowCount)
@ -585,7 +585,6 @@ bool TextParser<ElemType>::TryReadSample(SequenceBuffer& sequence, size_t& bytes
    size_t id;
    if (!TryGetInputId(id, bytesToRead))
    {
-        IncrementNumberOfErrorsOrDie();
        return false;
    }

@ -672,12 +671,16 @@ bool TextParser<ElemType>::TryGetInputId(size_t& id, size_t& bytesToRead)
                if (ShouldWarn())
                {
                    fprintf(stderr,
-                        "WARNING: Invalid input ('%s') %ls. "
+                        "WARNING: Unknown input ('%s') %ls. "
                        "Input name '%s' was not specified in the reader config section.\n",
                        name.c_str(), GetFileInfo().c_str(), name.c_str());
                }
+
+                // return false here to skip this input, but do not call IncrementNumberOfErrorsOrDie()
+                return false;
            }
-            else if (ShouldWarn())
+            
+            if (ShouldWarn())
            {
                fprintf(stderr,
                    "WARNING: Input name prefix ('%c') is followed by"
@ -685,7 +688,7 @@ bool TextParser<ElemType>::TryGetInputId(size_t& id, size_t& bytesToRead)
                    NAME_PREFIX, c, GetFileInfo().c_str());
            }

-            return false;
+            break;
        }
        else if (scratchIndex < (m_scratch.get() + m_maxAliasLength))
        {
@ -702,19 +705,20 @@ bool TextParser<ElemType>::TryGetInputId(size_t& id, size_t& bytesToRead)
                    "WARNING: Did not find a valid input name %ls.\n",
                    GetFileInfo().c_str());
            }
-            return false;
+            break;
        }

        ++m_pos;
        --bytesToRead;
    }

-    if (ShouldWarn())
+    if (bytesToRead == 0 && ShouldWarn())
    {
        fprintf(stderr,
            "WARNING: Exhausted all input expected for the current sequence"
            " while reading an input name %ls.\n", GetFileInfo().c_str());
    }
+    IncrementNumberOfErrorsOrDie();
    return false;
 }

@ -781,13 +785,13 @@ bool TextParser<ElemType>::TryReadDenseSample(vector<ElemType>& values, size_t s
        ++counter;
    }

-    IncrementNumberOfErrorsOrDie();
    if (ShouldWarn())
    {
        fprintf(stderr,
            "WARNING: Exhausted all input expected for the current sequence"
            " while reading a dense sample %ls.\n", GetFileInfo().c_str());
    }
+    IncrementNumberOfErrorsOrDie();
    return false;
 }

@ -1135,8 +1139,13 @@ bool TextParser<ElemType>::TryReadRealNumber(ElemType& value, size_t& bytesToRea
            }
            break;
        default:
-            LogicError("Reached an invalid state while reading a floating point value %ls.\n",
-                GetFileInfo().c_str());
+            if (ShouldWarn())
+            {
+                fprintf(stderr,
+                    "WARNING: Reached an invalid state while reading a floating point value %ls.\n",
+                    GetFileInfo().c_str());
+            }
+            return false;
        }

        ++m_pos;
--- a/Source/SGDLib/PostComputingActions.cpp
+++ b/Source/SGDLib/PostComputingActions.cpp
@ -2,8 +2,6 @@
 // Copyright (c) Microsoft. All rights reserved.
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //
-// PostStat.cpp -- CNTK post statistics related actions
-//

 #include "PostComputingActions.h"

@ -118,7 +116,7 @@ void PostComputingActions<ElemType>::BatchNormalizationStatistics(IDataReader *
            // push the statistics results of mean and variance of bn nodes into mpi updating vector
            std::vector<Matrix<ElemType>*> learnParamsValues(2, nullptr);

-            SimpleDistGradAggregator<ElemType> distGradAgg(m_mpi, false /*useAsyncAggregation*/, 0 /*syncStatsTrace*/);
+            SimpleDistGradAggregator<ElemType> distGradAgg(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);

            auto runMeanParameterPtr = node->Input(3);
            auto runStdParameterPtr  = node->Input(4);
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -1,4 +1,10 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
 // SGD.cpp -- implements SGD with all bells and whistles, parallelization, randomization, etc.
+//

 #define _CRT_SECURE_NO_WARNINGS // "secure" CRT not available on all platforms  --add this at the top of all CPP files that give "function or variable may be unsafe" warnings

@ -327,7 +333,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
    if (GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD)
    {
        currentNumGradientBits = m_numGradientBits[startEpoch]; // remember so that we can detect a change
-        InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel);
+        InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, net->GetDeviceId(), m_traceLevel);
    }
    else if (GetParallelizationMethod() == ParallelizationMethod::modelAveragingSGD || 
             GetParallelizationMethod() == ParallelizationMethod::blockMomentumSGD)
@ -434,7 +440,7 @@ void SGD<ElemType>::TrainOrAdaptModel(int startEpoch, ComputationNetworkPtr net,
            currentNumGradientBits != m_numGradientBits[i])
        {
            currentNumGradientBits = m_numGradientBits[i];
-            InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, m_traceLevel);
+            InitDistGradAgg(evaluationNodes.size(), currentNumGradientBits, net->GetDeviceId(), m_traceLevel);
        }

        Timer timer;
@ -2076,31 +2082,35 @@ void SGD<ElemType>::AttemptUtteranceDerivativeFeatures(ComputationNetworkPtr net
 }

 template <class ElemType>
-void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel)
+void SGD<ElemType>::InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel)
 {
    assert(GetParallelizationMethod() == ParallelizationMethod::dataParallelSGD);
-    if (traceLevel > 0)
-        fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);

-#ifdef CNTK_PARALLEL_TRAINING_SUPPORT
-    if (Globals::UseV2Aggregator())
-    {
-        auto communicator = ::CNTK::QuantizedMPICommunicator(m_zeroThresholdFor1Bit, true, numGradientBits);
-        m_distGradAgg = std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
-    }
-    else
-        m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
-#else
    if (numGradientBits != (8 * sizeof(ElemType)))
    {
+        if (traceLevel > 0)
+            fprintf(stderr, "Initializing dataParallelSGD for %d-bit quantization.\n", numGradientBits);
+#ifdef CNTK_PARALLEL_TRAINING_SUPPORT
+        if (Globals::UseV2Aggregator())
+        {
+            auto communicator = ::CNTK::QuantizedMPICommunicator(m_zeroThresholdFor1Bit, true, numGradientBits);
+            m_distGradAgg = std::make_shared<V2AllReduceDistGradAggregator<ElemType>>(communicator, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
+        }
+        else
+            m_distGradAgg = std::make_shared<AllReduceDistGradAggregator<ElemType>>(m_mpi, numGradientBits, m_zeroThresholdFor1Bit, true /*useQuantizationForSelfStripe*/, m_bufferedAsyncGradientAggregation, traceLevel, m_syncStatsTrace);
+#else
        RuntimeError("Gradient quantization is unsupported in CNTK binaries built without quantized gradient aggregation support!");
-    }
-
-    if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines.
-        m_distGradAgg = std::make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace, ::CNTK::MPICommunicator());
-    else
-        m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace);
 #endif // !CNTK_PARALLEL_TRAINING_SUPPORT
+    }
+    else
+    {
+        if (traceLevel > 0)
+            fprintf(stderr, "Initializing dataParallelSGD with FP%d aggregation.\n", numGradientBits);
+        if (Globals::UseV2Aggregator()) // Currently used to check V2 against baselines.
+            m_distGradAgg = std::make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, m_syncStatsTrace, ::CNTK::MPICommunicator());
+        else
+            m_distGradAgg = std::make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, m_bufferedAsyncGradientAggregation, deviceId, m_syncStatsTrace);
+    }

    m_gradHeader.reset(DistGradHeader::Create(numEvalNodes), [](DistGradHeader* ptr) { DistGradHeader::Destroy(ptr); });
 }
--- a/Source/SGDLib/SGD.h
+++ b/Source/SGDLib/SGD.h
@ -491,7 +491,7 @@ protected:
                         const std::string& prefixMsg = "",
                         const size_t maxNumberOfSamples = SIZE_MAX);

-    void InitDistGradAgg(int numEvalNodes, int numGradientBits, int traceLevel);
+    void InitDistGradAgg(int numEvalNodes, int numGradientBits, int deviceId, int traceLevel);
    void InitModelAggregationHandler(int traceLevel, DEVICEID_TYPE devID);
 public:
    // UpdateWeights() - actual weight update, implementing various update rules
--- a/Source/SGDLib/SimpleDistGradAggregator.h
+++ b/Source/SGDLib/SimpleDistGradAggregator.h
@ -1,7 +1,14 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2016, NVIDIA CORPORATION. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+
 #pragma once

 #include "IDistGradAggregator.h"
 #include "CUDAPageLockedMemAllocator.h"
+#include "NcclComm.h"
 #include <future>
 #include "GPUDataTransferer.h"
 #include "TimerUtility.h"
@ -15,8 +22,8 @@ class SimpleDistGradAggregator : public IDistGradAggregator<ElemType>
    UsingIDistGradAggregatorMembers;

 public:
-    SimpleDistGradAggregator(const MPIWrapperPtr& mpi, bool useAsyncAggregation, int syncStatsTrace)
-        : IDistGradAggregator<ElemType>(mpi), m_useAsyncAggregation(useAsyncAggregation), m_initialized(false), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace), m_iterationCount(0)
+    SimpleDistGradAggregator(const MPIWrapperPtr& mpi, bool useAsyncAggregation, int deviceId, int syncStatsTrace)
+        : IDistGradAggregator<ElemType>(mpi), m_useAsyncAggregation(useAsyncAggregation), m_initialized(false), m_bufferedGradHeader(nullptr), m_syncStatsTrace(syncStatsTrace), m_iterationCount(0), m_nccl(deviceId, mpi)
    {}

    ~SimpleDistGradAggregator()
@ -136,7 +143,8 @@ private:
        {
            m_initialized = true;
            int deviceId = gradients[0]->GetDeviceId();
-            if (deviceId != CPUDEVICE)
+
+            if (!m_nccl.IsSupported() && deviceId != CPUDEVICE)
                m_allocator.reset(new CUDAPageLockedMemAllocator(deviceId));

            for (size_t i = 0; i < gradients.size(); i++)
@ -145,7 +153,7 @@ private:
                if (gradients[i]->GetMatrixType() != DENSE)
                    RuntimeError("Gradient aggregation for sparse gradient matrices is currently unsupported!");

-                if (deviceId != CPUDEVICE)
+                if (!m_nccl.IsSupported() && deviceId != CPUDEVICE)
                {
                    m_gpuDataTransferers.push_back(std::make_unique<GPUDataTransferer>(deviceId, m_useAsyncAggregation));
                    m_intermediateCPUBuffers.push_back(AllocateIntermediateBuffer(deviceId, gradients[i]->GetNumElements()));
@ -216,7 +224,7 @@ private:
        }

        // Initiate transfer of the gradient matrices to the CPU if needed
-        if (deviceId >= 0)
+        if (!m_nccl.IsSupported() && deviceId >= 0)
        {
            for (size_t i = 0; i < numGradMatrices; ++i)
                m_gpuDataTransferers[i]->CopyGPUToCPUAsync(gradients[i]->Data(), gradients[i]->GetNumElements(), m_intermediateCPUBuffers[i].get());
@ -239,20 +247,27 @@ private:
        if (!m_mpi->IsMainNode())
            MPI_Isend(headerCPU, headerCPU->Size(), MPI_CHAR, m_mpi->MainNodeRank(), numGradMatrices, m_mpi->Communicator(), &sendHeaderRequest) || MpiFail("MPI_Isend");

-        // Perform MPI async allreduce on the gradient data
+        // Perform async allreduce on the gradient data
        std::vector<MPI_Request> allReduceRequests(numGradMatrices);
-        for (size_t i = 0; i < numGradMatrices; ++i)
+        if (!m_nccl.IsSupported())
        {
-            ElemType* reductionBuffer = gradients[i]->Data();
-            if (deviceId >= 0)
+            for (size_t i = 0; i < numGradMatrices; ++i)
            {
-                m_gpuDataTransferers[i]->WaitForCopyGPUToCPUAsync();
-                reductionBuffer = m_intermediateCPUBuffers[i].get();
-            }
+                ElemType* reductionBuffer = gradients[i]->Data();
+                if (deviceId >= 0)
+                {
+                    m_gpuDataTransferers[i]->WaitForCopyGPUToCPUAsync();
+                    reductionBuffer = m_intermediateCPUBuffers[i].get();
+                }

-            // On Windows this async MPI_Iallreduce call requires MS MPI v7 or higher to be installed
-            MPI_Iallreduce(MPI_IN_PLACE, reductionBuffer, gradients[i]->GetNumElements(), MPIWrapper::GetDataType(reductionBuffer), MPI_SUM, m_mpi->Communicator(), &allReduceRequests[i]) || MpiFail("MPI_Iallreduce");
+                // On Windows this async MPI_Iallreduce call requires MS MPI v7 or higher to be installed
+                MPI_Iallreduce(MPI_IN_PLACE, reductionBuffer, gradients[i]->GetNumElements(),
+                               MPIWrapper::GetDataType(reductionBuffer), MPI_SUM,
+                               m_mpi->Communicator(), &allReduceRequests[i]) || MpiFail("MPI_Iallreduce");
+            }
        }
+        else
+            m_nccl.AllReduce(gradients);

        // On the main node wait for the headers to arrive and aggregate
        if (m_mpi->IsMainNode())
@ -293,11 +308,14 @@ private:
        }

        // Wait for the allreduce operations to finish and initiate transfer back to the GPU if needed
-        for (size_t i = 0; i < numGradMatrices; ++i)
+        if (!m_nccl.IsSupported())
        {
-            MPI_Wait(&allReduceRequests[i], MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
-            if (deviceId >= 0)
-                m_gpuDataTransferers[i]->CopyCPUToGPUAsync(m_intermediateCPUBuffers[i].get(), gradients[i]->GetNumElements(), gradients[i]->Data());
+            for (size_t i = 0; i < numGradMatrices; ++i)
+            {
+                MPI_Wait(&allReduceRequests[i], MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");
+                if (deviceId >= 0)
+                    m_gpuDataTransferers[i]->CopyCPUToGPUAsync(m_intermediateCPUBuffers[i].get(), gradients[i]->GetNumElements(), gradients[i]->Data());
+            }
        }

        // Wait to receive aggregate header
@ -305,7 +323,9 @@ private:
            MPI_Wait(&recvAggHeaderRequest, MPI_STATUSES_IGNORE) || MpiFail("MPI_Wait");

        // Wait for all the transfers to finish
-        if (deviceId >= 0)
+        if (m_nccl.IsSupported())
+            m_nccl.Sync();
+        else if (deviceId >= 0)
        {
            for (size_t i = 0; i < numGradMatrices; ++i)
                m_gpuDataTransferers[i]->WaitForCopyCPUToGPUAsync();
@ -349,5 +369,7 @@ private:
    size_t m_iterationCount;

    bool m_initialized;
+
+    NcclComm m_nccl;
 };
 } } }
--- a/Source/SGDLib/SimpleEvaluator.h
+++ b/Source/SGDLib/SimpleEvaluator.h
@ -2,6 +2,7 @@
 // Copyright (c) Microsoft. All rights reserved.
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //
+
 #pragma once

 #include "V2SimpleDistGradAggregator.h"
@ -167,7 +168,7 @@ public:
                    if (Globals::UseV2Aggregator())
                        m_distGradAgg = make_shared<V2SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, 0 /*syncStatsTrace*/, ::CNTK::MPICommunicator());
                    else 
-                        m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, 0 /*syncStatsTrace*/);
+                        m_distGradAgg = make_shared<SimpleDistGradAggregator<ElemType>>(m_mpi, false /*useAsyncAggregation*/, m_net->GetDeviceId(), 0 /*syncStatsTrace*/);
                }

                m_gradHeader->numEvalNode = evalNodes.size();
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/CPPEvalExtendedClientTest.vcxproj
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/CPPEvalExtendedClientTest.vcxproj
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_CpuOnly|x64">
+      <Configuration>Debug_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_CpuOnly|x64">
+      <Configuration>Release_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{5D29C76D-648A-456F-920D-48230F2FB3C8}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CPPEvalExtendedClientTest</RootNamespace>
+    <ProjectName>CPPEvalExtendedClientTest</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
+  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <UseIntelMKL>No</UseIntelMKL>
+  </PropertyGroup>
+  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+    <UseIntelMKL>No</UseIntelMKL>
+    <UseIntelIPP>false</UseIntelIPP>
+  </PropertyGroup>
+  <!--Importing CPP defaults must occur after declaring the desired toolset above
+  Otherwise, the build may default back to an previous toolset -->
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings" />
+  <ImportGroup Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup>
+    <!-- TODO intentional for all? -->
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>CPPEvalExtendedClientTest</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level4</WarningLevel>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;UNICODE;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OpenMPSupport>true</OpenMPSupport>
+      <TreatWarningAsError>true</TreatWarningAsError>
+    </ClCompile>
+    <Link>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>%(DelayLoadDLLs)</DelayLoadDLLs>
+      <Profile>true</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(DebugBuild)">
+    <ClCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Optimization>Disabled</Optimization>
+      <MinimalRebuild>false</MinimalRebuild>
+    </ClCompile>
+    <Link />
+    <ProjectReference>
+      <LinkLibraryDependencies>false</LinkLibraryDependencies>
+    </ProjectReference>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\..\Examples\Evaluation\CPPEvalExtendedClient\CPPEvalExtendedClient.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets" />
+</Project>
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/CPPEvalExtendedClientTest.vcxproj.filters
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/CPPEvalExtendedClientTest.vcxproj.filters
@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="CPPEvalExtendedClient.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/README.md
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/README.md
@ -0,0 +1,2 @@
+This folder contains the VC++ project file for building CPPEvalExtendedClientTest.exe. 
+The C++ source code used by the project is in Examples\Evaluation\CPPEvalExtendedClient.
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/baseline.txt
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/baseline.txt
@ -0,0 +1,114 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz
+    Hardware threads: 32
+    Total Memory: 33468508 kB
+-------------------------------------------------------------------
+ [[ -z E:\CNTKTestData ]]
+ [[ ! -d E:\CNTKTestData ]]
+ '[' Windows_NT == Windows_NT ']'
++ cygpath -au 'E:\CNTKTestData'
+ TestDataDir=/cygdrive/e/CNTKTestData
+ ATISDir=/cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS
+ DataDir=/cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/Data
+ OutputDir=/cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/Data
+ ConfigDir=/cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS
+ DeleteModelsAfterTest=0
+ '[' -f /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/ATIS.cntk ']'
+ cntkrun ATIS.cntk 'stderr=- command=Train Train=[SGD=[maxEpochs=1]]'
+ configFileName=ATIS.cntk
+ additionalCNTKArgs='stderr=- command=Train Train=[SGD=[maxEpochs=1]]'
+ '[' Windows_NT == Windows_NT ']'
++ cygpath -aw /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS
+ ConfigDir='C:\repos\cntk\Examples\Text\ATIS'
++ cygpath -aw /tmp/cntk-test-20161108174139.565799/EvalClientTests_CPPEvalExtendedClientTest@release_cpu
+ RunDir='C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu'
++ cygpath -aw /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/Data
+ DataDir='C:\repos\cntk\Examples\Text\ATIS\Data'
++ cygpath -aw /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/Data
+ OutputDir='C:\repos\cntk\Examples\Text\ATIS\Data'
+ CNTKArgs='configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu DataDir=C:\repos\cntk\Examples\Text\ATIS\Data ConfigDir=C:\repos\cntk\Examples\Text\ATIS OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data DeviceId=-1 timestamping=true stderr=- command=Train Train=[SGD=[maxEpochs=1]]'
+ '[' '' '!=' '' ']'
+ modelsDir=/tmp/cntk-test-20161108174139.565799/EvalClientTests_CPPEvalExtendedClientTest@release_cpu/Models
+ [[ 1 == 1 ]]
+ '[' -d /tmp/cntk-test-20161108174139.565799/EvalClientTests_CPPEvalExtendedClientTest@release_cpu/Models ']'
+ mkdir -p /tmp/cntk-test-20161108174139.565799/EvalClientTests_CPPEvalExtendedClientTest@release_cpu/Models
+ [[ 0 == 0 ]]
+ run /cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe 'configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk' 'currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data' 'RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu' 'DataDir=C:\repos\cntk\Examples\Text\ATIS\Data' 'ConfigDir=C:\repos\cntk\Examples\Text\ATIS' 'OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data' DeviceId=-1 timestamping=true stderr=- command=Train 'Train=[SGD=[maxEpochs=1]]'
+ cmd=/cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe
+ shift
+ '[' '' == 1 ']'
+ echo === Running /cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe 'configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk' 'currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data' 'RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu' 'DataDir=C:\repos\cntk\Examples\Text\ATIS\Data' 'ConfigDir=C:\repos\cntk\Examples\Text\ATIS' 'OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data' DeviceId=-1 timestamping=true stderr=- command=Train 'Train=[SGD=[maxEpochs=1]]'
+=== Running /cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu DataDir=C:\repos\cntk\Examples\Text\ATIS\Data ConfigDir=C:\repos\cntk\Examples\Text\ATIS OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data DeviceId=-1 timestamping=true stderr=- command=Train Train=[SGD=[maxEpochs=1]]
+ /cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe 'configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk' 'currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data' 'RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu' 'DataDir=C:\repos\cntk\Examples\Text\ATIS\Data' 'ConfigDir=C:\repos\cntk\Examples\Text\ATIS' 'OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data' DeviceId=-1 timestamping=true stderr=- command=Train 'Train=[SGD=[maxEpochs=1]]'
+CNTK 2.0.beta2.0+ (zhouwang/pr899 0b1214, Nov  8 2016 17:27:36) on ZHOUWANGDEV4 at 2016/11/08 16:41:40
+
+C:\repos\cntk\x64\release_CpuOnly\cntk.exe  configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk  currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data  RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu  DataDir=C:\repos\cntk\Examples\Text\ATIS\Data  ConfigDir=C:\repos\cntk\Examples\Text\ATIS  OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data  DeviceId=-1  timestamping=true  stderr=-  command=Train  Train=[SGD=[maxEpochs=1]]
+Changed current directory to C:\repos\cntk\Examples\Text\ATIS\Data
+11/08/2016 16:41:40: Redirecting stderr to file -_Train.logrank0
+CNTK 2.0.beta2.0+ (zhouwang/pr899 0b1214, Nov  8 2016 17:27:36) on ZHOUWANGDEV4 at 2016/11/08 16:41:40
+
+C:\repos\cntk\x64\release_CpuOnly\cntk.exe  configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk  currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data  RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu  DataDir=C:\repos\cntk\Examples\Text\ATIS\Data  ConfigDir=C:\repos\cntk\Examples\Text\ATIS  OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data  DeviceId=-1  timestamping=true  stderr=-  command=Train  Train=[SGD=[maxEpochs=1]]
+
+11/08/2016 16:41:40: ##############################################################################
+11/08/2016 16:41:40: #                                                                            #
+11/08/2016 16:41:40: # Train command (train action)                                               #
+11/08/2016 16:41:40: #                                                                            #
+11/08/2016 16:41:40: ##############################################################################
+
+Node 'lstmStack.layers[0].lstmState._.ot._.PlusArgs[0].PlusArgs[0].PlusArgs[1].TimesArgs[0]' (LearnableParameter operation) operation: Tensor shape was inferred as [300 x 150].
+Node 'lstmStack.layers[0].lstmState._.ft._.PlusArgs[0].PlusArgs[0].PlusArgs[1].TimesArgs[0]' (LearnableParameter operation) operation: Tensor shape was inferred as [300 x 150].
+Node 'lstmStack.layers[0].lstmState._.it._.PlusArgs[0].PlusArgs[0].PlusArgs[1].TimesArgs[0]' (LearnableParameter operation) operation: Tensor shape was inferred as [300 x 150].
+Node 'lstmStack.layers[0].lstmState._.bit.ElementTimesArgs[1].z.PlusArgs[0].PlusArgs[1].TimesArgs[0]' (LearnableParameter operation) operation: Tensor shape was inferred as [300 x 150].
+11/08/2016 16:41:40: 
+Model has 61 nodes. Using CPU.
+
+11/08/2016 16:41:40: Training criterion:   cr = CrossEntropyWithSoftmax
+11/08/2016 16:41:40: Evaluation criterion: errs = ClassificationError
+
+11/08/2016 16:41:40: Training 1005127 parameters in 18 parameter tensors.
+
+11/08/2016 16:42:02: Finished Epoch[ 1 of 1]: [Training] cr = 0.40189165 * 36006; errs = 8.254% * 36006; totalSamplesSeen = 36006; learningRatePerSample = 0.0099999998; epochTime=22.2249s
+
+11/08/2016 16:42:02: __COMPLETED__
+ return 0
+ local ExitCode=0
+ [[ 0 == 1 ]]
+ return 0
+ '[' -d 'C:\repos\cntk\Examples\Text\ATIS\Data/work' ']'
+ '[' -d /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/work ']'
+ mv 'C:\repos\cntk\Examples\Text\ATIS\Data/work' /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/
+ '[' Windows_NT == Windows_NT ']'
+ /cygdrive/c/repos/cntk/x64/release_CpuOnly/CPPEvalExtendedClientTest.exe
+Input node name: featuresCW
+Input feature dimension: 944
+Input node name: featuresNW
+Input feature dimension: 944
+Input node name: featuresPW
+Input feature dimension: 944
+Slot tag for sentence "BOS i would like to find a flight from charlotte to las vegas that makes a stop in st. louis EOS" is as followings:
+         i -- I-transport_type
+     would -- I-transport_type
+      like -- I-transport_type
+        to -- I-transport_type
+      find -- I-transport_type
+         a -- I-transport_type
+    flight -- I-transport_type
+      from -- I-transport_type
+ charlotte -- B-fromloc.airport_name
+        to -- I-transport_type
+       las -- B-toloc.airport_name
+     vegas -- I-toloc.airport_name
+      that -- I-transport_type
+     makes -- I-transport_type
+         a -- I-transport_type
+      stop -- I-transport_type
+        in -- I-transport_type
+       st. -- B-stoploc.airport_name
+     louis -- I-state_name
+Evaluation complete.
+Output dimension: 127
+Output name: outputs
+ ExitCode=0
+ '[' -d /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/work ']'
+ rm -rf /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/work
+ exit 0
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/run-test
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/run-test
@ -0,0 +1,48 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+set -x
+
+# This test case is to test CPPEvalClient works with the same setup of users. 
+# For that purpose, the test needs to create the pre-trained model in the Examples directories as expected by CPPEvalExtendedClient.
+# These files are removed by Jenkins during workspace cleanup.
+
+# The eval test uses some pretrained models which are not part of the CNTK repository itself
+# We use the dataset from an external location specified using an environment variable
+if [[ -z "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" || ! -d "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" ]]; then
+  echo This test uses external data that is not part of the CNTK repository. Environment variable CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY must be set to point to the external test data location.
+  exit 1
+fi
+
+if [ "$OS" == "Windows_NT" ]; then
+  TestDataDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`
+else
+  TestDataDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY
+fi
+
+ATISDir=$TEST_ROOT_DIR/../../Examples/Text/ATIS
+DataDir=$ATISDir/Data
+OutputDir=$ATISDir/Data
+ConfigDir=$ATISDir
+
+# Train model for evaluation
+DeleteModelsAfterTest=0
+[ -f $ConfigDir/ATIS.cntk ] || exit 1
+cntkrun ATIS.cntk "stderr=- command=Train Train=[SGD=[maxEpochs=1]]" || exit $?
+
+# The created model is saved under $DataDir/work, according to ATIS.cntk. Move it to the $ATISDir/work
+[ -d $DataDir/work ] || exit $?
+[ -d $ATISDir/work ] && rm -rf $ATISDir/work
+mv $DataDir/work $ATISDir/ || exit $?
+ 
+if [ "$OS" == "Windows_NT" ]; then
+  $TEST_BIN_DIR/CPPEvalExtendedClientTest.exe
+else
+  $TEST_BIN_DIR/cppevalextendedclient
+fi
+ExitCode=$?
+
+[ -d $ATISDir/work ] && rm -rf $ATISDir/work
+
+exit $ExitCode
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/testcases.yml
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/testcases.yml
@ -0,0 +1,92 @@
+dataDir: .
+
+tags:  
+  - bvt-i (build_sku != '1bitsgd') and ((build_sku == 'cpu') or (device == 'gpu')) and (flavor == 'release')
+  # This test also runs in debug mode, as the debug version of EvalDll is also included in the NuGet package.
+  - nightly-i (build_sku != '1bitsgd') and ((build_sku == 'cpu') or (device == 'gpu'))
+
+testCases:
+   Test run must be completed:
+    patterns:
+      - Evaluation complete
+
+   # Due to time limitation, the test can only train the model with 1 Epoch, so the
+   # model is not accurate enough to create correct results under some build flavors.
+   # Disable to check results for now.
+
+   #Test results Line 1:
+   #patterns:
+   # - i -- I-transport_type
+        
+   #Test results Line 2:
+   # patterns:
+   # - would -- I-transport_type
+           
+   #Test results Line 3:
+   # patterns:
+   # - like -- I-transport_type
+           
+   #Test results Line 4:
+   # patterns:
+   # - to -- I-transport_type
+           
+   #Test results Line 5:
+   # patterns:
+   # - find -- I-transport_type
+           
+   #Test results Line 6:
+   # patterns:
+   # - a -- I-transport_type
+           
+   #Test results Line 7:
+   # patterns:
+   # - flight -- I-transport_type
+           
+   #Test results Line 8:
+   # patterns:
+   # - from -- I-transport_type
+           
+   #Test results Line 9:
+   # patterns:
+   # - charlotte -- B-fromloc.airport_name
+           
+   #Test results Line 10:
+   # patterns:
+   # - to -- I-transport_type
+           
+   #Test results Line 11:
+   # patterns:
+   # - las -- B-toloc.airport_name
+           
+   #Test results Line 12:
+   # patterns:
+   # - vegas -- I-toloc.airport_name
+           
+   #Test results Line 13:
+   # patterns:
+   # - that -- I-transport_type
+           
+   #Test results Line 14:
+   # patterns:
+   # - makes -- I-transport_type
+           
+   #Test results Line 15:
+   # patterns:
+   # - a -- I-transport_type
+           
+   #Test results Line 16:
+   # patterns:
+   # - stop -- I-transport_type
+           
+   #Test results Line 17:
+   # patterns:
+   # - in -- I-transport_type
+           
+   #Test results Line 18:
+   # patterns:
+   # - st. -- B-stoploc.airport_name
+           
+   #Test results Line 19:
+   # patterns:
+   # - louis -- I-state_name
+
--- a/Tests/EndToEndTests/UnitTests/ReaderTests/baseline.txt
+++ b/Tests/EndToEndTests/UnitTests/ReaderTests/baseline.txt
@ -1229,6 +1229,9 @@ Test module "ReaderTests" has passed with:
    Test case "ReaderTestSuite/CNTKTextFormatReader_Simple_dense" has passed with:
      1 assertion out of 1 passed

+    Test case "ReaderTestSuite/CNTKTextFormatReader_Simple_dense_single_stream" has passed with:
+      1 assertion out of 1 passed
+
    Test case "ReaderTestSuite/CNTKTextFormatReader_MNIST_dense" has passed with:
      1 assertion out of 1 passed

--- a/Tests/Install/linux/test.sh
+++ b/Tests/Install/linux/test.sh
@ -3,7 +3,7 @@ set -x -e -o pipefail

 USAGE="Usage: $0 <drops-to-test>"

-REPO_TAG=v2.0.beta2.0
+REPO_TAG=v2.0.beta3.0

 while [ $# -gt 0 ]; do
  case "$1" in
@ -52,9 +52,11 @@ for drop in $*; do
  if [[ "$DROP_FILE" == *CPU* ]] || [[ "$DROP_FILE" == *cpu* ]]; then
    TEST_DEVICE=cpu
    DOCKER_TO_RUN=docker
+    DOCKERFILE_SUFFIX=CPU
  else
    TEST_DEVICE=gpu
    DOCKER_TO_RUN=nvidia-docker
+    DOCKERFILE_SUFFIX=GPU
  fi

  rm -f "$DROP_RESERVED"
@ -63,7 +65,7 @@ for drop in $*; do

  IMAGE=cntk:installtest
  for base in Ubuntu16 Ubuntu14; do
-    docker build -t $IMAGE -f Dockerfile-$base-GPU --build-arg REPO_TAG=$REPO_TAG .
+    docker build -t $IMAGE -f Dockerfile-$base-$DOCKERFILE_SUFFIX --build-arg REPO_TAG=$REPO_TAG .
    $DOCKER_TO_RUN run --rm $IMAGE su - testuser -c "./run-test.sh $TEST_DEVICE"
    docker rmi $IMAGE
  done
--- a/Tests/UnitTests/BrainScriptTests/stdafx.cpp
+++ b/Tests/UnitTests/BrainScriptTests/stdafx.cpp
@ -8,9 +8,4 @@

 #define BOOST_TEST_MODULE BrainScriptTests

-#include "stdafx.h"
-
-// TODO: Temporary mechanism to enable memory sharing for
-// node output value matrices. This will go away when the
-// sharing is ready to be enabled by default
-bool g_shareNodeValueMatrices = false;
+#include "stdafx.h"
--- a/Tests/UnitTests/NetworkTests/stdafx.cpp
+++ b/Tests/UnitTests/NetworkTests/stdafx.cpp
@ -9,9 +9,4 @@
 #include "MPIWrapper.h"

 // TODO: Get rid of these globals
-Microsoft::MSR::CNTK::MPIWrapper* g_mpi = nullptr;
-
-// TODO: Temporary mechanism to enable memory sharing for
-// node output value matrices. This will go away when the
-// sharing is ready to be enabled by default
-bool g_shareNodeValueMatrices = false;
+Microsoft::MSR::CNTK::MPIWrapper* g_mpi = nullptr;
--- a/Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
+++ b/Tests/UnitTests/ReaderTests/CNTKTextFormatReaderTests.cpp
@ -101,6 +101,23 @@ BOOST_AUTO_TEST_CASE(CNTKTextFormatReader_Simple_dense)
        1);
 };

+BOOST_AUTO_TEST_CASE(CNTKTextFormatReader_Simple_dense_single_stream)
+{
+    HelperRunReaderTest<float>(
+        testDataPath() + "/Config/CNTKTextFormatReader/dense.cntk",
+        testDataPath() + "/Control/CNTKTextFormatReader/Simple_dense_single_stream.txt",
+        testDataPath() + "/Control/CNTKTextFormatReader/Simple_dense_single_stream_Output.txt",
+        "Simple_single_stream",
+        "reader",
+        1000, // epoch size
+        250,  // mb size
+        10,   // num epochs 
+        1,
+        0,
+        0,
+        1);
+};
+

 BOOST_AUTO_TEST_CASE(CNTKTextFormatReader_MNIST_dense)
 {
--- a/Tests/UnitTests/ReaderTests/Config/CNTKTextFormatReader/dense.cntk
+++ b/Tests/UnitTests/ReaderTests/Config/CNTKTextFormatReader/dense.cntk
@ -219,6 +219,26 @@ Simple = [
    ]
 ]

+Simple_single_stream = [
+    precision = "float"
+    reader = [
+        traceLevel = 0 # this will disable warnings triggered by the unknown input name.
+        readerType = "CNTKTextFormatReader"
+        file = "Simple_dense.txt"
+
+        randomize = false
+        
+        input = [
+
+             features = [
+                alias = "F"
+                dim = 2
+                format = "dense"
+            ]
+
+        ]
+    ]
+]

 50x20_jagged_sequences = [
    precision = "double"
--- a/Tests/UnitTests/ReaderTests/Control/CNTKTextFormatReader/Simple_dense_single_stream.txt
+++ b/Tests/UnitTests/ReaderTests/Control/CNTKTextFormatReader/Simple_dense_single_stream.txt
--- a/Tests/UnitTests/ReaderTests/Control/CNTKTextFormatReader/invalid_inputs_Control.txt
+++ b/Tests/UnitTests/ReaderTests/Control/CNTKTextFormatReader/invalid_inputs_Control.txt
@ -16,7 +16,7 @@ WARNING: Maximum per-input number of samples for sequence (id = 2) at offset 435
 INFO: Finished loading sequence (id = 2) at offset 435 in the input file (invalid_inputs.txt), successfully read 14 out of expected 14 rows.
 WARNING: Input name prefix ('|') is followed by an invalid character (' ') at offset 454 in the input file (invalid_inputs.txt).
 WARNING: Input name prefix ('|') is followed by an invalid character (' ') at offset 483 in the input file (invalid_inputs.txt).
-WARNING: Invalid input ('C') at offset 544 in the input file (invalid_inputs.txt). Input name 'C' was not specified in the reader config section.
+WARNING: Unknown input ('C') at offset 544 in the input file (invalid_inputs.txt). Input name 'C' was not specified in the reader config section.
 WARNING: Empty input row at offset 549 in the input file (invalid_inputs.txt).
 WARNING: Could not read a row (# 9) while loading sequence (id = 3) at offset 549 in the input file (invalid_inputs.txt).
 WARNING: Exhausted all input expected for the current sequence (id = 3) at offset 549 in the input file (invalid_inputs.txt), but only read 8 out of 9 expected rows.
--- a/Tests/UnitTests/V2LibraryTests/Common.h
+++ b/Tests/UnitTests/V2LibraryTests/Common.h
@ -200,7 +200,6 @@ inline CNTK::FunctionPtr Stabilize(const CNTK::Variable& x, const CNTK::DeviceDe
 template <typename ElementType>
 std::pair<CNTK::FunctionPtr, CNTK::FunctionPtr> LSTMPCellWithSelfStabilization(CNTK::Variable input, CNTK::Variable prevOutput, CNTK::Variable prevCellState, const CNTK::DeviceDescriptor& device)
 {
-    size_t inputDim = input.Shape()[0];
    size_t outputDim = prevOutput.Shape()[0];
    size_t cellDim = prevCellState.Shape()[0];

@ -209,8 +208,8 @@ std::pair<CNTK::FunctionPtr, CNTK::FunctionPtr> LSTMPCellWithSelfStabilization(C
    };

    unsigned long seed = 1;
-    auto createProjectionParam = [device, &seed](size_t outputDim, size_t inputDim) {
-        return CNTK::Parameter({ outputDim, inputDim }, CNTK::AsDataType<ElementType>(), CNTK::GlorotUniformInitializer(1, 0, 1, seed++), device);
+    auto createProjectionParam = [device, &seed](size_t outputDim) {
+        return CNTK::Parameter({ outputDim, CNTK::NDShape::InferredDimension }, CNTK::AsDataType<ElementType>(), CNTK::GlorotUniformInitializer(1, 0, 1, seed++), device);
    };

    auto createDiagWeightParam = [device, &seed](size_t dim) {
@ -220,26 +219,26 @@ std::pair<CNTK::FunctionPtr, CNTK::FunctionPtr> LSTMPCellWithSelfStabilization(C
    auto stabilizedPrevOutput = Stabilize<ElementType>(prevOutput, device);
    auto stabilizedPrevCellState = Stabilize<ElementType>(prevCellState, device);

-    auto projectInput = [input, cellDim, inputDim, createBiasParam, createProjectionParam]() {
-        return createBiasParam(cellDim) + CNTK::Times(createProjectionParam(cellDim, inputDim), input);
+    auto projectInput = [input, cellDim, createBiasParam, createProjectionParam]() {
+        return createBiasParam(cellDim) + CNTK::Times(createProjectionParam(cellDim), input);
    };

    // Input gate
-    auto it = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim, outputDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), stabilizedPrevCellState));
-    auto bit = CNTK::ElementTimes(it, CNTK::Tanh(projectInput() + CNTK::Times(createProjectionParam(cellDim, outputDim), stabilizedPrevOutput)));
+    auto it = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), stabilizedPrevCellState));
+    auto bit = CNTK::ElementTimes(it, CNTK::Tanh(projectInput() + CNTK::Times(createProjectionParam(cellDim), stabilizedPrevOutput)));

    // Forget-me-not gate
-    auto ft = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim, outputDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), stabilizedPrevCellState));
+    auto ft = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), stabilizedPrevCellState));
    auto bft = CNTK::ElementTimes(ft, prevCellState);

    auto ct = bft + bit;

    // Output gate
-    auto ot = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim, outputDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), Stabilize<ElementType>(ct, device)));
+    auto ot = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), Stabilize<ElementType>(ct, device)));
    auto ht = CNTK::ElementTimes(ot, CNTK::Tanh(ct));

    auto c = ct;
-    auto h = (outputDim != cellDim) ? CNTK::Times(createProjectionParam(outputDim, cellDim), Stabilize<ElementType>(ht, device)) : ht;
+    auto h = (outputDim != cellDim) ? CNTK::Times(createProjectionParam(outputDim), Stabilize<ElementType>(ht, device)) : ht;

    return{ h, c };
 }
--- a/Tests/UnitTests/V2LibraryTests/FunctionTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/FunctionTests.cpp
@ -99,18 +99,14 @@ void TestReduceSum(size_t sampleRank, const DeviceDescriptor& device)

    // Test ReduceSum along a dynamic axis
    {
-        auto testReduceSum = [&sequences, &sequenceLengths, inputShape, sequencesValue, device](const Axis& axis)
+        auto testReduceSum = [&sequences, &sequenceLengths, inputShape, sequencesValue, device]()
        {
-            if (!axis.IsDynamicAxis())
-                RuntimeError("Called the dynamic axis ReduceSum test with a static axis");
-
-            size_t maxActualSequenceLength = sequencesValue->Shape()[inputShape.Rank()];
            size_t numSequences = sequencesValue->Shape()[inputShape.Rank() + 1];

            auto inputVar = InputVariable({ inputShape }, DataType::Float, L"input");
-            FunctionPtr reduceSumFunc = ReduceSum(inputVar, axis);
+            FunctionPtr reduceSumFunc = Sequence::ReduceSum(inputVar);

-            NDShape maskShape = { ((axis == Axis::DefaultBatchAxis()) ? maxActualSequenceLength : 1), ((axis == Axis::DefaultBatchAxis()) ? 1 : numSequences) };
+            NDShape maskShape = { 1, numSequences };
            NDShape outputShape = reduceSumFunc->Output().Shape();
            auto outputDataShape = outputShape.AppendShape(maskShape);

@ -130,10 +126,7 @@ void TestReduceSum(size_t sampleRank, const DeviceDescriptor& device)
                    for (size_t k = 0; k < inputShape.TotalSize(); ++k)
                    {
                        float value = sequences[i][(j * inputShape.TotalSize()) + k];
-                        if (axis == Axis::DefaultBatchAxis())
-                            expectedTotals[(j * inputShape.TotalSize()) + k] += value;
-                        else
-                            expectedTotals[(i * inputShape.TotalSize()) + k] += value;
+                        expectedTotals[(i * inputShape.TotalSize()) + k] += value;
                    }
                }
            }
@ -141,7 +134,7 @@ void TestReduceSum(size_t sampleRank, const DeviceDescriptor& device)
            FloatingPointVectorCompare(outputData, expectedTotals, "testReduceSum: Forward prop results do not match expected results");
        };

-        testReduceSum(Axis::DefaultDynamicAxis());
+        testReduceSum();
    }
 }

@ -217,11 +210,8 @@ void TestSlice(size_t sampleRank, const DeviceDescriptor& device)

    // Test slice along a dynamic axis
    {
-        auto testDynamicAxisSlice = [&sequences, &sequenceLengths, inputShape, sequencesValue, device](const Axis& axis, int beginOffset, int endOffset)
+        auto testDynamicAxisSlice = [&sequences, &sequenceLengths, inputShape, sequencesValue, device](int beginOffset, int endOffset)
        {
-            if (!axis.IsDynamicAxis())
-                RuntimeError("Called the dynamic axis slice test with a static axis");
-
            size_t maxActualSequenceLength = sequencesValue->Shape()[inputShape.Rank()];
            size_t numSequences = sequencesValue->Shape()[inputShape.Rank() + 1];

@ -229,11 +219,11 @@ void TestSlice(size_t sampleRank, const DeviceDescriptor& device)
            size_t maxSliceLength = (endAndBeginOffsetDiff > 0) ? endAndBeginOffsetDiff : maxActualSequenceLength + endAndBeginOffsetDiff;

            auto inputVar = InputVariable(inputShape, DataType::Float, L"input");
-            auto sliceFunc = Slice(inputVar, axis, beginOffset, endOffset);
+            auto sliceFunc = Sequence::Slice(inputVar, beginOffset, endOffset);
            sliceFunc = sliceFunc + sliceFunc;

-            size_t outputSequenceAxisLength = (axis == Axis::DefaultDynamicAxis()) ? maxSliceLength : maxActualSequenceLength;
-            size_t outputBatchAxisLength = (axis == Axis::DefaultBatchAxis()) ? maxSliceLength : numSequences;
+            size_t outputSequenceAxisLength = maxSliceLength;
+            size_t outputBatchAxisLength = numSequences;
            NDShape outputShape = sliceFunc->Output().Shape().AppendShape({ outputSequenceAxisLength, outputBatchAxisLength });
            std::vector<float> outputData(outputShape.TotalSize(), 0);
            NDMaskPtr mask;
@ -247,15 +237,15 @@ void TestSlice(size_t sampleRank, const DeviceDescriptor& device)
            std::unordered_map<Variable, ValuePtr> outputs = { { sliceFunc->Output(), outputValue } };
            sliceFunc->Forward({ { inputVar, sequencesValue } }, outputs, device);

-            size_t startSequenceIdx = (axis == Axis::DefaultBatchAxis()) ? ((beginOffset >= 0) ? beginOffset : (numSequences + beginOffset)) : 0;
-            size_t endSequenceIdx = (axis == Axis::DefaultBatchAxis()) ? ((endOffset > 0) ? endOffset : (numSequences + endOffset)) : numSequences;
+            size_t startSequenceIdx = 0;
+            size_t endSequenceIdx = numSequences;

            std::vector<float> expectedOutputValues(inputShape.TotalSize() * outputSequenceAxisLength * outputBatchAxisLength);
            for (size_t i = startSequenceIdx; i < endSequenceIdx; ++i)
            {
                size_t currentSequenceLength = sequenceLengths[i];
-                size_t startFrameIdx = (axis == Axis::DefaultDynamicAxis()) ? ((beginOffset >= 0) ? beginOffset : (currentSequenceLength + beginOffset)) : 0;
-                size_t endFrameIdx = (axis == Axis::DefaultDynamicAxis()) ? ((endOffset > 0) ? endOffset : (currentSequenceLength + endOffset)) : currentSequenceLength;
+                size_t startFrameIdx = ((beginOffset >= 0) ? beginOffset : (currentSequenceLength + beginOffset));
+                size_t endFrameIdx = ((endOffset > 0) ? endOffset : (currentSequenceLength + endOffset));
                size_t j = startFrameIdx;
                for (; j < endFrameIdx; ++j)
                {
@ -272,12 +262,12 @@ void TestSlice(size_t sampleRank, const DeviceDescriptor& device)
            FloatingPointVectorCompare(outputData, expectedOutputValues, "testDynamicAxisSlice: Forward prop results do not match expected results");
        };

-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), 0, 1);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), 0, 2);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), -1, 0);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), -2, 0);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), 0, -1);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), 1, 0);
+        testDynamicAxisSlice(0, 1);
+        testDynamicAxisSlice(0, 2);
+        testDynamicAxisSlice(-1, 0);
+        testDynamicAxisSlice(-2, 0);
+        testDynamicAxisSlice(0, -1);
+        testDynamicAxisSlice(1, 0);
    }
 }

--- a/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp
+++ b/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp
@ -6,7 +6,7 @@ using namespace CNTK;

 using namespace std::placeholders;

-void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useSparseInputs, bool testSaveAndReLoad, bool testCheckpointing, bool addBeamSearchReorderingHook, bool testCloning)
+void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useSparseInputs, bool testSaveAndReLoad, bool testCheckpointing, bool addBeamSearchReorderingHook, bool testCloning, bool usePlaceholders)
 {
    using namespace std::placeholders;

@ -30,7 +30,7 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
    FunctionPtr inputSequence = Alias(rawInput, L"inputSequence");

    // Drop the sentence start token from the label, for decoder training
-    auto labelSequence = Slice(rawLabels, labelDynamicAxes[0], 1, 0, L"labelSequenceWithStartTrimmed");
+    auto labelSequence = Sequence::Slice(rawLabels, 1, 0, L"labelSequenceWithStartTrimmed");
    auto labelSentenceStart = Sequence::First(rawLabels, L"labelSequenceStart");

    auto isFirstLabel = Sequence::IsFirst(labelSequence, L"isFirstLabel");
@ -38,8 +38,8 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
    bool forceEmbedding = useSparseInputs;

    /* Embeddings */
-    auto inputEmbeddingWeights = Parameter({ inputEmbeddingDim, inputVocabDim }, DataType::Float, GlorotUniformInitializer(), device, L"inputEmbeddingWeights");
-    auto labelEmbeddingWeights = Parameter({ labelEmbeddingDim, labelVocabDim }, DataType::Float, GlorotUniformInitializer(), device, L"labelEmbeddingWeights");
+    auto inputEmbeddingWeights = Parameter({ inputEmbeddingDim, NDShape::InferredDimension }, DataType::Float, GlorotUniformInitializer(), device, L"inputEmbeddingWeights");
+    auto labelEmbeddingWeights = Parameter({ labelEmbeddingDim, NDShape::InferredDimension }, DataType::Float, GlorotUniformInitializer(), device, L"labelEmbeddingWeights");

    auto inputEmbedding = Alias((!forceEmbedding && (inputVocabDim <= inputEmbeddingDim)) ? inputSequence : Times(inputEmbeddingWeights, inputSequence), L"inputEmbedding");
    auto labelEmbedding = Alias((!forceEmbedding && (labelVocabDim <= labelEmbeddingDim)) ? labelSequence : Times(labelEmbeddingWeights, labelSequence), L"labelEmbedding");
@ -63,8 +63,20 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
        labelSentenceStartEmbeddedScattered = Reshape(labelSentenceStartEmbeddedScattered, labelSentenceStartEmbeddedScattered->Output().Shape().AppendShape({ 1 }), L"labelSentenceStartEmbeddedScattered");
    }

-    auto thoughtVectorBroadcastH = Sequence::BroadcastAs(thoughtVectorH, labelEmbedding, L"thoughtVectorBroadcastH");
-    auto thoughtVectorBroadcastC = Sequence::BroadcastAs(thoughtVectorC, labelEmbedding, L"thoughtVectorBroadcastC");
+    auto actualThoughtVectorBroadcastH = Sequence::BroadcastAs(thoughtVectorH, labelEmbedding, L"thoughtVectorBroadcastH");
+    auto actualThoughtVectorBroadcastC = Sequence::BroadcastAs(thoughtVectorC, labelEmbedding, L"thoughtVectorBroadcastC");
+
+    Variable thoughtVectorBroadcastH, thoughtVectorBroadcastC;
+    if (usePlaceholders)
+    {
+        thoughtVectorBroadcastH = PlaceholderVariable();
+        thoughtVectorBroadcastC = PlaceholderVariable();
+    }
+    else
+    {
+        thoughtVectorBroadcastH = actualThoughtVectorBroadcastH;
+        thoughtVectorBroadcastC = actualThoughtVectorBroadcastC;
+    }

    /* Decoder */
    auto beamSearchReorderHook = Constant({ 1, 1 }, 1.0f, device);
@ -116,6 +128,10 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
    auto biasWeights = Parameter({ labelVocabDim }, 0.0f, device);

    auto z = Plus(Times(outputLayerProjWeights, Stabilize<float>(decoderOutput, device)), biasWeights, L"classifierOutput");
+
+    if (usePlaceholders)
+        z->ReplacePlaceholders({ { thoughtVectorBroadcastH, actualThoughtVectorBroadcastH }, { thoughtVectorBroadcastC, actualThoughtVectorBroadcastC } });
+
    auto ce = CrossEntropyWithSoftmax(z, labelSequence, L"lossFunction");
    auto errs = ClassificationError(z, labelSequence, L"classificationError");

@ -218,8 +234,8 @@ void TrainSequenceToSequenceTranslator()
    fprintf(stderr, "\nTrainSequenceToSequenceTranslator..\n");

    // TODO: Also test with sparse input variables in the graph
-    TrainSequenceToSequenceTranslator(DeviceDescriptor::CPUDevice(), false, true, false, true, true);
+    TrainSequenceToSequenceTranslator(DeviceDescriptor::CPUDevice(), false, true, false, false, true, true);

    if (IsGPUAvailable())
-        TrainSequenceToSequenceTranslator(DeviceDescriptor::GPUDevice(0), false, false, true, false, false);
+        TrainSequenceToSequenceTranslator(DeviceDescriptor::GPUDevice(0), false, false, true, true, false, false);
 }
--- a/bindings/python/cntk/cntk_py.i
+++ b/bindings/python/cntk/cntk_py.i
@ -19,6 +19,8 @@
 %rename(gpu_device) CNTK::DeviceDescriptor::GPUDevice;
 %rename(cpu_device) CNTK::DeviceDescriptor::CPUDevice;
 %rename(times_transpose) CNTK::TransposeTimes;
+%rename(sequence_slice) CNTK::Sequence::Slice;
+%rename(sequence_reduce_sum) CNTK::Sequence::ReduceSum;

 %rename(momentum_as_time_constant_schedule) CNTK::MomentumAsTimeConstantSchedule;

@ -42,7 +44,6 @@
 %template() std::vector<CNTK::Axis>;
 %template() std::vector<CNTK::DeviceDescriptor>;
 %template() std::vector<CNTK::StreamConfiguration>;
-//%template() std::vector<CNTK::DictionaryValue>;
 %template() std::vector<std::shared_ptr<CNTK::Function>>;
 %template() std::vector<std::shared_ptr<CNTK::Learner>>;
 %template() std::pair<size_t, double>;
@ -74,7 +75,7 @@
 //
 %feature("shadow") CNTK::Variable::DynamicAxes %{
 def dynamic_axes(self):
-    return ($action(self))[::-1]
+    return tuple(reversed($action(self)))
 %}

 %fragment("NDShapeToTuple", "header")
@ -86,7 +87,7 @@ def dynamic_axes(self):
        for (size_t i=0; i<rank; i++)
        {
            size_t dim = (&shape)->operator[](i);
-            PyTuple_SetItem(result, i, PyInt_FromLong(dim));
+            PyTuple_SetItem(result, rank-i-1, PyInt_FromLong(dim));
        }
        return result;
    }
@ -160,6 +161,57 @@ def dynamic_axes(self):
    }
 }

+//
+// Converting Python list {DictionaryValue} to std::vector
+//
+%typecheck(1000) std::vector<CNTK::DictionaryValue>& {
+    // '1000' is the typecheck precedence code. It means: check after basic
+    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
+    $1 = PyList_Check($input) ? 1 : 0;
+}
+
+%typemap(in) std::vector<CNTK::DictionaryValue>& {
+     if (PyList_Check($input)) {
+        std::vector<CNTK::DictionaryValue>* vec = new std::vector<CNTK::DictionaryValue>();
+
+        PyObject *item;
+
+        PyObject *iterator = PyObject_GetIter($input);
+        if (iterator == NULL) {
+            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::DictionaryValue");
+        }
+
+        while ((item = PyIter_Next(iterator))) {
+            void *raw_var = 0 ;
+            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__DictionaryValue,  0);
+            if (!SWIG_IsOK(res1)) {
+                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert list element to CNTK::DictionaryValue");
+            }
+            if (!raw_var) {
+                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::DictionaryValue");
+            }
+
+            CNTK::DictionaryValue* var = reinterpret_cast<CNTK::DictionaryValue*>(raw_var);
+
+            vec->push_back(*var);
+
+            Py_DECREF(item);
+        }
+
+        Py_DECREF(iterator);
+
+        if (PyErr_Occurred()) {
+            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::DictionaryValue");
+        }
+
+        $1 = vec;
+
+     } else {
+         SWIG_exception(SWIG_ValueError, "list expected");
+     }
+}
+
+
 %fragment("DictionaryValueToPy", "header", fragment="NDShapeToTuple", fragment="NDArrayViewToNumPy")
 {
    PyObject *DictionaryValueToPy(const CNTK::DictionaryValue& dictVal)
@ -340,10 +392,10 @@ fail:

 %typemap(in) CNTK::NDShape const & {
     if (PyTuple_Check($input)) {
-        std::vector<size_t> dimensions;
        size_t rank = PyTuple_Size($input);
+        std::vector<size_t> dimensions(rank);
        for (size_t i=0; i<rank; i++)
-            dimensions.push_back(PyLong_AsLong(PyTuple_GET_ITEM($input, i)));
+            dimensions[i] = PyLong_AsLong(PyTuple_GET_ITEM($input, rank-i-1));

        $1 = new CNTK::NDShape(dimensions);
     } else {
@ -405,97 +457,60 @@ fail:
 //
 // Converting Python dictionary {Variable: ValuePtr} to std::unordered_map
 //
-%typecheck(1000) const std::unordered_map<CNTK::Variable, const CNTK::ValuePtr>&, std::unordered_map<CNTK::Variable, CNTK::ValuePtr>& {
+
+%define %unordered_map_conversion(KEY_TYPE, VALUE_TYPE, SWIG_KEY_TYPE, SWIG_VALUE_TYPE) 
    // '1000' is the typecheck precedence code. It means: check after basic
    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyDict_Check($input) ? 1 : 0;
-}
+    %typecheck(1000) std::unordered_map<KEY_TYPE, VALUE_TYPE> const&,
+        const std::unordered_map<KEY_TYPE, VALUE_TYPE>&, 
+        std::unordered_map<KEY_TYPE, VALUE_TYPE>&  
+    { $1 = PyDict_Check($input) ? 1 : 0; }

-%typemap(in) const std::unordered_map<CNTK::Variable, const CNTK::ValuePtr>& (
-        std::unordered_map<CNTK::Variable, const CNTK::ValuePtr> args_map
-) {
-     if (PyDict_Check($input)) {
+    %typemap(in) std::unordered_map<KEY_TYPE, VALUE_TYPE>& (
+            std::unordered_map<KEY_TYPE, VALUE_TYPE> args_map
+    ) {
+         if (PyDict_Check($input)) {

-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
+            PyObject *key, *value;
+            Py_ssize_t pos = 0;
+
+            while (PyDict_Next($input, &pos, &key, &value)) {
+                void *raw_var = 0 ;
+                int res1 = SWIG_ConvertPtr(key, &raw_var, SWIG_KEY_TYPE,  0);
+                if (!SWIG_IsOK(res1)) {
+                    SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary"); 
+                }
+                if (!raw_var) {
+                    SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary");
+                }
+
+                KEY_TYPE* var = reinterpret_cast<KEY_TYPE*>(raw_var);
+
+                void *raw_value = 0;
+                int res2 = SWIG_ConvertPtr(value, &raw_value, SWIG_VALUE_TYPE,  0);
+                if (!SWIG_IsOK(res2)) {
+                    SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary"); 
+                }
+
+                VALUE_TYPE* value;
+                if (raw_value) {
+                    value = reinterpret_cast<VALUE_TYPE*>(raw_value);
+                    args_map.insert(std::make_pair(*var, *value));
+                } else {
+                    // We got an empty VALUE_TYPE, which carries a nullptr.
+                    // This is only used for ValuePtr
+                    args_map.insert(std::make_pair(*var, VALUE_TYPE()));
+                }

-        while (PyDict_Next($input, &pos, &key, &value)) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(key, &raw_var, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary to CNTK::Variable");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary to CNTK::Variable");
            }

-            CNTK::Variable* var = reinterpret_cast<CNTK::Variable*>(raw_var);
+            $1 = &args_map;
+         } else {
+             SWIG_exception(SWIG_TypeError, "dictionary expected");
+         }
+    }
+%enddef

-            void *raw_value = 0;
-            int res2 = SWIG_ConvertPtr(value, &raw_value, SWIGTYPE_p_std__shared_ptrT_CNTK__Value_t,  0);
-            if (!SWIG_IsOK(res2)) {
-                SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary to CNTK::ValuePtr");
-            }
-
-            CNTK::ValuePtr* value;
-            if (raw_value) {
-                value = reinterpret_cast<CNTK::ValuePtr*>(raw_value);
-                args_map.insert(std::make_pair(*var, *value));
-            } else {
-                // We got an empty ValuePtr, which carries a nullptr.
-                args_map.insert(std::make_pair(*var, CNTK::ValuePtr()));
-            }
-
-        }
-
-        $1 = &args_map;
-     } else {
-         SWIG_exception(SWIG_TypeError, "dictionary expected");
-     }
-}
-
-// supporting the non-const version
-%typemap(in) std::unordered_map<CNTK::Variable, CNTK::ValuePtr>& (
-        std::unordered_map<CNTK::Variable, CNTK::ValuePtr> args_map
-) {
-     if (PyDict_Check($input)) {
-
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-
-        while (PyDict_Next($input, &pos, &key, &value)) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(key, &raw_var, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary to CNTK::Variable");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary to CNTK::Variable");
-            }
-
-            CNTK::Variable* var = reinterpret_cast<CNTK::Variable*>(raw_var);
-
-            void *raw_value = 0;
-            int res2 = SWIG_ConvertPtr(value, &raw_value, SWIGTYPE_p_std__shared_ptrT_CNTK__Value_t,  0);
-            if (!SWIG_IsOK(res2)) {
-                SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary to CNTK::ValuePtr");
-            }
-
-            CNTK::ValuePtr* value;
-            if (raw_value) {
-                value = reinterpret_cast<CNTK::ValuePtr*>(raw_value);
-                args_map.insert(std::make_pair(*var, *value));
-            } else {
-                // We got an empty ValuePtr, which carries a nullptr.
-                args_map.insert(std::make_pair(*var, CNTK::ValuePtr()));
-            }
-        }
-
-        $1 = &args_map;
-     } else {
-         SWIG_exception(SWIG_TypeError, "dictionary expected");
-     }
-}

 // For the output dict (the non-const unordered_map) we need to get the
 // modified values and put them back into the dictionary. This is used, when
@ -727,368 +742,6 @@ fail:
    }
 }

-//
-// Converting Python dictionary {Parameter: NDArrayViewPtr} to std::unordered_map
-//
-%typecheck(1000) const std::unordered_map<CNTK::Parameter, CNTK::NDArrayViewPtr>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyDict_Check($input) ? 1 : 0;
-}
-
-%typemap(in) const std::unordered_map<CNTK::Parameter, CNTK::NDArrayViewPtr>& (
-        std::unordered_map<CNTK::Parameter, CNTK::NDArrayViewPtr> args_map
-) {
-     if (PyDict_Check($input)) {
-
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-
-        while (PyDict_Next($input, &pos, &key, &value)) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(key, &raw_var, SWIGTYPE_p_CNTK__Parameter,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary to CNTK::Parameter");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary to CNTK::Parameter");
-            }
-
-            CNTK::Parameter* var = reinterpret_cast<CNTK::Parameter*>(raw_var);
-
-            void *raw_value = 0;
-            int res2 = SWIG_ConvertPtr(value, &raw_value, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView_t,  0);
-            if (!SWIG_IsOK(res2)) {
-                SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary to CNTK::NDArrayViewPtr");
-            }
-
-            CNTK::NDArrayViewPtr* value;
-            if (raw_value) {
-                value = reinterpret_cast<CNTK::NDArrayViewPtr*>(raw_value);
-            } else {
-                // We got an empty NDArrayViewPtr, which carries a nullptr.
-                value = new CNTK::NDArrayViewPtr();
-            }
-
-            args_map.insert(std::make_pair(*var, *value));
-        }
-
-        $1 = &args_map;
-     } else {
-         SWIG_exception(SWIG_TypeError, "dictionary expected");
-     }
-}
-
-//
-// Converting Python list {DictionaryValue} to std::vector
-//
-%typecheck(1000) std::vector<CNTK::DictionaryValue>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyList_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::vector<CNTK::DictionaryValue>& {
-     if (PyList_Check($input)) {
-        std::vector<CNTK::DictionaryValue>* vec = new std::vector<CNTK::DictionaryValue>();
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::DictionaryValue");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__DictionaryValue,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert list element to CNTK::DictionaryValue");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::DictionaryValue");
-            }
-
-            CNTK::DictionaryValue* var = reinterpret_cast<CNTK::DictionaryValue*>(raw_var);
-
-            vec->push_back(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::DictionaryValue");
-        }
-
-        $1 = vec;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "list expected");
-     }
-}
-
-// end of map conversion
-
-// TODO: Parametrize the following four typemaps and unify set/list usage.
-
-//
-// Converting Python set {Variable} to std::unordered_set
-//
-%typecheck(1000) std::unordered_set<CNTK::Variable>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PySet_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::unordered_set<CNTK::Variable>& (
-        std::unordered_set<CNTK::Variable> args_set
-) {
-     if (PySet_Check($input)) {
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::Variable");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert set element to CNTK::Variable");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::Variable");
-            }
-
-            CNTK::Variable* var = reinterpret_cast<CNTK::Variable*>(raw_var);
-
-            args_set.insert(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert set element to CNTK::Variable");
-        }
-
-        $1 = &args_set;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "set expected");
-     }
-}
-
-//
-// Converting Python set {StreamInformation} to std::unordered_set
-//
-%typecheck(1000) std::unordered_set<CNTK::StreamInformation>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PySet_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::unordered_set<CNTK::StreamInformation>& (
-        std::unordered_set<CNTK::StreamInformation> args_set
-) {
-     if (PySet_Check($input)) {
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::StreamInformation");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__StreamInformation,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert set element to CNTK::StreamInformation");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a set element to CNTK::StreamInformation");
-            }
-
-            CNTK::StreamInformation* var = reinterpret_cast<CNTK::StreamInformation*>(raw_var);
-
-            args_set.insert(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert set element to CNTK::StreamInformation");
-        }
-
-        $1 = &args_set;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "set expected");
-     }
-}
-
-//
-// Converting Python list {Parameter} to std::unordered_set
-//
-%typecheck(1000) std::unordered_set<CNTK::Parameter>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyList_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::unordered_set<CNTK::Parameter>& (
-        std::unordered_set<CNTK::Parameter> args_set
-) {
-     if (PyList_Check($input)) {
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::Parameter");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__Parameter,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert set element to CNTK::Parameter");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::Parameter");
-            }
-
-            CNTK::Parameter* var = reinterpret_cast<CNTK::Parameter*>(raw_var);
-
-            args_set.insert(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert set element to CNTK::Parameter");
-        }
-
-        $1 = &args_set;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "list expected");
-     }
-}
-
-
-//
-// Converting Python list {LearnerPtr} to std::unordered_set
-//
-%typecheck(1000) std::unordered_set<CNTK::LearnerPtr>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyList_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::unordered_set<CNTK::LearnerPtr>& (
-        std::unordered_set<CNTK::LearnerPtr> args_set
-) {
-     if (PyList_Check($input)) {
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::LearnerPtr");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_std__shared_ptrT_CNTK__Learner_t,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert list element to CNTK::LearnerPtr");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::LearnerPtr");
-            }
-
-            CNTK::LearnerPtr* var = reinterpret_cast<CNTK::LearnerPtr*>(raw_var);
-
-            args_set.insert(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::LearnerPtr");
-        }
-
-        $1 = &args_set;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "list expected");
-     }
-}
-
-%typecheck(1000) const std::unordered_map<CNTK::Variable, CNTK::Variable>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyDict_Check($input) ? 1 : 0;
-}
-
-
-%typemap(in) std::unordered_map<CNTK::Variable, CNTK::Variable>& (
-        std::unordered_map<CNTK::Variable, CNTK::Variable> args_map
-) {
-     if (PyDict_Check($input)) {
-
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-
-        while (PyDict_Next($input, &pos, &key, &value)) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(key, &raw_var, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary to CNTK::Variable");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary to CNTK::Variable");
-            }
-
-            CNTK::Variable* var = reinterpret_cast<CNTK::Variable*>(raw_var);
-
-            void *raw_value = 0;
-            int res2 = SWIG_ConvertPtr(value, &raw_value, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res2)) {
-                SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary to CNTK::Variable");
-            }
-
-            CNTK::Variable* value;
-            if (raw_value) {
-                value = reinterpret_cast<CNTK::Variable*>(raw_value);
-            } else {
-                // We got an empty Variable, which carries a nullptr.
-                value = new CNTK::Variable();
-            }
-
-            args_map.insert(std::make_pair(*var, *value));
-        }
-
-        $1 = &args_map;
-     } else {
-         SWIG_exception(SWIG_TypeError, "dictionary expected");
-     }
-}
-
-

 //
 // Converting std::unordered_set to Python list.
@ -1104,9 +757,9 @@ fail:
    {
        SWIG_exception(SWIG_RuntimeError, "error passing set to Python");
    }
-
+ 
    // *&$1 -> $1 is the returned result being converted (unordered_set<...>*),
-    // wrapped by SwigValueWrapper. So we need to unwrap it using '&',
+    // wrapped by SwigValueWrapper. So we need to unwrap it using '&', 
    // then access its value using '*'.
    for (auto var : *&$1)
    {
@ -1119,15 +772,58 @@ fail:
    $result = container;
 }
 %enddef
-
-%unordered_set_conversion(Variable, SWIGTYPE_p_CNTK__Variable)
-%unordered_set_conversion(Constant, SWIGTYPE_p_CNTK__Constant)
-%unordered_set_conversion(Parameter, SWIGTYPE_p_CNTK__Parameter)
-%unordered_set_conversion(DistributedWorkerDescriptor, SWIGTYPE_p_CNTK__DistributedWorkerDescriptor)
-
+ 
 %define %unordered_set_ref_conversion(DATA_TYPE, _SWIG_TYPE)

-%typemap(out) std::unordered_set<CNTK::DATA_TYPE>& {
+%typecheck(1000) std::unordered_set<DATA_TYPE>&, std::unordered_set<DATA_TYPE>const & {
+    // '1000' is the typecheck precedence code. It means: check after basic
+    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
+    $1 = PySet_Check($input) || PyList_Check($input) ? 1 : 0;
+}
+
+%typemap(in) std::unordered_set<DATA_TYPE>& (
+        std::unordered_set<DATA_TYPE> args_set 
+) {
+     if (PySet_Check($input) || PyList_Check($input)) {
+
+        PyObject *item;
+
+        PyObject *iterator = PyObject_GetIter($input);
+        if (iterator == NULL) {
+            SWIG_exception_fail(SWIG_ValueError, "cannot convert element"); 
+        }
+
+        while ((item = PyIter_Next(iterator))) {
+            void *raw_var = 0 ;
+            int res1 = SWIG_ConvertPtr(item, &raw_var, _SWIG_TYPE,  0);
+            if (!SWIG_IsOK(res1)) {
+                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert set element"); 
+            }
+            if (!raw_var) {
+                SWIG_exception_fail(SWIG_ValueError, "invalid null reference");
+            }
+
+            DATA_TYPE* var = reinterpret_cast<DATA_TYPE*>(raw_var);
+
+            args_set.insert(*var);
+
+            Py_DECREF(item);
+        }
+
+        Py_DECREF(iterator);
+
+        if (PyErr_Occurred()) {
+            SWIG_exception_fail(SWIG_ValueError, "cannot convert set element"); 
+        }
+
+        $1 = &args_set;
+
+     } else {
+         SWIG_exception(SWIG_ValueError, "set expected");
+     }
+}
+
+%typemap(out) std::unordered_set<DATA_TYPE>&  {
    PyObject* container = PyList_New(0);
    if (container == NULL)
    {
@ -1136,7 +832,7 @@ fail:

    for (auto var : *$1)
    {
-        PyObject *item = SWIG_NewPointerObj(new CNTK::DATA_TYPE(var), _SWIG_TYPE, SWIG_POINTER_OWN );
+        PyObject *item = SWIG_NewPointerObj(new DATA_TYPE(var), _SWIG_TYPE, SWIG_POINTER_OWN );
        // No error handling here, because the error will be passed directly to Python
        PyList_Append(container, item);
        Py_DECREF(item);
@ -1146,16 +842,23 @@ fail:
 }
 %enddef

-%unordered_set_ref_conversion(StreamInformation, SWIGTYPE_p_CNTK__StreamInformation)
-%unordered_set_ref_conversion(LearnerPtr, SWIGTYPE_p_std__shared_ptrT_CNTK__Learner_t)
-%unordered_set_ref_conversion(Parameter, SWIGTYPE_p_CNTK__Parameter)
-%unordered_set_ref_conversion(DistributedWorkerDescriptor, SWIGTYPE_p_CNTK__DistributedWorkerDescriptor)
+%unordered_set_conversion(CNTK::Variable, SWIGTYPE_p_CNTK__Variable)
+%unordered_set_conversion(CNTK::Constant, SWIGTYPE_p_CNTK__Constant)
+%unordered_set_conversion(CNTK::Parameter, SWIGTYPE_p_CNTK__Parameter)
+%unordered_set_conversion(CNTK::StreamInformation, SWIGTYPE_p_CNTK__StreamInformation)
+%unordered_set_conversion(CNTK::DistributedWorkerDescriptor, SWIGTYPE_p_CNTK__DistributedWorkerDescriptor)
+
+%unordered_set_ref_conversion(CNTK::Variable, SWIGTYPE_p_CNTK__Variable)
+%unordered_set_ref_conversion(CNTK::Parameter, SWIGTYPE_p_CNTK__Parameter)
+%unordered_set_ref_conversion(CNTK::StreamInformation, SWIGTYPE_p_CNTK__StreamInformation)
+%unordered_set_ref_conversion(CNTK::LearnerPtr, SWIGTYPE_p_std__shared_ptrT_CNTK__Learner_t)
+%unordered_set_ref_conversion(CNTK::DistributedWorkerDescriptor, SWIGTYPE_p_CNTK__DistributedWorkerDescriptor)

 // Unordered map conversion

 %define %unordered_map_ref_conversion(DATA_TYPE1, _SWIG_TYPE1, DATA_TYPE2, _SWIG_TYPE2)

-%typemap(out) std::unordered_map<CNTK::DATA_TYPE1, CNTK::DATA_TYPE2>& {
+%typemap(out) std::unordered_map<DATA_TYPE1, DATA_TYPE2>& {
    PyObject* container = PyDict_New();
    if (container == NULL)
    {
@ -1167,8 +870,8 @@ fail:
    // then access its value using '*'.
    for (auto it : *$1)
    {
-        PyObject *returned_var = SWIG_NewPointerObj(SWIG_as_voidptr(new CNTK::DATA_TYPE1(it.first)), _SWIG_TYPE1, SWIG_POINTER_OWN);
-        PyObject *returned_val = SWIG_NewPointerObj(SWIG_as_voidptr(new CNTK::DATA_TYPE2(it.second)), _SWIG_TYPE2, SWIG_POINTER_OWN);
+        PyObject *returned_var = SWIG_NewPointerObj(SWIG_as_voidptr(new DATA_TYPE1(it.first)), _SWIG_TYPE1, SWIG_POINTER_OWN);
+        PyObject *returned_val = SWIG_NewPointerObj(SWIG_as_voidptr(new DATA_TYPE2(it.second)), _SWIG_TYPE2, SWIG_POINTER_OWN);

        PyDict_SetItem(container, returned_var, returned_val);

@ -1180,8 +883,15 @@ fail:
 }
 %enddef

-%unordered_map_ref_conversion(StreamInformation, SWIGTYPE_p_CNTK__StreamInformation, MinibatchData, SWIGTYPE_p_CNTK__MinibatchData);
-%unordered_map_ref_conversion(Parameter, SWIGTYPE_p_CNTK__Parameter, NDArrayViewPtr, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView);
+%unordered_map_conversion(CNTK::Variable, const CNTK::ValuePtr, SWIGTYPE_p_CNTK__Variable, SWIGTYPE_p_std__shared_ptrT_CNTK__Value_t)
+%unordered_map_conversion(CNTK::Variable, CNTK::ValuePtr, SWIGTYPE_p_CNTK__Variable, SWIGTYPE_p_std__shared_ptrT_CNTK__Value_t)
+%unordered_map_conversion(CNTK::Variable, CNTK::Variable, SWIGTYPE_p_CNTK__Variable, SWIGTYPE_p_CNTK__Variable)
+%unordered_map_conversion(CNTK::Parameter, const CNTK::NDArrayViewPtr, SWIGTYPE_p_CNTK__Parameter, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView_t)
+%unordered_map_conversion(CNTK::Parameter, CNTK::NDArrayViewPtr, SWIGTYPE_p_CNTK__Parameter, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView_t)
+
+%unordered_map_ref_conversion(CNTK::StreamInformation, SWIGTYPE_p_CNTK__StreamInformation, CNTK::MinibatchData, SWIGTYPE_p_CNTK__MinibatchData);
+%unordered_map_ref_conversion(CNTK::Parameter, SWIGTYPE_p_CNTK__Parameter, CNTK::NDArrayViewPtr, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView);
+%unordered_map_ref_conversion(CNTK::Variable, SWIGTYPE_p_CNTK__Variable, CNTK::Variable, SWIGTYPE_p_CNTK__Variable);

 %shared_ptr(CNTK::Function)
 %shared_ptr(CNTK::NDArrayView)
@ -1206,7 +916,7 @@ fail:
 %extend CNTK::NDMask {
    PyObject* to_numpy() {
        std::vector<size_t> cntk_dims = (*self).Shape().Dimensions();
-        static_assert(dims.size()==2, "mask requires exactly two dimensions");
+        static_assert(cntk_dims.size()==2, "mask requires exactly two dimensions");
        std::vector<size_t> dimensions = {cntk_dims[1], cntk_dims[0]};

        size_t num_elements = dimensions[0] * dimensions[1];
@ -1258,17 +968,17 @@ fail:

        PyArrayObject* array = (PyArrayObject*)pyobj;

-        int rank = PyArray_NDIM(array);
-
-        npy_intp* np_shape = PyArray_SHAPE(array);
-        std::vector<size_t> shape;
+        int rank = PyArray_NDIM(array); 
+        
+        npy_intp* np_shape = PyArray_SHAPE(array); 
+        std::vector<size_t> shape(rank);

        npy_intp num_elements = 1;
        // CNTK uses column major, thus we reverse the shape
-        for (int i=rank-1; i>=0; i--)
+        for (int i=0; i<rank; i++)
        {
-            shape.push_back(np_shape[i]);
-            num_elements *= np_shape[i];
+            shape[rank-i-1] = np_shape[i];
+            num_elements *= np_shape[i];            
        }

        int typecode = PyArray_TYPE(array);
@ -1342,7 +1052,7 @@ public:
 // Setting up hash calculation so that __hash__ on Swig objects
 // are redirected to the std::hash computation of the C++ API
 //
-%define %py_hash_for(DATA_TYPE, EQ)
+%define %py_hash_for(DATA_TYPE)
 %extend CNTK::DATA_TYPE {
    const size_t __hash__() {
        return std::hash<CNTK::DATA_TYPE>()(*$self);
@ -1357,14 +1067,16 @@ DATA_TYPE.__eq__ = lambda a,b: EQ(a,b)
 %enddef

 %py_eq_for(Variable, Variable_eq)
-%py_eq_for(Constant, Variable_eq)
-%py_eq_for(Parameter, Variable_eq)
-%py_eq_for(NDShape, NDShape_eq)
+%py_hash_for(Variable)

-%py_hash_for(Variable, Variable_eq)
-%py_hash_for(Constant, Variable_eq)
-%py_hash_for(Parameter, Variable_eq)
-%py_hash_for(NDShape, NDShape_eq)
+%py_eq_for(Constant, Variable_eq)
+%py_hash_for(Constant)
+
+%py_eq_for(Parameter, Variable_eq)
+%py_hash_for(Parameter)
+
+%py_eq_for(NDShape, NDShape_eq)
+%py_hash_for(NDShape)

 %py_eq_for(DeviceDescriptor, DeviceDescriptor_eq)

@ -1395,4 +1107,3 @@ for klass in [Variable, Value, NDArrayView, NDMask]:

 enable_reversing_tensor_shapes_in_error_messages()
 %}
-
--- a/bindings/python/cntk/io/tests/io_tests.py
+++ b/bindings/python/cntk/io/tests/io_tests.py
@ -10,7 +10,7 @@ import numpy as np

 abs_path = os.path.dirname(os.path.abspath(__file__))

-def test_text_format():
+def _test_text_format():
    from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource

    # 0	|x 560	|y 1 0 0 0 0
--- a/bindings/python/cntk/ops/init.py
+++ b/bindings/python/cntk/ops/init.py
@ -59,9 +59,52 @@ def alias(x, name=''):
    return alias(x, name)

 ##########################################################################
-# evaluation ops
+# loss and evaluation ops
 ##########################################################################

+@typemap
+def binary_cross_entropy(output, target, name=''):
+    r'''
+    This operation computes the binary cross entropy between the ``output`` and ``target``.
+
+    Example:
+        TBA
+
+    Args:
+        output: the computed posterior probability from the network
+        target: ground-truth label, 0 or 1
+        name (`str`, optional): the name of the Function instance in the network
+    Returns:
+        :class:`cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import binary_cross_entropy
+    dtype = get_data_type(output, target)
+    output = sanitize_input(output, dtype)
+    target = sanitize_input(target, dtype)
+    return binary_cross_entropy(output, target, name)
+
+@typemap
+def weighted_binary_cross_entropy(output, target, weight, name=''):
+    r'''
+    This operation computes the weighted binary cross entropy between the ``output`` and ``target``.
+
+    Example:
+        TBA
+
+    Args:
+        output: the computed posterior probability from the network
+        target: ground-truth label, 0 or 1
+        weight: weight of each example
+        name (`str`, optional): the name of the Function instance in the network
+    Returns:
+        :class:`cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import weighted_binary_cross_entropy
+    dtype = get_data_type(output, target, weight)
+    output = sanitize_input(output, dtype)
+    target = sanitize_input(target, dtype)
+    weight = sanitize_input(weight, dtype)
+    return weighted_binary_cross_entropy(output, target, weight, name)

@typemap
 def cross_entropy_with_softmax(output_vector, target_vector, axis=-1, name=''):
@ -185,17 +228,21 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
                auto_padding=[True], lower_pad=(0,), upper_pad=(0,), transpose=False,
                max_temp_mem_size_in_samples=0, name=''):
    '''
-    Computes the convolution of a weight matrix with an image or tensor. This operation is used in image-processing applications
-    and language processing. It supports any dimensions, stride, sharing or padding.
+    Computes the convolution of ``convolution_map`` (typically a tensor of learnable parameters) with
+    ``operand`` (commonly an image or output of a previous convolution/pooling operation).
+    This operation is used in image and language processing applications. It supports arbitrary
+    dimensions, strides, sharing, and padding.

-    This function operates on input tensors of the form [M1 x M2 x ... x Mn x inChannels]. This can be understood as a rank-n
-    object, where each entry consists of a inChannels-dimensional vector. For example, an RGB image would have dimensions
-    [W x H x 3], i.e. a [W x H]-sized structure, where each entry (pixel) consists of a 3-tuple (note, however, that the
-    memory-storage format is the concatenation of 3 planes of size [W x H]).
+    This function operates on input tensors with dimensions :math:`[C \\times M_1 \\times M_2 \\times \\ldots \\times M_n]`. This can be understood as a rank-n
+    object, where each entry consists of a :math:`C`-dimensional vector. For example, an RGB image would have dimensions
+    :math:`[3 \\times W \\times H]`, i.e. a :math:`[W \\times H]`-sized structure, where each entry (pixel) consists of a 3-tuple.

-    `convolution` convolves the input with n+1-dimensional filters, where the first n dimensions are the spatial extent of the
-    filter, and the last one must be equal to inChannels. There are outChannels filters. I.e. for each output position, a vector of
-    dimension outChannels is computed. Hence, the total number of filter parameters is (M1*M2*...*Mn) * inChannels * outChannels.
+    `convolution` convolves the input ``operand`` with a :math:`n+2` rank tensor of (typically learnable) filters called
+    ``convolution_map`` of shape :math:`[O \\times I \\times m_1 \\times m_2 \\times \\ldots \\times m_n ]` (typically :math:`m_i \\ll M_i`).
+    The first dimension, :math:`O`, is the nunber of convolution filters (i.e. the number of
+    channels in the output). The second dimension, :math:`I`, must match the number of channels in the input.
+    The last n dimensions are the spatial extent of the filter. I.e. for each output position, a vector of
+    dimension :math:`O` is computed. Hence, the total number of filter parameters is :math:`O \\times I \\times m_1 \\times m_2 \\times \\ldots \\times m_n`


    Example:
@ -210,12 +257,12 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
              [ 36.,  38.,  40.,  42.]]]]], dtype=float32)

    Args:
-        convolution_map: convolution filter weights, stored as a tensor of dimensions [outChannels x M1 x M2 x ... x Mn],
-         where [M1 x M2 x ... x Mn] must be the kernel dimensions.
-        operand: convolution input. A tensor with dimensions [M1 x M2 x ... x Mn x inChannels].
-        strides (optional): stride dimensions. A stride > 1 means that only pixel positions that are multiples of the stride value are computed.
-         For example, a stride of 2 will lead to a halving of the dimensions. The last stride dimension that lines up with the number
-         of input channels must be equal to the number of input channels.
+        convolution_map: convolution filter weights, stored as a tensor of dimensions :math:`[O \\times I \\times m_1 \\times m_2 \\times \\ldots \\times m_n]`,
+         where :math:`[m_1 \\times m_2 \\times \\ldots \\times m_n]` must be the kernel dimensions (spatial extent of the filter).
+        operand: convolution input. A tensor with dimensions :math:`[I \\times M_1 \\times M_2 \\times \\ldots \\times M_n]`.
+        strides (`tuple`, optional): stride dimensions. If strides[i] > 1 then only pixel positions that are multiples of strides[i] are computed.
+         For example, a stride of 2 will lead to a halving of that dimension. The first stride dimension that lines up with the number
+         of input channels can be set to any non-zero value.
        sharing (bool): sharing flags for each input dimension
        auto_padding (bool): flags for each input dimension whether it should be padded automatically (that is,
         symmetrically) or not padded at all. Padding means that the convolution kernel is applied to all pixel positions, where all
@ -235,9 +282,8 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
    '''
    from cntk.cntk_py import convolution
    operand = sanitize_input(operand)
-    return convolution(convolution_map, operand, tuple(reversed(strides)), sharing, auto_padding,
-                       tuple(reversed(lower_pad)), tuple(
-                           reversed(upper_pad)), transpose,
+    return convolution(convolution_map, operand, tuple(strides), sharing, auto_padding,
+                       tuple(lower_pad), tuple(upper_pad), transpose,
                       max_temp_mem_size_in_samples, name)


@ -333,7 +379,7 @@ def batch_normalization(operand, scale, bias, running_mean, running_inv_std, spa
        spatial(`bool`): flag that indicates whether to compute mean/var for each feature in a minibatch
         independently or, in case of convolutional layers, per future map
        normalization_time_constant(`float`, default 5000): time constant for computing running average of
-         mean and variance as a low-pass filtered version of the batch statistics. 
+         mean and variance as a low-pass filtered version of the batch statistics.
        blend_time_constant(`float`, default 0): constant for smoothing batch estimates with the running
         statistics
        epsilon: conditioner constant added to the variance when computing the inverse standard deviation
@ -1702,32 +1748,32 @@ def random_sample(weights, num_samples, allow_duplicates, name=''):

@typemap
 def random_sample_inclusion_frequency(
-    weights, 
-    num_samples, 
-    allow_duplicates, 
+    weights,
+    num_samples,
+    allow_duplicates,
    name=''):
    '''
    For weighted sampling with the specifed sample size (`num_samples`)
    this node computes the expected number of occurences of each class
-    in the the sampled set. In case of sampling without replacement 
+    in the the sampled set. In case of sampling without replacement
    the result is only an estimate which might be quite rough in the
    case of small sample sizes.
-    Intended uses are e.g. sampled softmax, noise contrastive 
+    Intended uses are e.g. sampled softmax, noise contrastive
    estimation etc.
-    This operation will be typically used together 
+    This operation will be typically used together
    with :func:`random_sample`.

    Args:
-        weights: input vector of sampling weights which should be 
-            non-negative numbers. 
+        weights: input vector of sampling weights which should be
+            non-negative numbers.
        num_samples (`int`): number of expected samples
-        allow_duplicates (`bool`): If sampling is done 
+        allow_duplicates (`bool`): If sampling is done
            with replacement (`True`) or without (`False`).

    Examples:
        >>> import numpy as np
        >>> from cntk import *
-        >>> # weight vector with 100 '1000'-values followed 
+        >>> # weight vector with 100 '1000'-values followed
        >>> # by 100 '1' values
        >>> w1 = np.full((100),1000, dtype = np.float)
        >>> w2 = np.full((100),1, dtype = np.float)
@ -1752,9 +1798,9 @@ def random_sample_inclusion_frequency(
    weights = sanitize_input(weights)

    return random_sample_inclusion_frequency(
-        weights, 
-        num_samples, 
-        allow_duplicates, 
+        weights,
+        num_samples,
+        allow_duplicates,
        name)


--- a/bindings/python/cntk/ops/sequence/init.py
+++ b/bindings/python/cntk/ops/sequence/init.py
@ -63,6 +63,28 @@ def is_last(seq, name=''):
    seq = sanitize_input(seq, get_data_type(seq))
    return is_last(seq, name)

+@typemap
+def slice(seq, begin_index, end_index, name=''):
+    '''
+    Slice the input sequence.
+
+    Examples:
+        TBA
+    Args:
+        seq: sequence input tensor
+        begin_index (`int`): the index along sequence axis where the slicing starts
+        end_index (`int`): the index along sequence axis where the slicing ends
+        name (`str`, optional): the name of the Function instance in the network
+
+    See also:
+        Indexing in NumPy: http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+
+    Returns:
+        :class:`cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import sequence_slice
+    seq = sanitize_input(seq, get_data_type(seq))
+    return sequence_slice(seq, begin_index, end_index, name)

@typemap
 def first(seq, name=''):
@ -281,3 +303,21 @@ def broadcast_as(operand, broadcast_as_operand, name=''):
    broadcast_as_operand = sanitize_input(
        broadcast_as_operand, get_data_type(broadcast_as_operand))
    return broadcast_as(operand, broadcast_as_operand, name)
+
+@typemap
+def reduce_sum(seq, name=''):
+    '''
+    Computes the sum of the input sequence's elements across the sequence axis.
+
+    Examples:
+        TBA
+    Args:
+        seq: sequence input tensor
+        name (`str`, optional): the name of the Function instance in the network
+
+    Returns:
+        :class:`cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import sequence_reduce_sum
+    seq = sanitize_input(seq, get_data_type(seq))
+    return sequence_reduce_sum(seq, name)
--- a/bindings/python/cntk/ops/tests/linear_test.py
+++ b/bindings/python/cntk/ops/tests/linear_test.py
@ -19,8 +19,8 @@ TENSOR_PAIRS = [
    ([30.], [10.]),
    ([[10.]], [[30.]]),
    ([[1.5, 2.1]], [[10., 20.]]),
-    #([[100., 200.], [300., 400.], [10., 20.]],
-    #  [[10., 20.], [30., 40.], [1., 2.]]),
+    ([[100., 200.], [300., 400.], [10., 20.]],
+     [[10., 20.], [30., 40.], [1., 2.]]),

    # Adding two 3x2 inputs of sequence length 1
    ([[30., 40.], [1., 2.], [0.1, 0.2]], [[10, 20], [3, 4], [-0.5, -0.4]]),
@ -175,6 +175,8 @@ NEGATE_TENSORS = [
    ([[100., 200.], [300., 400.], [10., 20.]]),
    ([[30, 40], [1, 2], [0.1, 0.2]])
 ]
+
+
@pytest.mark.parametrize("operand", NEGATE_TENSORS)
 def test_op_negate(operand, device_id, precision):
    t = -1 * AA(operand, dtype=PRECISION_TO_TYPE[precision])
@ -193,34 +195,41 @@ def test_op_negate(operand, device_id, precision):
    _test_unary_op(precision, device_id, '-', operand,
                   expected_forward, expected_backward)

-TIMES_PAIRS = [
+# transpose_times currently only supports right operands of rank 1 or 2
+TRANSPOSE_TIMES_PAIRS = [
    ([[30.]], [[10.]]),
    ([[1.5, 2.1]], [[10.], [20.]]),
-    ([[100., 200.]], [[10.], [20.]]),
+    ([[100., 200.]], [[-10.], [20.]]),
    ([[100., 200.], [300., 400.]], [[10.], [20.]]),
-    ([[100., 200.], [300., 400.]], [[10., 20.], [20., 30.]])
+    ([[100., 200.], [-300., 400.]], [[10., 20.], [20., 30.]]),
+    (np.reshape(np.arange(24), (4, 3, 2)),
+     np.array([[1, 3], [2, 4]])),
 ]

-# TODO: Handle sparse matrices
+# TODO: Handle sparse matrices (left_matrix_type, right_matrix_type)
+
+# adding a rank 3 operand for times operation
+TIMES_PAIRS = TRANSPOSE_TIMES_PAIRS + \
+    list((np.reshape(np.arange(8), (2, 2, 2)), np.reshape(np.arange(8), (2, 2, 2))))


@pytest.mark.parametrize("left_operand, right_operand", TIMES_PAIRS)
-def test_op_times(left_operand, right_operand, device_id, precision,
-                  left_matrix_type, right_matrix_type):
+def test_op_times(left_operand, right_operand, device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]

    a = AA(left_operand, dtype=dt_precision)
    b = AA(right_operand, dtype=dt_precision)

-    expected_forward = [[np.dot(a, b)]]
-
-    assert len(a.shape) == len(b.shape) == 2
+    expected_forward = [[np.tensordot(a, b, axes=len(b.shape) - 1)]]

    left_backward = np.zeros_like(a)
-    left_backward[:, :] = b.sum(axis=1)
+    left_backward[...] = b.sum(axis=-1)

    right_backward = np.zeros_like(b)
-    right_backward[:, :] = np.transpose([a.sum(axis=0)])
+    transpose_axes = list(np.roll(np.arange(len(b.shape)), -1))
+    sum_axes = tuple(np.arange(0, len(a.shape) - len(b.shape) + 1))
+    right_backward[...] = np.transpose(
+        AA([a.sum(axis=sum_axes)]), axes=transpose_axes)

    expected_backward = {
        'left_arg':  [[left_backward]],
@ -231,3 +240,32 @@ def test_op_times(left_operand, right_operand, device_id, precision,

    _test_binary_op(precision, device_id, times,
                    left_operand, right_operand, expected_forward, expected_backward)
+
+
+@pytest.mark.parametrize("left_operand, right_operand", TRANSPOSE_TIMES_PAIRS)
+def test_op_transpose_times(left_operand, right_operand, device_id, precision):
+    dt_precision = PRECISION_TO_TYPE[precision]
+
+    # tranpose right_operand to make product possible
+    right_operand = np.transpose(right_operand).tolist()
+
+    a = AA(left_operand, dtype=dt_precision)
+    b = AA(right_operand, dtype=dt_precision)
+
+    expected_forward = [[np.dot(a, np.transpose(b))]]
+
+    left_backward = np.zeros_like(a)
+    left_backward[...] = b.sum(axis=tuple(range(len(b.shape) - 1)))
+
+    right_backward = np.zeros_like(b)
+    right_backward[...] = a.sum(axis=tuple(range(len(a.shape) - 1)))
+
+    expected_backward = {
+        'left_arg':  [[left_backward]],
+        'right_arg': [[right_backward]]
+    }
+
+    from cntk import times_transpose
+
+    _test_binary_op(precision, device_id, times_transpose,
+                    left_operand, right_operand, expected_forward, expected_backward)
--- a/bindings/python/cntk/ops/tests/reshaping_test.py
+++ b/bindings/python/cntk/ops/tests/reshaping_test.py
@ -166,8 +166,9 @@ def test_op_slice_sequence(input_data, slice_params, expected_result, device_id,
          dynamic_axes=[Axis.default_batch_axis(), t],
          name='a')

-    result = C.slice(a, axis=t, begin_index=slice_params[
-                     0], end_index=slice_params[1])
+    result = C.sequence.slice(a, 
+            begin_index=slice_params[0], 
+            end_index=slice_params[1])

    def grad_slice(x, beg_index, end_index):
        res = np.zeros_like(x)
@ -176,8 +177,8 @@ def test_op_slice_sequence(input_data, slice_params, expected_result, device_id,

    expected_gradient = grad_slice(np.asarray(input_data), *slice_params)

-    expected_forward = AA(
-        [expected_result], dtype=PRECISION_TO_TYPE[precision])
+    expected_forward = AA([expected_result], 
+            dtype=PRECISION_TO_TYPE[precision])
    expected_backward = {
        a: [grad_slice(np.asarray(input_data), *slice_params)]
    }
--- a/bindings/python/cntk/utils/init.py
+++ b/bindings/python/cntk/utils/init.py
@ -183,12 +183,9 @@ def get_temp_filename(directory=None):

 def sanitize_shape(shape):
    """
-    If shape is scalar, it creates a tuple out of it and reverse it as cntk uses
-    column major.
+    If shape is scalar, it creates a tuple out of it.
    """
-    if np.isscalar(shape):
-        shape = (shape,)
-    return tuple(reversed(shape))
+    return _as_tuple(shape)


 def sanitize_input(arg, fallback_dtype=np.float32, reshape=None):
@ -383,14 +380,15 @@ def sanitize_batch(var, batch, seq_starts=None, data_type=None, device=None):
                             'array and not "%s"' % type(batch))

        from cntk.cntk_py import NDMask
-        mask = NDMask((max(seq_lens), num_seq), device)
+        mask = NDMask((num_seq, max(seq_lens)), device)
        for idx, seq_len in enumerate(seq_lens):
-            if seq_starts is None:
-                mask.mark_sequence_begin((0, idx))
-            elif seq_starts[idx]:
+            if seq_starts is None or seq_starts[idx]:
                mask.mark_sequence_begin((0, idx))
+            # The second parameter is specifying the rectangle of the mask that
+            # is invalid. As C++ is taking an NDShape, and we reverse the shape
+            # in the SWIG layer, we provide it here as row-major.
            mask.invalidate_section((seq_len, idx),
-                                    (cntk_py.InferredDimension, 1))
+                                    (1, cntk_py.InferredDimension))

        # Then we pad the batch to rectangular shape
        if isinstance(batch, list):
@ -814,6 +812,17 @@ class _ClassFromDict(dict):
 def Record(**kwargs):
    return _ClassFromDict(kwargs)

-# type-cast a shape given as a scalar into a tuple
 def _as_tuple(x):
-    return x if (isinstance(x,tuple)) else (x,)
+    '''
+    Convert an argument to a tuple.
+
+    Args:
+        x: if scalar, it returns ``(x,)``. If iterable, it converts it to
+        tuple.
+
+    Returns:
+        Tuple of ``x``.
+    '''
+    if np.isscalar(x):
+        x = (x,)
+    return tuple(x)
--- a/bindings/python/doc/conf.py
+++ b/bindings/python/doc/conf.py
@ -63,9 +63,9 @@ author = 'Microsoft'
 # built documents.
 #
 # The short X.Y version.
-version = '2.0.beta2.0'
+version = '2.0.beta3.0'
 # The full version, including alpha/beta/rc tags.
-release = '2.0.beta2.0'
+release = '2.0.beta3.0'

 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
--- a/bindings/python/doc/examples.rst
+++ b/bindings/python/doc/examples.rst
@ -5,27 +5,27 @@ The best way to learn about the APIs currently is to look at the
 following examples in the [CNTK clone root]/bindings/python/examples
 directory:

-  `MNIST <https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/examples/MNIST/SimpleMNIST.py>`__:
+-  `MNIST <https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/examples/MNIST/SimpleMNIST.py>`__:
   A fully connected feed-forward model for classification of MNIST
   images. (follow the instructions in
   Examples/Image/DataSets/MNIST/README.md)

-  `CifarResNet <https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/examples/CifarResNet/CifarResNet.py>`__:
+-  `CifarResNet <https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/examples/CifarResNet/CifarResNet.py>`__:
   An image classification ResNet model for training on the CIFAR image
   dataset. (follow the instructions in
   Examples/Image/DataSets/CIFAR-10/README.md to get the CIFAR dataset
   and convert it to the CNTK supported format)

-  `SequenceClassification <https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/examples/SequenceClassification/SequenceClassification.py>`__:
+-  `SequenceClassification <https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/examples/SequenceClassification/SequenceClassification.py>`__:
   An LSTM sequence classification model for text data.

-  `Sequence2Sequence <https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py>`__:
+-  `Sequence2Sequence <https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py>`__:
   A sequence to sequence grapheme to phoneme translation model that
   trains on the CMUDict corpus.

-  `NumpyInterop <https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/examples/NumpyInterop/FeedForwardNet.py>`__
+-  `NumpyInterop <https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/examples/NumpyInterop/FeedForwardNet.py>`__
   - NumPy interoperability example showing how to train a simple feed-forward
   network with training data fed using NumPy arrays.

-  `LanguageUnderstanding <https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/examples/LanguageUnderstanding/LanguageUnderstanding.py>`__
+-  `LanguageUnderstanding <https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/examples/LanguageUnderstanding/LanguageUnderstanding.py>`__
   - Language Understanding.
--- a/bindings/python/doc/index.rst
+++ b/bindings/python/doc/index.rst
@ -2,7 +2,7 @@
 .. some aliases
 .. _CNTK: http://cntk.ai/

-Python API for CNTK (2.0.beta2.0)
+Python API for CNTK (2.0.beta3.0)
 ===============================

 CNTK_, the Microsoft Cognitive Toolkit, is a system for describing, training,
@ -12,7 +12,7 @@ neural networks (CNNs), recurrent neural networks (RNNs), long short term
 memory (LSTM), logistic regression, and maximum entropy model. CNTK is an 
 implementation of computational networks that supports both CPU and GPU.
 
-This page describes the Python API for CNTK_ version 2.0.beta2.0. This is an ongoing effort
+This page describes the Python API for CNTK_ version 2.0.beta3.0. This is an ongoing effort
 to expose such an API to the CNTK system, thus enabling the use of higher-level
 tools such as IDEs to facilitate the definition of computational networks, to execute
 them on sample data in real time.
--- a/bindings/python/doc/tutorials.rst
+++ b/bindings/python/doc/tutorials.rst
@ -1,23 +1,28 @@
 Tutorials 
 ===============

-#. `Logistic Regression`_ with CNTK and NumPy
-#. `Feed Forward Network`_ with CNTK and NumPy
-#.  Image 101 Feed Forward Classifier with MNIST data
+#.  CNTK 101: `Logistic Regression`_ with CNTK and NumPy
+#.  CNTK 102: `Feed Forward Network`_ with CNTK and NumPy
+#.  CNTK 103: Feed Forward image classifier with MNIST data

-    * Part A: `MNIST Data preparation`_
+    * Part A: `MNIST data preparation`_
    * Part B: `Feed Forward Classifier`_

-#.  Image 201 ResNet Classifier with CIFAR-10 data
+#.  CNTK 201: Image classifiers with CIFAR-10 data

    * Part A: `CIFAR-10 Data preparation`_
-    * Part B: `ResNet Classifier`_
-	
-.. _`Logistic Regression`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_101_LogisticRegression.ipynb
-.. _`Feed Forward Network`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_102_FeedForward.ipynb
-.. _`MNIST Data preparation`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_103A_MNIST_DataLoader.ipynb
-.. _`Feed Forward Classifier`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_103B_MNIST_FeedForwardNetwork.ipynb
-.. _`CIFAR-10 Data preparation`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_201A_CIFAR-10_DataLoader.ipynb
-.. _`ResNet Classifier`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_201B_CIFAR-10_ImageHandsOn.ipynb
+    * Part B: `VGG and ResNet classifiers`_
+    
+#.  CNTK 202: `Language understanding`_ with ATIS3 text data

+#.  CNTK 203: `Reinforcement learning basics`_ with OpenAI Gym data
+	
+.. _`Logistic Regression`: https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/bindings/python/tutorials/CNTK_101_LogisticRegression.ipynb
+.. _`Feed Forward Network`: https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/bindings/python/tutorials/CNTK_102_FeedForward.ipynb
+.. _`MNIST data preparation`: https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/bindings/python/tutorials/CNTK_103A_MNIST_DataLoader.ipynb
+.. _`Feed Forward Classifier`: https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/bindings/python/tutorials/CNTK_103B_MNIST_FeedForwardNetwork.ipynb
+.. _`CIFAR-10 Data preparation`: https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/bindings/python/tutorials/CNTK_201A_CIFAR-10_DataLoader.ipynb
+.. _`VGG and ResNet classifiers`: https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/bindings/python/tutorials/CNTK_201B_CIFAR-10_ImageHandsOn.ipynb
+.. _`Language understanding`: https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/tutorials/CNTK_202_Language_Understanding.ipynb
+.. _`Reinforcement learning basics`: https://github.com/Microsoft/CNTK/blob/master/bindings/python/tutorials/CNTK_203_Reinforcement_Learning_Basics.ipynb
  
--- a/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py
+++ b/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py
@ -11,7 +11,7 @@ from cntk import Trainer, Axis, save_model, load_model #, text_format_minibatch_
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
 from cntk.device import cpu, set_default_device
 from cntk.learner import momentum_sgd, momentum_as_time_constant_schedule
-from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence, slice, past_value, future_value, element_select, alias, hardmax
+from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence, past_value, future_value, element_select, alias, hardmax
 from cntk.ops.functions import CloneMethod

 abs_path = os.path.dirname(os.path.abspath(__file__))
@ -94,7 +94,7 @@ def sequence_to_sequence_translator(debug_output=False, run_test=False):
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
-    label_sequence = slice(raw_labels, label_seq_axis, 1, 0) # <s> A B C </s> --> A B C </s>
+    label_sequence = sequence.slice(raw_labels, 1, 0) # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)        # <s>

    is_first_label = sequence.is_first(label_sequence)       # <s> 0 0 0 ...
@ -239,7 +239,7 @@ def sequence_to_sequence_translator(debug_output=False, run_test=False):
    z = load_model("seq2seq.dnn")

    label_seq_axis = Axis('labelAxis')
-    label_sequence = slice(find_arg_by_name('raw_labels',z), label_seq_axis, 1, 0)
+    label_sequence = sequence.slice(find_arg_by_name('raw_labels',z), 1, 0)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)
    trainer = Trainer(z, ce, errs, [momentum_sgd(
--- a/bindings/python/examples/SequenceClassification/SequenceClassification.py
+++ b/bindings/python/examples/SequenceClassification/SequenceClassification.py
@ -10,11 +10,11 @@ from cntk import Trainer, Axis #, text_format_minibatch_source, StreamConfigurat
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
 from cntk.device import cpu, set_default_device
 from cntk.learner import sgd
-from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error
+from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence

 abs_path = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(abs_path, "..", ".."))
-from examples.common.nn import LSTMP_component_with_self_stabilization, embedding, linear_layer, select_last, print_training_progress
+from examples.common.nn import LSTMP_component_with_self_stabilization, embedding, linear_layer, print_training_progress

 # Creates the reader
 def create_reader(path, is_training, input_dim, label_dim):
@ -28,7 +28,7 @@ def LSTM_sequence_classifer_net(input, num_output_classes, embedding_dim, LSTM_d
    embedding_function = embedding(input, embedding_dim)
    LSTM_function = LSTMP_component_with_self_stabilization(
        embedding_function.output, LSTM_dim, cell_dim)[0]
-    thought_vector = select_last(LSTM_function)
+    thought_vector = sequence.last(LSTM_function)

    return linear_layer(thought_vector, num_output_classes)

--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@ -114,7 +114,6 @@ if IS_WINDOWS:
        "/EHsc",
        "/DEBUG",
        "/Zi",
-        "/EHsc",
    ]
    runtime_library_dirs = []
 else:
@ -166,7 +165,7 @@ else:
    kwargs = dict(package_data = package_data)

 setup(name="cntk",
-      version="2.0.beta2.0",
+      version="2.0.beta3.0",
      url="http://cntk.ai",
      ext_modules=[cntk_module],
      packages=packages,
--- a/bindings/python/tutorials/CNTK_101_LogisticRegression.ipynb
+++ b/bindings/python/tutorials/CNTK_101_LogisticRegression.ipynb
@ -10,7 +10,7 @@
    "\n",
    "This tutorial is targeted to individuals who are new to CNTK and to machine learning. In this tutorial, you will train a simple yet powerful machine learning model that is widely used in industry for a variety of applications. The model trained below scales to massive data sets in the most expeditious manner by harnessing computational scalability leveraging the computational resources you may have (one or more CPU cores, one or more GPUs, a cluster of CPUs or a cluster of GPUs), transparently via the CNTK library.\n",
    "\n",
-    "The following notebook users Python APIs. If you are looking for this example in Brainscript, please look [here](https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/Examples/Tutorials/LogisticRegressionAndMultiClass). \n",
+    "The following notebook users Python APIs. If you are looking for this example in Brainscript, please look [here](https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/Examples/Tutorials/LogisticRegressionAndMultiClass). \n",
    "\n",
    "## Introduction\n",
    "\n",
--- a/bindings/python/tutorials/CNTK_102_FeedForward.ipynb
+++ b/bindings/python/tutorials/CNTK_102_FeedForward.ipynb
@ -767,7 +767,7 @@
    "\n",
    "If you want to try running the tutorial from python command prompt. Please run the [FeedForwardNet.py][] example.\n",
    "\n",
-    "[FeedForwardNet.py]: https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/examples/NumpyInterop/FeedForwardNet.py"
+    "[FeedForwardNet.py]: https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/examples/NumpyInterop/FeedForwardNet.py"
   ]
  },
  {
--- a/bindings/python/tutorials/CNTK_103A_MNIST_DataLoader.ipynb
+++ b/bindings/python/tutorials/CNTK_103A_MNIST_DataLoader.ipynb
@ -12,7 +12,7 @@
    "\n",
    "CNTK 103 tutorial is divided into two parts:\n",
    "- Part A: Familiarize with the [MNIST][] database that will be used later in the tutorial\n",
-    "- [Part B](https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/tutorials/CNTK_103A_MNIST_DataLoader.ipynb): We will use the feedforward classifier used in CNTK 102 to classify digits in MNIST data set.\n",
+    "- [Part B](https://github.com/Microsoft/CNTK/blob/v2.0.beta3.0/bindings/python/tutorials/CNTK_103A_MNIST_DataLoader.ipynb): We will use the feedforward classifier used in CNTK 102 to classify digits in MNIST data set.\n",
    "\n",
    "[MNIST]: http://yann.lecun.com/exdb/mnist/\n",
    "\n"
--- a/bindings/python/tutorials/CNTK_103B_MNIST_FeedForwardNetwork.ipynb
+++ b/bindings/python/tutorials/CNTK_103B_MNIST_FeedForwardNetwork.ipynb
@ -12,7 +12,7 @@
    "\n",
    "We assume that you have successfully completed CNTK 103 Part A.\n",
    "\n",
-    "In this tutorial we will train a fully connected network on MNIST data. This notebook provides the recipe using Python APIs. If you are looking for this example in Brainscript, please look [here](https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/Examples/Image/GettingStarted)\n",
+    "In this tutorial we will train a fully connected network on MNIST data. This notebook provides the recipe using Python APIs. If you are looking for this example in Brainscript, please look [here](https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/Examples/Image/GettingStarted)\n",
    "\n",
    "## Introduction\n",
    "\n",
@ -765,7 +765,7 @@
   "source": [
    "#### Code link\n",
    "\n",
-    "If you want to try running the tutorial from python command prompt. Please run the [SimpleMNIST.py](https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/examples/MNIST) example."
+    "If you want to try running the tutorial from python command prompt. Please run the [SimpleMNIST.py](https://github.com/Microsoft/CNTK/tree/v2.0.beta3.0/bindings/python/examples/MNIST) example."
   ]
  },
  {
--- a/43
+++ b/43
@ -16,6 +16,11 @@ enable_cuda=

 enable_python=

+# NCCL communication library
+have_nccl=no
+nccl_path=
+nccl_check=include/nccl.h
+
 # CNTK Custom MKL Version
 cntk_custom_mkl_version=2

@ -99,6 +104,7 @@ default_boost="boost-1.60.0"

 # NOTE: Will get compilation errors with cuda-6.0
 default_cudas="cuda-7.5 cuda-7.0 cuda-6.5"
+default_nccls="nccl"
 default_kaldis="kaldi-trunk kaldi-c024e8aa"
 default_gdk_includes="include/nvidia/gdk"
 default_gdk_nvml_libs="src/gdk/nvml/lib"
@ -165,6 +171,11 @@ function find_protobuf ()
    find_dir "$default_protobuf" "$protobuf_check"
 }

+function find_nccl ()
+{
+    find_dir "$default_nccls" "$nccl_check"
+}
+
 function find_cuda ()
 {
    find_dir "$default_cudas" "$cuda_check"
@ -322,6 +333,7 @@ function show_help ()
    echo "  --with-gdk-include[=directory] $(show_default $(find_gdk_include))"
    echo "  --with-gdk-nvml-lib[=directory] $(show_default $(find_gdk_nvml_lib))"
    echo "  --with-cudnn[=directory] $(show_default $(find_cudnn))"
+    echo "  --with-nccl[=directory] $(show_default $(find_nccl))"
    echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
    echo "  --with-mkl-sequential[=directory] $(show_default $(find_mkl))"
    echo "  --with-openblas[=directory] (experimental) $(show_default $(find_openblas))"
@ -603,6 +615,28 @@ do
                fi
            fi
            ;;
+        --with-nccl*)
+            have_nccl=yes
+            if test x$optarg = x
+            then
+                nccl_path=$(find_nccl)
+                if test x$nccl_path = x
+                then
+                    echo "Cannot find NCCL directory."
+                    echo "Please specify a value for --with-nccl"
+                    echo "NCCL can be downloaded from https://github.com/NVIDIA/nccl"
+                    exit 1
+                fi
+            else
+                if test $(check_dir $optarg $nccl_check) = yes
+                then
+                    nccl_path=$optarg
+                else
+                    echo "Invalid NCCL directory $optarg"
+                    exit 1
+                fi
+            fi
+            ;;
        --with-mkl*)
            have_mkl=yes
            mathlib=mkl
@ -898,6 +932,14 @@ then
    done
 fi

+if test $enable_cuda = yes && test x$nccl_path = x
+then
+    nccl_path=$(find_nccl)
+    if test x$nccl_path != x; then
+        echo Found NCCL at $nccl_path
+    fi
+fi
+
 if test x$opencv_path = x
 then
    opencv_path=$(find_opencv)
@ -978,6 +1020,7 @@ if test $enable_cuda = yes ; then
    echo GDK_NVML_LIB_PATH=$gdk_nvml_lib_path >> $config
    echo CUB_PATH=$cub_path >> $config
    echo CUDNN_PATH=$cudnn_path >> $config
+    [-z "$nccl_path"] || echo NCCL_PATH=$nccl_path >> $config
 fi
 if test $enable_python = yes ; then
    echo PYTHON_SUPPORT=true >> $config