Merge remote-tracking branch 'origin/master' into mahilleb/pr/874

2016-11-09 14:01:33 +00:00 · 2016-11-09 14:01:33 +00:00 · 88065644f7
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -158,7 +158,7 @@
    </Link>
  </ItemDefinitionGroup>

-  <ItemDefinitionGroup Condition="'$(ConfigurationType)' == 'StaticLibrary' And $(ReleaseBuild) And !$(NoOptBuild)">
+  <ItemDefinitionGroup Condition="$(ReleaseBuild) And !$(NoOptBuild)">
    <ClCompile>
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
@ -180,8 +180,16 @@
      <IntrinsicFunctions>false</IntrinsicFunctions>
    </ClCompile>
    <Link>
+      <EnableCOMDATFolding>false</EnableCOMDATFolding>
      <OptimizeReferences>false</OptimizeReferences>
+      <Profile>false</Profile>
    </Link>
  </ItemDefinitionGroup>

+  <PropertyGroup Condition="$(NoOptBuild)" Label="Configuration">
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <WholeProgramOptimization>false</WholeProgramOptimization>
+    <LinkIncremental>true</LinkIncremental>
+  </PropertyGroup>
+
 </Project>
--- a/CNTK.sln
+++ b/CNTK.sln
@ -1285,6 +1285,8 @@ Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "V2LibraryDistributionTests"
 		{E5606ECE-48CA-4464-BB12-09D81D02B9EF} = {E5606ECE-48CA-4464-BB12-09D81D02B9EF}
 	EndProjectSection
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClientTest", "Tests\EndToEndTests\EvalClientTests\CPPEvalExtendedClientTest\CPPEvalExtendedClientTest.vcxproj", "{5D29C76D-648A-456F-920D-48230F2FB3C8}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug_CpuOnly|Any CPU = Debug_CpuOnly|Any CPU
@ -2240,6 +2242,31 @@ Global
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|Mixed Platforms.Build.0 = Release|x64
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.ActiveCfg = Release|x64
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E}.Release|x64.Build.0 = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Any CPU.ActiveCfg = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.ActiveCfg = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|Mixed Platforms.Build.0 = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|x64.ActiveCfg = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug_CpuOnly|x64.Build.0 = Debug_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|Any CPU.ActiveCfg = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|Mixed Platforms.ActiveCfg = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|Mixed Platforms.Build.0 = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|x64.ActiveCfg = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Debug|x64.Build.0 = Debug|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|Any CPU.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|Mixed Platforms.Build.0 = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|x64.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_CpuOnly|x64.Build.0 = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|Any CPU.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|Mixed Platforms.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|Mixed Platforms.Build.0 = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|x64.ActiveCfg = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release_NoOpt|x64.Build.0 = Release_CpuOnly|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|Any CPU.ActiveCfg = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|Mixed Platforms.ActiveCfg = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|Mixed Platforms.Build.0 = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|x64.ActiveCfg = Release|x64
+		{5D29C76D-648A-456F-920D-48230F2FB3C8}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
@ -2417,5 +2444,6 @@ Global
 		{E844AB9A-A48F-4A99-9625-F528C5C46D83} = {8656B71D-E24C-4AC2-8BE4-C07B415A3E15}
 		{CD721536-CFD3-413E-A3D7-FB0FAF989635} = {DD043083-71A4-409A-AA91-F9C548DCF7EC}
 		{F4CCAAB2-0DB2-4281-929A-2E68E30F0F6E} = {6F19321A-65E7-4829-B00C-3886CD6C6EDE}
+		{5D29C76D-648A-456F-920D-48230F2FB3C8} = {05E45AF7-C069-4057-BC16-0A532D068CE4}
 	EndGlobalSection
 EndGlobal
--- a/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.cpp
+++ b/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.cpp
@ -0,0 +1,326 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+// CPPEvalExtendedClient.cpp : Sample application using the extended evaluation interface from C++
+//
+
+#include <sys/stat.h>
+#include <inttypes.h>
+#include <algorithm>
+#include <fstream>
+#include <unordered_map>
+
+#include "Eval.h"
+#ifdef _WIN32
+#include "Windows.h"
+#endif
+
+using namespace std;
+using namespace Microsoft::MSR::CNTK;
+
+// Used for retrieving the model appropriate for the element type (float / double)
+template<typename ElemType>
+using GetEvalProc = void(*)(IEvaluateModelExtended<ElemType>**);
+
+std::unordered_map<std::string, size_t> buildVocab(std::string filePath)
+{
+    std::ifstream ifs(filePath);
+    size_t idx = 0;
+
+    std::unordered_map<std::string, size_t> vocab;
+    std::string line;
+    while (std::getline(ifs, line))
+    {
+        vocab.insert(std::pair<std::string, size_t>(line, idx));
+        idx += 1;
+    }
+
+    ifs.close();
+    return vocab;
+}
+
+std::unordered_map<size_t, std::string> buildInvVocab(std::string filePath)
+{
+    std::ifstream ifs(filePath);
+    size_t idx = 1;
+
+    std::unordered_map<size_t, std::string> vocab;
+    std::string line;
+    while (std::getline(ifs, line))
+    {
+        vocab.insert(std::pair<size_t, std::string>(idx, line));
+        idx += 1;
+    }
+
+    ifs.close();
+    return vocab;
+}
+
+size_t word2idx(std::string word, std::unordered_map<std::string, size_t>& word2idxVocab)
+{
+    std::unordered_map<std::string, size_t>::iterator iter = word2idxVocab.find(word);
+    if (iter == word2idxVocab.end())
+    {
+        throw std::runtime_error("word not found in source vocab");
+    }
+
+    return iter->second;
+}
+
+
+std::string idx2word(size_t idx, std::unordered_map<size_t, std::string>& idx2wordVocab)
+{
+    std::unordered_map<size_t, std::string>::iterator iter = idx2wordVocab.find(idx);
+    if (iter == idx2wordVocab.end())
+    {
+        throw std::runtime_error("word index is not found in target vocab");
+    }
+
+    return iter->second;
+}
+
+void addOneHotWord(Values<float>& inputBuffers, size_t idx, VariableSchema& inputLayouts, size_t inputNode)
+{
+    size_t inputDim = inputLayouts[inputNode].m_numElements;
+    for (size_t i = 0; i < inputDim; i++)
+    {
+        if (i == idx)
+        {
+            inputBuffers[inputNode].m_buffer.push_back(1);
+        }
+        else
+        {
+            inputBuffers[inputNode].m_buffer.push_back(0);
+        }
+    }
+}
+
+std::vector<std::string> feedInputVectors(std::string sentence, std::unordered_map<std::string, size_t>& word2idxVocab, Values<float>& inputBuffers, VariableSchema& inputLayouts)
+{
+    std::vector<std::string> words;
+
+    // Split input sentence by space.
+    char delimiters = ' ';
+    size_t begin = 0;
+    size_t end = sentence.find_first_of(delimiters);
+    while (end != sentence.npos)
+    {
+        words.push_back(sentence.substr(begin, end - begin));
+        begin = end + 1;
+        end = sentence.find(delimiters, begin);
+    }
+
+    words.push_back(sentence.substr(begin));
+
+    // Convert words to ids.
+    std::vector<size_t> wordIds;
+    for (size_t i = 0; i < words.size(); i++)
+    {
+        size_t id = word2idx(words[i], word2idxVocab);
+        wordIds.push_back(id);
+    }
+
+    // Process the input words to construct network input vectors.
+    // As the sentence begins and ends with special tag, we will ignore the first and last word.
+    for (size_t i = 1; i < words.size() - 1; i++)
+    {
+        // Current word.
+        size_t cwIdx = wordIds[i];
+        addOneHotWord(inputBuffers, cwIdx, inputLayouts, 0);
+
+        // Next word.
+        size_t nwIdx = wordIds[i + 1];
+        addOneHotWord(inputBuffers, nwIdx, inputLayouts, 1);
+
+        // Previous word.
+        size_t pwIdx = wordIds[i - 1];
+        addOneHotWord(inputBuffers, pwIdx, inputLayouts, 2);
+    }
+
+    return words;
+}
+
+IEvaluateModelExtended<float>* SetupNetworkAndGetLayouts(std::string modelDefinition, VariableSchema& inputLayouts, VariableSchema& outputLayouts)
+{
+    // Native model evaluation instance
+    IEvaluateModelExtended<float> *eval;
+
+    GetEvalExtendedF(&eval);
+
+    try
+    {
+        eval->CreateNetwork(modelDefinition);
+    }
+    catch (std::exception& ex)
+    {
+        fprintf(stderr, "%s\n", ex.what());
+        throw;
+    }
+    fflush(stderr);
+
+    // Get the model's layers dimensions
+    outputLayouts = eval->GetOutputSchema();
+
+    for (auto vl : outputLayouts)
+    {
+        fprintf(stderr, "Output dimension: %" PRIu64 "\n", vl.m_numElements);
+        fprintf(stderr, "Output name: %ls\n", vl.m_name.c_str());
+    }
+
+    eval->StartForwardEvaluation({ outputLayouts[0].m_name });
+    inputLayouts = eval->GetInputSchema();
+    outputLayouts = eval->GetOutputSchema();
+
+    return eval;
+}
+
+
+/// <summary>
+/// Program for demonstrating how to run model evaluations using the native extended evaluation interface, also show
+/// how to input sequence vectors to LSTM(RNN) network.
+/// </summary>
+/// <description>
+/// This program is a native C++ client using the native extended evaluation interface
+/// located in the <see cref="eval.h"/> file.
+/// The CNTK evaluation library (EvalDLL.dll on Windows, and LibEval.so on Linux), must be found through the system's path. 
+/// The other requirement is that Eval.h be included
+/// In order to run this program the model must already exist in the example. To create the model,
+/// first run the example in <CNTK>/Examples/Text/ATIS. Once the model file ATIS.slot.lstm is created,
+/// you can run this client.
+/// This program demonstrates the usage of the Evaluate method requiring the input and output layers as parameters.
+int main(int argc, char* argv[])
+{
+    // Get the binary path (current working directory)
+    argc = 0;
+    std::string app = argv[0];
+    std::string path;
+    size_t pos;
+    int ret;
+
+#ifdef _WIN32
+    pos = app.rfind("\\");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);
+
+    // This relative path assumes launching from CNTK's binary folder, e.g. x64\Release
+    const std::string modelBaseDir = path + "/../../Examples/Text/ATIS/";
+    
+#else // on Linux
+    pos = app.rfind("/");
+    path = (pos == std::string::npos) ? "." : app.substr(0, pos);
+
+    // This relative path assumes launching from CNTK's binary folder, e.g. build/cpu/release/bin/
+    const std::string modelBaseDir = path + "/../../../../Examples/Text/ATIS/";
+#endif
+    const std::string modelWorkingDirectory = modelBaseDir + "work/";
+
+    const std::string modelFilePath = modelWorkingDirectory + "ATIS.slot.lstm";
+
+    try
+    {
+        struct stat statBuf;
+        if (stat(modelFilePath.c_str(), &statBuf) != 0)
+        {
+            fprintf(stderr, "Error: The model %s does not exist. Please follow instructions in README.md in <CNTK>/Examples/Text/ATIS to create the model.\n", modelFilePath.c_str());
+            return(1);
+        }
+
+        std::string networkConfiguration;
+        networkConfiguration += "modelPath=\"" + modelFilePath + "\"";
+
+        VariableSchema inputLayouts;
+        VariableSchema outputLayouts;
+        IEvaluateModelExtended<float> *eval;
+        eval = SetupNetworkAndGetLayouts(networkConfiguration, inputLayouts, outputLayouts);
+
+        vector<size_t> inputBufferSize;
+        for (size_t i = 0; i < inputLayouts.size(); i++)
+        {
+            fprintf(stdout, "Input node name: %ls\n", inputLayouts[i].m_name.c_str());
+            fprintf(stdout, "Input feature dimension: %" PRIu64 "\n", inputLayouts[i].m_numElements);
+            inputBufferSize.push_back(inputLayouts[i].m_numElements);
+        }
+
+        vector<size_t> outputBufferSize;
+        for (size_t i = 0; i < outputLayouts.size(); i++)
+        {
+            outputBufferSize.push_back(outputLayouts[i].m_numElements);
+        }
+
+        // Build source word vocab to id 
+        const::string sourceVocab = modelBaseDir + "/Data/ATIS.vocab";
+        if (stat(sourceVocab.c_str(), &statBuf) != 0)
+        {
+            fprintf(stderr, "Error: The file '%s' does not exist.\n", sourceVocab.c_str());
+            return(1);
+        }
+        std::unordered_map<std::string, size_t> word2idxVocab = buildVocab(sourceVocab);
+
+        // Build id to target word vocab
+        const::string targetVocab = modelBaseDir + "/Data/ATIS.label";
+        if (stat(targetVocab.c_str(), &statBuf) != 0)
+        {
+            fprintf(stderr, "Error: The file '%s' does not exist.\n", targetVocab.c_str());
+            return(1);
+        }
+        std::unordered_map<size_t, std::string> idx2wordVocab = buildInvVocab(targetVocab);
+
+        // Use the following sentence as input example.
+        // One single space is used as word sperator. 
+        std::string inputSequences = "BOS i would like to find a flight from charlotte to las vegas that makes a stop in st. louis EOS";
+
+        Values<float> inputBuffers = inputLayouts.CreateBuffers<float>(inputBufferSize);
+        Values<float> outputBuffers = outputLayouts.CreateBuffers<float>(outputBufferSize);
+
+        // Feed input sequence vectors to network
+        std::vector<std::string> words = feedInputVectors(inputSequences, word2idxVocab, inputBuffers, inputLayouts);
+
+        // Forward propagation
+        eval->ForwardPass(inputBuffers, outputBuffers);
+
+        // Get output from output layer
+        auto buf = outputBuffers[0].m_buffer;
+        size_t bufSize = outputBuffers[0].m_buffer.size();
+
+        std::vector<std::string> outputs;
+        size_t outputDim = outputLayouts[0].m_numElements;
+        size_t outputStep = bufSize / outputDim;
+
+        auto iter = buf.begin();
+        for (size_t i = 0; i < outputStep; i++)
+        {
+            auto max_iter = std::max_element(iter, iter + outputDim);
+            auto index = max_iter - iter;
+            outputs.push_back(idx2word(index, idx2wordVocab));
+            iter += outputDim;
+        }
+
+        words.erase(words.begin());
+        words.pop_back();
+        fprintf(stdout, "Slot tag for sentence \"%s\" is as follows:\n", inputSequences.c_str());
+        for (size_t i = 0; i < outputs.size(); i++)
+        {
+            fprintf(stdout, "%10s -- %s\n", words[i].c_str(), outputs[i].c_str());
+        }
+
+        eval->Destroy();
+       
+        // This pattern is used by End2EndTests to check whether the program runs to complete.
+        fprintf(stdout, "Evaluation complete.\n");
+        ret = 0;
+    }
+    catch (const std::exception& err)
+    {
+        fprintf(stderr, "Evaluation failed. EXCEPTION occurred: %s\n", err.what());
+        ret = 1;
+    }
+    catch (...)
+    {
+        fprintf(stderr, "Evaluation failed. Unknown ERROR occurred.\n");
+        ret = 1;
+    }
+
+    fflush(stdout);
+    fflush(stderr);
+    return ret;
+}
--- a/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj
+++ b/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj
@ -0,0 +1,66 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CPPEvalExtendedClient</RootNamespace>
+    <ProjectName>CPPEvalExtendedClient</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+  </PropertyGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings">
+  </ImportGroup>
+  <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'" Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup Label="UserMacros" />
+  <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <LinkIncremental>false</LinkIncremental>
+    <OutDir>$(SolutionDir)..\..\$(Platform)\$(ProjectName).$(Configuration)\</OutDir>
+  </PropertyGroup>
+  <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
+    <ClCompile>
+      <WarningLevel>Level4</WarningLevel>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;_LIB;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <AdditionalIncludeDirectories>$(SolutionDir)..\..\Include</AdditionalIncludeDirectories>
+      <TreatWarningAsError>true</TreatWarningAsError>
+      <SDLCheck>true</SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OpenMPSupport>true</OpenMPSupport>
+    </ClCompile>
+    <Link>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+      <AdditionalLibraryDirectories>$(SolutionDir)\..\..\cntk</AdditionalLibraryDirectories>
+      <AdditionalDependencies>EvalDll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <OutputFile>$(OutDir)$(TargetName)$(TargetExt)</OutputFile>
+      <Profile>true</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="CPPEvalExtendedClient.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets">
+  </ImportGroup>
+</Project>
--- a/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj.filters
+++ b/Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.vcxproj.filters
@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="CPPEvalExtendedClient.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/Examples/Evaluation/EvalClients.sln
+++ b/Examples/Evaluation/EvalClients.sln
@ -9,6 +9,8 @@ Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "CSEvalClient", "CSEvalClien
 EndProject
 Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalV2Client", "CPPEvalV2Client\CPPEvalV2Client.vcxproj", "{D771A06D-CC25-4582-B5CD-D2A4782BB005}"
 EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "CPPEvalExtendedClient", "CPPEvalExtendedClient\CPPEvalExtendedClient.vcxproj", "{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}"
+EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|x64 = Debug|x64
@ -25,6 +27,9 @@ Global
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Debug|x64.ActiveCfg = Release|x64
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.ActiveCfg = Release|x64
 		{D771A06D-CC25-4582-B5CD-D2A4782BB005}.Release|x64.Build.0 = Release|x64
+		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Debug|x64.ActiveCfg = Release|x64
+		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.ActiveCfg = Release|x64
+		{93ECB70B-FDDD-44B4-BD6A-D63E094C704B}.Release|x64.Build.0 = Release|x64
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
--- a/Examples/Evaluation/README.md
+++ b/Examples/Evaluation/README.md
@ -1,8 +1,13 @@
 #EvalClients

 The folder contains some examples using the CNTK evaluation library. Please note that only the 64-bit target is supported by CNTK evaluation library.
+
 -CPPEvalClient: demonstrate the use of the C++ CNTK eval lib. Only the release configuration is supported.  
+
 -CSEvalClient: demonstrate the use of the C# CNTK eval lib.
+
 -EvalClients.sln: the VS2013 solution file to build examples. It creates two binaries in the directory $(SolutionDir)..\..\x64\:
-    * CPPEvalClient.$(Configuration)\CPPEvalClient.exe: the C++ example executable. To run the example, please first include the directory containing CNTK dependent dlls, usually $(SolutionDir)..\..\cntk, in the PATH environment variable.  
-    * CSEvalClient.$(Configuration)\CSEvalClient.exe: the C# example executable.
+
+    - CPPEvalClient.$(Configuration)\CPPEvalClient.exe: the C++ example executable. To run the example, please first include the directory containing CNTK dependent dlls, usually $(SolutionDir)..\..\cntk, in the PATH environment variable. 
+    
+    - CSEvalClient.$(Configuration)\CSEvalClient.exe: the C# example executable.
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet101_ImageNet1K.cntk
@ -13,6 +13,7 @@ modelPath = "$outputDir$/Models/ResNet_101"
 stderr = "$outputDir$/ResNet_101_BS_out"

 parallelTrain = true
+hyperCompressMemory = true

 TrainNetwork = {
    action = "train"
--- a/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
+++ b/Examples/Image/Classification/ResNet/BrainScript/ResNet152_ImageNet1K.cntk
@ -13,6 +13,7 @@ modelPath = "$outputDir$/Models/ResNet_152"
 stderr = "$outputDir$/ResNet_152_BS_out"

 parallelTrain = true
+hyperCompressMemory = true

 TrainNetwork = {
    action = "train"
--- a/Examples/Text/ATIS/ATIS.cntk
+++ b/Examples/Text/ATIS/ATIS.cntk
@ -2,7 +2,7 @@
 # An LSTM model is built to tag each word in sentences with its semantic label.

 WorkDir = work
-DataDir = data
+DataDir = Data

 makeMode = false
 modelPath = $WorkDir$/ATIS.slot.lstm
@ -96,9 +96,11 @@ Train = [
            parallelizationMethod = "DataParallelSGD"
            parallelizationStartEpoch = 2
            distributedMBReading = true
-            dataParallelSGD = [
-                gradientBits = 1
-            ]
+            # Comment out the following lines if you want to enable parallelTrain to use 1-bit-SGD.
+            # For that you also need CNTK binaries built with 1-bit-SGD enabled.
+            # dataParallelSGD = [
+            #    gradientBits = 1
+            # ]
        ]
    ]

--- a/32
+++ b/32
@ -424,7 +424,6 @@ CNTKLIBRARY_COMMON_SRC =\
 CNTKLIBRARY_SRC =\
 	$(SOURCEDIR)/CNTKv2LibraryDll/ComputeInputStatistics.cpp \
 	$(SOURCEDIR)/CNTKv2LibraryDll/MinibatchSource.cpp \
-	$(SOURCEDIR)/CNTKv2LibraryDll/Globals.cpp \

 CNTKLIBRARY_SRC+=$(CNTKLIBRARY_COMMON_SRC)
 CNTKLIBRARY_SRC+=$(CNTK_COMMON_SRC)
@ -553,24 +552,39 @@ $(EVAL_LIB): $(EVAL_OBJ) | $(CNTKMATH_LIB)
 	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKMATH) $(PROTOBUF_PATH)/lib/libprotobuf.a

 ########################################
-# Eval Sample client
+# Eval Sample clients
 ########################################
-EVAL_SAMPLE_CLIENT:=$(BINDIR)/cppevalclient
+EVAL_CLIENT:=$(BINDIR)/cppevalclient

-EVAL_SAMPLE_CLIENT_SRC=\
+EVAL_CLIENT_SRC=\
 	$(SOURCEDIR)/../Examples/Evaluation/CPPEvalClient/CPPEvalClient.cpp 

-EVAL_SAMPLE_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_SAMPLE_CLIENT_SRC))
+EVAL_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_CLIENT_SRC))

-ALL+=$(EVAL_SAMPLE_CLIENT)
-SRC+=$(EVAL_SAMPLE_CLIENT_SRC)
+ALL+=$(EVAL_CLIENT)
+SRC+=$(EVAL_CLIENT_SRC)

-$(EVAL_SAMPLE_CLIENT): $(EVAL_SAMPLE_CLIENT_OBJ) | $(EVAL_LIB)
+$(EVAL_CLIENT): $(EVAL_CLIENT_OBJ) | $(EVAL_LIB)
 	@echo $(SEPARATOR)
 	@mkdir -p $(dir $@)
-	@echo building $(EVAL_SAMPLE_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
+	@echo building $(EVAL_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
 	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)

+EVAL_EXTENDED_CLIENT:=$(BINDIR)/cppevalextendedclient
+
+EVAL_EXTENDED_CLIENT_SRC=\
+	$(SOURCEDIR)/../Examples/Evaluation/CPPEvalExtendedClient/CPPEvalExtendedClient.cpp 
+
+EVAL_EXTENDED_CLIENT_OBJ:=$(patsubst %.cpp, $(OBJDIR)/%.o, $(EVAL_EXTENDED_CLIENT_SRC))
+
+ALL+=$(EVAL_EXTENDED_CLIENT)
+SRC+=$(EVAL_EXTENDED_CLIENT_SRC)
+
+$(EVAL_EXTENDED_CLIENT): $(EVAL_EXTENDED_CLIENT_OBJ) | $(EVAL_LIB)
+	@echo $(SEPARATOR)
+	@mkdir -p $(dir $@)
+	@echo building $(EVAL_EXTENDED_CLIENT) for $(ARCH) with build type $(BUILDTYPE)
+	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(EVAL) -l$(CNTKMATH)

 ########################################
 # Eval V2 Sample client
--- a/Scripts/windows/_action.ps1
+++ b/Scripts/windows/_action.ps1
@ -26,11 +26,7 @@ function ActionItem(
    $expr = $func +' $item' 
        
    Write-Verbose "Calling Operation: [$func]"
-    $result = Invoke-Expression $expr 
-    if (-not $result) {
-        return 
-    }
-    return 
+    Invoke-Expression $expr 
 }


@ -47,10 +43,14 @@ function InstallExe(
    $processWait = $table["ProcessWait"]
    $message =  $table["message"]
    $runAs = $table["runAs"]
+    $maxErrorLevel = $table["maxErrorLevel"]

    if ($runAs -eq $null) {
        $runAs = $true
    }
+    if ($maxErrorLevel -eq $null) {
+        $maxErrorLevel = 0
+    }
    if ($platform -ne $null) {
        $runningOn = ((Get-WmiObject -class Win32_OperatingSystem).Caption).ToUpper()
        $platform  = ($platform.ToString()).ToUpper()
@ -65,10 +65,10 @@ function InstallExe(
    }
    
    if ($dir -eq $null) {
-        $ecode = DoProcess -command $cmd -param "$param" -requiresRunAs $runAs
+        DoProcess -command $cmd -param $param -requiresRunAs $runAs -maxErrorLevel $maxErrorLevel
    }
    else {
-        $ecode = DoProcess -command $cmd -param "$param" -requiresRunAs $runAs -workingDir "$dir" 
+        DoProcess -command $cmd -param $param -requiresRunAs $runAs -workingDir $dir -maxErrorLevel $maxErrorLevel
    }
    
    if ( ($processWait -ne $null) -and ($Execute) -and ($false) ) {
@ -77,11 +77,42 @@ function InstallExe(
            $pwait = Get-Process $processWait -ErrorAction SilentlyContinue
        } while (-not ($pwait -eq $null))
    }
+}
+
+function ExecuteApplication(
+    [Parameter(Mandatory = $true)][hashtable] $table)
+{
+    FunctionIntro $table
    
-      
-    if ($ecode -eq 0) { return $true }
-          
-    return $false
+    $func = $table["Function"]
+    $appName = $table["AppName"]
+    $param= $table["Param"]
+    $appDir = $table["AppDir"]
+    $usePath = $table["UseEnvPath"]
+    $dir  = $table["WorkDir"]
+    $maxErrorLevel = $table["maxErrorLevel"]
+
+    if ($appDir -eq $null) {
+        $appDir = ""
+    }
+    if ($usePath -eq $null) {
+        $usePath = $false
+    }
+    if ($maxErrorLevel -eq $null) {
+        $maxErrorLevel = 0
+    }
+
+    $application = ResolveApplicationName $appName $appDir $usePath
+    if ($application.Length -eq 0) {
+        throw "ExecuteApplication: Couldn't resolve program [$appName] with location directory [$appDir] and usePath [$usePath]"
+    }
+
+    if ($dir -eq $null) {
+        DoProcess -command $application -param $param -maxErrorLevel $maxErrorLevel
+    }
+    else {
+        DoProcess -command $application -param $param -workingDir $dir -maxErrorLevel $maxErrorLevel
+    }
 }

 function InstallWheel(
@ -110,12 +141,12 @@ function InstallWheel(
    $whl = $whlFile.FullName

    $condaExe = Join-Path $BasePath 'Scripts\conda.exe'
-    $newPaths = Invoke-DosCommand $condaExe (Write-Output ..activate cmd.exe $EnvName)
+    $newPaths = Invoke-DosCommand $condaExe (Write-Output ..activate cmd.exe $EnvName)  -maxErrorLevel 0

    $oldPath = $env:PATH
    $env:PATH = $newPaths + ';' + $env:PATH

-    Invoke-DosCommand pip (Write-Output install $whl)
+    Invoke-DosCommand pip (Write-Output install $whl) -maxErrorLevel 0
    $env:PATH = $oldPath 
    return
 }
@ -237,7 +268,8 @@ function DoProcess(
    [string]  $command,
    [string]  $param,
    [string]  $workingDir = "",
-    [boolean] $requiresRunAs = $false)
+    [boolean] $requiresRunAs = $false,
+    [int] $maxErrorLevel)
 {
    $info = "start-process [$command] with [$param]"

@ -266,15 +298,13 @@ function DoProcess(
        }
    }

-
    $eCode = ($process.ExitCode)

-    if ($eCode -ne 0) {
-        Write-Host  "$message ** Exit Code **:($eCode)"
-    } else {
-        Write-Verbose "$message ** Exit Code **:($eCode)"
+    if ($ecode -gt $maxErrorLevel) {
+        throw "Running 'start-process $commandString $param' failed with exit code [$ecode]"
    }
-    return $eCode
+    
+    return
 }


@ -287,17 +317,15 @@ function SetEnvVar(
    Write-Verbose "SetEnvVar [$name] with [$content]"
    
    if ($Execute) {
-        # [environment]::SetEnvironmentVariable($name, $content, $location)
-
        $commandString = "& { [environment]::SetEnvironmentVariable('"+$name+"', '"+$content+"', '"+$location+"') }"
-
-        RunPowershellCommand -command "$commandString" -elevated $true
+        RunPowershellCommand -command "$commandString" -elevated $true -maxErrorLevel 0
    }    
 }

 function RunPowershellCommand(
    [string] $commandString,
-    [boolean] $elevated
+    [boolean] $elevated,
+    [int] $maxErrorLevel
 )
 {
    $commandBytes = [System.Text.Encoding]::Unicode.GetBytes($commandString)
@ -310,8 +338,12 @@ function RunPowershellCommand(
    else {
        $process = Start-Process -PassThru -FilePath powershell.exe -ArgumentList $commandLine -wait
    }
+    
    $eCode = ($process.ExitCode)
-    return ($ecode -eq 0)
+    if ($ecode -gt $maxErrorLevel) {
+        throw "Running 'powershell.exe $commandString' failed with exit code [$ecode]"
+    }
+    return
 }

 function Invoke-DosCommand {
@ -321,7 +353,7 @@ function Invoke-DosCommand {
    [string] $Command,
    [string[]] $Argument,
    [string] [ValidateScript({ Test-Path -PathType Container $_ })] $WorkingDirectory,
-    [switch] $IgnoreNonZeroExitCode,
+    [int] $maxErrorLevel,
    [switch] $SuppressOutput
  )
    Write-Verbose "Running '$Command $Argument'"
@ -336,7 +368,43 @@ function Invoke-DosCommand {
    if ($WorkingDirectory) {
        Pop-Location
    }
-    if (($LASTEXITCODE -ne 0) -and -not $IgnoreNonZeroExitCode) {
+    if ($LASTEXITCODE -gt $maxErrorLevel) {
        throw "Running '$Command $Argument' failed with exit code $LASTEXITCODE"
    }
 }
+
+function ResolveApplicationName(
+    [string] $name,
+    [string] $directory,
+    [bool] $usePath)
+{
+    $application = ""
+
+    if ($directory.Length -gt 0) {
+        $application = CallGetCommand (join-path $directory $name)
+    }
+    if ($application.Length -eq 0) {
+        if ($usePath) {
+            # we are at this point if we are supposed to check in the path environment for a match and
+            # $directory was empty or we couldn't find it in the $directory
+
+            $application = CallGetCommand $name
+        }
+    }
+    # application will be an empty string if we couldn't resolve the name, otherwise we can execute $application
+
+    return $application
+}
+
+function CallGetCommand(
+    [string] $application)
+{
+    try {
+        get-command $application -CommandType Application -ErrorAction Stop | Out-Null
+        return $application
+    }
+    catch {
+        # the application can't be found, so return empty string
+        return ""
+    }
+}
--- a/Scripts/windows/_info.ps1
+++ b/Scripts/windows/_info.ps1
@ -82,6 +82,23 @@ function CheckPowershellVersion
    return $false
 }

+function CheckOSVersion 
+{
+    $runningOn = (Get-WmiObject -class Win32_OperatingSystem).Caption
+    $isMatching = ($runningOn -match "^Microsoft Windows (8\.1|10|Server 2012 R2)") 
+
+    if ($isMatching) {
+        return
+    }
+
+    Write-Host "
+You are running the this install script on [$runningOn].
+The Microsoft Cognitive Toolkit is designed and tested on Windows 8.1, Windows 10, 
+and Windows Server 2012 R2. 
+"
+    return
+}
+
 function DisplayStart()
 {
    Write-Host $(DisplayStartMessage)
@ -90,6 +107,8 @@ function DisplayStart()
        return $false
    }

+    CheckOSVersion
+
    if (-not $Execute) {
        Write-Host $(DisplayWarningNoExecuteMessage)
    }
--- a/Scripts/windows/_operations.ps1
+++ b/Scripts/windows/_operations.ps1
@ -4,6 +4,9 @@
 #

 $operations = @(
+    @{Name = "Scan System for installed programs"; ShortName = "SCANPROG"; Info = "Scan System for installed programs"; 
+      Verification = @( @{Function = "VerifyScanPrograms" } )
+     },
    @{Name = "Verifying Installation contents"; ShortName = "INSTCONTENT"; Info = "Verifying Installation contents"; 
      Verification = @( @{Function = "VerifyInstallationContent"; Path = "$cntkRootDir" } )
     },
@ -45,8 +48,9 @@ $operations = @(
                  @{Function = "AddToPath"; Dir = "C:\Program Files\Git\cmd"; AtStart  = $true; } )
     },
    @{Name = "Clone CNTK from Github"; ShortName = "CNTKCLONE"; Info = "Clone CNTK from Github repository";
-      Verification = @( @{Function = "VerifyDirectory"; Path = "$RepoLocation" } );
+      Verification = @( @{Function = "VerifyDirectory"; Path = $RepoLocation } ); 
      Action = @( @{Function = "MakeDirectory"; Path = $repoDirectory },
-                  @{Function = "InstallExe"; Command = "C:\Program Files\Git\bin\git.exe"; Param = "clone --branch $RepoTag --recursive https://github.com/Microsoft/CNTK/ $repoName"; WorkDir = "$repoDirectory"; Message="Cloning CNTK (branch $RepoTag) repository...." } )
+                  @{Function = "ExecuteApplication"; AppName = "git.exe"; Param = "clone --branch $RepoTag --recursive https://github.com/Microsoft/CNTK/ $repoName"; AppDir = "C:\Program Files\Git"; UseEnvPath = $true; WorkDir = $repoDirectory } )
     }
 )
+
--- a/Scripts/windows/_verify.ps1
+++ b/Scripts/windows/_verify.ps1
@ -58,6 +58,19 @@ function VerifyItem(
    return $noInstallRequired
 }

+function VerifyScanPrograms(
+    [Parameter(Mandatory = $true)][hashtable] $table)
+{
+    FunctionIntro $table
+    $func = $table["Function"]
+    $noInstallRequired = $true
+    
+    # no actual work is being performed, just the script local datastructure with the list
+    # of installed programs is being initialized
+    LoadWin32Product
+    return $noInstallRequired
+}
+
 function VerifyWin32ProductExists(
    [Parameter(Mandatory = $true)][hashtable] $table)
 {
--- a/Source/CNTK/CNTK.cpp
+++ b/Source/CNTK/CNTK.cpp
@ -61,11 +61,6 @@
 #define let const auto
 #endif

-// TODO: Temporary mechanism to enable memory sharing for
-// node output value matrices. This will go away when the
-// sharing is ready to be enabled by default
-bool g_shareNodeValueMatrices = false;
-
 using namespace std;
 using namespace Microsoft::MSR;
 using namespace Microsoft::MSR::CNTK;
@ -243,6 +238,9 @@ void DoCommands(const ConfigParameters& config, const shared_ptr<MPIWrapper>& mp
            ProgressTracing::SetStepOffset(fullEpochsOffset); // this is the epoch number that SGD will log relative to
        }

+        if (Globals::ShouldEnableHyperCompressMemory())
+            Matrix<ElemType>::UseCachedResizeOrNot(true);
+
        // determine the action to perform, and do it
        for (int j = 0; j < action.size(); j++)
        {
@ -560,7 +558,10 @@ int wmainWithBS(int argc, wchar_t* argv[]) // called from wmain which is a wrapp
        mpi = MPIWrapper::GetInstance(true /*create*/);
    }  

-    g_shareNodeValueMatrices = config(L"shareNodeValueMatrices", false);
+    if (config(L"shareNodeValueMatrices", false))
+        Globals::EnableShareNodeValueMatrices();
+    if (config(L"hyperCompressMemory", false))
+        Globals::EnableHyperCompressMemory();

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

@ -702,7 +703,10 @@ int wmainOldCNTKConfig(int argc, wchar_t* argv[])
       mpi = MPIWrapper::GetInstance(true /*create*/);
    } 

-    g_shareNodeValueMatrices = config(L"shareNodeValueMatrices", false);
+    if (config(L"shareNodeValueMatrices", false))
+        Globals::EnableShareNodeValueMatrices();
+    if (config(L"hyperCompressMemory", false))
+        Globals::EnableHyperCompressMemory();

    TracingGPUMemoryAllocator::SetTraceLevel(config(L"traceGPUMemoryAllocations", 0));

--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -2399,6 +2399,11 @@ namespace CNTK
        ///
        CNTK_API static FunctionPtr LoadModel(DataType dataType, const std::wstring& modelFile, const DeviceDescriptor& computeDevice = DeviceDescriptor::UseDefaultDevice());

+        ///
+        /// Prints the entire graph underlying this function to stderr
+        ///
+        CNTK_API void PrintGraph() const;
+
    private:

        template <typename VariableType, typename FilterFunction>
@ -2899,6 +2904,13 @@ namespace CNTK
        CNTK_API FunctionPtr IsFirst(const Variable& operand, const std::wstring& name = L"");
        CNTK_API FunctionPtr IsLast(const Variable& operand, const std::wstring& name = L"");

+        CNTK_API FunctionPtr Slice(const Variable& operand, int beginIndex, int endIndex, const std::wstring& name = L"");
+
+        ///
+        /// Create an instance of the CNTK built-in sum reduction operation on specified tensor input operand along the operands lone dynamic sequence axis
+        ///
+        CNTK_API FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name = L"");
+
        CNTK_API FunctionPtr First(const Variable& operand, const std::wstring& name = L"");
        CNTK_API FunctionPtr Last(const Variable& operand, const std::wstring& name = L"");

--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -206,9 +206,11 @@ namespace CNTK
        CNTK_API FunctionPtr GatherPacked(const Variable& operand, const Variable& packedIndex, const std::wstring& name = L"");
        CNTK_API FunctionPtr ScatterPacked(const Variable& operand, const Variable& packedIndex, const Variable& condition, const std::wstring& name = L"");
        CNTK_API FunctionPtr ZeroesWithDynamicAxesLike(const Variable& operand);
-        CNTK_API FunctionPtr Where(const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
-        CNTK_API FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
-        CNTK_API FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Where(const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::wstring& name = L"");
+        CNTK_API FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name = L"");
        CNTK_API FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name = L"");
        CNTK_API FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name = L"");

@ -236,7 +238,8 @@ namespace CNTK

        CNTK_API void SetFixedRandomSeed(unsigned long fixedRandomSeed);

-        CNTK_API void SetForwardValuesSharing(bool enableSharing);
+        CNTK_API void EnableForwardValuesSharing();
+        CNTK_API void EnableHyperMemoryCompress();

        CNTK_API bool AreEquivalent(const ::CNTK::FunctionPtr& f1, const ::CNTK::FunctionPtr& f2);
        CNTK_API bool AreEquivalent(const ::CNTK::Variable& v1, const ::CNTK::Variable& v2, bool allowParameterAndConstantsEquivalence = false);
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -39,10 +39,6 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <UseDebugLibraries>true</UseDebugLibraries>
  </PropertyGroup>
-  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
-    <UseDebugLibraries>false</UseDebugLibraries>
-    <WholeProgramOptimization>true</WholeProgramOptimization>
-  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings">
  </ImportGroup>
@ -51,11 +47,10 @@
  </ImportGroup>
  <PropertyGroup Label="UserMacros" />
  <PropertyGroup Condition="$(DebugBuild)">
-    <LinkIncremental>true</LinkIncremental>
-    <TargetName>CNTKLibrary-$(LibraryVersion)</TargetName>
+      <LinkIncremental>true</LinkIncremental>
+      <TargetName>CNTKLibrary-$(LibraryVersion)</TargetName>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)">
-    <LinkIncremental>false</LinkIncremental>
    <TargetName>CNTKLibrary-$(LibraryVersion)</TargetName>
  </PropertyGroup>
  <PropertyGroup>
@ -100,9 +95,6 @@
    <ClCompile>
      <WarningLevel>Level4</WarningLevel>
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
-      <Optimization>MaxSpeed</Optimization>
-      <FunctionLevelLinking>true</FunctionLevelLinking>
-      <IntrinsicFunctions>true</IntrinsicFunctions>
      <PreprocessorDefinitions>CNTKV2LIBRARYDLL;WIN32;NDEBUG;_WINDOWS;_USRDLL;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <OpenMPSupport>false</OpenMPSupport>
@ -114,10 +106,7 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <EnableCOMDATFolding>true</EnableCOMDATFolding>
-      <OptimizeReferences>true</OptimizeReferences>
      <AdditionalDependencies>ComputationNetworkLib.lib; Math.lib; Common.lib; ReaderLib.lib; kernel32.lib; user32.lib; shell32.lib; SequenceTrainingLib.lib;$(ProtobufLib);%(AdditionalDependencies)</AdditionalDependencies>
-      <Profile>true</Profile>
      <DelayLoadDLLs>Math.dll; nvml.dll; $(CudaRuntimeDll)</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
@ -169,7 +158,6 @@
      </PrecompiledHeader>
    </ClCompile>
    <ClCompile Include="Function.cpp" />
-    <ClCompile Include="Globals.cpp" />
    <ClCompile Include="Learner.cpp" />
    <ClCompile Include="MinibatchSource.cpp" />
    <ClCompile Include="NDArrayView.cpp" />
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -21,7 +21,6 @@
    </ClCompile>
    <ClCompile Include="DistributedCommunicator.cpp" />
    <ClCompile Include="DataParallelDistributedTrainer.cpp" />
-    <ClCompile Include="Globals.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="stdafx.h" />
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -60,9 +60,14 @@ namespace CNTK
            return s_disableAutomaticUnpackingOfPackedValues.load();
        }

-        void SetForwardValuesSharing(bool enableSharing)
+        void EnableForwardValuesSharing()
        {
-            g_shareNodeValueMatrices = enableSharing;
+            Microsoft::MSR::CNTK::Globals::EnableShareNodeValueMatrices();
+        }
+
+        void EnableHyperMemoryCompress()
+        {
+            Microsoft::MSR::CNTK::Globals::EnableHyperCompressMemory();
        }

        bool AreEquivalent(const Variable& var1, const Variable& var2, bool allowParameterAndConstantsEquivalence)
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -544,6 +544,12 @@ namespace CNTK
        return CompositeFunction::Deserialize(modelDictionary, device);
    }

+    void Function::PrintGraph() const
+    {
+        CompositeFunction::Traverse(RootFunction(), [](const FunctionPtr& function) {
+        });
+    }
+
    // Names for the reduction operations as used by the CNTK ReduceElementsNode
    /*static*/ const std::wstring PrimitiveFunction::InternalSumReductionOpName = L"Sum";
    /*static*/ const std::wstring PrimitiveFunction::InternalLogSumReductionOpName = L"LogSum";
@ -580,6 +586,8 @@ namespace CNTK
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameEpsilon = L"epsilon";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameUseCuDNNEngine = L"useCuDNNEngine";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameNewDynamicAxes = L"newDynamicAxes";
+    /*static*/ const std::wstring PrimitiveFunction::AttributeNameNewSequenceAxisLengthScalingFactor = L"newSequenceAxisLengthScalingFactor";
+    /*static*/ const std::wstring PrimitiveFunction::AttributeNameNewSequenceAxisLengthAdditiveFactor = L"newSequenceAxisLengthAdditiveFactor";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameBeginIndex = L"beginIndex";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameEndIndex = L"endIndex";
    /*static*/ const std::wstring PrimitiveFunction::AttributeNameReductionOpName = L"reductionOpName";
@ -631,7 +639,36 @@ namespace CNTK
        if ((op == PrimitiveOpType::SumAll) || (op == PrimitiveOpType::SquaredError) || (op == PrimitiveOpType::CrossEntropyWithSoftmax) || (op == PrimitiveOpType::ClassificationError))
            outputDynamicAxes = std::vector<Axis>({});
        else if (op == PrimitiveOpType::Where)
-            outputDynamicAxes = AsVector<Axis>(functionConfig[PrimitiveFunction::AttributeNameNewDynamicAxes].Value<std::vector<DictionaryValue>>());
+        {
+            if (functionConfig.Contains(PrimitiveFunction::AttributeNameNewDynamicAxes))
+                outputDynamicAxes = AsVector<Axis>(functionConfig[PrimitiveFunction::AttributeNameNewDynamicAxes].Value<std::vector<DictionaryValue>>());
+            else
+            {
+                if (inputs[0].DynamicAxes() == Axis::UnknownDynamicAxes())
+                    outputDynamicAxes = Axis::UnknownDynamicAxes();
+                else
+                {
+                    if (functionConfig.Contains(PrimitiveFunction::AttributeNameNewSequenceAxisLengthScalingFactor) &&
+                        functionConfig.Contains(PrimitiveFunction::AttributeNameNewSequenceAxisLengthAdditiveFactor))
+                    {
+                        size_t newSequenceAxisLengthScalingFactor = functionConfig[PrimitiveFunction::AttributeNameNewSequenceAxisLengthScalingFactor].Value<size_t>();
+                        int newSequenceAxisLengthAdditiveFactor = functionConfig[PrimitiveFunction::AttributeNameNewSequenceAxisLengthAdditiveFactor].Value<int>();
+
+                        auto derivedDynamicAxes = GetDerivedDynamicAxes(inputs[0].DynamicAxes()[0], newSequenceAxisLengthScalingFactor, newSequenceAxisLengthAdditiveFactor);
+                        std::copy(derivedDynamicAxes.begin(), derivedDynamicAxes.end(), std::back_inserter(outputDynamicAxes));
+                    }
+                    else
+                    {
+                        outputDynamicAxes.push_back(Axis::NewUniqueDynamicAxis(L"whereNodeDynamicAxis"));
+                    }
+
+                    for (size_t i = 1; i < inputs[0].DynamicAxes().size(); ++i)
+                        outputDynamicAxes.push_back(inputs[0].DynamicAxes()[i]);
+
+                    functionConfig[PrimitiveFunction::AttributeNameNewDynamicAxes] = AsDictionaryValueVector(outputDynamicAxes);
+                }
+            }
+        }
        else if (op == PrimitiveOpType::ScatterPacked)
            outputDynamicAxes = inputs[2].DynamicAxes();
        else if ((op == PrimitiveOpType::PackedIndex) || (op == PrimitiveOpType::GatherPacked))
@ -1098,7 +1135,7 @@ namespace CNTK
        std::vector<FunctionPtr> topoSortedPrimitiveFunctions;
        std::vector<Variable> inputs;
        std::unordered_set<std::wstring> inputUids;
-        Traverse([&visitedFunctions, &inputs, &topoSortedPrimitiveFunctions, &inputUids](const FunctionPtr& function) {
+        Traverse(RootFunction(), [&visitedFunctions, &inputs, &topoSortedPrimitiveFunctions, &inputUids](const FunctionPtr& function) {
                    std::vector<Variable> functionInputs = function->Inputs();
                    for (const auto& input : functionInputs)
                    {
@ -2585,7 +2622,7 @@ namespace CNTK

    FunctionPtr Round(const Variable& operand, const std::wstring& name)
    {
-        return Floor(Plus(operand, Constant::Scalar(operand.GetDataType(), 0.5)), name);
+        return Floor(Plus(operand, Constant::Scalar(0.5f)), name);
    }

    FunctionPtr Floor(const Variable& operand, const std::wstring& name)
@ -2633,11 +2670,9 @@ namespace CNTK

        return TransposeAxes(operand, Axis(0), Axis(1), name);
    }
+
    FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name)
    {
-        if (axis == Axis::DefaultBatchAxis())
-            LogicError("Slice is currently unsupported along the batch axis");
-
        if (axis.IsStaticAxis())
        {
            if ((endIndex - beginIndex) <= 0)
@ -2646,46 +2681,10 @@ namespace CNTK
            return Internal::Slice(operand, axis, beginIndex, endIndex, name);
        }

-        if ((beginIndex == 0) && (endIndex == 0))
-            return operand;
+        if (axis == Axis::DefaultBatchAxis())
+            LogicError("Slice is currently unsupported along the batch axis");

-        auto operandAxes = operand.DynamicAxes();
-        auto findAxis = std::find(operandAxes.begin(), operandAxes.end(), axis);
-        if (findAxis == operandAxes.end())
-            InvalidArgument("The specified dynamic axis named %S does not match any of the dynamic axes of the operand", axis.Name().c_str());
-
-        auto beginFlagsLambda = [beginIndex, operand]() {
-            return (beginIndex > 0) ? Minus(Constant::Scalar(operand.GetDataType(), 1.0), Internal::IsWithin(operand, beginIndex)) : Internal::IsWithin(operand, beginIndex);
-        };
-
-        auto endFlagsLambda = [endIndex, operand]() {
-            return (endIndex > 0) ? Internal::IsWithin(operand, endIndex) : Minus(Constant::Scalar(operand.GetDataType(), 1.0), Internal::IsWithin(operand, endIndex));
-        };
-
-        FunctionPtr flags;
-        if (beginIndex == 0)
-            flags = endFlagsLambda();
-        else if (endIndex == 0)
-            flags = beginFlagsLambda();
-        else
-            flags = ElementTimes(beginFlagsLambda(), endFlagsLambda());
-
-        // Since we are slicing along a dynamic axis, the output variable's dynamic axes will be different than the operand
-        std::vector<Axis> newDynamicAxes;
-        for (auto operandAxis : operandAxes)
-        {
-            if (operandAxis == axis)
-            {
-                int sliceLength = (endIndex - beginIndex);
-                size_t multiplicativeFactor = (sliceLength > 0) ? 0 : 1;
-                auto derivedDynamicAxes = GetDerivedDynamicAxes(operandAxis, multiplicativeFactor, sliceLength);
-                std::copy(derivedDynamicAxes.begin(), derivedDynamicAxes.end(), std::back_inserter(newDynamicAxes));
-            }
-            else
-                newDynamicAxes.push_back(operandAxis);
-        }
-
-        return Internal::Gather(operand, flags, newDynamicAxes, name);
+        LogicError("CNTK::Slice: Invalid axis argument provided. To slice a sequence along its ordered dynamic axis use Sequence::Slice.");
    }

    FunctionPtr RandomSample(const Variable& operand, size_t numSamples, bool allowDuplicates, const std::wstring& name)
@ -2721,6 +2720,7 @@ namespace CNTK

        return UnaryOp(PrimitiveOpType::Reshape, operand, std::move(additionalProperties), name);
    }
+
    FunctionPtr BinaryOp(PrimitiveOpType op, const Variable& leftOperand, const Variable& rightOperand, Dictionary&& opConfig, const std::wstring& name)
    {
        std::vector<Variable> operands = { leftOperand, rightOperand };
@ -2815,14 +2815,14 @@ namespace CNTK
        if (topN == 1)
        {
            if (axis == Axis(0))
-                return Minus(Constant::Scalar(prediction.GetDataType(), 1.0), TransposeTimes(labels, Hardmax(prediction)), name);
+                return Minus(Constant::Scalar(1.0f), TransposeTimes(labels, Hardmax(prediction)), name);
            else
            {
                auto axMax = ReduceMax(prediction, axis);
                auto pred = Equal(prediction, axMax);
                auto wrongPred = NotEqual(labels, pred);
                auto axErr = ReduceSum(wrongPred, axis);
-                auto capErr = GreaterEqual(axErr, Constant::Scalar(prediction.GetDataType(), 1.0));
+                auto capErr = GreaterEqual(axErr, Constant::Scalar(1.0f));
                return ReduceMean(capErr, Axis::AllStaticAxes(), name);
            }
        }
@ -2831,7 +2831,7 @@ namespace CNTK
            if (axis != Axis(0))
                LogicError("ClassificationError along a specific axis does not support topN!");

-            std::vector<Variable> operands = { prediction, labels, Constant::Scalar(prediction.GetDataType(), (double)topN) };
+            std::vector<Variable> operands = { prediction, labels, Constant::Scalar((float)topN) };
            return CompositeFunction::Create(MakeSharedObject<PrimitiveFunction>(PrimitiveOpType::ClassificationError, operands, Dictionary(), name), name);
        }
    }
@ -3011,75 +3011,113 @@ namespace CNTK
    {
        // TODO: This is a temporary and expensive hack until we have a real alias implementation
        // that does not waste memory and compute cycles
-        return Plus(operand, Constant::Scalar(operand.GetDataType(), 0), name);
+        return Plus(operand, Constant::Scalar(0.0f), name);
    }

    namespace Sequence
    {
        void VerifyIsSequence(const Variable& operand)
        {
-            // The operand must have at least one dynamic axis and its first dynamic axis must be ordered
-            if (operand.DynamicAxes().empty() || !operand.DynamicAxes()[0].IsOrdered())
+            // The operand must have at least one dynamic axis
+            if (operand.DynamicAxes().empty())
                InvalidArgument("A sequence function can only be applied on operands with at least one dynamic axis and whose first dynamic axis is ordered");
        }

        FunctionPtr IsFirst(const Variable& operand, const std::wstring& name)
        {
-            VerifyIsSequence(operand);
            return Internal::IsWithin(operand, 1, name);
        }

        FunctionPtr IsLast(const Variable& operand, const std::wstring& name)
        {
-            VerifyIsSequence(operand);
            return Internal::IsWithin(operand, -1, name);
        }

+        FunctionPtr Slice(const Variable& operand, int beginIndex, int endIndex, const std::wstring& name)
+        {
+            VerifyIsSequence(operand);
+
+            if ((beginIndex == 0) && (endIndex == 0))
+                return operand;
+
+            auto beginFlagsLambda = [beginIndex, operand]() {
+                return (beginIndex > 0) ? Minus(Constant::Scalar(1.0f), Internal::IsWithin(operand, beginIndex)) : Internal::IsWithin(operand, beginIndex);
+            };
+
+            auto endFlagsLambda = [endIndex, operand]() {
+                return (endIndex > 0) ? Internal::IsWithin(operand, endIndex) : Minus(Constant::Scalar(1.0f), Internal::IsWithin(operand, endIndex));
+            };
+
+            FunctionPtr flags;
+            if (beginIndex == 0)
+                flags = endFlagsLambda();
+            else if (endIndex == 0)
+                flags = beginFlagsLambda();
+            else
+                flags = ElementTimes(beginFlagsLambda(), endFlagsLambda());
+
+            int sliceLength = (endIndex - beginIndex);
+            size_t multiplicativeFactor = (sliceLength > 0) ? 0 : 1;
+
+            return Internal::Gather(operand, flags, { multiplicativeFactor, sliceLength }, name);
+        }
+
        FunctionPtr First(const Variable& operand, const std::wstring& name)
        {
-            VerifyIsSequence(operand);
-            return Slice(operand, operand.DynamicAxes()[0], 0, 1, name);
+            return Sequence::Slice(operand, 0, 1, name);
        }

        FunctionPtr Last(const Variable& operand, const std::wstring& name)
        {
-            VerifyIsSequence(operand);
-            return Slice(operand, operand.DynamicAxes()[0], -1, 0, name);
-        }
-
-        std::vector<Axis> WhereOpDynamicAxes(const Variable& operand)
-        {
-            VerifyIsSequence(operand);
-
-            std::vector<Axis> newDynamicAxes = { Axis::NewUniqueDynamicAxis(L"whereNodeDynamicAxis") };
-            for (size_t i = 1; i < operand.DynamicAxes().size(); ++i)
-                newDynamicAxes.push_back(operand.DynamicAxes()[i]);
-
-            return newDynamicAxes;
+            return Sequence::Slice(operand, -1, 0, name);
        }

        FunctionPtr Where(const Variable& condition, const std::wstring& name)
        {
-            return Internal::Where(condition, WhereOpDynamicAxes(condition), name);
+            return UnaryOp(PrimitiveOpType::Where, condition, Dictionary(), name);
        }

        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::wstring& name)
        {
-            return Internal::Gather(operand, condition, WhereOpDynamicAxes(condition), name);
+            return Internal::Gather(operand, condition, name);
        }

        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::wstring& name)
        {
-            return Internal::Scatter(operand, condition, WhereOpDynamicAxes(condition), name);
+            return Internal::Scatter(operand, condition, name);
        }

        FunctionPtr BroadcastAs(const Variable& operand, const Variable& broadcastAs, const std::wstring& name)
        {
-            auto dataPadded = Internal::Scatter(operand, Sequence::IsFirst(broadcastAs), operand.DynamicAxes());
+            auto dataPadded = Internal::Scatter(operand, Sequence::IsFirst(broadcastAs), std::make_pair<size_t, int>(0, 1));
            auto placeHolderOutput = PlaceholderVariable(operand.Shape(), broadcastAs.DynamicAxes());
            auto output = ElementSelect(Sequence::IsFirst(broadcastAs), dataPadded, PastValue(placeHolderOutput), name);
            return output->ReplacePlaceholders({ { placeHolderOutput, output } });
        }
+
+        FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const std::wstring& name)
+        {
+            using namespace std::placeholders;
+
+            std::function<FunctionPtr(const Variable& leftOperand, const Variable& rightOperand)> reductionFunctor;
+            if (reductionOpName == PrimitiveFunction::InternalSumReductionOpName)
+                reductionFunctor = std::bind(Plus, _1, _2, L"");
+            else
+                LogicError("%S reduction along dynamic axis is currently unsupported", reductionOpName.c_str());
+
+            // We are reducing over a dynamic axis which is currently implemented using recurrence
+            auto cumulativeSumFunctionPlaceholder = PlaceholderVariable(operand.Shape());
+            auto prevAccumulatedValuesFunction = PastValue(cumulativeSumFunctionPlaceholder);
+            auto cumulativeSumFunction = reductionFunctor(prevAccumulatedValuesFunction, operand);
+            cumulativeSumFunction->ReplacePlaceholders({ { cumulativeSumFunctionPlaceholder, cumulativeSumFunction } });
+
+            return Sequence::Slice(cumulativeSumFunction, -1, 0, name);
+        }
+
+        FunctionPtr ReduceSum(const Variable& operand, const std::wstring& name)
+        {
+            return ReduceElements(operand, PrimitiveFunction::InternalSumReductionOpName, name);
+        }
    }

    namespace Internal
@ -3092,9 +3130,9 @@ namespace CNTK
                InvalidArgument("CNTK::Sequence::IsWithin: The offset must be positive");

            if (offset > 0)
-                return PastValue(Internal::ZeroesWithDynamicAxesLike(operand), Constant::Scalar(operand.GetDataType(), 1.0), offset, name);
+                return PastValue(Internal::ZeroesWithDynamicAxesLike(operand), Constant::Scalar(1.0f), offset, name);
            else
-                return FutureValue(Internal::ZeroesWithDynamicAxesLike(operand), Constant::Scalar(operand.GetDataType(), 1.0), -offset, name);
+                return FutureValue(Internal::ZeroesWithDynamicAxesLike(operand), Constant::Scalar(1.0f), -offset, name);
        }

        FunctionPtr PackedIndex(const Variable& operand, const Variable& index, const std::wstring& name)
@ -3131,21 +3169,32 @@ namespace CNTK
            }
        }

-        FunctionPtr Where(const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name)
+        FunctionPtr Where(const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name)
        {
            auto additionalProperties = Dictionary();
-            additionalProperties[PrimitiveFunction::AttributeNameNewDynamicAxes] = AsDictionaryValueVector(newDynamicAxes);
+            additionalProperties[PrimitiveFunction::AttributeNameNewSequenceAxisLengthScalingFactor] = newDerivedSequenceAxisScalingAndAdditiveFactor.first;
+            additionalProperties[PrimitiveFunction::AttributeNameNewSequenceAxisLengthAdditiveFactor] = newDerivedSequenceAxisScalingAndAdditiveFactor.second;
            return UnaryOp(PrimitiveOpType::Where, condition, std::move(additionalProperties), name);
        }

-        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::vector<Axis>& newDynamicAxes, const std::wstring& name)
+        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::wstring& name)
        {
-            return Internal::GatherPacked(operand, Internal::PackedIndex(/*layout of*/ operand, Where(condition, newDynamicAxes)), name);
+            return Internal::GatherPacked(operand, Internal::PackedIndex(/*layout of*/ operand, Sequence::Where(condition)), name);
        }

-        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::vector<Axis>& whereNodeDynamicAxes, const std::wstring& name)
+        FunctionPtr Gather(const Variable& operand, const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name)
        {
-            return Internal::ScatterPacked(operand, Internal::PackedIndex(/*layout of*/ condition, Where(condition, whereNodeDynamicAxes)), /*layout of*/ condition, name);
+            return Internal::GatherPacked(operand, Internal::PackedIndex(/*layout of*/ operand, Where(condition, newDerivedSequenceAxisScalingAndAdditiveFactor)), name);
+        }
+
+        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::wstring& name)
+        {
+            return Internal::ScatterPacked(operand, Internal::PackedIndex(/*layout of*/ condition, Sequence::Where(condition)), /*layout of*/ condition, name);
+        }
+
+        FunctionPtr Scatter(const Variable& operand, const Variable& condition, const std::pair<size_t, int>& newDerivedSequenceAxisScalingAndAdditiveFactor, const std::wstring& name)
+        {
+            return Internal::ScatterPacked(operand, Internal::PackedIndex(/*layout of*/ condition, Where(condition, newDerivedSequenceAxisScalingAndAdditiveFactor)), /*layout of*/ condition, name);
        }

        FunctionPtr Slice(const Variable& operand, const Axis& axis, int beginIndex, int endIndex, const std::wstring& name)
@ -3160,8 +3209,6 @@ namespace CNTK

        FunctionPtr ReduceElements(const Variable& operand, const std::wstring& reductionOpName, const Axis& axis, const std::wstring& name)
        {
-            using namespace std::placeholders;
-
            if (axis.IsStaticAxis() || (axis == Axis::AllStaticAxes()))
            {
                auto additionalProperties = Dictionary();
@ -3173,20 +3220,7 @@ namespace CNTK
            if (axis == Axis::DefaultBatchAxis())
                LogicError("Reduction is currently unsupported along the batch axis");

-            if (reductionOpName != PrimitiveFunction::InternalSumReductionOpName)
-                LogicError("%S reduction along dynamic axis is currently unsupported", reductionOpName.c_str());
-
-            std::function<FunctionPtr(const Variable& leftOperand, const Variable& rightOperand)> reductionFunctor;
-            if (reductionOpName == PrimitiveFunction::InternalSumReductionOpName)
-                reductionFunctor = std::bind(Plus, _1, _2, L"");
-
-            // We are reducing over a dynamic axis which is currently implemented using recurrence
-            auto cumulativeSumFunctionPlaceholder = PlaceholderVariable(operand.Shape());
-            auto prevAccumulatedValuesFunction = PastValue(cumulativeSumFunctionPlaceholder);
-            auto cumulativeSumFunction = reductionFunctor(prevAccumulatedValuesFunction, operand);
-            cumulativeSumFunction->ReplacePlaceholders({ { cumulativeSumFunctionPlaceholder, cumulativeSumFunction } });
-
-            return CNTK::Slice(cumulativeSumFunction, axis, -1, 0, name);
+            LogicError("CNTK::ReduceElements: Invalid axis argument provided. To reduce a sequence along its ordered dynamic axis use Sequence::ReduceElements.");
        }
   }
 }
--- a/Source/CNTKv2LibraryDll/Function.h
+++ b/Source/CNTKv2LibraryDll/Function.h
@ -187,6 +187,8 @@ namespace CNTK
        static const std::wstring AttributeNameEpsilon;
        static const std::wstring AttributeNameUseCuDNNEngine;
        static const std::wstring AttributeNameNewDynamicAxes;
+        static const std::wstring AttributeNameNewSequenceAxisLengthScalingFactor;
+        static const std::wstring AttributeNameNewSequenceAxisLengthAdditiveFactor;
        static const std::wstring AttributeNameBeginIndex;
        static const std::wstring AttributeNameEndIndex;
        static const std::wstring AttributeNameReductionOpName;
@ -699,22 +701,11 @@ namespace CNTK
            return CompositeFunctionOpName;
        }

-    private:
-        virtual void ReplacePlaceholdersInPlace(const std::unordered_map<Variable, Variable>& placeholderReplacements,
-                                                std::unordered_set<const Function*>& visitedFunctions,
-                                                std::unordered_set<Variable>& replacedPlaceholders) override;
-
-        CompositeFunction(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name, const std::wstring& uid = Internal::GenerateUid(L"CompositeFunction"))
-            : Function({}, rootFunction->Outputs(), Dictionary(), rootFunction, name, uid),
-            m_allPrimitiveFunctions(std::move(allPrimitiveFunctions)), m_networkMatricesAllocated(false)
-        {}
-
        template <typename FunctionType>
-        void Traverse(const FunctionType& functor) const
+        static void Traverse(const FunctionPtr& rootFunction, const FunctionType& functor)
        {
-            const auto& root = RootFunction();
            std::unordered_set<FunctionPtr> visitedFunctions;
-            Traverse(root, visitedFunctions, functor);
+            Traverse(rootFunction, visitedFunctions, functor);
        }

        // Recursively traverses the Function graph underlying the 'rootFunction' invoking the provided functor for all visited nodes in the graph.
@ -735,6 +726,16 @@ namespace CNTK
            }
        }

+    private:
+        virtual void ReplacePlaceholdersInPlace(const std::unordered_map<Variable, Variable>& placeholderReplacements,
+                                                std::unordered_set<const Function*>& visitedFunctions,
+                                                std::unordered_set<Variable>& replacedPlaceholders) override;
+
+        CompositeFunction(const FunctionPtr& rootFunction, std::unordered_set<FunctionPtr>&& allPrimitiveFunctions, const std::wstring& name, const std::wstring& uid = Internal::GenerateUid(L"CompositeFunction"))
+            : Function({}, rootFunction->Outputs(), Dictionary(), rootFunction, name, uid),
+            m_allPrimitiveFunctions(std::move(allPrimitiveFunctions)), m_networkMatricesAllocated(false)
+        {}
+
        std::vector<Variable> DetermineInputs() const
        {
            const auto& root = RootFunction();
--- a/Source/CNTKv2LibraryDll/Globals.cpp
+++ b/Source/CNTKv2LibraryDll/Globals.cpp
@ -1,10 +0,0 @@
-//
-// Copyright (c) Microsoft. All rights reserved.
-// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
-//
-
-#include "stdafx.h"
-
-// TODO: Currently there are some known issues with memory sharing for forward pass output matrices that 
-// need to be addressed before we can switch to using memory sharing by default here.
-bool g_shareNodeValueMatrices = false;
--- a/Source/Common/Globals.cpp
+++ b/Source/Common/Globals.cpp
@ -13,4 +13,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
    std::atomic<bool> Globals::m_forceDeterministicAlgorithms(false);
    std::atomic<bool> Globals::m_forceConstantRandomSeed(false);

-}}}
+    std::atomic<bool> Globals::m_enableShareNodeValueMatrices(false);
+    std::atomic<bool> Globals::m_enableHyperCompressMemory(false);
+
+}}}
--- a/Source/Common/Include/Globals.h
+++ b/Source/Common/Include/Globals.h
@ -22,8 +22,32 @@ namespace Microsoft { namespace MSR { namespace CNTK {
        // TODO: Currently the flag is set to false. Should be switched to true after more rigorous testing.
        static bool UseV2Aggregator() { return false; }

+        static void EnableShareNodeValueMatrices()
+        {
+            m_enableShareNodeValueMatrices = true;
+        }
+
+        static bool ShouldEnableShareNodeValueMatrices()
+        {
+            return m_enableShareNodeValueMatrices;
+        }
+
+        static void EnableHyperCompressMemory()
+        {
+            m_enableHyperCompressMemory = true;
+        }
+
+        static bool ShouldEnableHyperCompressMemory()
+        {
+            return m_enableHyperCompressMemory;
+        }
+
    private:
        static std::atomic<bool> m_forceDeterministicAlgorithms;
+        // The global flag to enable matrices values in forward and backward prop
+        static std::atomic<bool> m_enableShareNodeValueMatrices;
+        // The global flag to enable hyper memory compression 
+        static std::atomic<bool> m_enableHyperCompressMemory;
        static std::atomic<bool> m_forceConstantRandomSeed;
    };
 }}}
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -1002,7 +1002,7 @@ void ComputationNetwork::AllocateAllMatrices(const std::vector<ComputationNodeBa
    // Due to special topology, if a node is solely induced by parameters, its function value should not be shared
    MarkValueNonSharableNodes();

-    bool performingBackPropagation = (trainRootNode != nullptr);
+    bool performingBackPropagation = (trainRootNode != nullptr) || (Globals::ShouldEnableHyperCompressMemory());

    // Create a composite Eval order with the specified nodes as roots
    // For each node determine parents and whether the output of the
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@ -61,7 +61,7 @@
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
    <ClCompile>
-      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>/d2Zi+ /bigobj %(AdditionalOptions)</AdditionalOptions>
    </ClCompile>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(CpuOnlyBuild)">
@ -136,4 +136,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -12,6 +12,7 @@
 #include "TensorShape.h"
 #include "MatrixPool.h"
 #include "ComputationEnvironment.h"
+#include "Globals.h"

 #include <unordered_set>
 #include <map>
@ -46,8 +47,6 @@
 #define CNTK_MODEL_VERSION_15 15 // add new nodes: LambdaRankNode and NDCG1Eval
 #define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_15

-extern bool g_shareNodeValueMatrices;
-
 // helper mode for debugging
 // If TRACK_GAP_NANS is defined then initialize layout gaps to NaN and do NaN checks. Also do detailed logging of node computations.
 // #define TRACK_GAP_NANS
@ -768,7 +767,11 @@ public:
    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const { return true; }

    void SetOutputNeededDuringBackprop(bool f) { m_outputNeededDuringBackprop = f; }
-    bool IsOutputNeededDuringBackprop() const { return !g_shareNodeValueMatrices || m_outputNeededDuringBackprop; }
+    bool IsOutputNeededDuringBackprop() const 
+    { 
+        return (!Globals::ShouldEnableShareNodeValueMatrices() && !Globals::ShouldEnableHyperCompressMemory())
+            || m_outputNeededDuringBackprop; 
+    }

    // -----------------------------------------------------------------------
    // helpers for network traversal
@ -1631,6 +1634,20 @@ public:
 #endif
        // tracing
        Trace();
+
+        // Any memory not needed could resize to zero immediately when HyperCompressMemory active. Since the memory won't really release,
+        // all these memory blocks are gathered into a memory pool. When the next request coming, the best fitting block will be chosen.
+        if (Globals::ShouldEnableHyperCompressMemory()) 
+        {
+            for (auto& input : GetInputs())
+            {
+                if (!input->IsOutputNeededDuringBackprop())
+                {
+                    auto inputNodePtr = DownCast(input);
+                    inputNodePtr->Value().Resize(0, 0);
+                }
+            }
+        }
    }

 #if 0   // (keep it around in case we need to add stuff in the future)
@ -1640,9 +1657,9 @@ public:
        }
 #endif

-#ifdef _DEBUG
    virtual void /*IComputationNode::*/ EndBackprop() override
    {
+#ifdef _DEBUG
        Base::EndBackprop();
 #ifdef TRACK_GAP_NANS
        for (size_t i = 0; i < m_inputs.size(); i++)
@ -1656,8 +1673,18 @@ public:
            }
        }
 #endif
-    }
 #endif
+        // We could release the gradient of value sharable nodes and all no-longer used memory generated in forward.
+        if (IsValueSharable() && Globals::ShouldEnableHyperCompressMemory())
+        {
+            if (GradientPtr()) 
+                Gradient().Resize(0, 0);
+
+            // canceling the graph dependency
+            if (IsOutputNeededDuringBackprop()) 
+                Value().Resize(0, 0);
+        }
+    }

    // this is the entry point from Network; while it will call virtual BackpropTo() into the actual node implementation
    // TODO: move to -Base (or -Network?)
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -2506,7 +2506,7 @@ public:
        if (expAvgFactor != 0 || blendFactor != 1)
            m_samplesSeen += GetMBLayout()->GetActualNumSamples();

-        Base::EndBackprop();
+        Base::EndForwardProp();
    }

    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
--- a/Source/EvalDll/CNTKEval.cpp
+++ b/Source/EvalDll/CNTKEval.cpp
@ -30,11 +30,6 @@
 #include "latticearchive.h"
 #include <limits>

-// TODO: Temporary mechanism to enable memory sharing for
-// node output value matrices. This will go away when the
-// sharing is ready to be enabled by default
-bool g_shareNodeValueMatrices = false;
-
 namespace Microsoft { namespace MSR { namespace CNTK {


@ -44,7 +39,10 @@ void CNTKEvalBase<ElemType>::Init(const std::string& config)
    m_config.Parse(config);
    size_t nThreads = m_config("numCPUThreads", "1");
    CPUMatrix<ElemType>::SetNumThreads(nThreads);
-    g_shareNodeValueMatrices = m_config(L"shareNodeValueMatrices", false);
+    if (m_config(L"shareNodeValueMatrices", false))
+        Globals::EnableShareNodeValueMatrices();
+    if (m_config(L"hyperCompressMemory", false))
+        Globals::EnableHyperCompressMemory();
 }


--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -14,9 +14,12 @@
 #endif

 #include "Basics.h"
+#include "basetypes.h"
 #include <string>
 #include <stdint.h>
 #include <memory>
+#include <unordered_map>
+#include <map>

 #pragma warning( disable: 4251 )
 typedef unsigned char byte;
@ -38,6 +41,8 @@ typedef unsigned char byte;
 #define GPUSPARSE_INDEX_TYPE int // cuSparse only supports int array indexes
 #define CPUSPARSE_INDEX_TYPE int // to be consistent with cuSparse but limited the possible size of the matrix.

+#define MEM_MAX_LIMIT_TIMES 2 // The maximum times allowed a cached memory block allocated to a request
+
 namespace Microsoft { namespace MSR { namespace CNTK {

 MATH_API void SetMathLibTraceLevel(int traceLevel);
@ -61,11 +66,13 @@ public:
    template <typename AllocatedElemType>
    static void Free(int deviceId, AllocatedElemType* bufferPtr, bool ignoreCUDARetCode = false);

+    // Let it be public method, the memory manager could check the totoal free memory and decide whether to physically
+    // release all the cached memory.
+    static std::pair<size_t, size_t> GetFreeAndTotalMemoryInMBs(int deviceId);
+
 private:
    template <typename AllocatedElemType>
    static AllocatedElemType* AllocateNoTrace(int deviceId, size_t numElements);
-
-    static std::pair<size_t, size_t> GetFreeAndTotalMemoryInMBs(int deviceId);
 };

 // -----------------------------------------------------------------------
@ -205,6 +212,158 @@ enum MatrixFlags
    matrixFlagSetValueOnDevice = 1 << bitPosSetValueOnDevice, // SetValue() call has a buffer that is already on the device
 };

+
+// -----------------------------------------------------------------------
+// BufferManagement -- to control the allocation and release of memory
+// 
+// 1. The goal of buffer management
+// The best way to save memory is releasing memory right after no longer used in the rest of the mini-batch, which makes 
+// the extra cost on memory operation and slows down the speed. An option to solve that is building the static link between 
+// all nodes in pre-computing process and making memory re-use in the runtime, known as shared node value matrices in CNTK. 
+// The other option is using a buffer pool to take over the allocation and release request. Whereas the physical operation on 
+// memory, logical operation will make nearly no cost on allocation or release. Since the second option, achieved as 
+// BufferManagement below, could control all the memory operation, including some trivial ones, like the workspace in convolutions, 
+// and more flexible, allocating based on size and being easy to implement new algorithm, it is usually more powerful than the
+// first method.
+// 2. How it works?
+// First, it should be called in Resize function. In Resize function, using Request and LogicalReleaseFunction to replace the original 
+// request and release functions. Since BufferManagement is singleton for deviceId, just call the GetManagementInstance. And in Resize, 
+// there is a flag named growthOnly, which will request only the size increases to save the allocation cost. In the case, since the 
+// buffer pool, nearly no cost on allocation, the growth only will be disable in BufferManagement mode.
+// -----------------------------------------------------------------------
+class BufferManagement
+{
+private:
+    BufferManagement() = default;
+
+    // Disable all the copy & move functions to keep the instance safely
+    DISABLE_COPY_AND_MOVE(BufferManagement);
+
+public:
+    static BufferManagement& GetManagerInstance(DEVICEID_TYPE deviceId)
+    {
+        static std::mutex instancLock;
+        auto instance = m_instances.find(deviceId);
+        if (instance == m_instances.end()) 
+        {
+            std::lock_guard<std::mutex> lock(instancLock);
+            if (instance == m_instances.end())
+            {
+                instance = m_instances.insert(std::make_pair(deviceId, std::unique_ptr<BufferManagement>(
+                    new BufferManagement()))).first;
+                instance->second->m_deviceId = deviceId;
+                instance->second->m_totalManageSize = 0;
+                instance->second->m_totalAllocSize = 0;
+            }
+        }
+        return *(instance->second);
+    }
+
+    // for requesting, find in buffer container first, if failed, allocate a new one
+    // if allocating from buffer, the size will be modified to the real buffer size
+    template<class ElemType>
+    ElemType* RequestBuffer(size_t& size)
+    {
+        ElemType* bufferPtr = nullptr;
+        auto& bufferContainer = BufferContainer<ElemType>();
+
+        // simply allocating based on size, more efficient and complex algorithm could be implemented here
+        auto bufferHint = bufferContainer.lower_bound(size);
+        if (bufferHint != bufferContainer.end() && bufferHint->first < size * MEM_MAX_LIMIT_TIMES) 
+        {
+            bufferPtr = bufferHint->second;
+            size = bufferHint->first;
+            m_totalManageSize -= size;
+            bufferContainer.erase(bufferHint);
+            return bufferPtr;
+        }
+
+        if (m_deviceId >= 0) {
+#ifndef CPUONLY
+            auto deviceSize = TracingGPUMemoryAllocator::GetFreeAndTotalMemoryInMBs(m_deviceId);
+            float freeMemoryRatio = (float)deviceSize.first / deviceSize.second;
+            if (freeMemoryRatio < 0.05f || (deviceSize.first << 20) / sizeof(ElemType) < size) 
+            {
+                PhysicalReleaseAllBuffer<ElemType>();
+            }
+            bufferPtr = TracingGPUMemoryAllocator::Allocate<ElemType>(m_deviceId, size);
+            m_totalAllocSize += size;
+#endif
+        }
+        else 
+        {
+            // first, try no-throw allocation.
+            // if failed, empty the buffer and re-try a throwing allocation
+            // if failed again, let system throw the bad_alloc exception
+            bufferPtr = new (std::nothrow) ElemType[size];
+            if (!bufferPtr) 
+            {
+                PhysicalReleaseAllBuffer<ElemType>();
+                bufferPtr = new ElemType[size];
+            }
+            m_totalAllocSize += size;
+        }
+
+        return bufferPtr;
+    }
+
+    // insert the header of buffer into the buffer container
+    template<class ElemType>
+    void LogicalReleaseBuffer(ElemType* buffer, size_t size)
+    {
+        auto& bufferContainer = BufferContainer<ElemType>();
+        bufferContainer.insert(std::make_pair(size, buffer));
+        m_totalManageSize += size;
+    }
+
+    // physical release the buffer
+    template<class ElemType>
+    void PhysicalReleaseBuffer(ElemType* buffer)
+    {
+        if (m_deviceId >= 0) 
+        {
+#ifndef CPUONLY
+            TracingGPUMemoryAllocator::Free<ElemType>(m_deviceId, buffer, false);
+#endif
+        }
+        else {
+            delete[] buffer;
+        }
+    }
+
+    // empty all the cached buffer
+    template<class ElemType>
+    void PhysicalReleaseAllBuffer()
+    {
+        auto& bufferContainer = BufferContainer<ElemType>();
+
+        for (auto& iter : bufferContainer) 
+        {
+            PhysicalReleaseBuffer<ElemType>(iter.second);
+        }
+
+        bufferContainer.clear();
+        m_totalManageSize = 0;
+    }
+
+private:
+    static std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> m_instances;
+
+    template <class ElemType>
+    std::multimap<size_t, ElemType*>& BufferContainer();
+    DEVICEID_TYPE m_deviceId;
+    size_t m_totalManageSize;
+    size_t m_totalAllocSize;
+
+    // map to store all the temp buffer handle
+    std::multimap<size_t, float*> m_bufferFloatContainer;
+    std::multimap<size_t, double*> m_bufferDoubleContainer;
+    std::multimap<size_t, char*> m_bufferCharContainer;
+    std::multimap<size_t, short*> m_bufferShortContainer;
+    std::multimap<size_t, int*> m_bufferIntContainer;
+};
+
+
 // -----------------------------------------------------------------------
 // BaseMatrixStorage -- base class for all matrix types (CPU, GPU) x (dense, sparse)
 // -----------------------------------------------------------------------
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -247,6 +247,11 @@ protected:
            if (CUDNN_STATUS_SUCCESS == err2)
                err = CUDNN_STATUS_SUCCESS;
        }
+
+        // Only supported in MatrixPool enable
+        // NOTE: it's unnecessary to keep the workspace.
+        workspace.Resize(0, 0);
+
        CUDNN_CALL(err);
    }

@ -278,6 +283,7 @@ protected:
        // Compute gradients with respect to the output tensor (data).
        CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.Algo.algo,
                                                ptr(workspace), m_backDataAlgo.Algo.memory, &C::One, m_inT, ptr(grad)));
+        workspace.Resize(0, 0);
    }

    void BackwardKernelCore(const Mat& srcGrad, const Mat& in, Mat& kernelGrad, bool /*allowReuse*/, Mat& workspace) override
@ -308,6 +314,7 @@ protected:
        // Compute gradients with respect to the output tensor (data).
        CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.Algo.algo,
                                                  ptr(workspace), m_backFiltAlgo.Algo.memory, &C::One, *m_kernelT, ptr(kernelGrad)));
+        workspace.Resize(0, 0);
    }

    void EnsurePoolingInitialized() override
--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -1505,32 +1505,43 @@ void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly)
+void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
 {
    if (GetNumRows() != numRows || GetNumCols() != numCols)
-        Resize(numRows, numCols, growOnly);
+        Resize(numRows, numCols, growOnly, cachedResize);
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
+void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
 {
    if (GetNumRows() == numRows && GetNumCols() == numCols)
        return;

    VerifyResizable(__func__);
+    bool isForceResize = (!growOnly) || cachedResize;

    size_t numElements = numRows * numCols;
-    if (numElements > GetSizeAllocated() ||                 // grow allocation
-        (!growOnly && numElements != GetSizeAllocated()))   // shrink allocation if not growOnly
+    if (numElements > GetSizeAllocated() ||                     // grow allocation
+        (isForceResize && numElements != GetSizeAllocated()))   // shrink allocation if not growOnly
    {
        // reallocate buffer if numElements > 0
        ElemType* pArray = nullptr;
        if (numElements > 0)
-            pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
+        {
+            if (cachedResize)
+                pArray = BufferManagement::GetManagerInstance(GetComputeDeviceId()).RequestBuffer<ElemType>(numElements);
+            else
+                pArray = TracingGPUMemoryAllocator::Allocate<ElemType>(GetComputeDeviceId(), numRows, numCols);
+        }

        // If the buffer exists, free it
        if (Buffer())
-            TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
+        {
+            if(cachedResize)
+                BufferManagement::GetManagerInstance(GetComputeDeviceId()).LogicalReleaseBuffer<ElemType>(Buffer(), GetSizeAllocated());
+            else
+                TracingGPUMemoryAllocator::Free<ElemType>(GetComputeDeviceId(), Buffer());
+        }

        SetBuffer(pArray, numElements * sizeof(ElemType));
        SetSizeAllocated(numElements);
@ -4559,8 +4570,8 @@ template GPUMatrix<char>::GPUMatrix(const GPUMatrix<char>&);
 template GPUMatrix<char>::GPUMatrix(GPUMatrix<char>&&);
 template char* GPUMatrix<char>::CopyToArray() const;
 template void GPUMatrix<char>::ChangeDeviceTo(int);
-template void GPUMatrix<char>::Resize(size_t, size_t, bool);
-template void GPUMatrix<char>::RequireSize(size_t, size_t, bool);
+template void GPUMatrix<char>::Resize(size_t, size_t, bool, bool);
+template void GPUMatrix<char>::RequireSize(size_t, size_t, bool, bool);

 template GPUMatrix<char>::~GPUMatrix();
 template GPUMatrix<char> GPUMatrix<char>::ColumnSlice(size_t startColumn, size_t numCols) const;
@ -4584,8 +4595,8 @@ template GPUMatrix<short>::GPUMatrix(const GPUMatrix<short>&);
 template GPUMatrix<short>::GPUMatrix(GPUMatrix<short>&&);
 template short* GPUMatrix<short>::CopyToArray() const;
 template void GPUMatrix<short>::ChangeDeviceTo(int);
-template void GPUMatrix<short>::Resize(size_t, size_t, bool);
-template void GPUMatrix<short>::RequireSize(size_t, size_t, bool);
+template void GPUMatrix<short>::Resize(size_t, size_t, bool, bool);
+template void GPUMatrix<short>::RequireSize(size_t, size_t, bool, bool);

 template GPUMatrix<short>::~GPUMatrix();
 template GPUMatrix<short> GPUMatrix<short>::ColumnSlice(size_t startColumn, size_t numCols) const;
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -232,12 +232,12 @@ public:
    // RequireSize is now the new preferred method of ensuring the correct size inside of the Matrix class. Since Resize will fail if the storage object has
    // multiple views, RequireSize will first check to see if Resize is required. If it is not, then it short-circuits and is a noop. Otherwise, RequireSize
    // will call Resize, which may fail if the matrix has multiple views.
-    void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
-    void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly); }
+    void RequireSize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow
+    void RequireSize(const GPUMatrix<ElemType>& like, bool growOnly = true, bool cachedResize = false) { RequireSize(like.GetNumRows(), like.GetNumCols(), growOnly, cachedResize); }

    // Resize first checks to ensure that the caller has the authority to call Resize (i.e., it checks to ensure the underlying data is owned by only this matrix), and then
    // actually resizes the underlying matrix, doing any allocation as required.
-    void Resize(const size_t numRows, const size_t numCols, bool growOnly = true); // by default we only reallocate if need to grow
+    void Resize(const size_t numRows, const size_t numCols, bool growOnly = true, bool cachedResize = false); // by default we only reallocate if need to grow

    ElemType&       operator()(const size_t /*row*/, const size_t /*col*/)       { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
    const ElemType& operator()(const size_t /*row*/, const size_t /*col*/) const { LogicError("GPUMatrix doesn't support operator(,) on the CPU."); }
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -156,6 +156,23 @@ int GetMathLibTraceLevel()

 MatrixBase::~MatrixBase() { }

+#pragma region BufferManagement
+
+std::unordered_map<DEVICEID_TYPE, std::unique_ptr<BufferManagement>> BufferManagement::m_instances;
+
+template <>
+std::multimap<size_t, float*>& BufferManagement::BufferContainer<float>() { return m_bufferFloatContainer; }
+template <>
+std::multimap<size_t, double*>& BufferManagement::BufferContainer<double>() { return m_bufferDoubleContainer; }
+template <>
+std::multimap<size_t, char*>& BufferManagement::BufferContainer<char>() { return m_bufferCharContainer; }
+template <>
+std::multimap<size_t, short*>& BufferManagement::BufferContainer<short>() { return m_bufferShortContainer; }
+template <>
+std::multimap<size_t, int*>& BufferManagement::BufferContainer<int>() { return m_bufferIntContainer; }
+
+#pragma endregion
+
 #pragma region Constructors, destructors and other static matrix builders


@ -165,6 +182,10 @@ MatrixBase::~MatrixBase() { }
 //            { GPU code },
 //            ...

+// By default, the CachedMatrixBuffer is disable
+template <class ElemType>
+bool Matrix<ElemType>::m_useCachedResize = false;
+
 // Initialize members 
 template <class ElemType>
 void Matrix<ElemType>::Init(DEVICEID_TYPE deviceId)
@ -278,6 +299,9 @@ void Matrix<ElemType>::SetDataLocation(CurrentDataLocation location, MatrixType
        LogicError("SetDataLocation: New m_baseMatrix must not be NULL.");
 }

+template <class ElemType>
+void Matrix<ElemType>::UseCachedResizeOrNot(bool useCachedResize) { m_useCachedResize = useCachedResize; }
+
 //this is a private constructor only used internally to initialize a blank matrix
 template <class ElemType>
 Matrix<ElemType>::Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID)
@ -1593,7 +1617,7 @@ void Matrix<ElemType>::Resize(const size_t numRows, const size_t numCols, const
    // TODO: should this function test whether the size is changing, and skip if it isn't? We have at least one explicit test for this code calling this (recurrent node)
    DISPATCH_MATRIX_ON_FLAG_USEBOTH_4BOTH(this,
        { m_CPUMatrix->Resize(numRows, numCols, growOnly); },
-        { m_GPUMatrix->Resize(numRows, numCols, growOnly); },
+        { m_GPUMatrix->Resize(numRows, numCols, growOnly, m_useCachedResize); },
        { m_CPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); },
        { m_GPUSparseMatrix->RequireSizeAndAllocate(numRows, numCols, numNZElemToReserve, growOnly, false); });
 #ifdef _DEBUG
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -76,6 +76,9 @@ private:
    mutable size_t m_numTimesDeviceChanged;
    mutable size_t m_numTimesMatrixTypeChanged;
    mutable int m_devicesTransferedTo[2]; // TODO: what is this for? Seems only diagnostics
+ 
+    // whether to use cached memory Resize() or not
+    static bool m_useCachedResize;

    // Moves matrix from device id_from to device with id_to. This method doesn't change preferred device Id
    void _transferFromDeviceToDevice(int id_from, int id_to, bool isBeingMoved = true, bool emptyTransfer = false) const;
@ -130,6 +133,8 @@ public:
        SetDataLocation(GetDeviceId() < 0 ? CurrentDataLocation::CPU : CurrentDataLocation::GPU, GetMatrixType());
    }

+    static void UseCachedResizeOrNot(bool useCachedResize);
+
 private:
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, const MatrixFormat matrixFormat, DEVICEID_TYPE deviceID); // only used internally to initialize a blank matrix
    Matrix(const MatrixFlags matrixFlags, const MatrixType matrixType, DEVICEID_TYPE deviceID);                                  // only used internally to initialize a blank matrix
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -1067,12 +1067,12 @@ void GPUMatrix<ElemType>::Reshape(const size_t numRows, const size_t numCols)
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly)
+void GPUMatrix<ElemType>::RequireSize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
 {
 }

 template <class ElemType>
-void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly)
+void GPUMatrix<ElemType>::Resize(const size_t numRows, const size_t numCols, bool growOnly, bool cachedResize)
 {
 }

--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/CPPEvalExtendedClientTest.vcxproj
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/CPPEvalExtendedClientTest.vcxproj
@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup Label="ProjectConfigurations">
+    <ProjectConfiguration Include="Debug|x64">
+      <Configuration>Debug</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release|x64">
+      <Configuration>Release</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Debug_CpuOnly|x64">
+      <Configuration>Debug_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+    <ProjectConfiguration Include="Release_CpuOnly|x64">
+      <Configuration>Release_CpuOnly</Configuration>
+      <Platform>x64</Platform>
+    </ProjectConfiguration>
+  </ItemGroup>
+  <PropertyGroup Label="Globals">
+    <ProjectGuid>{5D29C76D-648A-456F-920D-48230F2FB3C8}</ProjectGuid>
+    <Keyword>Win32Proj</Keyword>
+    <RootNamespace>CPPEvalExtendedClientTest</RootNamespace>
+    <ProjectName>CPPEvalExtendedClientTest</ProjectName>
+  </PropertyGroup>
+  <Import Project="$(SolutionDir)\CNTK.Cpp.props" />
+  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>true</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <CharacterSet>Unicode</CharacterSet>
+    <UseIntelMKL>No</UseIntelMKL>
+  </PropertyGroup>
+  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
+    <ConfigurationType>Application</ConfigurationType>
+    <UseDebugLibraries>false</UseDebugLibraries>
+    <PlatformToolset>v120</PlatformToolset>
+    <WholeProgramOptimization>true</WholeProgramOptimization>
+    <CharacterSet>Unicode</CharacterSet>
+    <UseIntelMKL>No</UseIntelMKL>
+    <UseIntelIPP>false</UseIntelIPP>
+  </PropertyGroup>
+  <!--Importing CPP defaults must occur after declaring the desired toolset above
+  Otherwise, the build may default back to an previous toolset -->
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+  <ImportGroup Label="ExtensionSettings" />
+  <ImportGroup Label="PropertySheets">
+    <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+  </ImportGroup>
+  <PropertyGroup>
+    <!-- TODO intentional for all? -->
+    <LinkIncremental>false</LinkIncremental>
+    <TargetName>CPPEvalExtendedClientTest</TargetName>
+  </PropertyGroup>
+  <ItemDefinitionGroup>
+    <ClCompile>
+      <PrecompiledHeader>NotUsing</PrecompiledHeader>
+      <WarningLevel>Level4</WarningLevel>
+      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <PreprocessorDefinitions>WIN32;UNICODE;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <SDLCheck>true</SDLCheck>
+      <MultiProcessorCompilation>true</MultiProcessorCompilation>
+      <FloatingPointModel>Fast</FloatingPointModel>
+      <OpenMPSupport>true</OpenMPSupport>
+      <TreatWarningAsError>true</TreatWarningAsError>
+    </ClCompile>
+    <Link>
+      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
+      <SubSystem>Console</SubSystem>
+      <GenerateDebugInformation>true</GenerateDebugInformation>
+      <AdditionalDependencies>EvalDLL.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>%(DelayLoadDLLs)</DelayLoadDLLs>
+      <Profile>true</Profile>
+    </Link>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(DebugBuild)">
+    <ClCompile>
+      <PreprocessorDefinitions>_DEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <Optimization>Disabled</Optimization>
+      <MinimalRebuild>false</MinimalRebuild>
+    </ClCompile>
+    <Link />
+    <ProjectReference>
+      <LinkLibraryDependencies>false</LinkLibraryDependencies>
+    </ProjectReference>
+  </ItemDefinitionGroup>
+  <ItemDefinitionGroup Condition="$(ReleaseBuild)">
+    <ClCompile>
+      <Optimization>MaxSpeed</Optimization>
+      <FunctionLevelLinking>true</FunctionLevelLinking>
+      <IntrinsicFunctions>true</IntrinsicFunctions>
+      <PreprocessorDefinitions>NDEBUG;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <EnableParallelCodeGeneration>true</EnableParallelCodeGeneration>
+      <FloatingPointExceptions>false</FloatingPointExceptions>
+      <AdditionalOptions>/d2Zi+ %(AdditionalOptions)</AdditionalOptions>
+      <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+    </ClCompile>
+    <Link>
+      <EnableCOMDATFolding>true</EnableCOMDATFolding>
+      <OptimizeReferences>true</OptimizeReferences>
+    </Link>
+    <ProjectReference>
+      <LinkLibraryDependencies>true</LinkLibraryDependencies>
+    </ProjectReference>
+  </ItemDefinitionGroup>
+  <ItemGroup>
+    <ClCompile Include="..\..\..\..\Examples\Evaluation\CPPEvalExtendedClient\CPPEvalExtendedClient.cpp" />
+  </ItemGroup>
+  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+  <ImportGroup Label="ExtensionTargets" />
+</Project>
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/CPPEvalExtendedClientTest.vcxproj.filters
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/CPPEvalExtendedClientTest.vcxproj.filters
@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <ItemGroup>
+    <Filter Include="Source Files">
+      <UniqueIdentifier>{4FC737F1-C7A5-4376-A066-2A32D752A2FF}</UniqueIdentifier>
+      <Extensions>cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx</Extensions>
+    </Filter>
+    <Filter Include="Header Files">
+      <UniqueIdentifier>{93995380-89BD-4b04-88EB-625FBE52EBFB}</UniqueIdentifier>
+      <Extensions>h;hh;hpp;hxx;hm;inl;inc;xsd</Extensions>
+    </Filter>
+    <Filter Include="Resource Files">
+      <UniqueIdentifier>{67DA6AB6-F800-4c08-8B7A-83BB121AAD01}</UniqueIdentifier>
+      <Extensions>rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms</Extensions>
+    </Filter>
+  </ItemGroup>
+  <ItemGroup>
+    <ClCompile Include="CPPEvalExtendedClient.cpp">
+      <Filter>Source Files</Filter>
+    </ClCompile>
+  </ItemGroup>
+</Project>
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/README.md
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/README.md
@ -0,0 +1,2 @@
+This folder contains the VC++ project file for building CPPEvalExtendedClientTest.exe. 
+The C++ source code used by the project is in Examples\Evaluation\CPPEvalExtendedClient.
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/baseline.txt
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/baseline.txt
@ -0,0 +1,114 @@
+CPU info:
+    CPU Model Name: Intel(R) Xeon(R) CPU E5-2630 v3 @ 2.40GHz
+    Hardware threads: 32
+    Total Memory: 33468508 kB
+-------------------------------------------------------------------
+ [[ -z E:\CNTKTestData ]]
+ [[ ! -d E:\CNTKTestData ]]
+ '[' Windows_NT == Windows_NT ']'
++ cygpath -au 'E:\CNTKTestData'
+ TestDataDir=/cygdrive/e/CNTKTestData
+ ATISDir=/cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS
+ DataDir=/cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/Data
+ OutputDir=/cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/Data
+ ConfigDir=/cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS
+ DeleteModelsAfterTest=0
+ '[' -f /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/ATIS.cntk ']'
+ cntkrun ATIS.cntk 'stderr=- command=Train Train=[SGD=[maxEpochs=1]]'
+ configFileName=ATIS.cntk
+ additionalCNTKArgs='stderr=- command=Train Train=[SGD=[maxEpochs=1]]'
+ '[' Windows_NT == Windows_NT ']'
++ cygpath -aw /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS
+ ConfigDir='C:\repos\cntk\Examples\Text\ATIS'
++ cygpath -aw /tmp/cntk-test-20161108174139.565799/EvalClientTests_CPPEvalExtendedClientTest@release_cpu
+ RunDir='C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu'
++ cygpath -aw /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/Data
+ DataDir='C:\repos\cntk\Examples\Text\ATIS\Data'
++ cygpath -aw /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/Data
+ OutputDir='C:\repos\cntk\Examples\Text\ATIS\Data'
+ CNTKArgs='configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu DataDir=C:\repos\cntk\Examples\Text\ATIS\Data ConfigDir=C:\repos\cntk\Examples\Text\ATIS OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data DeviceId=-1 timestamping=true stderr=- command=Train Train=[SGD=[maxEpochs=1]]'
+ '[' '' '!=' '' ']'
+ modelsDir=/tmp/cntk-test-20161108174139.565799/EvalClientTests_CPPEvalExtendedClientTest@release_cpu/Models
+ [[ 1 == 1 ]]
+ '[' -d /tmp/cntk-test-20161108174139.565799/EvalClientTests_CPPEvalExtendedClientTest@release_cpu/Models ']'
+ mkdir -p /tmp/cntk-test-20161108174139.565799/EvalClientTests_CPPEvalExtendedClientTest@release_cpu/Models
+ [[ 0 == 0 ]]
+ run /cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe 'configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk' 'currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data' 'RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu' 'DataDir=C:\repos\cntk\Examples\Text\ATIS\Data' 'ConfigDir=C:\repos\cntk\Examples\Text\ATIS' 'OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data' DeviceId=-1 timestamping=true stderr=- command=Train 'Train=[SGD=[maxEpochs=1]]'
+ cmd=/cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe
+ shift
+ '[' '' == 1 ']'
+ echo === Running /cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe 'configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk' 'currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data' 'RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu' 'DataDir=C:\repos\cntk\Examples\Text\ATIS\Data' 'ConfigDir=C:\repos\cntk\Examples\Text\ATIS' 'OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data' DeviceId=-1 timestamping=true stderr=- command=Train 'Train=[SGD=[maxEpochs=1]]'
+=== Running /cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu DataDir=C:\repos\cntk\Examples\Text\ATIS\Data ConfigDir=C:\repos\cntk\Examples\Text\ATIS OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data DeviceId=-1 timestamping=true stderr=- command=Train Train=[SGD=[maxEpochs=1]]
+ /cygdrive/c/repos/cntk/x64/release_CpuOnly/cntk.exe 'configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk' 'currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data' 'RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu' 'DataDir=C:\repos\cntk\Examples\Text\ATIS\Data' 'ConfigDir=C:\repos\cntk\Examples\Text\ATIS' 'OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data' DeviceId=-1 timestamping=true stderr=- command=Train 'Train=[SGD=[maxEpochs=1]]'
+CNTK 2.0.beta2.0+ (zhouwang/pr899 0b1214, Nov  8 2016 17:27:36) on ZHOUWANGDEV4 at 2016/11/08 16:41:40
+
+C:\repos\cntk\x64\release_CpuOnly\cntk.exe  configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk  currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data  RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu  DataDir=C:\repos\cntk\Examples\Text\ATIS\Data  ConfigDir=C:\repos\cntk\Examples\Text\ATIS  OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data  DeviceId=-1  timestamping=true  stderr=-  command=Train  Train=[SGD=[maxEpochs=1]]
+Changed current directory to C:\repos\cntk\Examples\Text\ATIS\Data
+11/08/2016 16:41:40: Redirecting stderr to file -_Train.logrank0
+CNTK 2.0.beta2.0+ (zhouwang/pr899 0b1214, Nov  8 2016 17:27:36) on ZHOUWANGDEV4 at 2016/11/08 16:41:40
+
+C:\repos\cntk\x64\release_CpuOnly\cntk.exe  configFile=C:\repos\cntk\Examples\Text\ATIS/ATIS.cntk  currentDirectory=C:\repos\cntk\Examples\Text\ATIS\Data  RunDir=C:\cygwin64\tmp\cntk-test-20161108174139.565799\EvalClientTests_CPPEvalExtendedClientTest@release_cpu  DataDir=C:\repos\cntk\Examples\Text\ATIS\Data  ConfigDir=C:\repos\cntk\Examples\Text\ATIS  OutputDir=C:\repos\cntk\Examples\Text\ATIS\Data  DeviceId=-1  timestamping=true  stderr=-  command=Train  Train=[SGD=[maxEpochs=1]]
+
+11/08/2016 16:41:40: ##############################################################################
+11/08/2016 16:41:40: #                                                                            #
+11/08/2016 16:41:40: # Train command (train action)                                               #
+11/08/2016 16:41:40: #                                                                            #
+11/08/2016 16:41:40: ##############################################################################
+
+Node 'lstmStack.layers[0].lstmState._.ot._.PlusArgs[0].PlusArgs[0].PlusArgs[1].TimesArgs[0]' (LearnableParameter operation) operation: Tensor shape was inferred as [300 x 150].
+Node 'lstmStack.layers[0].lstmState._.ft._.PlusArgs[0].PlusArgs[0].PlusArgs[1].TimesArgs[0]' (LearnableParameter operation) operation: Tensor shape was inferred as [300 x 150].
+Node 'lstmStack.layers[0].lstmState._.it._.PlusArgs[0].PlusArgs[0].PlusArgs[1].TimesArgs[0]' (LearnableParameter operation) operation: Tensor shape was inferred as [300 x 150].
+Node 'lstmStack.layers[0].lstmState._.bit.ElementTimesArgs[1].z.PlusArgs[0].PlusArgs[1].TimesArgs[0]' (LearnableParameter operation) operation: Tensor shape was inferred as [300 x 150].
+11/08/2016 16:41:40: 
+Model has 61 nodes. Using CPU.
+
+11/08/2016 16:41:40: Training criterion:   cr = CrossEntropyWithSoftmax
+11/08/2016 16:41:40: Evaluation criterion: errs = ClassificationError
+
+11/08/2016 16:41:40: Training 1005127 parameters in 18 parameter tensors.
+
+11/08/2016 16:42:02: Finished Epoch[ 1 of 1]: [Training] cr = 0.40189165 * 36006; errs = 8.254% * 36006; totalSamplesSeen = 36006; learningRatePerSample = 0.0099999998; epochTime=22.2249s
+
+11/08/2016 16:42:02: __COMPLETED__
+ return 0
+ local ExitCode=0
+ [[ 0 == 1 ]]
+ return 0
+ '[' -d 'C:\repos\cntk\Examples\Text\ATIS\Data/work' ']'
+ '[' -d /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/work ']'
+ mv 'C:\repos\cntk\Examples\Text\ATIS\Data/work' /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/
+ '[' Windows_NT == Windows_NT ']'
+ /cygdrive/c/repos/cntk/x64/release_CpuOnly/CPPEvalExtendedClientTest.exe
+Input node name: featuresCW
+Input feature dimension: 944
+Input node name: featuresNW
+Input feature dimension: 944
+Input node name: featuresPW
+Input feature dimension: 944
+Slot tag for sentence "BOS i would like to find a flight from charlotte to las vegas that makes a stop in st. louis EOS" is as followings:
+         i -- I-transport_type
+     would -- I-transport_type
+      like -- I-transport_type
+        to -- I-transport_type
+      find -- I-transport_type
+         a -- I-transport_type
+    flight -- I-transport_type
+      from -- I-transport_type
+ charlotte -- B-fromloc.airport_name
+        to -- I-transport_type
+       las -- B-toloc.airport_name
+     vegas -- I-toloc.airport_name
+      that -- I-transport_type
+     makes -- I-transport_type
+         a -- I-transport_type
+      stop -- I-transport_type
+        in -- I-transport_type
+       st. -- B-stoploc.airport_name
+     louis -- I-state_name
+Evaluation complete.
+Output dimension: 127
+Output name: outputs
+ ExitCode=0
+ '[' -d /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/work ']'
+ rm -rf /cygdrive/c/repos/cntk/Tests/EndToEndTests/../../Examples/Text/ATIS/work
+ exit 0
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/run-test
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/run-test
@ -0,0 +1,48 @@
+#!/bin/bash
+
+. $TEST_ROOT_DIR/run-test-common
+
+set -x
+
+# This test case is to test CPPEvalClient works with the same setup of users. 
+# For that purpose, the test needs to create the pre-trained model in the Examples directories as expected by CPPEvalExtendedClient.
+# These files are removed by Jenkins during workspace cleanup.
+
+# The eval test uses some pretrained models which are not part of the CNTK repository itself
+# We use the dataset from an external location specified using an environment variable
+if [[ -z "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" || ! -d "$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY" ]]; then
+  echo This test uses external data that is not part of the CNTK repository. Environment variable CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY must be set to point to the external test data location.
+  exit 1
+fi
+
+if [ "$OS" == "Windows_NT" ]; then
+  TestDataDir=`cygpath -au $CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY`
+else
+  TestDataDir=$CNTK_EXTERNAL_TESTDATA_SOURCE_DIRECTORY
+fi
+
+ATISDir=$TEST_ROOT_DIR/../../Examples/Text/ATIS
+DataDir=$ATISDir/Data
+OutputDir=$ATISDir/Data
+ConfigDir=$ATISDir
+
+# Train model for evaluation
+DeleteModelsAfterTest=0
+[ -f $ConfigDir/ATIS.cntk ] || exit 1
+cntkrun ATIS.cntk "stderr=- command=Train Train=[SGD=[maxEpochs=1]]" || exit $?
+
+# The created model is saved under $DataDir/work, according to ATIS.cntk. Move it to the $ATISDir/work
+[ -d $DataDir/work ] || exit $?
+[ -d $ATISDir/work ] && rm -rf $ATISDir/work
+mv $DataDir/work $ATISDir/ || exit $?
+ 
+if [ "$OS" == "Windows_NT" ]; then
+  $TEST_BIN_DIR/CPPEvalExtendedClientTest.exe
+else
+  $TEST_BIN_DIR/cppevalextendedclient
+fi
+ExitCode=$?
+
+[ -d $ATISDir/work ] && rm -rf $ATISDir/work
+
+exit $ExitCode
--- a/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/testcases.yml
+++ b/Tests/EndToEndTests/EvalClientTests/CPPEvalExtendedClientTest/testcases.yml
@ -0,0 +1,92 @@
+dataDir: .
+
+tags:  
+  - bvt-i (build_sku != '1bitsgd') and ((build_sku == 'cpu') or (device == 'gpu')) and (flavor == 'release')
+  # This test also runs in debug mode, as the debug version of EvalDll is also included in the NuGet package.
+  - nightly-i (build_sku != '1bitsgd') and ((build_sku == 'cpu') or (device == 'gpu'))
+
+testCases:
+   Test run must be completed:
+    patterns:
+      - Evaluation complete
+
+   # Due to time limitation, the test can only train the model with 1 Epoch, so the
+   # model is not accurate enough to create correct results under some build flavors.
+   # Disable to check results for now.
+
+   #Test results Line 1:
+   #patterns:
+   # - i -- I-transport_type
+        
+   #Test results Line 2:
+   # patterns:
+   # - would -- I-transport_type
+           
+   #Test results Line 3:
+   # patterns:
+   # - like -- I-transport_type
+           
+   #Test results Line 4:
+   # patterns:
+   # - to -- I-transport_type
+           
+   #Test results Line 5:
+   # patterns:
+   # - find -- I-transport_type
+           
+   #Test results Line 6:
+   # patterns:
+   # - a -- I-transport_type
+           
+   #Test results Line 7:
+   # patterns:
+   # - flight -- I-transport_type
+           
+   #Test results Line 8:
+   # patterns:
+   # - from -- I-transport_type
+           
+   #Test results Line 9:
+   # patterns:
+   # - charlotte -- B-fromloc.airport_name
+           
+   #Test results Line 10:
+   # patterns:
+   # - to -- I-transport_type
+           
+   #Test results Line 11:
+   # patterns:
+   # - las -- B-toloc.airport_name
+           
+   #Test results Line 12:
+   # patterns:
+   # - vegas -- I-toloc.airport_name
+           
+   #Test results Line 13:
+   # patterns:
+   # - that -- I-transport_type
+           
+   #Test results Line 14:
+   # patterns:
+   # - makes -- I-transport_type
+           
+   #Test results Line 15:
+   # patterns:
+   # - a -- I-transport_type
+           
+   #Test results Line 16:
+   # patterns:
+   # - stop -- I-transport_type
+           
+   #Test results Line 17:
+   # patterns:
+   # - in -- I-transport_type
+           
+   #Test results Line 18:
+   # patterns:
+   # - st. -- B-stoploc.airport_name
+           
+   #Test results Line 19:
+   # patterns:
+   # - louis -- I-state_name
+
--- a/Tests/Install/linux/test.sh
+++ b/Tests/Install/linux/test.sh
@ -52,9 +52,11 @@ for drop in $*; do
  if [[ "$DROP_FILE" == *CPU* ]] || [[ "$DROP_FILE" == *cpu* ]]; then
    TEST_DEVICE=cpu
    DOCKER_TO_RUN=docker
+    DOCKERFILE_SUFFIX=CPU
  else
    TEST_DEVICE=gpu
    DOCKER_TO_RUN=nvidia-docker
+    DOCKERFILE_SUFFIX=GPU
  fi

  rm -f "$DROP_RESERVED"
@ -63,7 +65,7 @@ for drop in $*; do

  IMAGE=cntk:installtest
  for base in Ubuntu16 Ubuntu14; do
-    docker build -t $IMAGE -f Dockerfile-$base-GPU --build-arg REPO_TAG=$REPO_TAG .
+    docker build -t $IMAGE -f Dockerfile-$base-$DOCKERFILE_SUFFIX --build-arg REPO_TAG=$REPO_TAG .
    $DOCKER_TO_RUN run --rm $IMAGE su - testuser -c "./run-test.sh $TEST_DEVICE"
    docker rmi $IMAGE
  done
--- a/Tests/UnitTests/BrainScriptTests/stdafx.cpp
+++ b/Tests/UnitTests/BrainScriptTests/stdafx.cpp
@ -8,9 +8,4 @@

 #define BOOST_TEST_MODULE BrainScriptTests

-#include "stdafx.h"
-
-// TODO: Temporary mechanism to enable memory sharing for
-// node output value matrices. This will go away when the
-// sharing is ready to be enabled by default
-bool g_shareNodeValueMatrices = false;
+#include "stdafx.h"
--- a/Tests/UnitTests/NetworkTests/stdafx.cpp
+++ b/Tests/UnitTests/NetworkTests/stdafx.cpp
@ -9,9 +9,4 @@
 #include "MPIWrapper.h"

 // TODO: Get rid of these globals
-Microsoft::MSR::CNTK::MPIWrapper* g_mpi = nullptr;
-
-// TODO: Temporary mechanism to enable memory sharing for
-// node output value matrices. This will go away when the
-// sharing is ready to be enabled by default
-bool g_shareNodeValueMatrices = false;
+Microsoft::MSR::CNTK::MPIWrapper* g_mpi = nullptr;
--- a/Tests/UnitTests/V2LibraryTests/Common.h
+++ b/Tests/UnitTests/V2LibraryTests/Common.h
@ -200,7 +200,6 @@ inline CNTK::FunctionPtr Stabilize(const CNTK::Variable& x, const CNTK::DeviceDe
 template <typename ElementType>
 std::pair<CNTK::FunctionPtr, CNTK::FunctionPtr> LSTMPCellWithSelfStabilization(CNTK::Variable input, CNTK::Variable prevOutput, CNTK::Variable prevCellState, const CNTK::DeviceDescriptor& device)
 {
-    size_t inputDim = input.Shape()[0];
    size_t outputDim = prevOutput.Shape()[0];
    size_t cellDim = prevCellState.Shape()[0];

@ -209,8 +208,8 @@ std::pair<CNTK::FunctionPtr, CNTK::FunctionPtr> LSTMPCellWithSelfStabilization(C
    };

    unsigned long seed = 1;
-    auto createProjectionParam = [device, &seed](size_t outputDim, size_t inputDim) {
-        return CNTK::Parameter({ outputDim, inputDim }, CNTK::AsDataType<ElementType>(), CNTK::GlorotUniformInitializer(1, 0, 1, seed++), device);
+    auto createProjectionParam = [device, &seed](size_t outputDim) {
+        return CNTK::Parameter({ outputDim, CNTK::NDShape::InferredDimension }, CNTK::AsDataType<ElementType>(), CNTK::GlorotUniformInitializer(1, 0, 1, seed++), device);
    };

    auto createDiagWeightParam = [device, &seed](size_t dim) {
@ -220,26 +219,26 @@ std::pair<CNTK::FunctionPtr, CNTK::FunctionPtr> LSTMPCellWithSelfStabilization(C
    auto stabilizedPrevOutput = Stabilize<ElementType>(prevOutput, device);
    auto stabilizedPrevCellState = Stabilize<ElementType>(prevCellState, device);

-    auto projectInput = [input, cellDim, inputDim, createBiasParam, createProjectionParam]() {
-        return createBiasParam(cellDim) + CNTK::Times(createProjectionParam(cellDim, inputDim), input);
+    auto projectInput = [input, cellDim, createBiasParam, createProjectionParam]() {
+        return createBiasParam(cellDim) + CNTK::Times(createProjectionParam(cellDim), input);
    };

    // Input gate
-    auto it = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim, outputDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), stabilizedPrevCellState));
-    auto bit = CNTK::ElementTimes(it, CNTK::Tanh(projectInput() + CNTK::Times(createProjectionParam(cellDim, outputDim), stabilizedPrevOutput)));
+    auto it = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), stabilizedPrevCellState));
+    auto bit = CNTK::ElementTimes(it, CNTK::Tanh(projectInput() + CNTK::Times(createProjectionParam(cellDim), stabilizedPrevOutput)));

    // Forget-me-not gate
-    auto ft = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim, outputDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), stabilizedPrevCellState));
+    auto ft = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), stabilizedPrevCellState));
    auto bft = CNTK::ElementTimes(ft, prevCellState);

    auto ct = bft + bit;

    // Output gate
-    auto ot = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim, outputDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), Stabilize<ElementType>(ct, device)));
+    auto ot = CNTK::Sigmoid(projectInput() + CNTK::Times(createProjectionParam(cellDim), stabilizedPrevOutput) + CNTK::ElementTimes(createDiagWeightParam(cellDim), Stabilize<ElementType>(ct, device)));
    auto ht = CNTK::ElementTimes(ot, CNTK::Tanh(ct));

    auto c = ct;
-    auto h = (outputDim != cellDim) ? CNTK::Times(createProjectionParam(outputDim, cellDim), Stabilize<ElementType>(ht, device)) : ht;
+    auto h = (outputDim != cellDim) ? CNTK::Times(createProjectionParam(outputDim), Stabilize<ElementType>(ht, device)) : ht;

    return{ h, c };
 }
--- a/Tests/UnitTests/V2LibraryTests/FunctionTests.cpp
+++ b/Tests/UnitTests/V2LibraryTests/FunctionTests.cpp
@ -99,18 +99,14 @@ void TestReduceSum(size_t sampleRank, const DeviceDescriptor& device)

    // Test ReduceSum along a dynamic axis
    {
-        auto testReduceSum = [&sequences, &sequenceLengths, inputShape, sequencesValue, device](const Axis& axis)
+        auto testReduceSum = [&sequences, &sequenceLengths, inputShape, sequencesValue, device]()
        {
-            if (!axis.IsDynamicAxis())
-                RuntimeError("Called the dynamic axis ReduceSum test with a static axis");
-
-            size_t maxActualSequenceLength = sequencesValue->Shape()[inputShape.Rank()];
            size_t numSequences = sequencesValue->Shape()[inputShape.Rank() + 1];

            auto inputVar = InputVariable({ inputShape }, DataType::Float, L"input");
-            FunctionPtr reduceSumFunc = ReduceSum(inputVar, axis);
+            FunctionPtr reduceSumFunc = Sequence::ReduceSum(inputVar);

-            NDShape maskShape = { ((axis == Axis::DefaultBatchAxis()) ? maxActualSequenceLength : 1), ((axis == Axis::DefaultBatchAxis()) ? 1 : numSequences) };
+            NDShape maskShape = { 1, numSequences };
            NDShape outputShape = reduceSumFunc->Output().Shape();
            auto outputDataShape = outputShape.AppendShape(maskShape);

@ -130,10 +126,7 @@ void TestReduceSum(size_t sampleRank, const DeviceDescriptor& device)
                    for (size_t k = 0; k < inputShape.TotalSize(); ++k)
                    {
                        float value = sequences[i][(j * inputShape.TotalSize()) + k];
-                        if (axis == Axis::DefaultBatchAxis())
-                            expectedTotals[(j * inputShape.TotalSize()) + k] += value;
-                        else
-                            expectedTotals[(i * inputShape.TotalSize()) + k] += value;
+                        expectedTotals[(i * inputShape.TotalSize()) + k] += value;
                    }
                }
            }
@ -141,7 +134,7 @@ void TestReduceSum(size_t sampleRank, const DeviceDescriptor& device)
            FloatingPointVectorCompare(outputData, expectedTotals, "testReduceSum: Forward prop results do not match expected results");
        };

-        testReduceSum(Axis::DefaultDynamicAxis());
+        testReduceSum();
    }
 }

@ -217,11 +210,8 @@ void TestSlice(size_t sampleRank, const DeviceDescriptor& device)

    // Test slice along a dynamic axis
    {
-        auto testDynamicAxisSlice = [&sequences, &sequenceLengths, inputShape, sequencesValue, device](const Axis& axis, int beginOffset, int endOffset)
+        auto testDynamicAxisSlice = [&sequences, &sequenceLengths, inputShape, sequencesValue, device](int beginOffset, int endOffset)
        {
-            if (!axis.IsDynamicAxis())
-                RuntimeError("Called the dynamic axis slice test with a static axis");
-
            size_t maxActualSequenceLength = sequencesValue->Shape()[inputShape.Rank()];
            size_t numSequences = sequencesValue->Shape()[inputShape.Rank() + 1];

@ -229,11 +219,11 @@ void TestSlice(size_t sampleRank, const DeviceDescriptor& device)
            size_t maxSliceLength = (endAndBeginOffsetDiff > 0) ? endAndBeginOffsetDiff : maxActualSequenceLength + endAndBeginOffsetDiff;

            auto inputVar = InputVariable(inputShape, DataType::Float, L"input");
-            auto sliceFunc = Slice(inputVar, axis, beginOffset, endOffset);
+            auto sliceFunc = Sequence::Slice(inputVar, beginOffset, endOffset);
            sliceFunc = sliceFunc + sliceFunc;

-            size_t outputSequenceAxisLength = (axis == Axis::DefaultDynamicAxis()) ? maxSliceLength : maxActualSequenceLength;
-            size_t outputBatchAxisLength = (axis == Axis::DefaultBatchAxis()) ? maxSliceLength : numSequences;
+            size_t outputSequenceAxisLength = maxSliceLength;
+            size_t outputBatchAxisLength = numSequences;
            NDShape outputShape = sliceFunc->Output().Shape().AppendShape({ outputSequenceAxisLength, outputBatchAxisLength });
            std::vector<float> outputData(outputShape.TotalSize(), 0);
            NDMaskPtr mask;
@ -247,15 +237,15 @@ void TestSlice(size_t sampleRank, const DeviceDescriptor& device)
            std::unordered_map<Variable, ValuePtr> outputs = { { sliceFunc->Output(), outputValue } };
            sliceFunc->Forward({ { inputVar, sequencesValue } }, outputs, device);

-            size_t startSequenceIdx = (axis == Axis::DefaultBatchAxis()) ? ((beginOffset >= 0) ? beginOffset : (numSequences + beginOffset)) : 0;
-            size_t endSequenceIdx = (axis == Axis::DefaultBatchAxis()) ? ((endOffset > 0) ? endOffset : (numSequences + endOffset)) : numSequences;
+            size_t startSequenceIdx = 0;
+            size_t endSequenceIdx = numSequences;

            std::vector<float> expectedOutputValues(inputShape.TotalSize() * outputSequenceAxisLength * outputBatchAxisLength);
            for (size_t i = startSequenceIdx; i < endSequenceIdx; ++i)
            {
                size_t currentSequenceLength = sequenceLengths[i];
-                size_t startFrameIdx = (axis == Axis::DefaultDynamicAxis()) ? ((beginOffset >= 0) ? beginOffset : (currentSequenceLength + beginOffset)) : 0;
-                size_t endFrameIdx = (axis == Axis::DefaultDynamicAxis()) ? ((endOffset > 0) ? endOffset : (currentSequenceLength + endOffset)) : currentSequenceLength;
+                size_t startFrameIdx = ((beginOffset >= 0) ? beginOffset : (currentSequenceLength + beginOffset));
+                size_t endFrameIdx = ((endOffset > 0) ? endOffset : (currentSequenceLength + endOffset));
                size_t j = startFrameIdx;
                for (; j < endFrameIdx; ++j)
                {
@ -272,12 +262,12 @@ void TestSlice(size_t sampleRank, const DeviceDescriptor& device)
            FloatingPointVectorCompare(outputData, expectedOutputValues, "testDynamicAxisSlice: Forward prop results do not match expected results");
        };

-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), 0, 1);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), 0, 2);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), -1, 0);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), -2, 0);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), 0, -1);
-        testDynamicAxisSlice(Axis::DefaultDynamicAxis(), 1, 0);
+        testDynamicAxisSlice(0, 1);
+        testDynamicAxisSlice(0, 2);
+        testDynamicAxisSlice(-1, 0);
+        testDynamicAxisSlice(-2, 0);
+        testDynamicAxisSlice(0, -1);
+        testDynamicAxisSlice(1, 0);
    }
 }

--- a/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp
+++ b/Tests/UnitTests/V2LibraryTests/Seq2Seq.cpp
@ -6,7 +6,7 @@ using namespace CNTK;

 using namespace std::placeholders;

-void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useSparseInputs, bool testSaveAndReLoad, bool testCheckpointing, bool addBeamSearchReorderingHook, bool testCloning)
+void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useSparseInputs, bool testSaveAndReLoad, bool testCheckpointing, bool addBeamSearchReorderingHook, bool testCloning, bool usePlaceholders)
 {
    using namespace std::placeholders;

@ -30,7 +30,7 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
    FunctionPtr inputSequence = Alias(rawInput, L"inputSequence");

    // Drop the sentence start token from the label, for decoder training
-    auto labelSequence = Slice(rawLabels, labelDynamicAxes[0], 1, 0, L"labelSequenceWithStartTrimmed");
+    auto labelSequence = Sequence::Slice(rawLabels, 1, 0, L"labelSequenceWithStartTrimmed");
    auto labelSentenceStart = Sequence::First(rawLabels, L"labelSequenceStart");

    auto isFirstLabel = Sequence::IsFirst(labelSequence, L"isFirstLabel");
@ -38,8 +38,8 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
    bool forceEmbedding = useSparseInputs;

    /* Embeddings */
-    auto inputEmbeddingWeights = Parameter({ inputEmbeddingDim, inputVocabDim }, DataType::Float, GlorotUniformInitializer(), device, L"inputEmbeddingWeights");
-    auto labelEmbeddingWeights = Parameter({ labelEmbeddingDim, labelVocabDim }, DataType::Float, GlorotUniformInitializer(), device, L"labelEmbeddingWeights");
+    auto inputEmbeddingWeights = Parameter({ inputEmbeddingDim, NDShape::InferredDimension }, DataType::Float, GlorotUniformInitializer(), device, L"inputEmbeddingWeights");
+    auto labelEmbeddingWeights = Parameter({ labelEmbeddingDim, NDShape::InferredDimension }, DataType::Float, GlorotUniformInitializer(), device, L"labelEmbeddingWeights");

    auto inputEmbedding = Alias((!forceEmbedding && (inputVocabDim <= inputEmbeddingDim)) ? inputSequence : Times(inputEmbeddingWeights, inputSequence), L"inputEmbedding");
    auto labelEmbedding = Alias((!forceEmbedding && (labelVocabDim <= labelEmbeddingDim)) ? labelSequence : Times(labelEmbeddingWeights, labelSequence), L"labelEmbedding");
@ -63,8 +63,20 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
        labelSentenceStartEmbeddedScattered = Reshape(labelSentenceStartEmbeddedScattered, labelSentenceStartEmbeddedScattered->Output().Shape().AppendShape({ 1 }), L"labelSentenceStartEmbeddedScattered");
    }

-    auto thoughtVectorBroadcastH = Sequence::BroadcastAs(thoughtVectorH, labelEmbedding, L"thoughtVectorBroadcastH");
-    auto thoughtVectorBroadcastC = Sequence::BroadcastAs(thoughtVectorC, labelEmbedding, L"thoughtVectorBroadcastC");
+    auto actualThoughtVectorBroadcastH = Sequence::BroadcastAs(thoughtVectorH, labelEmbedding, L"thoughtVectorBroadcastH");
+    auto actualThoughtVectorBroadcastC = Sequence::BroadcastAs(thoughtVectorC, labelEmbedding, L"thoughtVectorBroadcastC");
+
+    Variable thoughtVectorBroadcastH, thoughtVectorBroadcastC;
+    if (usePlaceholders)
+    {
+        thoughtVectorBroadcastH = PlaceholderVariable();
+        thoughtVectorBroadcastC = PlaceholderVariable();
+    }
+    else
+    {
+        thoughtVectorBroadcastH = actualThoughtVectorBroadcastH;
+        thoughtVectorBroadcastC = actualThoughtVectorBroadcastC;
+    }

    /* Decoder */
    auto beamSearchReorderHook = Constant({ 1, 1 }, 1.0f, device);
@ -116,6 +128,10 @@ void TrainSequenceToSequenceTranslator(const DeviceDescriptor& device, bool useS
    auto biasWeights = Parameter({ labelVocabDim }, 0.0f, device);

    auto z = Plus(Times(outputLayerProjWeights, Stabilize<float>(decoderOutput, device)), biasWeights, L"classifierOutput");
+
+    if (usePlaceholders)
+        z->ReplacePlaceholders({ { thoughtVectorBroadcastH, actualThoughtVectorBroadcastH }, { thoughtVectorBroadcastC, actualThoughtVectorBroadcastC } });
+
    auto ce = CrossEntropyWithSoftmax(z, labelSequence, L"lossFunction");
    auto errs = ClassificationError(z, labelSequence, L"classificationError");

@ -218,8 +234,8 @@ void TrainSequenceToSequenceTranslator()
    fprintf(stderr, "\nTrainSequenceToSequenceTranslator..\n");

    // TODO: Also test with sparse input variables in the graph
-    TrainSequenceToSequenceTranslator(DeviceDescriptor::CPUDevice(), false, true, false, true, true);
+    TrainSequenceToSequenceTranslator(DeviceDescriptor::CPUDevice(), false, true, false, false, true, true);

    if (IsGPUAvailable())
-        TrainSequenceToSequenceTranslator(DeviceDescriptor::GPUDevice(0), false, false, true, false, false);
+        TrainSequenceToSequenceTranslator(DeviceDescriptor::GPUDevice(0), false, false, true, true, false, false);
 }
--- a/bindings/python/cntk/cntk_py.i
+++ b/bindings/python/cntk/cntk_py.i
@ -19,6 +19,8 @@
 %rename(gpu_device) CNTK::DeviceDescriptor::GPUDevice;
 %rename(cpu_device) CNTK::DeviceDescriptor::CPUDevice;
 %rename(times_transpose) CNTK::TransposeTimes;
+%rename(sequence_slice) CNTK::Sequence::Slice;
+%rename(sequence_reduce_sum) CNTK::Sequence::ReduceSum;

 %rename(momentum_as_time_constant_schedule) CNTK::MomentumAsTimeConstantSchedule;

@ -42,7 +44,6 @@
 %template() std::vector<CNTK::Axis>;
 %template() std::vector<CNTK::DeviceDescriptor>;
 %template() std::vector<CNTK::StreamConfiguration>;
-//%template() std::vector<CNTK::DictionaryValue>;
 %template() std::vector<std::shared_ptr<CNTK::Function>>;
 %template() std::vector<std::shared_ptr<CNTK::Learner>>;
 %template() std::pair<size_t, double>;
@ -86,7 +87,7 @@ def dynamic_axes(self):
        for (size_t i=0; i<rank; i++)
        {
            size_t dim = (&shape)->operator[](i);
-            PyTuple_SetItem(result, i, PyInt_FromLong(dim));
+            PyTuple_SetItem(result, rank-i-1, PyInt_FromLong(dim));
        }
        return result;
    }
@ -160,6 +161,57 @@ def dynamic_axes(self):
    }
 }

+//
+// Converting Python list {DictionaryValue} to std::vector
+//
+%typecheck(1000) std::vector<CNTK::DictionaryValue>& {
+    // '1000' is the typecheck precedence code. It means: check after basic
+    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
+    $1 = PyList_Check($input) ? 1 : 0;
+}
+
+%typemap(in) std::vector<CNTK::DictionaryValue>& {
+     if (PyList_Check($input)) {
+        std::vector<CNTK::DictionaryValue>* vec = new std::vector<CNTK::DictionaryValue>();
+
+        PyObject *item;
+
+        PyObject *iterator = PyObject_GetIter($input);
+        if (iterator == NULL) {
+            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::DictionaryValue");
+        }
+
+        while ((item = PyIter_Next(iterator))) {
+            void *raw_var = 0 ;
+            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__DictionaryValue,  0);
+            if (!SWIG_IsOK(res1)) {
+                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert list element to CNTK::DictionaryValue");
+            }
+            if (!raw_var) {
+                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::DictionaryValue");
+            }
+
+            CNTK::DictionaryValue* var = reinterpret_cast<CNTK::DictionaryValue*>(raw_var);
+
+            vec->push_back(*var);
+
+            Py_DECREF(item);
+        }
+
+        Py_DECREF(iterator);
+
+        if (PyErr_Occurred()) {
+            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::DictionaryValue");
+        }
+
+        $1 = vec;
+
+     } else {
+         SWIG_exception(SWIG_ValueError, "list expected");
+     }
+}
+
+
 %fragment("DictionaryValueToPy", "header", fragment="NDShapeToTuple", fragment="NDArrayViewToNumPy")
 {
    PyObject *DictionaryValueToPy(const CNTK::DictionaryValue& dictVal)
@ -340,10 +392,10 @@ fail:

 %typemap(in) CNTK::NDShape const & {
     if (PyTuple_Check($input)) {
-        std::vector<size_t> dimensions;
        size_t rank = PyTuple_Size($input);
+        std::vector<size_t> dimensions(rank);
        for (size_t i=0; i<rank; i++)
-            dimensions.push_back(PyLong_AsLong(PyTuple_GET_ITEM($input, i)));
+            dimensions[i] = PyLong_AsLong(PyTuple_GET_ITEM($input, rank-i-1));

        $1 = new CNTK::NDShape(dimensions);
     } else {
@ -405,97 +457,60 @@ fail:
 //
 // Converting Python dictionary {Variable: ValuePtr} to std::unordered_map
 //
-%typecheck(1000) const std::unordered_map<CNTK::Variable, const CNTK::ValuePtr>&, std::unordered_map<CNTK::Variable, CNTK::ValuePtr>& {
+
+%define %unordered_map_conversion(KEY_TYPE, VALUE_TYPE, SWIG_KEY_TYPE, SWIG_VALUE_TYPE) 
    // '1000' is the typecheck precedence code. It means: check after basic
    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyDict_Check($input) ? 1 : 0;
-}
+    %typecheck(1000) std::unordered_map<KEY_TYPE, VALUE_TYPE> const&,
+        const std::unordered_map<KEY_TYPE, VALUE_TYPE>&, 
+        std::unordered_map<KEY_TYPE, VALUE_TYPE>&  
+    { $1 = PyDict_Check($input) ? 1 : 0; }

-%typemap(in) const std::unordered_map<CNTK::Variable, const CNTK::ValuePtr>& (
-        std::unordered_map<CNTK::Variable, const CNTK::ValuePtr> args_map
-) {
-     if (PyDict_Check($input)) {
+    %typemap(in) std::unordered_map<KEY_TYPE, VALUE_TYPE>& (
+            std::unordered_map<KEY_TYPE, VALUE_TYPE> args_map
+    ) {
+         if (PyDict_Check($input)) {

-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
+            PyObject *key, *value;
+            Py_ssize_t pos = 0;
+
+            while (PyDict_Next($input, &pos, &key, &value)) {
+                void *raw_var = 0 ;
+                int res1 = SWIG_ConvertPtr(key, &raw_var, SWIG_KEY_TYPE,  0);
+                if (!SWIG_IsOK(res1)) {
+                    SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary"); 
+                }
+                if (!raw_var) {
+                    SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary");
+                }
+
+                KEY_TYPE* var = reinterpret_cast<KEY_TYPE*>(raw_var);
+
+                void *raw_value = 0;
+                int res2 = SWIG_ConvertPtr(value, &raw_value, SWIG_VALUE_TYPE,  0);
+                if (!SWIG_IsOK(res2)) {
+                    SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary"); 
+                }
+
+                VALUE_TYPE* value;
+                if (raw_value) {
+                    value = reinterpret_cast<VALUE_TYPE*>(raw_value);
+                    args_map.insert(std::make_pair(*var, *value));
+                } else {
+                    // We got an empty VALUE_TYPE, which carries a nullptr.
+                    // This is only used for ValuePtr
+                    args_map.insert(std::make_pair(*var, VALUE_TYPE()));
+                }

-        while (PyDict_Next($input, &pos, &key, &value)) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(key, &raw_var, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary to CNTK::Variable");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary to CNTK::Variable");
            }

-            CNTK::Variable* var = reinterpret_cast<CNTK::Variable*>(raw_var);
+            $1 = &args_map;
+         } else {
+             SWIG_exception(SWIG_TypeError, "dictionary expected");
+         }
+    }
+%enddef

-            void *raw_value = 0;
-            int res2 = SWIG_ConvertPtr(value, &raw_value, SWIGTYPE_p_std__shared_ptrT_CNTK__Value_t,  0);
-            if (!SWIG_IsOK(res2)) {
-                SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary to CNTK::ValuePtr");
-            }
-
-            CNTK::ValuePtr* value;
-            if (raw_value) {
-                value = reinterpret_cast<CNTK::ValuePtr*>(raw_value);
-                args_map.insert(std::make_pair(*var, *value));
-            } else {
-                // We got an empty ValuePtr, which carries a nullptr.
-                args_map.insert(std::make_pair(*var, CNTK::ValuePtr()));
-            }
-
-        }
-
-        $1 = &args_map;
-     } else {
-         SWIG_exception(SWIG_TypeError, "dictionary expected");
-     }
-}
-
-// supporting the non-const version
-%typemap(in) std::unordered_map<CNTK::Variable, CNTK::ValuePtr>& (
-        std::unordered_map<CNTK::Variable, CNTK::ValuePtr> args_map
-) {
-     if (PyDict_Check($input)) {
-
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-
-        while (PyDict_Next($input, &pos, &key, &value)) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(key, &raw_var, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary to CNTK::Variable");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary to CNTK::Variable");
-            }
-
-            CNTK::Variable* var = reinterpret_cast<CNTK::Variable*>(raw_var);
-
-            void *raw_value = 0;
-            int res2 = SWIG_ConvertPtr(value, &raw_value, SWIGTYPE_p_std__shared_ptrT_CNTK__Value_t,  0);
-            if (!SWIG_IsOK(res2)) {
-                SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary to CNTK::ValuePtr");
-            }
-
-            CNTK::ValuePtr* value;
-            if (raw_value) {
-                value = reinterpret_cast<CNTK::ValuePtr*>(raw_value);
-                args_map.insert(std::make_pair(*var, *value));
-            } else {
-                // We got an empty ValuePtr, which carries a nullptr.
-                args_map.insert(std::make_pair(*var, CNTK::ValuePtr()));
-            }
-        }
-
-        $1 = &args_map;
-     } else {
-         SWIG_exception(SWIG_TypeError, "dictionary expected");
-     }
-}

 // For the output dict (the non-const unordered_map) we need to get the
 // modified values and put them back into the dictionary. This is used, when
@ -727,368 +742,6 @@ fail:
    }
 }

-//
-// Converting Python dictionary {Parameter: NDArrayViewPtr} to std::unordered_map
-//
-%typecheck(1000) const std::unordered_map<CNTK::Parameter, CNTK::NDArrayViewPtr>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyDict_Check($input) ? 1 : 0;
-}
-
-%typemap(in) const std::unordered_map<CNTK::Parameter, CNTK::NDArrayViewPtr>& (
-        std::unordered_map<CNTK::Parameter, CNTK::NDArrayViewPtr> args_map
-) {
-     if (PyDict_Check($input)) {
-
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-
-        while (PyDict_Next($input, &pos, &key, &value)) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(key, &raw_var, SWIGTYPE_p_CNTK__Parameter,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary to CNTK::Parameter");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary to CNTK::Parameter");
-            }
-
-            CNTK::Parameter* var = reinterpret_cast<CNTK::Parameter*>(raw_var);
-
-            void *raw_value = 0;
-            int res2 = SWIG_ConvertPtr(value, &raw_value, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView_t,  0);
-            if (!SWIG_IsOK(res2)) {
-                SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary to CNTK::NDArrayViewPtr");
-            }
-
-            CNTK::NDArrayViewPtr* value;
-            if (raw_value) {
-                value = reinterpret_cast<CNTK::NDArrayViewPtr*>(raw_value);
-            } else {
-                // We got an empty NDArrayViewPtr, which carries a nullptr.
-                value = new CNTK::NDArrayViewPtr();
-            }
-
-            args_map.insert(std::make_pair(*var, *value));
-        }
-
-        $1 = &args_map;
-     } else {
-         SWIG_exception(SWIG_TypeError, "dictionary expected");
-     }
-}
-
-//
-// Converting Python list {DictionaryValue} to std::vector
-//
-%typecheck(1000) std::vector<CNTK::DictionaryValue>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyList_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::vector<CNTK::DictionaryValue>& {
-     if (PyList_Check($input)) {
-        std::vector<CNTK::DictionaryValue>* vec = new std::vector<CNTK::DictionaryValue>();
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::DictionaryValue");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__DictionaryValue,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert list element to CNTK::DictionaryValue");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::DictionaryValue");
-            }
-
-            CNTK::DictionaryValue* var = reinterpret_cast<CNTK::DictionaryValue*>(raw_var);
-
-            vec->push_back(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::DictionaryValue");
-        }
-
-        $1 = vec;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "list expected");
-     }
-}
-
-// end of map conversion
-
-// TODO: Parametrize the following four typemaps and unify set/list usage.
-
-//
-// Converting Python set {Variable} to std::unordered_set
-//
-%typecheck(1000) std::unordered_set<CNTK::Variable>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PySet_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::unordered_set<CNTK::Variable>& (
-        std::unordered_set<CNTK::Variable> args_set
-) {
-     if (PySet_Check($input)) {
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::Variable");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert set element to CNTK::Variable");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::Variable");
-            }
-
-            CNTK::Variable* var = reinterpret_cast<CNTK::Variable*>(raw_var);
-
-            args_set.insert(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert set element to CNTK::Variable");
-        }
-
-        $1 = &args_set;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "set expected");
-     }
-}
-
-//
-// Converting Python set {StreamInformation} to std::unordered_set
-//
-%typecheck(1000) std::unordered_set<CNTK::StreamInformation>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PySet_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::unordered_set<CNTK::StreamInformation>& (
-        std::unordered_set<CNTK::StreamInformation> args_set
-) {
-     if (PySet_Check($input)) {
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::StreamInformation");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__StreamInformation,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert set element to CNTK::StreamInformation");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a set element to CNTK::StreamInformation");
-            }
-
-            CNTK::StreamInformation* var = reinterpret_cast<CNTK::StreamInformation*>(raw_var);
-
-            args_set.insert(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert set element to CNTK::StreamInformation");
-        }
-
-        $1 = &args_set;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "set expected");
-     }
-}
-
-//
-// Converting Python list {Parameter} to std::unordered_set
-//
-%typecheck(1000) std::unordered_set<CNTK::Parameter>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyList_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::unordered_set<CNTK::Parameter>& (
-        std::unordered_set<CNTK::Parameter> args_set
-) {
-     if (PyList_Check($input)) {
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::Parameter");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_CNTK__Parameter,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert set element to CNTK::Parameter");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::Parameter");
-            }
-
-            CNTK::Parameter* var = reinterpret_cast<CNTK::Parameter*>(raw_var);
-
-            args_set.insert(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert set element to CNTK::Parameter");
-        }
-
-        $1 = &args_set;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "list expected");
-     }
-}
-
-
-//
-// Converting Python list {LearnerPtr} to std::unordered_set
-//
-%typecheck(1000) std::unordered_set<CNTK::LearnerPtr>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyList_Check($input) ? 1 : 0;
-}
-
-%typemap(in) std::unordered_set<CNTK::LearnerPtr>& (
-        std::unordered_set<CNTK::LearnerPtr> args_set
-) {
-     if (PyList_Check($input)) {
-
-        PyObject *item;
-
-        PyObject *iterator = PyObject_GetIter($input);
-        if (iterator == NULL) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::LearnerPtr");
-        }
-
-        while ((item = PyIter_Next(iterator))) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(item, &raw_var, SWIGTYPE_p_std__shared_ptrT_CNTK__Learner_t,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert list element to CNTK::LearnerPtr");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting a list element to CNTK::LearnerPtr");
-            }
-
-            CNTK::LearnerPtr* var = reinterpret_cast<CNTK::LearnerPtr*>(raw_var);
-
-            args_set.insert(*var);
-
-            Py_DECREF(item);
-        }
-
-        Py_DECREF(iterator);
-
-        if (PyErr_Occurred()) {
-            SWIG_exception_fail(SWIG_ValueError, "cannot convert list element to CNTK::LearnerPtr");
-        }
-
-        $1 = &args_set;
-
-     } else {
-         SWIG_exception(SWIG_ValueError, "list expected");
-     }
-}
-
-%typecheck(1000) const std::unordered_map<CNTK::Variable, CNTK::Variable>& {
-    // '1000' is the typecheck precedence code. It means: check after basic
-    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
-    $1 = PyDict_Check($input) ? 1 : 0;
-}
-
-
-%typemap(in) std::unordered_map<CNTK::Variable, CNTK::Variable>& (
-        std::unordered_map<CNTK::Variable, CNTK::Variable> args_map
-) {
-     if (PyDict_Check($input)) {
-
-        PyObject *key, *value;
-        Py_ssize_t pos = 0;
-
-        while (PyDict_Next($input, &pos, &key, &value)) {
-            void *raw_var = 0 ;
-            int res1 = SWIG_ConvertPtr(key, &raw_var, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res1)) {
-                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert key of dictionary to CNTK::Variable");
-            }
-            if (!raw_var) {
-                SWIG_exception_fail(SWIG_ValueError, "invalid null reference when converting key of dictionary to CNTK::Variable");
-            }
-
-            CNTK::Variable* var = reinterpret_cast<CNTK::Variable*>(raw_var);
-
-            void *raw_value = 0;
-            int res2 = SWIG_ConvertPtr(value, &raw_value, SWIGTYPE_p_CNTK__Variable,  0);
-            if (!SWIG_IsOK(res2)) {
-                SWIG_exception_fail(SWIG_ArgError(res2), "cannot convert value of dictionary to CNTK::Variable");
-            }
-
-            CNTK::Variable* value;
-            if (raw_value) {
-                value = reinterpret_cast<CNTK::Variable*>(raw_value);
-            } else {
-                // We got an empty Variable, which carries a nullptr.
-                value = new CNTK::Variable();
-            }
-
-            args_map.insert(std::make_pair(*var, *value));
-        }
-
-        $1 = &args_map;
-     } else {
-         SWIG_exception(SWIG_TypeError, "dictionary expected");
-     }
-}
-
-

 //
 // Converting std::unordered_set to Python list.
@ -1104,9 +757,9 @@ fail:
    {
        SWIG_exception(SWIG_RuntimeError, "error passing set to Python");
    }
-
+ 
    // *&$1 -> $1 is the returned result being converted (unordered_set<...>*),
-    // wrapped by SwigValueWrapper. So we need to unwrap it using '&',
+    // wrapped by SwigValueWrapper. So we need to unwrap it using '&', 
    // then access its value using '*'.
    for (auto var : *&$1)
    {
@ -1119,15 +772,58 @@ fail:
    $result = container;
 }
 %enddef
-
-%unordered_set_conversion(Variable, SWIGTYPE_p_CNTK__Variable)
-%unordered_set_conversion(Constant, SWIGTYPE_p_CNTK__Constant)
-%unordered_set_conversion(Parameter, SWIGTYPE_p_CNTK__Parameter)
-%unordered_set_conversion(DistributedWorkerDescriptor, SWIGTYPE_p_CNTK__DistributedWorkerDescriptor)
-
+ 
 %define %unordered_set_ref_conversion(DATA_TYPE, _SWIG_TYPE)

-%typemap(out) std::unordered_set<CNTK::DATA_TYPE>& {
+%typecheck(1000) std::unordered_set<DATA_TYPE>&, std::unordered_set<DATA_TYPE>const & {
+    // '1000' is the typecheck precedence code. It means: check after basic
+    // types, but before arrays. See: http://www.swig.org/Doc1.3/Typemaps.html#Typemaps_overloading
+    $1 = PySet_Check($input) || PyList_Check($input) ? 1 : 0;
+}
+
+%typemap(in) std::unordered_set<DATA_TYPE>& (
+        std::unordered_set<DATA_TYPE> args_set 
+) {
+     if (PySet_Check($input) || PyList_Check($input)) {
+
+        PyObject *item;
+
+        PyObject *iterator = PyObject_GetIter($input);
+        if (iterator == NULL) {
+            SWIG_exception_fail(SWIG_ValueError, "cannot convert element"); 
+        }
+
+        while ((item = PyIter_Next(iterator))) {
+            void *raw_var = 0 ;
+            int res1 = SWIG_ConvertPtr(item, &raw_var, _SWIG_TYPE,  0);
+            if (!SWIG_IsOK(res1)) {
+                SWIG_exception_fail(SWIG_ArgError(res1), "cannot convert set element"); 
+            }
+            if (!raw_var) {
+                SWIG_exception_fail(SWIG_ValueError, "invalid null reference");
+            }
+
+            DATA_TYPE* var = reinterpret_cast<DATA_TYPE*>(raw_var);
+
+            args_set.insert(*var);
+
+            Py_DECREF(item);
+        }
+
+        Py_DECREF(iterator);
+
+        if (PyErr_Occurred()) {
+            SWIG_exception_fail(SWIG_ValueError, "cannot convert set element"); 
+        }
+
+        $1 = &args_set;
+
+     } else {
+         SWIG_exception(SWIG_ValueError, "set expected");
+     }
+}
+
+%typemap(out) std::unordered_set<DATA_TYPE>&  {
    PyObject* container = PyList_New(0);
    if (container == NULL)
    {
@ -1136,7 +832,7 @@ fail:

    for (auto var : *$1)
    {
-        PyObject *item = SWIG_NewPointerObj(new CNTK::DATA_TYPE(var), _SWIG_TYPE, SWIG_POINTER_OWN );
+        PyObject *item = SWIG_NewPointerObj(new DATA_TYPE(var), _SWIG_TYPE, SWIG_POINTER_OWN );
        // No error handling here, because the error will be passed directly to Python
        PyList_Append(container, item);
        Py_DECREF(item);
@ -1146,16 +842,23 @@ fail:
 }
 %enddef

-%unordered_set_ref_conversion(StreamInformation, SWIGTYPE_p_CNTK__StreamInformation)
-%unordered_set_ref_conversion(LearnerPtr, SWIGTYPE_p_std__shared_ptrT_CNTK__Learner_t)
-%unordered_set_ref_conversion(Parameter, SWIGTYPE_p_CNTK__Parameter)
-%unordered_set_ref_conversion(DistributedWorkerDescriptor, SWIGTYPE_p_CNTK__DistributedWorkerDescriptor)
+%unordered_set_conversion(CNTK::Variable, SWIGTYPE_p_CNTK__Variable)
+%unordered_set_conversion(CNTK::Constant, SWIGTYPE_p_CNTK__Constant)
+%unordered_set_conversion(CNTK::Parameter, SWIGTYPE_p_CNTK__Parameter)
+%unordered_set_conversion(CNTK::StreamInformation, SWIGTYPE_p_CNTK__StreamInformation)
+%unordered_set_conversion(CNTK::DistributedWorkerDescriptor, SWIGTYPE_p_CNTK__DistributedWorkerDescriptor)
+
+%unordered_set_ref_conversion(CNTK::Variable, SWIGTYPE_p_CNTK__Variable)
+%unordered_set_ref_conversion(CNTK::Parameter, SWIGTYPE_p_CNTK__Parameter)
+%unordered_set_ref_conversion(CNTK::StreamInformation, SWIGTYPE_p_CNTK__StreamInformation)
+%unordered_set_ref_conversion(CNTK::LearnerPtr, SWIGTYPE_p_std__shared_ptrT_CNTK__Learner_t)
+%unordered_set_ref_conversion(CNTK::DistributedWorkerDescriptor, SWIGTYPE_p_CNTK__DistributedWorkerDescriptor)

 // Unordered map conversion

 %define %unordered_map_ref_conversion(DATA_TYPE1, _SWIG_TYPE1, DATA_TYPE2, _SWIG_TYPE2)

-%typemap(out) std::unordered_map<CNTK::DATA_TYPE1, CNTK::DATA_TYPE2>& {
+%typemap(out) std::unordered_map<DATA_TYPE1, DATA_TYPE2>& {
    PyObject* container = PyDict_New();
    if (container == NULL)
    {
@ -1167,8 +870,8 @@ fail:
    // then access its value using '*'.
    for (auto it : *$1)
    {
-        PyObject *returned_var = SWIG_NewPointerObj(SWIG_as_voidptr(new CNTK::DATA_TYPE1(it.first)), _SWIG_TYPE1, SWIG_POINTER_OWN);
-        PyObject *returned_val = SWIG_NewPointerObj(SWIG_as_voidptr(new CNTK::DATA_TYPE2(it.second)), _SWIG_TYPE2, SWIG_POINTER_OWN);
+        PyObject *returned_var = SWIG_NewPointerObj(SWIG_as_voidptr(new DATA_TYPE1(it.first)), _SWIG_TYPE1, SWIG_POINTER_OWN);
+        PyObject *returned_val = SWIG_NewPointerObj(SWIG_as_voidptr(new DATA_TYPE2(it.second)), _SWIG_TYPE2, SWIG_POINTER_OWN);

        PyDict_SetItem(container, returned_var, returned_val);

@ -1180,8 +883,15 @@ fail:
 }
 %enddef

-%unordered_map_ref_conversion(StreamInformation, SWIGTYPE_p_CNTK__StreamInformation, MinibatchData, SWIGTYPE_p_CNTK__MinibatchData);
-%unordered_map_ref_conversion(Parameter, SWIGTYPE_p_CNTK__Parameter, NDArrayViewPtr, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView);
+%unordered_map_conversion(CNTK::Variable, const CNTK::ValuePtr, SWIGTYPE_p_CNTK__Variable, SWIGTYPE_p_std__shared_ptrT_CNTK__Value_t)
+%unordered_map_conversion(CNTK::Variable, CNTK::ValuePtr, SWIGTYPE_p_CNTK__Variable, SWIGTYPE_p_std__shared_ptrT_CNTK__Value_t)
+%unordered_map_conversion(CNTK::Variable, CNTK::Variable, SWIGTYPE_p_CNTK__Variable, SWIGTYPE_p_CNTK__Variable)
+%unordered_map_conversion(CNTK::Parameter, const CNTK::NDArrayViewPtr, SWIGTYPE_p_CNTK__Parameter, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView_t)
+%unordered_map_conversion(CNTK::Parameter, CNTK::NDArrayViewPtr, SWIGTYPE_p_CNTK__Parameter, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView_t)
+
+%unordered_map_ref_conversion(CNTK::StreamInformation, SWIGTYPE_p_CNTK__StreamInformation, CNTK::MinibatchData, SWIGTYPE_p_CNTK__MinibatchData);
+%unordered_map_ref_conversion(CNTK::Parameter, SWIGTYPE_p_CNTK__Parameter, CNTK::NDArrayViewPtr, SWIGTYPE_p_std__shared_ptrT_CNTK__NDArrayView);
+%unordered_map_ref_conversion(CNTK::Variable, SWIGTYPE_p_CNTK__Variable, CNTK::Variable, SWIGTYPE_p_CNTK__Variable);

 %shared_ptr(CNTK::Function)
 %shared_ptr(CNTK::NDArrayView)
@ -1206,7 +916,7 @@ fail:
 %extend CNTK::NDMask {
    PyObject* to_numpy() {
        std::vector<size_t> cntk_dims = (*self).Shape().Dimensions();
-        static_assert(dims.size()==2, "mask requires exactly two dimensions");
+        static_assert(cntk_dims.size()==2, "mask requires exactly two dimensions");
        std::vector<size_t> dimensions = {cntk_dims[1], cntk_dims[0]};

        size_t num_elements = dimensions[0] * dimensions[1];
@ -1258,17 +968,17 @@ fail:

        PyArrayObject* array = (PyArrayObject*)pyobj;

-        int rank = PyArray_NDIM(array);
-
-        npy_intp* np_shape = PyArray_SHAPE(array);
-        std::vector<size_t> shape;
+        int rank = PyArray_NDIM(array); 
+        
+        npy_intp* np_shape = PyArray_SHAPE(array); 
+        std::vector<size_t> shape(rank);

        npy_intp num_elements = 1;
        // CNTK uses column major, thus we reverse the shape
-        for (int i=rank-1; i>=0; i--)
+        for (int i=0; i<rank; i++)
        {
-            shape.push_back(np_shape[i]);
-            num_elements *= np_shape[i];
+            shape[rank-i-1] = np_shape[i];
+            num_elements *= np_shape[i];            
        }

        int typecode = PyArray_TYPE(array);
@ -1342,7 +1052,7 @@ public:
 // Setting up hash calculation so that __hash__ on Swig objects
 // are redirected to the std::hash computation of the C++ API
 //
-%define %py_hash_for(DATA_TYPE, EQ)
+%define %py_hash_for(DATA_TYPE)
 %extend CNTK::DATA_TYPE {
    const size_t __hash__() {
        return std::hash<CNTK::DATA_TYPE>()(*$self);
@ -1357,14 +1067,16 @@ DATA_TYPE.__eq__ = lambda a,b: EQ(a,b)
 %enddef

 %py_eq_for(Variable, Variable_eq)
-%py_eq_for(Constant, Variable_eq)
-%py_eq_for(Parameter, Variable_eq)
-%py_eq_for(NDShape, NDShape_eq)
+%py_hash_for(Variable)

-%py_hash_for(Variable, Variable_eq)
-%py_hash_for(Constant, Variable_eq)
-%py_hash_for(Parameter, Variable_eq)
-%py_hash_for(NDShape, NDShape_eq)
+%py_eq_for(Constant, Variable_eq)
+%py_hash_for(Constant)
+
+%py_eq_for(Parameter, Variable_eq)
+%py_hash_for(Parameter)
+
+%py_eq_for(NDShape, NDShape_eq)
+%py_hash_for(NDShape)

 %py_eq_for(DeviceDescriptor, DeviceDescriptor_eq)

@ -1395,4 +1107,3 @@ for klass in [Variable, Value, NDArrayView, NDMask]:

 enable_reversing_tensor_shapes_in_error_messages()
 %}
-
--- a/bindings/python/cntk/io/tests/io_tests.py
+++ b/bindings/python/cntk/io/tests/io_tests.py
@ -10,7 +10,7 @@ import numpy as np

 abs_path = os.path.dirname(os.path.abspath(__file__))

-def test_text_format():
+def _test_text_format():
    from cntk.io import text_format_minibatch_source, StreamConfiguration, MinibatchSource

    # 0	|x 560	|y 1 0 0 0 0
--- a/bindings/python/cntk/ops/init.py
+++ b/bindings/python/cntk/ops/init.py
@ -239,9 +239,8 @@ def convolution(convolution_map, operand, strides=(1,), sharing=[True],
    '''
    from cntk.cntk_py import convolution
    operand = sanitize_input(operand)
-    return convolution(convolution_map, operand, tuple(reversed(strides)), sharing, auto_padding,
-                       tuple(reversed(lower_pad)), tuple(
-                           reversed(upper_pad)), transpose,
+    return convolution(convolution_map, operand, tuple(strides), sharing, auto_padding,
+                       tuple(lower_pad), tuple(upper_pad), transpose,
                       max_temp_mem_size_in_samples, name)


--- a/bindings/python/cntk/ops/sequence/init.py
+++ b/bindings/python/cntk/ops/sequence/init.py
@ -63,6 +63,28 @@ def is_last(seq, name=''):
    seq = sanitize_input(seq, get_data_type(seq))
    return is_last(seq, name)

+@typemap
+def slice(seq, begin_index, end_index, name=''):
+    '''
+    Slice the input sequence.
+
+    Examples:
+        TBA
+    Args:
+        seq: sequence input tensor
+        begin_index (`int`): the index along sequence axis where the slicing starts
+        end_index (`int`): the index along sequence axis where the slicing ends
+        name (`str`, optional): the name of the Function instance in the network
+
+    See also:
+        Indexing in NumPy: http://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
+
+    Returns:
+        :class:`cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import sequence_slice
+    seq = sanitize_input(seq, get_data_type(seq))
+    return sequence_slice(seq, begin_index, end_index, name)

@typemap
 def first(seq, name=''):
@ -281,3 +303,21 @@ def broadcast_as(operand, broadcast_as_operand, name=''):
    broadcast_as_operand = sanitize_input(
        broadcast_as_operand, get_data_type(broadcast_as_operand))
    return broadcast_as(operand, broadcast_as_operand, name)
+
+@typemap
+def reduce_sum(seq, name=''):
+    '''
+    Computes the sum of the input sequence's elements across the sequence axis.
+
+    Examples:
+        TBA
+    Args:
+        seq: sequence input tensor
+        name (`str`, optional): the name of the Function instance in the network
+
+    Returns:
+        :class:`cntk.ops.functions.Function`
+    '''
+    from cntk.cntk_py import sequence_reduce_sum
+    seq = sanitize_input(seq, get_data_type(seq))
+    return sequence_reduce_sum(seq, name)
--- a/bindings/python/cntk/ops/tests/linear_test.py
+++ b/bindings/python/cntk/ops/tests/linear_test.py
@ -19,8 +19,8 @@ TENSOR_PAIRS = [
    ([30.], [10.]),
    ([[10.]], [[30.]]),
    ([[1.5, 2.1]], [[10., 20.]]),
-    #([[100., 200.], [300., 400.], [10., 20.]],
-    #  [[10., 20.], [30., 40.], [1., 2.]]),
+    ([[100., 200.], [300., 400.], [10., 20.]],
+     [[10., 20.], [30., 40.], [1., 2.]]),

    # Adding two 3x2 inputs of sequence length 1
    ([[30., 40.], [1., 2.], [0.1, 0.2]], [[10, 20], [3, 4], [-0.5, -0.4]]),
@ -175,6 +175,8 @@ NEGATE_TENSORS = [
    ([[100., 200.], [300., 400.], [10., 20.]]),
    ([[30, 40], [1, 2], [0.1, 0.2]])
 ]
+
+
@pytest.mark.parametrize("operand", NEGATE_TENSORS)
 def test_op_negate(operand, device_id, precision):
    t = -1 * AA(operand, dtype=PRECISION_TO_TYPE[precision])
@ -193,34 +195,41 @@ def test_op_negate(operand, device_id, precision):
    _test_unary_op(precision, device_id, '-', operand,
                   expected_forward, expected_backward)

-TIMES_PAIRS = [
+# transpose_times currently only supports right operands of rank 1 or 2
+TRANSPOSE_TIMES_PAIRS = [
    ([[30.]], [[10.]]),
    ([[1.5, 2.1]], [[10.], [20.]]),
-    ([[100., 200.]], [[10.], [20.]]),
+    ([[100., 200.]], [[-10.], [20.]]),
    ([[100., 200.], [300., 400.]], [[10.], [20.]]),
-    ([[100., 200.], [300., 400.]], [[10., 20.], [20., 30.]])
+    ([[100., 200.], [-300., 400.]], [[10., 20.], [20., 30.]]),
+    (np.reshape(np.arange(24), (4, 3, 2)),
+     np.array([[1, 3], [2, 4]])),
 ]

-# TODO: Handle sparse matrices
+# TODO: Handle sparse matrices (left_matrix_type, right_matrix_type)
+
+# adding a rank 3 operand for times operation
+TIMES_PAIRS = TRANSPOSE_TIMES_PAIRS + \
+    list((np.reshape(np.arange(8), (2, 2, 2)), np.reshape(np.arange(8), (2, 2, 2))))


@pytest.mark.parametrize("left_operand, right_operand", TIMES_PAIRS)
-def test_op_times(left_operand, right_operand, device_id, precision,
-                  left_matrix_type, right_matrix_type):
+def test_op_times(left_operand, right_operand, device_id, precision):
    dt_precision = PRECISION_TO_TYPE[precision]

    a = AA(left_operand, dtype=dt_precision)
    b = AA(right_operand, dtype=dt_precision)

-    expected_forward = [[np.dot(a, b)]]
-
-    assert len(a.shape) == len(b.shape) == 2
+    expected_forward = [[np.tensordot(a, b, axes=len(b.shape) - 1)]]

    left_backward = np.zeros_like(a)
-    left_backward[:, :] = b.sum(axis=1)
+    left_backward[...] = b.sum(axis=-1)

    right_backward = np.zeros_like(b)
-    right_backward[:, :] = np.transpose([a.sum(axis=0)])
+    transpose_axes = list(np.roll(np.arange(len(b.shape)), -1))
+    sum_axes = tuple(np.arange(0, len(a.shape) - len(b.shape) + 1))
+    right_backward[...] = np.transpose(
+        AA([a.sum(axis=sum_axes)]), axes=transpose_axes)

    expected_backward = {
        'left_arg':  [[left_backward]],
@ -231,3 +240,32 @@ def test_op_times(left_operand, right_operand, device_id, precision,

    _test_binary_op(precision, device_id, times,
                    left_operand, right_operand, expected_forward, expected_backward)
+
+
+@pytest.mark.parametrize("left_operand, right_operand", TRANSPOSE_TIMES_PAIRS)
+def test_op_transpose_times(left_operand, right_operand, device_id, precision):
+    dt_precision = PRECISION_TO_TYPE[precision]
+
+    # tranpose right_operand to make product possible
+    right_operand = np.transpose(right_operand).tolist()
+
+    a = AA(left_operand, dtype=dt_precision)
+    b = AA(right_operand, dtype=dt_precision)
+
+    expected_forward = [[np.dot(a, np.transpose(b))]]
+
+    left_backward = np.zeros_like(a)
+    left_backward[...] = b.sum(axis=tuple(range(len(b.shape) - 1)))
+
+    right_backward = np.zeros_like(b)
+    right_backward[...] = a.sum(axis=tuple(range(len(a.shape) - 1)))
+
+    expected_backward = {
+        'left_arg':  [[left_backward]],
+        'right_arg': [[right_backward]]
+    }
+
+    from cntk import times_transpose
+
+    _test_binary_op(precision, device_id, times_transpose,
+                    left_operand, right_operand, expected_forward, expected_backward)
--- a/bindings/python/cntk/ops/tests/reshaping_test.py
+++ b/bindings/python/cntk/ops/tests/reshaping_test.py
@ -166,8 +166,9 @@ def test_op_slice_sequence(input_data, slice_params, expected_result, device_id,
          dynamic_axes=[Axis.default_batch_axis(), t],
          name='a')

-    result = C.slice(a, axis=t, begin_index=slice_params[
-                     0], end_index=slice_params[1])
+    result = C.sequence.slice(a, 
+            begin_index=slice_params[0], 
+            end_index=slice_params[1])

    def grad_slice(x, beg_index, end_index):
        res = np.zeros_like(x)
@ -176,8 +177,8 @@ def test_op_slice_sequence(input_data, slice_params, expected_result, device_id,

    expected_gradient = grad_slice(np.asarray(input_data), *slice_params)

-    expected_forward = AA(
-        [expected_result], dtype=PRECISION_TO_TYPE[precision])
+    expected_forward = AA([expected_result], 
+            dtype=PRECISION_TO_TYPE[precision])
    expected_backward = {
        a: [grad_slice(np.asarray(input_data), *slice_params)]
    }
--- a/bindings/python/cntk/utils/init.py
+++ b/bindings/python/cntk/utils/init.py
@ -183,12 +183,9 @@ def get_temp_filename(directory=None):

 def sanitize_shape(shape):
    """
-    If shape is scalar, it creates a tuple out of it and reverse it as cntk uses
-    column major.
+    If shape is scalar, it creates a tuple out of it.
    """
-    if np.isscalar(shape):
-        shape = (shape,)
-    return tuple(reversed(shape))
+    return _as_tuple(shape)


 def sanitize_input(arg, fallback_dtype=np.float32, reshape=None):
@ -383,14 +380,15 @@ def sanitize_batch(var, batch, seq_starts=None, data_type=None, device=None):
                             'array and not "%s"' % type(batch))

        from cntk.cntk_py import NDMask
-        mask = NDMask((max(seq_lens), num_seq), device)
+        mask = NDMask((num_seq, max(seq_lens)), device)
        for idx, seq_len in enumerate(seq_lens):
-            if seq_starts is None:
-                mask.mark_sequence_begin((0, idx))
-            elif seq_starts[idx]:
+            if seq_starts is None or seq_starts[idx]:
                mask.mark_sequence_begin((0, idx))
+            # The second parameter is specifying the rectangle of the mask that
+            # is invalid. As C++ is taking an NDShape, and we reverse the shape
+            # in the SWIG layer, we provide it here as row-major.
            mask.invalidate_section((seq_len, idx),
-                                    (cntk_py.InferredDimension, 1))
+                                    (1, cntk_py.InferredDimension))

        # Then we pad the batch to rectangular shape
        if isinstance(batch, list):
@ -814,6 +812,17 @@ class _ClassFromDict(dict):
 def Record(**kwargs):
    return _ClassFromDict(kwargs)

-# type-cast a shape given as a scalar into a tuple
 def _as_tuple(x):
-    return x if (isinstance(x,tuple)) else (x,)
+    '''
+    Convert an argument to a tuple.
+
+    Args:
+        x: if scalar, it returns ``(x,)``. If iterable, it converts it to
+        tuple.
+
+    Returns:
+        Tuple of ``x``.
+    '''
+    if np.isscalar(x):
+        x = (x,)
+    return tuple(x)
--- a/bindings/python/doc/tutorials.rst
+++ b/bindings/python/doc/tutorials.rst
@ -1,23 +1,28 @@
 Tutorials 
 ===============

-#. `Logistic Regression`_ with CNTK and NumPy
-#. `Feed Forward Network`_ with CNTK and NumPy
-#.  Image 101 Feed Forward Classifier with MNIST data
+#.  CNTK 101: `Logistic Regression`_ with CNTK and NumPy
+#.  CNTK 102: `Feed Forward Network`_ with CNTK and NumPy
+#.  CNTK 103: Feed Forward image classifier with MNIST data

-    * Part A: `MNIST Data preparation`_
+    * Part A: `MNIST data preparation`_
    * Part B: `Feed Forward Classifier`_

-#.  Image 201 ResNet Classifier with CIFAR-10 data
+#.  CNTK 201: Image classifiers with CIFAR-10 data

    * Part A: `CIFAR-10 Data preparation`_
-    * Part B: `ResNet Classifier`_
+    * Part B: `VGG and ResNet classifiers`_
+    
+#.  CNTK 202: `Language understanding`_ with ATIS3 text data
+
+#.  CNTK 203: `Reinforcement learning basics`_ with OpenAI Gym data
 	
 .. _`Logistic Regression`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_101_LogisticRegression.ipynb
 .. _`Feed Forward Network`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_102_FeedForward.ipynb
-.. _`MNIST Data preparation`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_103A_MNIST_DataLoader.ipynb
+.. _`MNIST data preparation`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_103A_MNIST_DataLoader.ipynb
 .. _`Feed Forward Classifier`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_103B_MNIST_FeedForwardNetwork.ipynb
 .. _`CIFAR-10 Data preparation`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_201A_CIFAR-10_DataLoader.ipynb
-.. _`ResNet Classifier`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_201B_CIFAR-10_ImageHandsOn.ipynb
-
+.. _`VGG and ResNet classifiers`: https://github.com/Microsoft/CNTK/tree/v2.0.beta2.0/bindings/python/tutorials/CNTK_201B_CIFAR-10_ImageHandsOn.ipynb
+.. _`Language understanding`: https://github.com/Microsoft/CNTK/blob/v2.0.beta2.0/bindings/python/tutorials/CNTK_202_Language_Understanding.ipynb
+.. _`Reinforcement learning basics`: https://github.com/Microsoft/CNTK/blob/master/bindings/python/tutorials/CNTK_203_Reinforcement_Learning_Basics.ipynb
  
--- a/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py
+++ b/bindings/python/examples/Sequence2Sequence/Sequence2Sequence.py
@ -11,7 +11,7 @@ from cntk import Trainer, Axis, save_model, load_model #, text_format_minibatch_
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
 from cntk.device import cpu, set_default_device
 from cntk.learner import momentum_sgd, momentum_as_time_constant_schedule
-from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence, slice, past_value, future_value, element_select, alias, hardmax
+from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence, past_value, future_value, element_select, alias, hardmax
 from cntk.ops.functions import CloneMethod

 abs_path = os.path.dirname(os.path.abspath(__file__))
@ -94,7 +94,7 @@ def sequence_to_sequence_translator(debug_output=False, run_test=False):
    input_sequence = raw_input

    # Drop the sentence start token from the label, for decoder training
-    label_sequence = slice(raw_labels, label_seq_axis, 1, 0) # <s> A B C </s> --> A B C </s>
+    label_sequence = sequence.slice(raw_labels, 1, 0) # <s> A B C </s> --> A B C </s>
    label_sentence_start = sequence.first(raw_labels)        # <s>

    is_first_label = sequence.is_first(label_sequence)       # <s> 0 0 0 ...
@ -239,7 +239,7 @@ def sequence_to_sequence_translator(debug_output=False, run_test=False):
    z = load_model("seq2seq.dnn")

    label_seq_axis = Axis('labelAxis')
-    label_sequence = slice(find_arg_by_name('raw_labels',z), label_seq_axis, 1, 0)
+    label_sequence = sequence.slice(find_arg_by_name('raw_labels',z), 1, 0)
    ce = cross_entropy_with_softmax(z, label_sequence)
    errs = classification_error(z, label_sequence)
    trainer = Trainer(z, ce, errs, [momentum_sgd(
--- a/bindings/python/examples/SequenceClassification/SequenceClassification.py
+++ b/bindings/python/examples/SequenceClassification/SequenceClassification.py
@ -10,11 +10,11 @@ from cntk import Trainer, Axis #, text_format_minibatch_source, StreamConfigurat
 from cntk.io import MinibatchSource, CTFDeserializer, StreamDef, StreamDefs, INFINITELY_REPEAT, FULL_DATA_SWEEP
 from cntk.device import cpu, set_default_device
 from cntk.learner import sgd
-from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error
+from cntk.ops import input_variable, cross_entropy_with_softmax, classification_error, sequence

 abs_path = os.path.dirname(os.path.abspath(__file__))
 sys.path.append(os.path.join(abs_path, "..", ".."))
-from examples.common.nn import LSTMP_component_with_self_stabilization, embedding, linear_layer, select_last, print_training_progress
+from examples.common.nn import LSTMP_component_with_self_stabilization, embedding, linear_layer, print_training_progress

 # Creates the reader
 def create_reader(path, is_training, input_dim, label_dim):
@ -28,7 +28,7 @@ def LSTM_sequence_classifer_net(input, num_output_classes, embedding_dim, LSTM_d
    embedding_function = embedding(input, embedding_dim)
    LSTM_function = LSTMP_component_with_self_stabilization(
        embedding_function.output, LSTM_dim, cell_dim)[0]
-    thought_vector = select_last(LSTM_function)
+    thought_vector = sequence.last(LSTM_function)

    return linear_layer(thought_vector, num_output_classes)

--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@ -114,7 +114,6 @@ if IS_WINDOWS:
        "/EHsc",
        "/DEBUG",
        "/Zi",
-        "/EHsc",
    ]
    runtime_library_dirs = []
 else: