Merge branch 'master' of https://github.com/Microsoft/CNTK into amitaga/cntkv2Library

2016-06-15 11:11:37 -07:00 · 2016-06-15 11:11:37 -07:00 · a9f769884a
--- a/.gitignore
+++ b/.gitignore
@ -190,3 +190,7 @@ Source/CNTK/buildinfo.h$$
 # Unit test output
 Tests/UnitTests/ReaderTests/Control/**/*_Output.txt
 Tests/UnitTests/NetworkTests/Output/
+
+Dependencies/CNTKCustomMKL/Publish
+Dependencies/CNTKCustomMKL/CNTKCustomMKL-Linux-*.tgz
+Dependencies/CNTKCustomMKL/CNTKCustomMKL-Windows-*.zip
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -29,6 +29,47 @@

  </PropertyGroup>

+  <Choose>
+    <When Condition="Exists('$(ACML_PATH)')">
+      <PropertyGroup>
+        <MathLibraryName>ACML</MathLibraryName>
+        <MathIncludePath>$(ACML_PATH)\include</MathIncludePath>
+        <MathLibraryPath>$(ACML_PATH)\lib</MathLibraryPath>
+        <MathLinkLibrary>libacml_mp_dll.lib</MathLinkLibrary>
+        <MathDelayLoad>libacml_mp_dll.dll</MathDelayLoad>
+        <MathPostBuildCopyPattern>$(ACML_PATH)\lib\*.dll</MathPostBuildCopyPattern>
+        <UnitTestDlls>$(OutDir)libacml_mp_dll.dll;$(OutDir)libifcoremd.dll;$(OutDir)libifportmd.dll;$(OutDir)libiomp*.dll;$(OutDir)libmmd.dll;$(OutDir)svml_dispmd.dll;</UnitTestDlls>
+        <MathDefine>USE_ACML</MathDefine>
+      </PropertyGroup>
+    </When>
+
+    <!-- See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl on how to configure to build CNTK with MKL -->
+    <When Condition="'$(CNTK_MKL)' == '1'">
+      <PropertyGroup>
+        <CNTKCustomMKLVersion>1</CNTKCustomMKLVersion>
+        <CNTKCustomMKLPath>$(CNTK_MKL_PATH)\$(CNTKCustomMKLVersion)</CNTKCustomMKLPath>
+        <MathIncludePath>$(CNTKCustomMKLPath)\include</MathIncludePath>
+        <MathDefine>USE_MKL</MathDefine>
+      </PropertyGroup>
+      <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' != '1'">
+        <MathLibraryName>CNTK custom MKL Parallel (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
+        <MathLibraryPath>$(CNTKCustomMKLPath)\x64\parallel</MathLibraryPath>
+        <MathLinkLibrary>mkl_cntk_p.lib</MathLinkLibrary>
+        <MathDelayLoad>mkl_cntk_p.dll</MathDelayLoad>
+        <MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
+        <UnitTestDlls>$(OutDir)mkl_cntk_p.dll;$(OutDir)libiomp5md.dll;</UnitTestDlls>
+      </PropertyGroup>
+      <PropertyGroup Condition="'$(CNTK_MKL_SEQUENTIAL)' == '1'">
+        <MathLibraryName>CNTK custom MKL Sequential (Version: $(CNTKCustomMKLVersion))</MathLibraryName>
+        <MathLibraryPath>$(CNTKCustomMKLPath)\x64\sequential</MathLibraryPath>
+        <MathLinkLibrary>mkl_cntk_s.lib</MathLinkLibrary>
+        <MathDelayLoad>mkl_cntk_s.dll</MathDelayLoad>
+        <MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
+        <UnitTestDlls>$(OutDir)mkl_cntk_s.dll;</UnitTestDlls>
+      </PropertyGroup>
+    </When>
+  </Choose>
+
  <PropertyGroup Condition="'$(CudaVersion)' == '7.5'">
    <CudaPath>$(CUDA_PATH_V7_5)</CudaPath>
    <CudaRuntimeDll>cudart64_75.dll</CudaRuntimeDll>
--- a/Dependencies/CNTKCustomMKL/README-for-redistributable.txt
+++ b/Dependencies/CNTKCustomMKL/README-for-redistributable.txt
@ -0,0 +1,8 @@
+This archive contains header files as well as redistributable components of
+the Intel (r) Math Kernel Library (Intel (r) MKL).
+
+Included is also a custom library created out of Intel (r) Math Kernel Library
+(Intel (r) MKL) required for building the Microsoft Computational Network
+Toolkit (CNTK). More details on CNTK be found here: http://www.cntk.ai
+
+Please see LICENSE.md for full license information.
--- a/Dependencies/CNTKCustomMKL/README.md
+++ b/Dependencies/CNTKCustomMKL/README.md
@ -0,0 +1,23 @@
+# CNTK custom MKL
+
+This directory contains the necessary files to create a custom Intel® Math Kernel Library (MKL)
+for usage by CNTK ("CNTK custom MKL" for short).
+
+By default, a CNTK binary with Intel® MKL support includes a prebuilt CNTK
+custom MKL.
+If you want to build CNTK with Intel® MKL support yourself, you can install a
+prebuilt CNTK custom MKL, available for download from the [CNTK web site](https://www.cntk.ai/mkl).
+See [CNTK's setup instructions](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine)
+for more details.
+
+If you want to add new Intel® MKL functions to be used by CNTK you will have to
+build your own CNTK custom MKL.
+This requires you to install the [Intel MKL SDK](https://software.intel.com/en-us/intel-mkl/) for your platform.
+Then, in this directory,
+* extend the file `headers.txt` to expose new headers,
+* extend the file `functions.txt` to expose new functions, and
+* use `build-linux.sh` or `build-windows.cmd` to build for your platform.
+
+For further documentation please see the Developer Guide for the Intel® MKL, in particular
+[Building Custom Shared Objects (Linux)](https://software.intel.com/en-us/node/528533) and
+[Building Custom Dynamic-link Libraries (Windows)](https://software.intel.com/en-us/node/528362).
--- a/Dependencies/CNTKCustomMKL/build-linux.sh
+++ b/Dependencies/CNTKCustomMKL/build-linux.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+#
+# Copyright (c) Microsoft. All rights reserved.
+#
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+#
+
+# Stop on error, trace commands
+set -e -x
+
+# Enter directory the script is located in
+cd "$( dirname "${BASH_SOURCE[0]}" )"
+
+# TODO configurable
+MKLROOT=/opt/intel/compilers_and_libraries_2016.2.181/linux/mkl
+MKLBUILDERROOT=$MKLROOT/tools/builder
+CNTKCUSTOMMKLVERSION=$(cat version.txt)
+
+rm -rf Publish
+
+mkdir Publish{,/$CNTKCUSTOMMKLVERSION{,/x64}}
+
+for THREADING in parallel sequential
+do
+    LIBBASENAME=libmkl_cntk_$(echo $THREADING | cut -c 1)
+    make -f $MKLBUILDERROOT/makefile libintel64 \
+        export=functions.txt \
+        threading=$THREADING \
+        name=$LIBBASENAME \
+        MKLROOT=$MKLROOT
+    mkdir Publish/$CNTKCUSTOMMKLVERSION/x64/$THREADING
+    mv $LIBBASENAME.so Publish/$CNTKCUSTOMMKLVERSION/x64/$THREADING
+done
+
+cp -p $MKLROOT/../compiler/lib/intel64_lin/libiomp5.so Publish/$CNTKCUSTOMMKLVERSION/x64/parallel
+
+rsync -av --files-from headers.txt $MKLROOT/include Publish/$CNTKCUSTOMMKLVERSION/include
+
+cp -p README-for-redistributable.txt Publish/$CNTKCUSTOMMKLVERSION/README.txt
+cp -p ../../LICENSE.md Publish/$CNTKCUSTOMMKLVERSION
+
+cd Publish
+tar -czf ../CNTKCustomMKL-Linux-$CNTKCUSTOMMKLVERSION.tgz $CNTKCUSTOMMKLVERSION
+cd ..
--- a/Dependencies/CNTKCustomMKL/build-windows.cmd
+++ b/Dependencies/CNTKCustomMKL/build-windows.cmd
@ -0,0 +1,156 @@
+@echo off
+REM
+REM Copyright (c) Microsoft. All rights reserved.
+REM
+REM Licensed under the MIT license. See LICENSE.md file in the project root
+REM for full license information.
+REM ==============================================================================
+REM
+echo.
+echo This batch file will build a custom MKL dynamic link library for usage by CNTK.
+echo.
+echo Requirements:
+echo  - Intel MKL SDK installed on the machine
+echo  - MKLROOT environment variable is set to the MKL directory inside the Intel MKL SDK
+echo  - Visual Studio 2013 installed and included in the path
+echo.
+
+setlocal enableextensions enabledelayedexpansion
+
+pushd "%~dp0"
+if errorlevel 1 (
+  echo Could not change directory to script location.
+  exit /b 1
+)
+
+if not defined MKLROOT (
+  echo Error: Environment variable MKLROOT is undefined.
+  exit /b 1
+)
+
+if not exist "%MKLROOT%" (
+  echo Error: Directory doesn't exist: "%MKLROOT%".
+  exit /b 1
+)
+
+set MKLBUILDERROOT=%MKLROOT%\tools\builder
+
+if not exist "%MKLBUILDERROOT%" (
+  echo Error: Directory doesn't exist: "%MKLBUILDERROOT%".
+  exit /b 1
+)
+
+where /q nmake.exe
+if errorlevel 1 (
+  echo Error: NMAKE.EXE not in path.
+  exit /b 1
+)
+
+where /q link.exe
+if errorlevel 1 (
+  echo Error: LINK.EXE not in path.
+  exit /b 1
+)
+
+set /p CNTKCUSTOMMKLVERSION=<version.txt
+if not defined CNTKCUSTOMMKLVERSION (
+  echo Cannot determine CNTK custom MKL version.
+  exit /b 1
+)
+
+if exist lib rmdir /s /q lib
+if errorlevel 1 exit /b 1
+
+if exist Publish rmdir /s /q Publish
+if errorlevel 1 exit /b 1
+
+mkdir Publish\%CNTKCUSTOMMKLVERSION%\x64
+
+echo.
+echo Copying "%MKLBUILDERROOT%\lib".
+
+xcopy /s /e /y /i "%MKLBUILDERROOT%\lib" lib
+if errorlevel 1 (
+  exit /b 1
+)
+
+echo.
+echo Compiling and copying libraries.
+
+for %%t in (
+  parallel
+  sequential
+) do (
+
+  set TFIRSTCHAR=%%t
+  set TFIRSTCHAR=!TFIRSTCHAR:~0,1!
+  set LIBBASENAME=mkl_cntk_!TFIRSTCHAR!
+
+  echo.
+  echo Calling NMAKE libintel64 export=functions.txt threading=%%t name=!LIBBASENAME! MKLROOT="%MKLROOT%".
+  NMAKE /f "%MKLBUILDERROOT%\makefile" ^
+    libintel64 ^
+    export=functions.txt ^
+    threading=%%t ^
+    name=!LIBBASENAME! ^
+    MKLROOT="%MKLROOT%"
+
+  if errorlevel 1 (
+    echo Error: NMAKE.exe for threading=%%t failed.
+    exit /b 1
+  )
+
+  mkdir Publish\%CNTKCUSTOMMKLVERSION%\x64\%%t
+  if errorlevel 1 exit /b 1
+
+  move !LIBBASENAME!.dll Publish\%CNTKCUSTOMMKLVERSION%\x64\%%t
+  if errorlevel 1 exit /b 1
+
+  move !LIBBASENAME!.lib Publish\%CNTKCUSTOMMKLVERSION%\x64\%%t
+  if errorlevel 1 exit /b 1
+
+  del !LIBBASENAME!*
+  if errorlevel 1 exit /b 1
+  @REM TODO manifest?
+)
+
+echo.
+echo Copying libiomp5md.dll.
+
+copy "%MKLROOT%\..\redist\intel64_win\compiler\libiomp5md.dll" Publish\%CNTKCUSTOMMKLVERSION%\x64\parallel
+if errorlevel 1 (
+  exit /b 1
+)
+
+echo.
+echo Removing LIB directory.
+
+rmdir /s /q lib
+if errorlevel 1 exit /b 1
+
+echo.
+echo Copying include files to Publish\%CNTKCUSTOMMKLVERSION%\include.
+
+mkdir Publish\%CNTKCUSTOMMKLVERSION%\include
+
+for /f %%h in (headers.txt) do (
+  copy "%MKLROOT%\include\%%h" Publish\%CNTKCUSTOMMKLVERSION%\include
+  if errorlevel 1 (
+    echo Failed to copy "%MKLROOT%\include\%%h".
+    exit /b 1
+  )
+)
+
+copy README-for-redistributable.txt Publish\%CNTKCUSTOMMKLVERSION%\README.txt
+if errorlevel 1 (
+  echo Failed to copy README.
+  exit /b 1
+)
+
+copy ..\..\LICENSE.md Publish\%CNTKCUSTOMMKLVERSION%
+if errorlevel 1 (
+  echo Failed to copy LICENSE.md.
+  exit /b 1
+)
+
+popd
--- a/Dependencies/CNTKCustomMKL/functions.txt
+++ b/Dependencies/CNTKCustomMKL/functions.txt
@ -0,0 +1,17 @@
+cblas_dgemm
+cblas_dasum
+cblas_daxpy
+cblas_dcopy
+cblas_ddot
+cblas_dnrm2
+cblas_dscal
+cblas_sasum
+cblas_saxpy
+cblas_scopy
+cblas_sgemm
+cblas_sscal
+cblas_sdot
+cblas_snrm2
+dgesvd
+sgesvd
+MKL_Set_Num_Threads
--- a/Dependencies/CNTKCustomMKL/headers.txt
+++ b/Dependencies/CNTKCustomMKL/headers.txt
@ -0,0 +1,31 @@
+mkl_blas.h
+mkl_cblas.h
+mkl_df_defines.h
+mkl_df_functions.h
+mkl_df_types.h
+mkl_df.h
+mkl_dfti.h
+mkl_direct_call.h
+mkl_dss.h
+mkl_lapack.h
+mkl_lapacke.h
+mkl_pardiso.h
+mkl_poisson.h
+mkl_rci.h
+mkl_service.h
+mkl_solvers_ee.h
+mkl_sparse_handle.h
+mkl_spblas.h
+mkl_trans.h
+mkl_trig_transforms.h
+mkl_types.h
+mkl_version.h
+mkl_vml_defines.h
+mkl_vml_functions.h
+mkl_vml_types.h
+mkl_vml.h
+mkl_vsl_defines.h
+mkl_vsl_functions.h
+mkl_vsl_types.h
+mkl_vsl.h
+mkl.h
--- a/Dependencies/CNTKCustomMKL/version.txt
+++ b/Dependencies/CNTKCustomMKL/version.txt
@ -0,0 +1 @@
+1
--- a/Examples/Image/MNIST/Config/01_OneHidden.cntk
+++ b/Examples/Image/MNIST/Config/01_OneHidden.cntk
@ -106,7 +106,7 @@ train = [

 test = [
    action = "test"
-    minibatchSize = 1024
+    minibatchSize = 1024    # reduce this if you run out of memory

    evalNodeNames = ce:errs:top5Errs

--- a/Examples/Image/MNIST/Config/04_DeConv.cntk
+++ b/Examples/Image/MNIST/Config/04_DeConv.cntk
@ -0,0 +1,101 @@
+# Parameters can be overwritten on the command line
+# for example: cntk configFile=myConfigFile RootDir=../.. 
+# For running from Visual Studio add
+# currentDirectory=$(SolutionDir)/<path to corresponding data folder> 
+RootDir = ".."
+
+ConfigDir = "$RootDir$/Config"
+DataDir   = "$RootDir$/Data"
+OutputDir = "$RootDir$/Output"
+ModelDir  = "$OutputDir$/Models"
+
+deviceId = 0
+imageLayout = "cudnn"
+# Override the above as follows when running on CPU:
+# deviceId = -1
+
+command = train:test
+
+precision = "float"
+modelPath = "$ModelDir$/04_DeConv"
+ndlMacros = "$ConfigDir$/Macros.ndl"
+
+# uncomment the following line to write logs to a file 
+# stderr = "$OutputDir$/04_DeConv_out"
+traceLevel=1
+numMBsToShowResult=500
+
+prefetch=true
+
+# If set to true, always initialize the network on CPU, making initialization consistent across CPU and GPU targets (for testing).
+initOnCPUOnly=true
+
+#######################################
+#  TRAINING CONFIG                    #
+#######################################
+
+train = [
+    action = "train"
+
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/04_DeConv.ndl"
+    ]
+    
+    SGD = [
+        epochSize = 60000
+        minibatchSize = 32
+        learningRatesPerMB = 0.001
+        momentumPerMB = 0.9
+        maxEpochs = 10
+    ]
+    
+    # Note: this reader crashes if randomization is turned on.
+    reader = [
+        readerType = "UCIFastReader"
+        # To get the data (Train-28x28.txt) please run `python mnist_convert.py` 
+        # from the 'AdditionalFiles' folder. See REAMDE.md for details.
+        file = "$DataDir$/Train-28x28.txt"
+        
+        features = [
+            dim = 784
+            start = 1
+        ]
+        
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
+        ]
+    ]    
+]
+
+#######################################
+#  TEST CONFIG                        #
+#######################################
+
+test = [
+    action = test
+    minibatchSize = 16
+    
+    NDLNetworkBuilder = [
+        networkDescription = "$ConfigDir$/04_DeConv.ndl"
+    ]
+    
+    reader = [
+        readerType = "UCIFastReader"
+        file = "$DataDir$/Test-28x28.txt"
+        
+        features = [
+            dim = 784
+            start = 1
+        ]
+        
+        labels = [
+            dim = 1
+            start = 0
+            labelDim = 10
+            labelMappingFile = "$DataDir$/labelsmap.txt"
+        ]
+    ]
+]
--- a/Examples/Image/MNIST/Config/04_DeConv.ndl
+++ b/Examples/Image/MNIST/Config/04_DeConv.ndl
@ -0,0 +1,60 @@
+# macros to include
+load = ndlMnistMacros
+
+# the actual NDL that defines the network
+run = DNN
+
+ndlMnistMacros = [
+    imageW = 28
+    imageH = 28
+    imageC = 1
+    labelDim = 10
+
+    features = ImageInput(imageW, imageH, imageC, imageLayout=$imageLayout$)
+    featScale = Constant(0.00390625)
+    featScaled = Scale(featScale, features)
+    labels = InputValue(labelDim)
+]
+
+DNN=[
+    # conv1
+    kW1 = 5
+    kH1 = 5
+    cMap1 = 16
+    hStride1 = 2
+    vStride1 = 2
+    wScale1 = 0.1
+    bValue1 = 0
+    # weight[cMap1, kW1 * kH1 * inputChannels]
+    # Conv2DReLULayer is defined in Macros.ndl
+    conv1 = Conv2DReLULayer(featScaled, cMap1, 25, kW1, kH1, hStride1, vStride1, wScale1, bValue1)
+    
+    # pool1
+    pool1W = 2
+    pool1H = 2
+    pool1hStride = 2
+    pool1vStride = 2
+    # MaxPooling is a standard NDL node.
+    pool1 = MaxPooling(conv1, pool1W, pool1H, pool1hStride, pool1vStride, imageLayout=$imageLayout$)
+    
+    #unpool1
+    unpool1 = MaxUnpool(pool1, conv1, pool1W, pool1H, pool1hStride, pool1vStride)
+    
+    # deconv1
+    lpad1 = 2
+    upad1 = 1
+    # weight[cMap1, kW1 * kH1 * inputChannels]
+    # DeconvReLULayer is defined in Macros.ndl
+    deconv1 = DeconvReLULayer(unpool1, kW1, kH1, imageC, 25, cMap1, hStride1, vStride1, lpad1, upad1, wScale1, bValue1)
+ 
+    mse = SquareError(featScaled, deconv1)
+    #err = ErrorPrediction(labels, ol)
+
+    # Special Nodes
+    FeatureNodes = (features)
+    #LabelNodes = (labels)
+    CriterionNodes = (mse)
+    #EvalNodes = (err)
+    #OutputNodes = (deconv1)
+]
+
--- a/Examples/Image/MNIST/Config/Macros.ndl
+++ b/Examples/Image/MNIST/Config/Macros.ndl
@ -48,6 +48,10 @@ ConvND(w, inp, kW, kH, inMap, outMap, hStride, vStride) = [
    c = Convolution(w, inp, {kW, kH, inMap}, mapCount=outMap, stride={hStride, vStride, inMap}, sharing={true, true, true}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
 ]

+DeConv(w, inp, kW, kH, inMap, outMap, hStride, vStride, lpad, upad) = [
+    c = Convolution(w, inp, {kW, kH, inMap}, mapCount=outMap, stride={hStride, vStride, inMap}, sharing={true, true, true}, autoPadding=false, lowerPad={lpad, lpad, 0}, upperPad={upad, upad, 0}, transpose=1, imageLayout=$imageLayout$)
+]
+
 Conv2DReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue) = [
    w = ConvW(outMap, inWCount, wScale)
    b = ConvB(outMap, bValue)
@ -84,6 +88,17 @@ ConvBNReLULayer(inp, outMap, inWCount, kW, kH, hStride, vStride, wScale, bValue,
    y = RectifiedLinear(c)
 ]

+DeconvReLULayer(inp, kW, kH, inMap, inWCount, outMap, hStride, vStride, lpad, upad, wScale, bValue) = [
+    # No bias here.
+    w = ConvW(outMap, inWCount, wScale)
+    act = RectifiedLinear(inp)
+    out = DeConv(w, act, kW, kH, inMap, outMap, hStride, vStride, lpad, upad)
+]
+
 MaxNDPooling(inp, kW, kH, hStride, vStride) = [
-		p = Pooling(inp, "max", {kW, kH, 1}, stride={hStride, vStride, 1}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
-]
+    p = Pooling(inp, "max", {kW, kH, 1}, stride={hStride, vStride, 1}, autoPadding={true, true, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
+]
+
+MaxUnpool(inp, poolInp, kW, kH, hStride, vStride) = [
+    up = MaxUnpooling(inp, poolInp, {kW, kH, 1}, stride={hStride, vStride, 1}, autoPadding={false, false, false}, lowerPad=0, upperPad=0, imageLayout=$imageLayout$)
+]
--- a/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping
+++ b/Examples/SequenceToSequence/CMUDict/Data/cmudict-0.7b.mapping
@ -0,0 +1,69 @@
+'
+</s>
+<s/>
+<s>
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+~AA
+~AE
+~AH
+~AO
+~AW
+~AY
+~B
+~CH
+~D
+~DH
+~EH
+~ER
+~EY
+~F
+~G
+~HH
+~IH
+~IY
+~JH
+~K
+~L
+~M
+~N
+~NG
+~OW
+~OY
+~P
+~R
+~S
+~SH
+~T
+~TH
+~UH
+~UW
+~V
+~W
+~Y
+~Z
+~ZH
--- a/LICENSE.md
+++ b/LICENSE.md
@ -12,7 +12,11 @@ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLI

 This project is based on or incorporates material from the projects listed below (Third Party IP). The original copyright notice and the license under which Microsoft received such Third Party IP, are set forth below. Such licenses and notices are provided for informational purposes only. Where permitted, Microsoft licenses the Third Party IP to you under the licensing terms for the Microsoft product. Microsoft reserves all other rights not expressly granted under this agreement, whether by implication, estoppel or otherwise.

-### a. BOOST C++ LIBRARIES
+### a. INTEL (R) MATH KERNEL LIBRARY (INTEL (R) MKL)
+
+CNTK distribution contains Redistributable components of Intel (r) Math Kernel Library (Intel (r) MKL)
+
+### b. BOOST C++ LIBRARIES

 Copyright Beman Dawes, David Abrahams, 1998-2005.  
 Copyright Rene Rivera 2004-2007.
@ -27,7 +31,7 @@ The copyright notices in the Software and this entire statement, including the a

 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

-### b. ATIS DATASETS
+### c. ATIS DATASETS

 CNTK distribution contains a subset of ATIS Datasets:

@ -39,23 +43,23 @@ Dahl, Deborah, et al. ATIS3 Test Data LDC95S26. Web Download. Philadelphia: Ling

 Dahl, Deborah, et al. ATIS3 Training Data LDC94S19. Web Download. Philadelphia: Linguistic Data Consortium, 1994.

-### c. TIMIT ACOUSTIC-PHONETIC CONTINUOUS SPEECH CORPUS
+### d. TIMIT ACOUSTIC-PHONETIC CONTINUOUS SPEECH CORPUS

 CNTK distribution contains a subset of TIMIT Acoustic-Phonetic Continuous Speech Corpus:

 Garofolo, John, et al. TIMIT Acoustic-Phonetic Continuous Speech Corpus LDC93S1. Web Download. Philadelphia: Linguistic Data Consortium, 1993.

-### d. THE PENN TREEBANK PROJECT
+### e. THE PENN TREEBANK PROJECT

 CNTK distribution contains a subset of the data of The Penn Treebank Project:

 Marcus, Mitchell, Beatrice Santorini, and Mary Ann Marcinkiewicz. Treebank-2 LDC95T7. Web Download. Philadelphia: Linguistic Data Consortium, 1995.

-### e. THE CMU AUDIO DATABASES
+### f. THE CMU AUDIO DATABASES

 CNTK distribution contains a subset of the CMU Audio Databases
 Copyright (c) 1991-2005 Carnegie Mellon University.  All rights reserved.

-### f. THE MNIST DATABASE OF HANDWRITTEN DIGITS
+### g. THE MNIST DATABASE OF HANDWRITTEN DIGITS

-CNTK distribution contains a subset of the MNIST Database of Handwritten Digits
+CNTK distribution contains a subset of the MNIST Database of Handwritten Digits
--- a/20
+++ b/20
@ -11,7 +11,11 @@
 #     defaults to release
 #   ACML_PATH= path to ACML library installation
 #     only needed if MATHLIB=acml
-#   MKL_PATH= path to MKL library installation
+#   MKL_PATH= path to CNTK custom MKL installation
+#     only needed if MATHLIB=mkl
+#   CNTK_CUSTOM_MKL_VERSION=2
+#     version for the CNTK custom MKL installation
+#   MKL_THREADING=parallel|sequential
 #     only needed if MATHLIB=mkl
 #   GDK_PATH= path to cuda gdk installation, so $(GDK_PATH)/include/nvidia/gdk/nvml.h exists
 #     defaults to /usr
@ -131,9 +135,15 @@ ifeq ("$(MATHLIB)","acml")
 endif

 ifeq ("$(MATHLIB)","mkl")
-  INCLUDEPATH += $(MKL_PATH)/mkl/include
-  LIBPATH += $(MKL_PATH)/compiler/lib/intel64 $(MKL_PATH)/mkl/lib/intel64 $(MKL_PATH)/compiler/lib/mic $(MKL_PATH)/mkl/lib/mic
-  LIBS += -lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lm -liomp5 -lpthread
+  INCLUDEPATH += $(MKL_PATH)/$(CNTK_CUSTOM_MKL_VERSION)/include
+  LIBS += -lm
+ifeq ("$(MKL_THREADING)","sequential")
+  LIBPATH += $(MKL_PATH)/$(CNTK_CUSTOM_MKL_VERSION)/x64/sequential
+  LIBS += -lmkl_cntk_s
+else
+  LIBPATH += $(MKL_PATH)/$(CNTK_CUSTOM_MKL_VERSION)/x64/parallel
+  LIBS += -lmkl_cntk_p -liomp5 -lpthread
+endif
  COMMON_FLAGS += -DUSE_MKL
 endif

@ -168,7 +178,7 @@ GENCODE_SM50 := -gencode arch=compute_50,code=\"sm_50,compute_50\"
 # Use GCOV_PREFIX and GCOV_PREFIX_STRIP if relocating:
 # For example, if the object file /user/build/foo.o was built with -fprofile-arcs, the final executable will try to create the data file
 # /user/build/foo.gcda when running on the target system. This will fail if the corresponding directory does not exist and it is unable
-# to create it. This can be overcome by, for example, setting the environment as ‘GCOV_PREFIX=/target/run’ and ‘GCOV_PREFIX_STRIP=1’.
+# to create it. This can be overcome by, for example, setting the environment as 'GCOV_PREFIX=/target/run' and 'GCOV_PREFIX_STRIP=1'.
 # Such a setting will name the data file /target/run/build/foo.gcda
 ifdef CNTK_CODE_COVERAGE
  CXXFLAGS += -fprofile-arcs -ftest-coverage
--- a/README.md
+++ b/README.md
@ -1,6 +1,9 @@
 # CNTK

 ## Latest news
+*2016-06-15.*  CNTK now supports building against a custom Intel® Math Kernel Library (MKL).
+See [setup instructions](https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-your-machine) on how to set this up for your platform.
+
 *2016-06-10.* See CNTK v.1.5 binary release announcement in the official [Microsoft Research Blog](https://blogs.msdn.microsoft.com/msr_er/2016/06/10/microsoft-improves-programming-flexibility-of-its-ai-toolkit/)

 *2016-06-08.* V 1.5 Binary release
@ -10,8 +13,6 @@ CNTK v.1.5 binaries are on the [CNTK Releases page](https://github.com/Microsoft

 *2016-05-19.* A 1-hour talk describing CNTK, how to use it, and how it works, has been posted at [Presentations](https://github.com/Microsoft/CNTK/wiki/Presentations).

-*2016-05-16.* An example illustrating [Using CNTK with ResNet](https://github.com/Microsoft/CNTK/tree/master/Examples/Image/Miscellaneous/ImageNet/ResNet) is added to the codebase. The example contains some pre-trained models that can be used in various applications.
-
 See [all news](https://github.com/Microsoft/CNTK/wiki/News).

 ## What is CNTK
--- a/Source/ActionsLib/ActionsLib.vcxproj
+++ b/Source/ActionsLib/ActionsLib.vcxproj
@ -38,6 +38,7 @@
  <ItemDefinitionGroup>
    <ClCompile>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\CNTK;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(NvmlLibPath)</AdditionalLibraryDirectories>
--- a/Source/ActionsLib/NDLNetworkBuilder.cpp
+++ b/Source/ActionsLib/NDLNetworkBuilder.cpp
@ -292,24 +292,33 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                nodePtr = builder.FutureValue(NULL, defaultHiddenActivity, rows, timeStep, name);
        }
    }
-    else if (cnNodeType == OperationNameOf(ConvolutionNode) || cnNodeType == OperationNameOf(PoolingNode))
+    else if (cnNodeType == OperationNameOf(ConvolutionNode) ||
+             cnNodeType == OperationNameOf(PoolingNode) ||
+             cnNodeType == OperationNameOf(MaxUnpoolingNode))
    {
-        if (parameter.size() != 3 && parameter.size() != 7)
+        if (parameter.size() != 2 && parameter.size() != 3 && parameter.size() != 7)
        {
            if (cnNodeType == OperationNameOf(ConvolutionNode))
            {
                RuntimeError("%ls: unexpected parameter count. %ls supports 2 modes: \n"
-                             "1. 2D convolution which takes 7 fixed parameters [weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels,horizontalSubsample, verticalSubsample] \n"
+                             "1. 2D convolution which takes 7 fixed parameters [weightNodeName, inputValueNodeName, kernelWidth, kernelHeight, outputChannels, horizontalSubsample, verticalSubsample] \n"
                             "and two optional parameters [zeroPadding = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"HWC\"|\"cudnn\"]. \n"
                             "2. ND convolution which takes 3 fixed parameters [weightNodeName, inputValueNodeName, kernelShape] and \n"
-                             "9 optional parameters [mapCount = [1|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
+                             "10 optional parameters [mapCount = [1|yourvalue], stride = [1|yourvalue], sharing = [true|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], bool transpose = [false|yourvalue], maxTempMemSizeInSamples = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
                             "For ND convolution, parameters kernelShape, mapCount, stride, sharing, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
                             cnNodeType.c_str(), cnNodeType.c_str());
            }
-            else
+            else if (cnNodeType == OperationNameOf(PoolingNode))
            {
                RuntimeError("%ls: unexpected parameter count. %ls 3 fixed parameters [inputValueNodeName, poolKind, kernelShape] and \n"
-                             "5 optional parameters stride = [1|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], imageLayout = \"cudnn\"|\"HWC\"]. \n"
+                             "5 optional parameters stride = [1|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], imageLayout = \"cudnn\"]. \n"
+                             "Parameters kernelShape, stride, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
+                             cnNodeType.c_str(), cnNodeType.c_str());
+            }
+            else if (cnNodeType == OperationNameOf(MaxUnpoolingNode))
+            {
+                RuntimeError("%ls: unexpected parameter count. %ls 3 fixed parameters [inputValueNodeName, mask, kernelShape] and \n"
+                             "5 optional parameters stride = [1|yourvalue], autoPadding = [true|yourvalue], lowerPad = [0|yourvalue], upperPad = [0|yourvalue], imageLayout = \"cudnn\"]. \n"
                             "Parameters kernelShape, stride, autoPadding, lowerPad, upperPad can be arrays, e.g. kernelShape={5, 5, 3}",
                             cnNodeType.c_str(), cnNodeType.c_str());
            }
@ -317,11 +326,13 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst

        // setup the parameter position of children so we can hook them up later
        nodeParamStart = 0;
-        nodeParamCount = cnNodeType == OperationNameOf(ConvolutionNode) ? 2 : 1;
+        nodeParamCount = (cnNodeType == OperationNameOf(ConvolutionNode) || cnNodeType == OperationNameOf(MaxUnpoolingNode))
+                         ? 2
+                         : 1;

        if (pass == ndlPassInitial)
        {
-            if (parameter.size() == 3)
+            if (parameter.size() == 2 || parameter.size() == 3)
            {
                auto reqParams = node->GetParameters(false);
                auto optParams = node->GetParameters(true);
@ -378,21 +389,19 @@ void NDLNodeEvaluatorImpl<ElemType>::Evaluate(NDLNode<ElemType>* node, const wst
                ImageLayoutKind imageLayout = ImageLayoutKindFrom(node->GetOptionalParameter("imageLayout", "CHW"));
                size_t maxTempMemSizeInSamples = node->GetOptionalParameter("maxTempMemSizeInSamples", "0");

-                auto pool = PoolKind::None;
-                if (cnNodeType == OperationNameOf(PoolingNode))
+                if (cnNodeType == OperationNameOf(MaxUnpoolingNode))
+                    nodePtr = builder.MaxUnpooling(NULL, NULL, kernelShape, stride, autoPad, lowerPad, upperPad, imageLayout, name);
+                else if (cnNodeType == OperationNameOf(PoolingNode))
                {
                    auto parm = node->GetParentScript()->ParseVariable(reqParams[1]->GetValue(), false);
-                    pool = PoolKindFrom(wstring(parm->GetValue()));
-                }
-
-                if (pool == PoolKind::None)
-                {
-                    nodePtr = builder.Convolution(NULL, NULL, kernelShape, mapCount, stride, sharing, 
-                                                  autoPad, lowerPad, upperPad, imageLayout, maxTempMemSizeInSamples, name);
+                    auto pool = PoolKindFrom(wstring(parm->GetValue()));
+                    nodePtr = builder.Pooling(NULL, pool, kernelShape, stride, autoPad, lowerPad, upperPad, imageLayout, name);
                }
                else
                {
-                    nodePtr = builder.Pooling(NULL, pool, kernelShape, stride, autoPad, lowerPad, upperPad, imageLayout, name);
+                    bool transpose = node->GetOptionalParameter("transpose", "false");
+                    nodePtr = builder.Convolution(NULL, NULL, kernelShape, mapCount, stride, sharing, 
+                                                  autoPad, lowerPad, upperPad, transpose, imageLayout, maxTempMemSizeInSamples, name);
                }

            }
--- a/Source/ActionsLib/NetworkDescriptionLanguage.cpp
+++ b/Source/ActionsLib/NetworkDescriptionLanguage.cpp
@ -197,6 +197,7 @@ bool CheckFunction(std::string& p_nodeType, bool* allowUndeterminedVariable)
    else if (EqualInsensitive(nodeType, OperationNameOf(MatrixL1RegNode), L"L1Reg")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(MatrixL2RegNode), L"L2Reg")) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(MaxPoolingNode))) ret = true;
+    else if (EqualInsensitive(nodeType, OperationNameOf(MaxUnpoolingNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(MeanNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(MinusNode))) ret = true;
    else if (EqualInsensitive(nodeType, OperationNameOf(NegateNode))) ret = true;
--- a/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
+++ b/Source/CNTK/BrainScript/CNTKCoreLib/CNTK.core.bs
@ -216,8 +216,11 @@ WeightedLogistic(label, probability, instanceWeight, tag='') = new ComputationNo
 ReconcileDynamicAxis(dataInput, layoutInput, tag='') = new ComputationNode [ operation = 'ReconcileDynamicAxis' ; inputs = (dataInput : layoutInput) /*plus the function args*/ ]
 ReconcileMBLayout = ReconcileDynamicAxis # back compat
 CastAs (type, data) = ReconcileDynamicAxis (data, type) # read as CastAs<type>(data) where the cast may consist of rearranging the data w.r.t. MBLayout or broadcasting across sequence items
-Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
+Convolution(weightNode, inputValueNode, kernelDims, mapDims = 1, stride = 1, sharing = true, autoPadding = true, lowerPad = 0, upperPad = 0, transpose=false, imageLayout='CHW', maxTempMemSizeInSamples = 0, tag='') = new ComputationNode [ operation = 'Convolution' ; inputs = (weightNode : inputValueNode); kernelShape = new TensorShape [ dims = kernelDims ] ; mapCount = new TensorShape [ dims = mapDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimSharing = new BoolVector [ items = sharing ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
+# ND pooling/unpooling
 Pooling(input, poolKind/*'max'|'average'*/, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'Pooling' ; inputs = (input); pool = poolKind ; kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
+MaxUnpooling(unpoolInput, poolInput, kernelDims, stride=1, autoPadding = true, lowerPad = 0, upperPad = 0, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxUnpooling' ; inputs = (unpoolInput : poolInput); kernelShape = new TensorShape [ dims = kernelDims ] ; strideShape = new TensorShape [ dims = stride ] ; dimPadding = new BoolVector [ items = autoPadding ] ; dimPadLower = new TensorShape [ dims = lowerPad ] ; dimPadUpper = new TensorShape [ dims = upperPad ] /*plus the function args*/ ]
+# 2D pooling
 MaxPooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'MaxPooling' ; inputs = input /*plus the function args*/ ]
 AveragePooling(input, windowWidth, windowHeight, horizontalSubsample, verticalSubsample, imageLayout='CHW', tag='') = new ComputationNode [ operation = 'AveragePooling' ; inputs = input /*plus the function args*/ ]
 ColumnwiseCrossProduct = KhatriRaoProduct // deprecated 
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -55,6 +55,7 @@
  <ItemDefinitionGroup>
    <ClCompile>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\ActionsLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(NvmlLibPath)</AdditionalLibraryDirectories>
--- a/Source/CNTK/prebuild.bat
+++ b/Source/CNTK/prebuild.bat
@ -1,6 +1,12 @@
@echo off
 setlocal enableDelayedexpansion

+::: Copyright (c) Microsoft. All rights reserved.
+:::
+::: Licensed under the MIT license. See LICENSE.md file in the project root 
+::: for full license information.
+::: ==============================================================================
+:::
 ::: This is called as a pre-build step for the CNTK executable.
 ::: It receives the build's configuration, $(Configuration), as first paramter.
 ::: It creates buildinfo.h, which makes version information available to the executable itself.
@ -28,9 +34,14 @@ if not errorlevel 1 (
    )
 )

-:: For now, math lib is basically hardwired
-if exist ACML_PATH (
-    echo #define _MATHLIB_ "acml">> buildinfo.h$$
+if "%CNTK_MKL%" == "1" (
+  if "%CNTK_MKL_SEQUENTIAL%" == "1" (
+    echo #define _MATHLIB_ "mkl-sequential">> buildinfo.h$$
+  ) else (
+    echo #define _MATHLIB_ "mkl">> buildinfo.h$$
+  )
+) else (
+  echo #define _MATHLIB_ "acml">> buildinfo.h$$
 )

 echo #define _BUILDER_ "%USERNAME%"     >> buildinfo.h$$
--- a/Source/Common/File.cpp
+++ b/Source/Common/File.cpp
@ -151,7 +151,7 @@ void File::Init(const wchar_t* filename, int fileOptions)
 #ifdef _WIN32
    // Win32 accepts forward slashes, but it seems that PathRemoveFileSpec() does not
    // TODO:
-    // "PathCchCanonicalize does the / to \ conversion as a part of the canonicalization, it’s
+    // "PathCchCanonicalize does the / to \ conversion as a part of the canonicalization, it's
    // probably a good idea to do that anyway since I suspect that the '..' characters might
    // confuse the other PathCch functions" [Larry Osterman]
    // "Consider GetFullPathName both for canonicalization and last element finding." [Jay Krell]
--- a/Source/Common/Include/Config.h
+++ b/Source/Common/Include/Config.h
@ -498,9 +498,9 @@ public:
                    // check for custom separator character
                    // If the opening brace is immediately followed by any of the customSeparators,
                    // change m_separator (inside seps) to that character.
-                    // The parser lets you change the default separator to something else. For example the default separator for an array is usually the ‘:’ (I think)
+                    // The parser lets you change the default separator to something else. For example the default separator for an array is usually the ':' (I think)
                    // (12:45:23:46)
-                    // However if you are using strings, and one of those strings contains a ‘:’, you might want to change the separator to something else:
+                    // However if you are using strings, and one of those strings contains a ':', you might want to change the separator to something else:
                    // (;this;is;a;path:;c:\mydir\stuff)
                    //
                    // This will fail for
--- a/Source/Common/Include/MPIWrapper.h
+++ b/Source/Common/Include/MPIWrapper.h
@ -1,12 +1,20 @@
 #pragma once

-// This uses mpi.h which requires the Microsoft MPI SDK to be installed on Windows
-// [cf. https://msdn.microsoft.com/en-us/library/bb524831(v=vs.85).aspx]
-// download msmpisdk.msi at https://www.microsoft.com/en-us/download/details.aspx?id=49926 and run it
-// and the MPI dev package on Linux (sudo apt-get install libopenmpi-dev openmpi-bin openmpi-doc)
+// Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#ms-mpi or
+// https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Linux#open-mpi for setup instructions
+// of an MPI implementation on your platform.
+#ifdef _MSC_VER
+// Suppress warning for non-ASCII characters in MS-MPI headers
+#pragma warning(push)
+#pragma warning(disable : 4819) // The file contains a character that cannot be represented in the current code page (...). Save the file in Unicode format to prevent data loss
 #include "mpi.h"
+#pragma warning(pop)
+#else
+#include "mpi.h"
+#endif
 #pragma comment(lib, "msmpi.lib")

+
 #include <string>
 #include <array>
 #include <vector>
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -130,6 +130,7 @@ static shared_ptr<ComputationNode<ElemType>> CreateStandardNode(const std::wstri
 #if 1
    else if (nodeType == OperationNameOf(LegacyReshapeNode))                    return New<LegacyReshapeNode<ElemType>>(forward<_Types>(_Args)...);
 #endif
+    else if (nodeType == OperationNameOf(MaxUnpoolingNode))                     return New<MaxUnpoolingNode<ElemType>>(forward<_Types>(_Args)...);
    else InvalidArgument("Attempted to instantiate undefined operation %ls.", nodeType.c_str());
 }

@ -249,12 +250,12 @@ template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount,
                                                                                                 const TensorShape& strideShape, const std::vector<bool>& sharing,
                                                                                                 const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                                                                                                 ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+                                                                                                 bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
 {
    return net.AddNodeToNetWithElemType(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
                                                                       kernelShape, mapCount, strideShape,
                                                                       sharing, autoPadding, lowerPad, upperPad,
-                                                                       imageLayout, maxTempMemSizeInSamples));
+                                                                       transpose, imageLayout, maxTempMemSizeInSamples));
 }

 template <class ElemType>
@ -314,13 +315,13 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Convo
                                                                                       const TensorShape& kernelShape, const TensorShape& mapCount, 
                                                                                       const TensorShape& strideShape, const std::vector<bool>& sharing,
                                                                                       const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                                                                                       ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
+                                                                                       bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
                                                                                       const std::wstring nodeName)
 {
    return net.AddNodeToNetAndAttachInputs(New<ConvolutionNode<ElemType>>(net.GetDeviceId(), nodeName,
                                                                          kernelShape, mapCount, strideShape,
                                                                          sharing, autoPadding, lowerPad, upperPad,
-                                                                          imageLayout, maxTempMemSizeInSamples),
+                                                                          transpose, imageLayout, maxTempMemSizeInSamples),
                                                                          { weight, inputValues });
 }

@ -336,6 +337,19 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Pooli
                                                                      { inputValues });
 }

+template <class ElemType>
+shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxUnpooling(const ComputationNodePtr unpoolInputValues,
+                                                                                        const ComputationNodePtr poolInputValues,
+                                                                                        const TensorShape& kernelShape, const TensorShape& strideShape,
+                                                                                        const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                                                                        ImageLayoutKind imageLayout,
+                                                                                        const std::wstring nodeName)
+{
+    return net.AddNodeToNetAndAttachInputs(New<MaxUnpoolingNode<ElemType>>(net.GetDeviceId(), nodeName,
+                                                                           kernelShape, strideShape, autoPadding, lowerPad, upperPad, imageLayout),
+                                                                           { unpoolInputValues, poolInputValues });
+}
+
 template <class ElemType>
 shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::MaxPooling(const ComputationNodePtr inputValues,
                                                                                      const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -54,7 +54,7 @@ public:
    ComputationNodePtr CreateSparseInputNode(const std::wstring& inputName, const TensorShape& sampleLayout, const wstring& dynamicAxisName = L"");
    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                                             const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                                             ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
+                                             bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples);
    ComputationNodePtr CreateConvolutionNode(const std::wstring& nodeName, const size_t kernelWidth, const size_t kernelHeight, const size_t outputChannels, 
                                             const size_t horizontalSubsample, const size_t verticalSubsample, 
                                             ImageLayoutKind imageLayoutKind, const bool zeroPadding = false, const size_t maxTempMemSizeInSamples = 0);
@ -81,13 +81,19 @@ public:
                                   const ComputationNodePtr inputValues,
                                   const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                                   const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                                   ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
+                                   bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples,
                                   const std::wstring nodeName = L"");
    ComputationNodePtr Pooling(const ComputationNodePtr inputValues, 
                               PoolKind poolKind, const TensorShape& kernelShape, const TensorShape& strideShape,
                               const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
                               ImageLayoutKind imageLayout,
                               const std::wstring nodeName = L"");
+    ComputationNodePtr MaxUnpooling(const ComputationNodePtr unpoolInputValues,
+                                    const ComputationNodePtr poolInputValues,
+                                    const TensorShape& kernelShape, const TensorShape& strideShape,
+                                    const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                                    ImageLayoutKind imageLayout,
+                                    const std::wstring nodeName = L"");
    ComputationNodePtr MaxPooling(const ComputationNodePtr inputValues,
                                  const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind,
                                  const std::wstring nodeName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@ -41,6 +41,7 @@
      <PrecompiledHeader>
      </PrecompiledHeader>
      <PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(NvmlLib)</AdditionalLibraryDirectories>
--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -37,7 +37,8 @@
 #define CNTK_MODEL_VERSION_6 6 // Batch norm blending
 #define CNTK_MODEL_VERSION_7 7 // ElemType tag in model file
 #define CNTK_MODEL_VERSION_8 8 // DynamicAxis for inputs
-#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_8
+#define CNTK_MODEL_VERSION_9 9 // Transpose flag in ConvolutionNode to support deconvolution. 
+#define CURRENT_CNTK_MODEL_VERSION CNTK_MODEL_VERSION_9

 extern bool g_shareNodeValueMatrices;

--- a/Source/ComputationNetworkLib/ConvolutionalNodes.h
+++ b/Source/ComputationNetworkLib/ConvolutionalNodes.h
@ -52,15 +52,15 @@ class ConvolutionNodeBase : public ComputationNode<ElemType>

 public:
    ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
-        : Base(deviceId, name), m_poolKind(PoolKind::None), m_maxTempMemSizeInSamples(0)
+        : Base(deviceId, name), m_poolKind(PoolKind::None), m_transpose(false), m_maxTempMemSizeInSamples(0)
    {
    }
    ConvolutionNodeBase(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
-                    const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                    PoolKind poolKind, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
-                    : Base(deviceId, name), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(strideShape), m_sharing(sharing),
-                    m_autoPad(autoPadding), m_lowerPad(lowerPad), m_upperPad(upperPad), m_poolKind(poolKind),
-                    m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
+                        const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                        PoolKind poolKind, bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+                        : Base(deviceId, name), m_kernelShape(kernelShape), m_mapCount(mapCount), m_stride(strideShape), m_sharing(sharing),
+                        m_autoPad(autoPadding), m_lowerPad(lowerPad), m_upperPad(upperPad), m_poolKind(poolKind), m_transpose(transpose),
+                        m_imageLayout(imageLayout), m_maxTempMemSizeInSamples(maxTempMemSizeInSamples)
    {
    }

@ -79,6 +79,7 @@ public:
        fstream << (int32_t)m_poolKind;
        fstream << (int32_t)m_imageLayout;
        fstream << m_maxTempMemSizeInSamples;
+        fstream << m_transpose;
    }

    void Load(File& fstream, size_t modelVersion) override
@ -102,7 +103,11 @@ public:
            fstream >> layout;
            m_imageLayout = (ImageLayoutKind)layout;
            fstream >> m_maxTempMemSizeInSamples;
-    }
+        }
+        if (modelVersion >= CNTK_MODEL_VERSION_9)
+        {
+            fstream >> m_transpose;
+        }
    }

    void CopyTo(ComputationNodeBasePtr nodeP, const std::wstring& newName, const CopyNodeFlags flags) const override
@ -119,64 +124,12 @@ public:
            node->m_lowerPad = m_lowerPad;
            node->m_upperPad = m_upperPad;
            node->m_poolKind = m_poolKind;
+            node->m_transpose = m_transpose;
            node->m_imageLayout = m_imageLayout;
            node->m_maxTempMemSizeInSamples = m_maxTempMemSizeInSamples;
        }
    }

-    void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
-    {
-        auto sliceOutputGrad = GradientFor(fr);
-
-        if (m_poolKind == PoolKind::None)
-        {
-        if (inputIndex == 0) // derivative with respect to the weight matrix
-        {
-            auto& grad = Input(0)->GradientAsMatrix();
-                auto sliceInput1Value = Input(1)->ValueFor(fr);
-                m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
-        }
-        else if (inputIndex == 1) // derivative with respect to the input feature
-        {
-            auto& input0 = Input(0)->ValueAsMatrix();
-            auto sliceInput1Grad = Input(1)->GradientFor(fr);
-                m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
-        }
-    }
-        else
-        {
-            Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
-
-            Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-            Matrix<ElemType> sliceOutputValue = ValueFor(fr);
-
-            m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
-        }
-    }
-
-    bool OutputUsedInComputingInputNodesGradients() const override
-    {
-        // The ConvolutionNode requires output values only for max pooling.
-        return m_poolKind == PoolKind::Max;
-    }
-
-    void ForwardProp(const FrameRange& fr) override
-    {
-        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
-
-        if (m_poolKind == PoolKind::None)
-        {
-        const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
-        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
-            m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
-        }
-        else
-        {
-            const Matrix<ElemType>& input0 = Input(0)->ValueFor(fr);
-            m_convEng->ForwardPooling(input0, sliceOutputValue);
-        }
-    }
-
    void DumpNodeInfo(const bool printValues, const bool printMetadata, File& fstream) const override
    {
        Base::DumpNodeInfo(printValues, printMetadata, fstream);
@ -195,6 +148,7 @@ protected:
    TensorShape m_lowerPad;
    TensorShape m_upperPad;
    PoolKind m_poolKind;
+    bool m_transpose;
    ImageLayoutKind m_imageLayout;

    size_t m_maxTempMemSizeInSamples;
@ -214,6 +168,7 @@ protected:                                  \
    using Base::m_lowerPad;                 \
    using Base::m_upperPad;                 \
    using Base::m_poolKind;                 \
+    using Base::m_transpose;                \
    using Base::m_imageLayout;              \
    using Base::m_maxTempMemSizeInSamples;  \
    using Base::m_tempMatrix;               \
@ -241,8 +196,8 @@ public:
    }
    ConvolutionNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& strideShape,
                    const std::vector<bool>& sharing, const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                    ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
-                    : Base(deviceId, name, kernelShape, mapCount, strideShape, sharing, autoPadding, lowerPad, upperPad, PoolKind::None, imageLayout, maxTempMemSizeInSamples),
+                    bool transpose, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples)
+                    : Base(deviceId, name, kernelShape, mapCount, strideShape, sharing, autoPadding, lowerPad, upperPad, PoolKind::None, transpose, imageLayout, maxTempMemSizeInSamples),
                    m_convolution2D(false)
    {
    }
@ -250,16 +205,16 @@ public:
                    const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayout,
                    bool zeroPadding, size_t maxTempMemSizeInSamples)
                    : ConvolutionNode(deviceId, name, TensorShape(kernelWidth, kernelHeight, 1), TensorShape(1, 1, outputChannels),
-                                      TensorShape(horizontalSubsample, verticalSubsample, 1), vector<bool>{true}, 
+                                      TensorShape(horizontalSubsample, verticalSubsample, 1), vector<bool>{true},
                                      vector<bool>{zeroPadding}, TensorShape(0), TensorShape(0),
-                                      imageLayout, maxTempMemSizeInSamples)
+                                      false, imageLayout, maxTempMemSizeInSamples)
    {
        m_convolution2D = true;
    }
    ConvolutionNode(const ScriptableObjects::IConfigRecordPtr configp)
        : ConvolutionNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelShape"), configp->Get(L"mapCount"), configp->Get(L"strideShape"),
                          configp->Get(L"dimSharing"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
-                          ImageLayoutKindFrom(configp->Get(L"imageLayout")), configp->Get(L"maxTempMemSizeInSamples"))
+                          configp->Get(L"transpose"), ImageLayoutKindFrom(configp->Get(L"imageLayout")), configp->Get(L"maxTempMemSizeInSamples"))
    {
        AttachInputsFromConfig(configp, GetExpectedNumInputs());
    }
@ -317,6 +272,48 @@ public:
        }
    }

+    void ForwardProp(const FrameRange& fr) override
+    {
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+        const Matrix<ElemType>& input0 = Input(0)->ValueAsMatrix();
+        Matrix<ElemType> sliceInput1Value = Input(1)->ValueFor(fr);
+        if (!m_transpose)
+            m_convEng->Forward(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
+        else
+        {
+            // BackwardData adds results to the output so need to zero them out first.
+            // REVIEW alexeyk: should be rolled into BackwardData itself.
+            sliceOutputValue.SetValue(0);
+            m_convEng->BackwardData(sliceInput1Value, input0, sliceOutputValue, *m_tempMatrix);
+        }
+    }
+
+    void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+    {
+        auto sliceOutputGrad = GradientFor(fr);
+        if (inputIndex == 0) // derivative with respect to the weight matrix
+        {
+            auto& grad = Input(0)->GradientAsMatrix();
+            auto sliceInput1Value = Input(1)->ValueFor(fr);
+            if (!m_transpose)
+                m_convEng->BackwardKernel(sliceOutputGrad, sliceInput1Value, grad, fr.IsAllFrames(), *m_tempMatrix);
+            else
+                m_convEng->BackwardKernel(sliceInput1Value, sliceOutputGrad, grad, fr.IsAllFrames(), *m_tempMatrix);
+        }
+        else if (inputIndex == 1) // derivative with respect to the input feature
+        {
+            auto& input0 = Input(0)->ValueAsMatrix();
+            auto sliceInput1Grad = Input(1)->GradientFor(fr);
+            if (!m_transpose)
+                m_convEng->BackwardData(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
+            else
+            {
+                // REVIEW alexeyk: Forward overwrites values in sliceInput1Grad. Should handle correctly instead.
+                m_convEng->Forward(sliceOutputGrad, input0, sliceInput1Grad, *m_tempMatrix);
+            }
+        }
+    }
+
    void Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
@ -324,6 +321,8 @@ public:

        size_t inputIdx = GetExpectedNumInputs() - 1;
        TensorShape inputShape;
+        TensorShape outputShape;
+        // If 2D convolution syntax is used then some of the tensor dimensions need to be inferred.
        if (m_convolution2D)
        {
            // Need to update some tensors with correct input dims.
@ -346,38 +345,42 @@ public:

            if (isFinalValidationPass && (Input(0)->GetAsMatrixNumCols() != weightCols || Input(0)->GetAsMatrixNumRows() != mapCount))
            {
-                LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]", 
+                LogicError("Convolution weight matrix %ls should have dimension [%d, %d] which is [outputChannels, kernelWidth * kernelHeight * inputChannels]",
                           Input(0)->NodeName().c_str(), (int)mapCount, (int)weightCols);
            }

-            auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
-                                                                 m_sharing, m_autoPad, m_lowerPad, m_upperPad);
-            // ConvolveGeometry always uses CHW.
-            SetDims(ImageDimensions(outDims, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());
+            outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                m_sharing, m_autoPad, m_lowerPad, m_upperPad);
        }
        else
        {
-            if (m_imageLayout != ImageLayoutKind::CHW)
-            {
-                InvalidArgument(
-                    "%ls %ls supports only cuDNN (CHW) data layout. "
-                    "Please specify imageLayout=\"cudnn\" in %ls node in your script "
-                    "and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
-            }
            inputShape = GetInputSampleLayout(inputIdx);
-            auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
-                                                                m_sharing, m_autoPad, m_lowerPad, m_upperPad);
-            SetDims(outDims, HasMBLayout());
+            if (!m_transpose)
+            {
+                outputShape = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                    m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+            }
+            else
+            {
+                // In case of transpose (deconvolution), node input (inputShape) is really the output of the convolution
+                // and node output (outDims) is convolution input. ConvolveGeometry does not care about deconvolutions (it does not have to).
+                outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+            }
        }
+        // ConvolveGeometry always uses CHW.
+        SetDims(ImageDimensions(outputShape, ImageLayoutKind::CHW).AsTensorShape(m_imageLayout), HasMBLayout());

        if (isFinalValidationPass)
        {
            if (m_convEng == nullptr)
            {
-                auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
+                auto geometry = std::make_shared<ConvolveGeometry>(!m_transpose ? inputShape : outputShape,
+                                                                   m_kernelShape, m_mapCount, m_stride, 
                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
                m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
-                                                                m_maxTempMemSizeInSamples, m_poolKind);
+                                                                m_maxTempMemSizeInSamples, m_poolKind,
+                                                                ConvolutionEngineKind::All, NodeName());
            }

            if (Input(0)->GetAsMatrixNumCols() != m_kernelShape.GetNumElements() ||
@ -409,11 +412,13 @@ public:
    }

 protected:
+    // Flag that indicates whether the node is created using 2D-syntax.
    bool m_convolution2D;
 };

 // -----------------------------------------------------------------------
 // PoolingNode (inputFeature)
+// Performs max or average ND pooling.
 // -----------------------------------------------------------------------

 template <class ElemType>
@ -432,9 +437,9 @@ public:
    {
    }
    PoolingNode(DEVICEID_TYPE deviceId, const wstring& name, PoolKind pool, const TensorShape& kernelShape, const TensorShape& strideShape,
-                    const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
-                    ImageLayoutKind imageLayout)
-                    : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, pool, imageLayout, 0)
+                const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                ImageLayoutKind imageLayout)
+                : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, pool, false, imageLayout, 0)
    {
    }
    PoolingNode(const ScriptableObjects::IConfigRecordPtr configp)
@ -447,6 +452,129 @@ public:
    }

 public:
+    void ForwardProp(const FrameRange& fr) override
+    {
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+        const Matrix<ElemType>& input0 = Input(0)->ValueFor(fr);
+        m_convEng->ForwardPooling(input0, sliceOutputValue);
+    }
+
+    void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+    {
+        auto sliceOutputGrad = GradientFor(fr);
+        Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
+        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+
+        m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
+    }
+
+    bool OutputUsedInComputingInputNodesGradients() const override
+    {
+        // The PoolingNode requires output values only for max pooling.
+        return m_poolKind == PoolKind::Max;
+    }
+
+    void Validate(bool isFinalValidationPass) override
+    {
+        auto inputShape = GetInputSampleLayout(0);
+        ValidatePooling(inputShape, isFinalValidationPass);
+        if (isFinalValidationPass)
+        {
+            if (m_convEng == nullptr)
+            {
+                auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+                m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
+                                                                m_maxTempMemSizeInSamples, m_poolKind,
+                                                                ConvolutionEngineKind::All, NodeName());
+            }
+        }
+    }
+
+protected:
+    void ValidatePooling(const TensorShape& inputShape, bool isFinalValidationPass)
+    {
+        Base::Validate(isFinalValidationPass);
+        InferMBLayoutFromInputsForStandardCase(isFinalValidationPass);
+
+        if (m_imageLayout != ImageLayoutKind::CHW)
+        {
+            InvalidArgument(
+                "%ls %ls supports only cuDNN (CHW) data layout. "
+                "Please specify imageLayout=\"cudnn\" in %ls node in your script "
+                "and make sure input data layout is CHW", NodeName().c_str(), OperationName().c_str(), NodeName().c_str());
+        }
+
+        auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                            m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+        SetDims(outDims, HasMBLayout());
+    }
+};
+
+// -----------------------------------------------------------------------
+// MaxUnpoolingNode (unpoolInputValues, poolInputValues)
+// Performs "max unpooling" operation. Max unpooling mirrors the operation
+// performed by max pooling node and depends on the values provided to
+// the max pooling node (so unlike deconvolution operation, it is not
+// completely independent). Unpooling takes 2 inputs: features to be unpooled,
+// which tensor has the same shape as corresponding max pooling node output
+// and inputs for the original pooling node. Unpooling node
+// produces an output which has the same dimensions as input to the
+// corresponding max pooling node (i.e. poolInputValues).
+// TODO: need to add support for other pooling types, for example,
+// average unpooling. Note that in this case, generic unpooling operation
+// will take different number of inputs depending on pooling type.
+// -----------------------------------------------------------------------
+
+template <class ElemType>
+class MaxUnpoolingNode : public ConvolutionNodeBase<ElemType>, public NumInputs<2>
+{
+    typedef ConvolutionNodeBase<ElemType> Base;
+    UsingConvolutionNodeBaseMembers;
+    static const std::wstring TypeName() { return L"MaxUnpooling"; }
+
+public:
+    MaxUnpoolingNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+    MaxUnpoolingNode(DEVICEID_TYPE deviceId, const wstring& name, const TensorShape& kernelShape, const TensorShape& strideShape,
+                       const std::vector<bool>& autoPadding, const TensorShape& lowerPad, const TensorShape& upperPad,
+                       ImageLayoutKind imageLayout)
+                       : Base(deviceId, name, kernelShape, TensorShape(1), strideShape, vector<bool>{true}, autoPadding, lowerPad, upperPad, PoolKind::Max, true, imageLayout, 0)
+    {
+    }
+    MaxUnpoolingNode(const ScriptableObjects::IConfigRecordPtr configp)
+        : MaxUnpoolingNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"kernelShape"),
+                           configp->Get(L"strideShape"), configp->Get(L"dimPadding"), configp->Get(L"dimPadLower"), configp->Get(L"dimPadUpper"),
+                           ImageLayoutKindFrom(configp->Get(L"imageLayout")))
+    {
+        AttachInputsFromConfig(configp, GetExpectedNumInputs());
+    }
+
+public:
+    void ForwardProp(const FrameRange& fr) override
+    {
+        const Matrix<ElemType>& unpoolInput = Input(0)->ValueFor(fr);
+        const Matrix<ElemType>& poolInput = Input(1)->ValueFor(fr);
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+        m_convEng->MaxUnpooling(unpoolInput, poolInput, sliceOutputValue);
+    }
+
+    void BackpropTo(const size_t inputIndex, const FrameRange& fr) override
+    {
+        if (inputIndex != 0)
+            return;
+
+        auto sliceOutputGrad = GradientFor(fr);
+        Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
+        // BUGBUG: ForwardPooling overwrites values in sliceInput1Grad. Should handle correctly instead.
+        m_convEng->ForwardPooling(sliceOutputGrad, sliceInput0Grad);
+    }
+
+    bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+
    void Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
@ -461,18 +589,22 @@ public:
        }

        auto inputShape = GetInputSampleLayout(0);
-        auto outDims = ConvolveGeometry::ComputeOutputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
-                                                            m_sharing, m_autoPad, m_lowerPad, m_upperPad);
-        SetDims(outDims, HasMBLayout());
-
+        // Same as in case of deconvolution, node input (inputShape) is really the output of the max pooling
+        // and node output (outDims) is pooling input.
+        auto outputShape = ConvolveGeometry::ComputeInputShape(inputShape, m_kernelShape, m_mapCount, m_stride,
+                                                               m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+        SetDims(outputShape, HasMBLayout());
        if (isFinalValidationPass)
        {
            if (m_convEng == nullptr)
            {
-                auto geometry = std::make_shared<ConvolveGeometry>(inputShape, m_kernelShape, m_mapCount, m_stride,
+                auto geometry = std::make_shared<ConvolveGeometry>(outputShape, m_kernelShape, m_mapCount, m_stride,
                                                                   m_sharing, m_autoPad, m_lowerPad, m_upperPad);
+                // Create reference engine as it's the only engine that implements unpooling.
                m_convEng = ConvolutionEngine<ElemType>::Create(geometry, m_deviceId, m_imageLayout,
-                                                                m_maxTempMemSizeInSamples, m_poolKind);
+                                                                m_maxTempMemSizeInSamples, m_poolKind,
+                                                                ConvolutionEngineKind::Reference,
+                                                                NodeName());
            }
        }
    }
@ -491,20 +623,20 @@ class PoolingNodeBase : public ComputationNode<ElemType>, public NumInputs<1>
 public:
    PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring& name)
        : Base(deviceId, name),
-          m_windowWidth(SIZE_MAX),
-          m_windowHeight(SIZE_MAX),
-          m_horizontalSubsample(SIZE_MAX),
-          m_verticalSubsample(SIZE_MAX),
-          m_imageLayoutKind(ImageLayoutKind::HWC)
+        m_windowWidth(SIZE_MAX),
+        m_windowHeight(SIZE_MAX),
+        m_horizontalSubsample(SIZE_MAX),
+        m_verticalSubsample(SIZE_MAX),
+        m_imageLayoutKind(ImageLayoutKind::HWC)
    {
    }
    PoolingNodeBase(DEVICEID_TYPE deviceId, const wstring& name, const size_t windowWidth, const size_t windowHeight, const size_t horizontalSubsample, const size_t verticalSubsample, ImageLayoutKind imageLayoutKind)
        : Base(deviceId, name),
-          m_windowWidth(windowWidth),
-          m_windowHeight(windowHeight),
-          m_horizontalSubsample(horizontalSubsample),
-          m_verticalSubsample(verticalSubsample),
-          m_imageLayoutKind(imageLayoutKind)
+        m_windowWidth(windowWidth),
+        m_windowHeight(windowHeight),
+        m_horizontalSubsample(horizontalSubsample),
+        m_verticalSubsample(verticalSubsample),
+        m_imageLayoutKind(imageLayoutKind)
    {
    }
    PoolingNodeBase(const ScriptableObjects::IConfigRecordPtr configp)
@ -517,8 +649,8 @@ public:
    void Save(File& fstream) const override
    {
        Base::Save(fstream);
-        uint32_t imageLayoutKind = (uint32_t) m_imageLayoutKind;
-        uint32_t windowWidth = (uint32_t) m_windowWidth;
+        uint32_t imageLayoutKind = (uint32_t)m_imageLayoutKind;
+        uint32_t windowWidth = (uint32_t)m_windowWidth;
        fstream << windowWidth << imageLayoutKind << m_windowHeight << m_horizontalSubsample << m_verticalSubsample;
    }

@ -551,6 +683,14 @@ public:
        }
    }

+    void ForwardProp(const FrameRange& fr) override
+    {
+        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
+        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
+
+        m_convEng->ForwardPooling(sliceInput0Value, sliceOutputValue);
+    }
+
    void BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
    {
        Matrix<ElemType> sliceInput0Grad = Input(0)->GradientFor(fr);
@ -562,14 +702,6 @@ public:
        m_convEng->BackwardPooling(sliceOutputValue, sliceOutputGrad, sliceInput0Value, sliceInput0Grad);
    }

-    void ForwardProp(const FrameRange& fr) override
-    {
-        Matrix<ElemType> sliceInput0Value = Input(0)->ValueFor(fr);
-        Matrix<ElemType> sliceOutputValue = ValueFor(fr);
-
-        m_convEng->ForwardPooling(sliceInput0Value, sliceOutputValue);
-    }
-
    void Validate(bool isFinalValidationPass) override
    {
        Base::Validate(isFinalValidationPass);
@ -686,7 +818,11 @@ public:
    {
        Base::Validate(isFinalValidationPass);
        if (isFinalValidationPass && m_convEng == nullptr)
-            m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Max);
+        {
+            m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind,
+                                                            0, PoolKind::Max,
+                                                            ConvolutionEngineKind::All, NodeName());
+        }
    }
 };

@ -722,7 +858,11 @@ public:
    {
        Base::Validate(isFinalValidationPass);
        if (isFinalValidationPass && m_convEng == nullptr)
-            m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind, 0, PoolKind::Average);
+        {
+            m_convEng = ConvolutionEngine<ElemType>::Create(m_geometry, m_deviceId, m_imageLayoutKind,
+                                                            0, PoolKind::Average, 
+                                                            ConvolutionEngineKind::All, NodeName());
+        }
    }
 };

--- a/Source/ComputationNetworkLib/RecurrentNodes.h
+++ b/Source/ComputationNetworkLib/RecurrentNodes.h
@ -83,7 +83,7 @@ protected:
 //  - ranges of neighbor frames as a secondary tensor dimension (i.e. can be used to implement a rolling window)
 //  - full support/efficiency of non-recurrent use (in which case the range can be from negative to positive, e.g. a symmetric rolling window)
 //  - denoting which tensor dimension to loop over (this may not be completed, but I will plant a seed)
-//  - support for Yongqiang’s sub-minibatching with truncated BPTT (export/import state)
+//  - support for Yongqiang's sub-minibatching with truncated BPTT (export/import state)
 //  - more efficient storage of carried-over state (only store the needed frames, not a full copy of the previous MB as currently; which will on the other hand also allow windows that reach back beyond a minibatch)
 // -----------------------------------------------------------------------

--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -90,7 +90,7 @@ template <class ElemType>

        // "LogPlus": softmax
        //   f(x) = log(sum_i exp x_i), hence gradient is:
-        //   df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i  – ReduceLogPlus(x))
+        //   df / dx_i = 1 / (sum_j exp x_j) * exp x_i = (Softmax(x))_i = exp(x_i  - ReduceLogPlus(x))
        // targetGradient = gradientFromTop .* Exp (inputValue - outputValue)   --TODO: verify
        // i.e. compute dfference if input and output, then Exp in-place. No, would need temp memory. So needs its own opcode AddScaledExpOfDiff(). Ternary.

--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@ -55,8 +55,9 @@
    <TargetName>EvalDll</TargetName>
  </PropertyGroup>
  <ItemDefinitionGroup>
-      <ClCompile>
+    <ClCompile>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\SGDLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\SequenceTrainingLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\CNTK\BrainScript;$(SolutionDir)Source\ActionsLib;$(MSMPI_INC);$(NvmlInclude)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\Math;$(MSMPI_LIB64);$(SolutionDir)$(Platform)\$(Configuration);$(NvmlLibPath)</AdditionalLibraryDirectories>
@ -153,4 +154,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/Source/Math/CPUMatrix.cpp
+++ b/Source/Math/CPUMatrix.cpp
@ -4315,6 +4315,51 @@ void CPUMatrix<ElemType>::MaxPoolingBackward(const CPUMatrix<ElemType>& out, con
    }
 }

+template <class ElemType>
+void CPUMatrix<ElemType>::MaxUnpooling(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices,
+                                       const CPUMatrix<int>& indices, const CPUMatrix<ElemType>& poolInput,
+                                       CPUMatrix<ElemType>& input) const
+{
+#pragma omp parallel for
+    for (int64_t sample = 0; sample < (int64_t)GetNumCols(); sample++)
+    {
+        for (size_t row = 0; row < GetNumRows(); row++)
+        {
+            int colBase = mpRowCol(row, 0);
+            assert(0 <= colBase && colBase < input.GetNumRows());
+
+            int i0 = mpRowIndices(row, 0);
+            int size = indices(i0++, 0);
+            assert(size > 0);
+
+            ElemType curMax = poolInput(colBase + indices(i0, 0), sample);
+            ElemType prevMax = curMax;
+            int imax = 0;
+            for (int i = 1; i < size; i++)
+            {
+                int dcol = indices(i0 + i, 0);
+                assert(0 <= colBase + dcol && colBase + dcol < poolInput.GetNumRows());
+                curMax = std::max(curMax, poolInput(colBase + dcol, sample));
+                if (curMax > prevMax)
+                {
+                    prevMax = curMax;
+                    imax = i;
+                }
+            }
+
+            int dcol = indices(i0 + imax, 0);
+            assert(0 <= colBase + dcol && colBase + dcol < input.GetNumRows());
+            input(colBase + dcol, sample) = (*this)(row, sample);
+
+            //int i = (int)poolIn(row, sample);
+            //assert(0 <= i && i < size);
+            //int dcol = indices(i0 + i, 0);
+            //assert(0 <= colBase + dcol && colBase + dcol < input.GetNumRows());
+            //input(colBase + dcol, sample) = (*this)(row, sample);
+        }
+    }
+}
+
 template <class ElemType>
 void CPUMatrix<ElemType>::AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const
 {
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -367,6 +367,7 @@ public:
    void MaxPoolingBackward(const CPUMatrix<ElemType>& out, const CPUMatrix<ElemType>& in,
                            const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
                            CPUMatrix<ElemType>& grad) const;
+    void MaxUnpooling(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, const CPUMatrix<ElemType>& poolInput, CPUMatrix<ElemType>& input) const;

    void AveragePoolingForward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<ElemType>& output) const;
    void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
--- a/Source/Math/Convolution.cuh
+++ b/Source/Math/Convolution.cuh
@ -11,6 +11,16 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

+// -----------------------------------------------------------------------
+// The file contains CUDA kernels that are used in reference convolution
+// engine. All these kernels look very similar as they use the same
+// idea of precomputed maps described in ConvolveGeometry.h
+// That is, 'mpRowCol' maps each convolution output to the start of the
+// input. 'mpRowIwht', 'mpRowRun' and 'runs' provide maps that allow
+// to get indices of the active weight when applying the convolution.
+// See ConvolveGeometry.h (MpRowCol, MpRowIwht etc) for more details.
+// -----------------------------------------------------------------------
+
 template <typename ElemType>
 __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__ kernel,
                                    const int* mpRowCol, const int* mpRowIwht,
@ -203,6 +213,53 @@ __global__ void kMaxPoolingBackward(int batchSize, const ElemType* out, const El
    }
 }

+template <typename ElemType>
+__global__ void kMaxUnpooling(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
+                              const ElemType* __restrict__ src, const ElemType* poolIn, int srcVecSize,
+                              ElemType* dst, int dstVecSize)
+{
+    int row = blockIdx.x * blockDim.x + threadIdx.x;
+    if (row >= srcVecSize)
+        return;
+
+    src    += blockIdx.y * srcVecSize;
+    poolIn += blockIdx.y * dstVecSize;
+    dst    += blockIdx.y * dstVecSize;
+
+    for (int sample = blockIdx.y; sample < batchSize; sample += gridDim.y)
+    {
+        int colBase = mpRowCol[row];
+        assert(0 <= colBase && colBase < dstVecSize);
+
+        int i0 = mpRowIndices[row];
+        int size = indices[i0++];
+        ElemType curMax = poolIn[colBase + indices[i0]];
+        ElemType prevMax = curMax;
+        int imax = 0;
+        for (int i = 1; i < size; i++)
+        {
+            int dcol = indices[i0 + i];
+            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
+            curMax = max(curMax, poolIn[colBase + dcol]);
+            if (curMax > prevMax)
+            {
+                prevMax = curMax;
+                imax = i;
+            }
+
+        }
+
+        int dcol = indices[i0 + imax];
+        assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
+
+        dst[colBase + dcol] = src[row];
+
+        src    += blockIdx.y * srcVecSize;
+        poolIn += blockIdx.y * dstVecSize;
+        dst    += blockIdx.y * dstVecSize;
+    }
+}
+
 template <typename ElemType>
 __global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const int* mpRowIndices, const int* indices,
                                       const ElemType* __restrict__ src, int srcVecSize,
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -107,6 +107,26 @@ void ConvolutionEngine<ElemType>::BackwardPooling(const Mat& out, const Mat& src
    BackwardPoolingCore(out, srcGrad, in, grad);
 }

+template <class ElemType>
+void ConvolutionEngine<ElemType>::MaxUnpooling(const Mat& out, const Mat& poolIn, Mat& in)
+{
+    const auto& g = *m_geometry;
+    assert(g.InputShape().GetNumElements() == in.GetNumRows());
+    assert(g.InputShape().GetNumElements() == poolIn.GetNumRows());
+    assert(g.OutputShape().GetNumElements() == out.GetNumRows());
+    size_t batchSize = in.GetNumCols();
+    assert(batchSize == out.GetNumCols());
+    assert(batchSize == poolIn.GetNumCols());
+#ifdef NDEBUG
+    UNUSED(g);
+    UNUSED(batchSize);
+#endif
+
+    EnsureCompatible();
+    EnsurePoolingInitialized();
+    MaxUnpoolingCore(out, poolIn, in);
+}
+
 //------------------------------------------------------------------
 // Reference convolution engine implementation.
 // This engine supports arbitrary convolution geometry but does not provide efficient implementation.
@ -210,6 +230,11 @@ protected:
            InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
    }

+    void MaxUnpoolingCore(const Mat& out, const Mat& poolIn, Mat& in) override
+    {
+        out.MaxUnpooling(m_mpRowCol, *m_mpRowIndices, *m_indices, poolIn, in);
+    }
+
 protected:
    static bool IsGpu(DEVICEID_TYPE deviceId)
    {
@ -500,6 +525,15 @@ protected:
            InvalidArgument("Pooling type %d is not supported.", (int)m_poolKind);
    }

+    void MaxUnpoolingCore(const Mat& out, const Mat& poolIn, Mat& in) override
+    {
+        UNUSED(out);
+        UNUSED(poolIn);
+        UNUSED(in);
+        // Not implemented but potentially can make a fallback to reference engine.
+        LogicError("MaxUnpooling is not implemented for legacy engine.");
+    }
+
 private:
    ImageDimensions m_inT;
    ImageDimensions m_outT;
@ -816,8 +850,11 @@ public:
 template <class ElemType>
 std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
                                                                                 ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
-                                                                                 ConvolutionEngineKind enabledEngines)
+                                                                                 ConvolutionEngineKind enabledEngines, std::wstring logPrefix)
 {
+    if (!logPrefix.empty())
+        logPrefix += L": ";
+
    auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
    // Note: in some cases do not throw exception even if parameters do not match as Create
    // can be called from places like MEL with default parameters and never be used. 
@ -829,7 +866,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
        if (!isEnabled(ConvolutionEngineKind::Legacy))
            RuntimeError("Trying to use Legacy convolution engine when it's disabled.");
        // REVIEW alexeyk: should honor m_traceLevel here.
-        fprintf(stderr, "\nUsing legacy convolution engine for geometry: %s.\n", engStr.c_str());
+        fprintf(stderr, "\n%lsusing legacy convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return std::make_unique<LegacyConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

@ -837,19 +874,19 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
    if (isEnabled(ConvolutionEngineKind::CuDnn) &&
        CuDnnConvolutionEngineFactory<ElemType>::IsSupported(deviceId, geometry, poolKind))
    {
-        fprintf(stderr, "\nUsing cuDNN convolution engine for geometry: %s.\n", engStr.c_str());
+        fprintf(stderr, "\n%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

    if (isEnabled(ConvolutionEngineKind::Gemm) && GemmConvolutionEngine<ElemType>::IsSupported(deviceId, geometry))
    {
-        fprintf(stderr, "\nUsing GEMM convolution engine for geometry: %s.\n", engStr.c_str());
+        fprintf(stderr, "\n%lsusing GEMM convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
        return std::make_unique<GemmConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
    }

    if (!isEnabled(ConvolutionEngineKind::Reference))
        RuntimeError("Reference convolution is disabled and no other engine supports such configuratin (or disabled).");
-    fprintf(stderr, "\nUsing reference convolution engine for geometry: %s.\n", engStr.c_str());
+    fprintf(stderr, "\n%lsusing reference convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
    return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind);
 }

--- a/Source/Math/ConvolutionEngine.h
+++ b/Source/Math/ConvolutionEngine.h
@ -55,10 +55,14 @@ public:

    void BackwardPooling(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad);

+    void MaxUnpooling(const Mat& out, const Mat& poolIn, Mat& in);
+
    std::shared_ptr<const ConvolveGeometry> Geometry() const { return m_geometry; }

    static std::unique_ptr<ConvolutionEngine<ElemType>> Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout,
-                                                               size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None, ConvolutionEngineKind enabledEngines = ConvolutionEngineKind::All);
+                                                               size_t maxTempMemSizeInSamples, PoolKind poolKind = PoolKind::None, 
+                                                               ConvolutionEngineKind enabledEngines = ConvolutionEngineKind::All,
+                                                               std::wstring logPrefix = L"");

    DISABLE_COPY_AND_MOVE(ConvolutionEngine);

@ -91,6 +95,8 @@ protected:

    virtual void BackwardPoolingCore(const Mat& out, const Mat& srcGrad, const Mat& in, Mat& grad) = 0;

+    virtual void MaxUnpoolingCore(const Mat& out, const Mat& poolIn, Mat& in) = 0;
+
 protected:
    ConvolveGeometryPtr m_geometry;
    DEVICEID_TYPE m_deviceId;
--- a/Source/Math/ConvolveGeometry.h
+++ b/Source/Math/ConvolveGeometry.h
@ -21,6 +21,7 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 // arbitrary configurations and dimensions. In such case the generic implementation becomes very simple and invariant
 // wrt convolution configuration and dimensionality. For specific cases like 2D/3D convolutions and full sharing,
 // highly optimized implementations (e.g. cuDNN) are used.
+// TODO: rename to ConvolutionGeometry
 class ConvolveGeometry final
 {
 public:
@ -426,6 +427,7 @@ public:
        return -(center - (kernSize - 1) / 2);
    }

+    // Computes output shape given input shape and other convolution parameters.
    static TensorShape ComputeOutputShape(const TensorShape& inputShape, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& stride,
                                          const BoolVec& sharing, const BoolVec& autoPad, const TensorShape& lowerPad, const TensorShape& upperPad)
    {
@ -492,6 +494,69 @@ public:
        return dimsOut;
    }

+    // Computes input shape given output shape and other convolution parameters.
+    // Used in deconvolution operation.
+    static TensorShape ComputeInputShape(const TensorShape& outputShape, const TensorShape& kernelShape, const TensorShape& mapCount, const TensorShape& stride,
+                                         const BoolVec& sharing, const BoolVec& autoPad, const TensorShape& lowerPad, const TensorShape& upperPad)
+    {
+        if (outputShape.GetRank() != kernelShape.GetRank())
+            InvalidArgument("Convolution output and kernel tensors must have the same rank.");
+        if (mapCount.GetRank() != 1 && outputShape.GetRank() != mapCount.GetRank())
+            InvalidArgument("Convolution map tensor must have rank 1 or the same as the output tensor.");
+        if (stride.GetRank() != 1 && outputShape.GetRank() != stride.GetRank())
+            InvalidArgument("Convolution stride tensor must have rank 1 or the same as the output tensor.");
+        if (sharing.size() != 1 && outputShape.GetRank() != sharing.size())
+            InvalidArgument("Convolution sharing tensor must have rank 1 or the same as the output tensor.");
+        if (autoPad.size() != 1 && outputShape.GetRank() != autoPad.size())
+            InvalidArgument("Convolution padding tensor must have rank 1 or the same as the output tensor.");
+        if (lowerPad.GetRank() != 1 && outputShape.GetRank() != lowerPad.GetRank())
+            InvalidArgument("Convolution lower pad tensor must have rank 1 or the same as the output tensor.");
+        if (upperPad.GetRank() != 1 && outputShape.GetRank() != upperPad.GetRank())
+            InvalidArgument("Convolution upper pad tensor must have rank 1 or the same as the output tensor.");
+
+        SmallVector<size_t> dimsInput(outputShape.GetRank());
+        for (size_t i = 0; i < outputShape.GetRank(); i++)
+        {
+            assert(outputShape[i] >= 1);
+
+            size_t delta = stride[stride.GetRank() == 1 ? 0 : i];
+            size_t dim = outputShape[i];
+            // Input dimension does not include output map count.
+            size_t curMapCount = 1;
+            if (mapCount.size() > 1)
+                curMapCount = mapCount[i];
+            else if (i == outputShape.GetRank() - 1)
+                curMapCount = mapCount[0];
+            assert((dim % curMapCount) == 0);
+            dim /= curMapCount;
+
+            bool autoPadCur = autoPad[autoPad.size() == 1 ? 0 : i];
+            size_t lo = lowerPad[lowerPad.size() == 1 ? 0 : i];
+            size_t hi = upperPad[upperPad.size() == 1 ? 0 : i];
+            size_t dimIn = (dim - 1) * delta;
+            // We need to be able to restore any input size from the output, not just the one
+            // that does not require padding. For example, if output is 14, stride 2 and 
+            // desired input is 28 then padded input will be 31. In this case if autopadding is enabled,
+            // the input will 27 as (27 - 1) / 2 + 1 == 14.
+            if (autoPadCur)
+                dimIn += 1;
+            else
+                dimIn += (int64_t)kernelShape[i] - (lo + hi);
+            // When LowerPad and/or UpperPad are specified (i.e. > 0), we insist that the kernel applications
+            // fill the entire space.
+            if (!autoPadCur && (lo > 0 || hi > 0))
+            {
+                size_t size = (dimIn - kernelShape[i] + lo + hi) / delta + 1;
+                if (size != dim)
+                    InvalidArgument("Convolution requires that kernel fills the entire space if auto-padding is disabled.");
+            }
+
+            dimsInput[i] = dimIn;
+        }
+
+        return TensorShape(dimsInput);
+    }
+
    // Used in unit tests and during debugging.
    operator std::string() const
    {
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -306,6 +306,15 @@ protected:
                                        m_inT, ptr(in), &C::One, m_inT, ptr(grad)));
    }

+    void MaxUnpoolingCore(const Mat& out, const Mat& poolIn, Mat& in) override
+    {
+        UNUSED(out);
+        UNUSED(poolIn);
+        UNUSED(in);
+        // Not implemented but potentially can make a fallback to reference engine.
+        LogicError("MaxUnpooling is not implemented for cuDNN engine.");
+    }
+
 private:
    using C = Consts<ElemType>;

--- a/Source/Math/GPUMatrix.cu
+++ b/Source/Math/GPUMatrix.cu
@ -3074,6 +3074,17 @@ void GPUMatrix<ElemType>::MaxPoolingBackward(const GPUMatrix<ElemType>& out, con
                                                            Data(), (int)GetNumRows(), grad.Data(), (int)grad.GetNumRows());
 }

+template <class ElemType>
+void GPUMatrix<ElemType>::MaxUnpooling(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, const GPUMatrix<ElemType>& poolInput, GPUMatrix<ElemType>& input) const
+{
+    const int BlockSize = 128;
+    auto gdim = dim3((GetNumRows() + BlockSize - 1)/ BlockSize, std::min((int)GetNumCols(), 65535));
+    PrepareDevice();
+    SyncGuard syncGuard;
+    kMaxUnpooling<<<gdim, BlockSize, 0, t_stream>>>((int)GetNumCols(), mpRowCol.Data(), mpRowIndices.Data(), indices.Data(),
+                                                     Data(), poolInput.Data(), (int)GetNumRows(), input.Data(), (int)input.GetNumRows());
+}
+
 template <class ElemType>
 void GPUMatrix<ElemType>::AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
 {
@ -3137,6 +3148,7 @@ void GPUMatrix<ElemType>::BatchNormalizationForward(const GPUMatrix<ElemType>& s
        if (blendFactor > 0)
        {
            // REVIEW alexeyk: can be rolled into NormalizeBatchTraining to save bandwidth.
+            // TODO: add a 'beta' parameter to ScaleAndAdd()
            Scale((ElemType)(1 - blendFactor), saveMean);
            ScaleAndAdd((ElemType)blendFactor, runMean, saveMean);
            Scale((ElemType)(1 - blendFactor), saveInvStdDev);
--- a/Source/Math/GPUMatrix.h
+++ b/Source/Math/GPUMatrix.h
@ -445,6 +445,7 @@ public:
    void MaxPoolingBackward(const GPUMatrix<ElemType>& out, const GPUMatrix<ElemType>& in,
                            const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices,
                            GPUMatrix<ElemType>& grad) const;
+    void MaxUnpooling(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, const GPUMatrix<ElemType>& poolInput, GPUMatrix<ElemType>& input) const;

    void AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const;
    void AveragePoolingBackward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& grad) const;
--- a/Source/Math/Math.vcxproj
+++ b/Source/Math/Math.vcxproj
@ -1,5 +1,5 @@
-<?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -31,7 +31,6 @@
    <UseDebugLibraries>true</UseDebugLibraries>
    <PlatformToolset>v120</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
-    <UseIntelMKL>No</UseIntelMKL>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
@ -39,8 +38,6 @@
    <PlatformToolset>v120</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
-    <UseIntelMKL>No</UseIntelMKL>
-    <UseIntelIPP>false</UseIntelIPP>
  </PropertyGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
  <ImportGroup Label="ExtensionSettings" />
@ -55,17 +52,18 @@
  </PropertyGroup>
  <ItemDefinitionGroup>
    <ClCompile>
-      <AdditionalIncludeDirectories>$(ACML_PATH)\include;$(SolutionDir)Source\Common\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <AdditionalIncludeDirectories>$(MathIncludePath);$(SolutionDir)Source\Common\include;%(AdditionalIncludeDirectories)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
-      <AdditionalLibraryDirectories>$(ACML_PATH)\lib;$(OutDir)</AdditionalLibraryDirectories>
+      <AdditionalLibraryDirectories>$(MathLibraryPath);$(OutDir)</AdditionalLibraryDirectories>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
    <ClCompile>
      <PrecompiledHeader>NotUsing</PrecompiledHeader>
      <WarningLevel>Level4</WarningLevel>
-      <PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>$(MathDefine); NO_SYNC; WIN32; _DEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <Optimization>Disabled</Optimization>
@ -77,13 +75,13 @@
    <Link>
      <SubSystem>Console</SubSystem>
      <GenerateDebugInformation>true</GenerateDebugInformation>
-      <AdditionalDependencies>libacml_mp_dll.lib;Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <DelayLoadDLLs>libacml_mp_dll.dll; $(CudaDlls); %(DelayLoadDLLs)</DelayLoadDLLs>
+      <AdditionalDependencies>$(MathLinkLibrary);Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>$(MathDelayLoad); $(CudaDlls); %(DelayLoadDLLs)</DelayLoadDLLs>
      <Profile>true</Profile>
    </Link>
    <PostBuildEvent>
-      <Command>xcopy /D /I /Y "$(ACML_PATH)\lib\*.dll" "$(OutputPath)"</Command>
-      <Message>Copying ACML DLLs</Message>
+      <Command>xcopy /D /I /Y "$(MathPostBuildCopyPattern)" "$(OutDir)"</Command>
+      <Message>Copying $(MathLibraryName) DLLs</Message>
    </PostBuildEvent>
    <CudaCompile>
      <TargetMachinePlatform>64</TargetMachinePlatform>
@ -103,7 +101,7 @@
      <Optimization>MaxSpeed</Optimization>
      <FunctionLevelLinking>true</FunctionLevelLinking>
      <IntrinsicFunctions>true</IntrinsicFunctions>
-      <PreprocessorDefinitions>USE_ACML; NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <PreprocessorDefinitions>$(MathDefine); NO_SYNC; WIN32; NDEBUG; _WINDOWS; _USRDLL; MATH_EXPORTS; %(PreprocessorDefinitions)</PreprocessorDefinitions>
      <SDLCheck>true</SDLCheck>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <FloatingPointModel>Fast</FloatingPointModel>
@ -119,13 +117,13 @@
      <GenerateDebugInformation>true</GenerateDebugInformation>
      <EnableCOMDATFolding>true</EnableCOMDATFolding>
      <OptimizeReferences>true</OptimizeReferences>
-      <AdditionalDependencies>libacml_mp_dll.lib;Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>$(MathLinkLibrary);Common.lib;%(AdditionalDependencies)</AdditionalDependencies>
+      <DelayLoadDLLs>$(MathDelayLoad); $(CudaDlls); %(DelayLoadDLLs)</DelayLoadDLLs>
      <Profile>true</Profile>
-      <DelayLoadDLLs>libacml_mp_dll.dll; $(CudaDlls); %(DelayLoadDLLs)</DelayLoadDLLs>
    </Link>
    <PostBuildEvent>
-      <Command>xcopy /D /I /Y "$(ACML_PATH)\lib\*.dll" "$(OutputPath)"</Command>
-      <Message>Copying ACML DLLs</Message>
+      <Command>xcopy /D /I /Y "$(MathPostBuildCopyPattern)" "$(OutDir)"</Command>
+      <Message>Copying $(MathLibraryName) DLLs</Message>
    </PostBuildEvent>
    <CudaCompile>
      <FastMath>true</FastMath>
@ -155,7 +153,7 @@
      <PreprocessorDefinitions>CPUONLY;%(PreprocessorDefinitions)</PreprocessorDefinitions>
    </ClCompile>
    <Link>
-      <DelayLoadDLLs>libacml_mp_dll.dll</DelayLoadDLLs>
+      <DelayLoadDLLs>$(MathDelayLoad)</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemGroup>
@ -167,9 +165,9 @@
    <ClInclude Include="ConvolutionEngine.h" />
    <ClInclude Include="ConvolveGeometry.h" />
    <ClInclude Include="CPUMatrix.h" />
-    <ClInclude Include="CPURNGHandle.h" />
+    <ClInclude Include="CPURNGHandle.h" />	
    <ClInclude Include="MatrixQuantizerImpl.h" />
-    <ClInclude Include="RNGHandle.h" />
+    <ClInclude Include="RNGHandle.h" />	
    <ClInclude Include="TensorOps.h" />
    <ClInclude Include="TensorView.h" />
    <None Include="GPUWatcher.cu" />
@ -193,7 +191,7 @@
  <ItemGroup>
    <ClCompile Include="BatchNormalizationEngine.cpp" />
    <ClCompile Include="ConvolutionEngine.cpp" />
-    <ClCompile Include="CPURNGHandle.cpp" />
+    <ClCompile Include="CPURNGHandle.cpp" />	
    <ClCompile Include="CPUSparseMatrix.cpp" />
    <ClCompile Include="CUDAPageLockedMemAllocator.cpp" />
    <ClCompile Include="dllmain.cpp">
@ -207,7 +205,7 @@
    <ClCompile Include="NoGPU.cpp" />
    <ClCompile Include="Matrix.cpp" />
    <ClCompile Include="QuantizedMatrix.cpp" />
-    <ClCompile Include="RNGHandle.cpp" />
+    <ClCompile Include="RNGHandle.cpp" />	
    <ClCompile Include="stdafx.cpp">
      <PrecompiledHeader>Create</PrecompiledHeader>
    </ClCompile>
@ -218,4 +216,8 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+  <Target Name="CheckDependencies">
+    <Error Condition="'$(CNTK_MKL)' == '1' And !Exists('$(CNTKCustomMKLPath)')" Text="CNTK custom MKL not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#optional-mkl for instructions." />
+    <Error Condition="'$(CNTK_MKL)' != '1' And !Exists('$(ACML_PATH)')" Text="ACML not found. See https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#acml for instructions." />
+  </Target>
+</Project>
--- a/Source/Math/MathCUDA.vcxproj
+++ b/Source/Math/MathCUDA.vcxproj
@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
@ -65,6 +65,7 @@
      <AdditionalIncludeDirectories>$(SolutionDir)Source\Common\include;$(CudaInclude);$(CUB_PATH);$(CuDnnIncPath)</AdditionalIncludeDirectories>
      <MultiProcessorCompilation>true</MultiProcessorCompilation>
      <FloatingPointModel>Fast</FloatingPointModel>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalDependencies>$(CudaLibs);%(AdditionalDependencies)</AdditionalDependencies>
@ -82,7 +83,7 @@
      <FastMath>true</FastMath>
      <GPUDebugInfo>false</GPUDebugInfo>
      <GPUDebugInfo Condition="'$(CNTK_CUDA_DEVICE_DEBUGINFO)'=='1'">true</GPUDebugInfo>
-      <AdditionalOptions>-Xcudafe "--diag_suppress=field_without_dll_interface" %(AdditionalOptions)</AdditionalOptions>
+      <AdditionalOptions>-Xcudafe "--diag_suppress=field_without_dll_interface" -Xcompiler "/wd 4819" %(AdditionalOptions)</AdditionalOptions>
    </CudaCompile>
    <PostBuildEvent>
      <Command>for %%l in ($(CudaDlls)) do if exist "$(CudaPath)\bin\%%l" xcopy /D /Y "$(CudaPath)\bin\%%l*" "$(OutputPath)"
@ -198,4 +199,4 @@ if exist "$(CuDnnDll)" xcopy /D /Y "$(CuDnnDll)" "$(OutputPath)"
    <Error Condition="!Exists('$(CUB_PATH)')" Text="CNTK requires the NVIDIA CUB library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
    <Error Condition="!Exists('$(CUDNN_PATH)')" Text="CNTK requires the NVIDIA cuDNN library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#cuda for installation instructions." />
  </Target>
-</Project>
+</Project>
--- a/Source/Math/Matrix.cpp
+++ b/Source/Math/Matrix.cpp
@ -4207,6 +4207,31 @@ void Matrix<ElemType>::MaxPoolingBackward(const Matrix<ElemType>& out, const Mat
                            NOT_IMPLEMENTED);
 }

+template <class ElemType>
+void Matrix<ElemType>::MaxUnpooling(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, const Matrix<ElemType>& poolInput, Matrix<ElemType>& input) const
+{
+    assert(mpRowCol.GetNumCols() == 1);
+    assert(mpRowIndices.GetNumCols() == 1);
+    assert(indices.GetNumCols() == 1);
+
+    DecideAndMoveToRightDevice(*this, input);
+
+    // REVIEW alexeyk: setting values to zero may cause inconsistency when negative values are unpooled.
+    // To see why, let's assume we have just one input with negative value and output of, for example, 2x2.
+    // As a result of unpooling, there will be 3 zero values and one negative. If we now apply max pooling
+    // operation to the output then we get 0 as the output, not the original negative value.
+    // In practice this will not happen as pooling layers usually go right after ReLU layer.
+    input.SetValue(0);
+
+    // REVIEW alexeyk: add sparse version.
+    DISPATCH_MATRIX_ON_FLAG(this,
+                            this,
+                            m_CPUMatrix->MaxUnpooling(*(mpRowCol.m_CPUMatrix), *(mpRowIndices.m_CPUMatrix), *(indices.m_CPUMatrix), *(poolInput.m_CPUMatrix), *(input.m_CPUMatrix)),
+                            m_GPUMatrix->MaxUnpooling(*(mpRowCol.m_GPUMatrix), *(mpRowIndices.m_GPUMatrix), *(indices.m_GPUMatrix), *(poolInput.m_GPUMatrix), *(input.m_GPUMatrix)),
+                            NOT_IMPLEMENTED,
+                            NOT_IMPLEMENTED);
+}
+
 template <class ElemType>
 void Matrix<ElemType>::AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const
 {
--- a/Source/Math/Matrix.h
+++ b/Source/Math/Matrix.h
@ -495,6 +495,7 @@ public:
    void MaxPoolingBackward(const Matrix<ElemType>& out, const Matrix<ElemType>& in,
                            const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices,
                            Matrix<ElemType>& grad) const;
+    void MaxUnpooling(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, const Matrix<ElemType>& poolInput, Matrix<ElemType>& input) const;

    void AveragePoolingForward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& output) const;
    void AveragePoolingBackward(const Matrix<int>& mpRowCol, const Matrix<int>& mpRowIndices, const Matrix<int>& indices, Matrix<ElemType>& grad) const;
--- a/Source/Math/NoGPU.cpp
+++ b/Source/Math/NoGPU.cpp
@ -1809,6 +1809,11 @@ void GPUMatrix<ElemType>::MaxPoolingBackward(const GPUMatrix<ElemType>& out, con
 {
 }

+template <class ElemType>
+void GPUMatrix<ElemType>::MaxUnpooling(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, const GPUMatrix<ElemType>& poolInput, GPUMatrix<ElemType>& input) const
+{
+}
+
 template <class ElemType>
 void GPUMatrix<ElemType>::AveragePoolingForward(const GPUMatrix<int>& mpRowCol, const GPUMatrix<int>& mpRowIndices, const GPUMatrix<int>& indices, GPUMatrix<ElemType>& output) const
 {
--- a/Source/Readers/ImageReader/ImageReader.vcxproj
+++ b/Source/Readers/ImageReader/ImageReader.vcxproj
@ -71,6 +71,7 @@
      <SDLCheck>true</SDLCheck>
      <TreatWarningAsError>true</TreatWarningAsError>
      <OpenMPSupport>true</OpenMPSupport>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <SubSystem>Windows</SubSystem>
@ -146,4 +147,4 @@ if "$(UseZip)" == "true" if exist "$(ZLIB_PATH)\bin\zlib1.dll" (xcopy /I /D /Y "
    <Warning Condition="!$(HasOpenCV)" Text="ImageReader requires the OpenCV library to build. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#opencv for installation instructions." />
    <Warning Condition="!$(UseZip)" Text="zlib and libzip libraries were not found, ImageReader will be built without zip container support. Please see https://github.com/Microsoft/CNTK/wiki/Setup-CNTK-on-Windows#libzip for installation instructions." />
  </Target>
-</Project>
+</Project>
--- a/Source/SGDLib/SGD.cpp
+++ b/Source/SGDLib/SGD.cpp
@ -8,6 +8,7 @@
 #include "SpecialPurposeNodes.h"        // for SequenceWithSoftmaxNode
 #include "DataReaderHelpers.h"
 #include "MatrixQuantizerImpl.h"
+
 #ifdef CNTK_PARALLEL_TRAINING_SUPPORT
 //static inline bool operator==(const std::pair<double,size_t>& a, double b) { assert(b==0); return a.first == b; }
 // ^^ workaround until this line in AggregateGradientsImpl() gets updated: assert(headerCPU->evalErrors[i] == 0);
@ -2355,7 +2356,7 @@ SGDParams::SGDParams(const ConfigRecordType& configSGD, size_t sizeofElemType)
    m_minibatchSearchCriterionErrorMargin = configAALR(L"minibatchSearchCriterionErrorMargin", (size_t) 1);

    // the number of minibatches used to search
-    // the learning rate. Its typically set to 10-20% of
+    // the learning rate. It's typically set to 10-20% of
    // the total minibatches in an epoch.
    m_numMiniBatch4LRSearch = configAALR(L"numMiniBatch4LRSearch", ConfigRecordType::Array(intargvector(vector<int>{500})));

--- a/Source/SGDLib/SGDLib.vcxproj
+++ b/Source/SGDLib/SGDLib.vcxproj
@ -43,6 +43,7 @@
      </PrecompiledHeader>
      <PreprocessorDefinitions>WIN32;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
      <PreprocessorDefinitions Condition="'$(CNTK_ENABLE_1BitSGD)'=='true'">QUANTIZED_GRADIENT_AGGREGATION;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(MSMPI_LIB64);$(OutDir);$(NvmlLibPath)</AdditionalLibraryDirectories>
--- a/Tests/UnitTests/CommandEval/CommandEval.vcxproj
+++ b/Tests/UnitTests/CommandEval/CommandEval.vcxproj
@ -52,6 +52,7 @@
  <ItemDefinitionGroup>
    <ClCompile>
      <AdditionalIncludeDirectories>$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\Math;$(SolutionDir)Source\Common\Include</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
@ -109,4 +110,4 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets">
  </ImportGroup>
-</Project>
+</Project>
--- a/Tests/UnitTests/EvalTests/EvalTests.vcxproj
+++ b/Tests/UnitTests/EvalTests/EvalTests.vcxproj
@ -68,6 +68,7 @@
  <ItemDefinitionGroup>
    <ClCompile>
      <AdditionalIncludeDirectories>$(BOOST_INCLUDE_PATH);$(SolutionDir)Source\Common\Include</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)..;$(BOOST_LIB_PATH)</AdditionalLibraryDirectories>
@ -158,13 +159,13 @@
      <CuDnnDll Condition="$(GpuBuild) And Exists('$(OutDir)..\cudnn64_4.dll')">$(OutDir)..\cudnn64_4.dll</CuDnnDll>
    </PropertyGroup>
    <ItemGroup>
-      <UnitTestDependencies Include="$(OutDir)CNTK.Core.BS;$(OutDir)..\evaldll.dll;$(OutDir)..\Math.dll;$(OutDir)..\libacml_mp_dll.dll;$(OutDir)..\libifcoremd.dll;$(OutDir)..\libifportmd.dll;$(OutDir)..\libiomp*.dll;$(OutDir)..\libmmd.dll;$(OutDir)..\svml_dispmd.dll;" />
+      <UnitTestDependencies Include="$(OutDir)CNTK.Core.BS;$(OutDir)..\evaldll.dll;$(OutDir)..\Math.dll;$(UnitTestDlls)" />
    </ItemGroup>
    <ItemGroup Condition="$(GpuBuild)">
-      <UnitTestDependencies Include="$(OutDir)CNTK.Core.BS;$(OutDir)..\evaldll.dll;$(OutDir)..\cuda*.dll;$(OutDir)..\svml_dispmd.dll;$(CuDnnDll);$(UnitTestDependencies)" />
+      <UnitTestDependencies Include="$(OutDir)CNTK.Core.BS;$(OutDir)..\evaldll.dll;$(OutDir)..\cuda*.dll;$(CuDnnDll);$(UnitTestDependencies)" />
    </ItemGroup>
    <Copy SourceFiles="@(UnitTestDependencies)" DestinationFolder="$(OutDir)" SkipUnchangedFiles="true">
      <Output TaskParameter="DestinationFiles" ItemName="NewFileWrites" />
    </Copy>
  </Target>
-</Project>
+</Project>
--- a/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
+++ b/Tests/UnitTests/MathTests/ConvolutionEngineTests.cpp
@ -175,7 +175,7 @@ BOOST_AUTO_TEST_CASE(ConvolutionForward)
        std::generate(begin(data) + r * c, begin(data) + 2 * r * c, [&] { return nd(rng); });
        buf.SetValue(r, 3 * c, buf.GetDeviceId(), data.data());
        // Get center slice.
-        return buf.ColumnSlice(c, c).DeepClone();
+        return buf.ColumnSlice(c, c);
    };

    int baseDeviceId = 0;
@ -224,7 +224,7 @@ BOOST_AUTO_TEST_CASE(ConvolutionForward)
            std::string emsg;

            BOOST_REQUIRE_MESSAGE(!out.HasNan("out"), "out" << msgNan);
-            BOOST_REQUIRE_MESSAGE(CheckEqual(out, outB, emsg, relErr * 4, absErr * 8), "out" << msg << ". " << emsg);
+            BOOST_REQUIRE_MESSAGE(CheckEqual(out, outB, emsg, relErr * 4, absErr * 9), "out" << msg << ". " << emsg);
            BOOST_REQUIRE_MESSAGE(CountNans(outBuf) == crowOut * 2 * n, "out" << msgNotNan);
        }
    }
@ -243,7 +243,7 @@ BOOST_AUTO_TEST_CASE(ConvolutionBackwardData)
        std::generate(begin(data) + r * c, begin(data) + 2 * r * c, [&] { return nd(rng); });
        buf.SetValue(r, 3 * c, buf.GetDeviceId(), data.data());
        // Get center slice.
-        return buf.ColumnSlice(c, c).DeepClone();
+        return buf.ColumnSlice(c, c);
    };

    int baseDeviceId = 0;
@ -380,7 +380,7 @@ BOOST_AUTO_TEST_CASE(PoolingForward)
        std::generate(begin(data) + r * c, begin(data) + 2 * r * c, [&] { return nd(rng); });
        buf.SetValue(r, 3 * c, buf.GetDeviceId(), data.data());
        // Get center slice.
-        return buf.ColumnSlice(c, c).DeepClone();
+        return buf.ColumnSlice(c, c);
    };

    int baseDeviceId = 0;
@ -499,6 +499,87 @@ BOOST_AUTO_TEST_CASE(PoolingBackward)
    }
 }

+BOOST_AUTO_TEST_CASE(MaxUnpooling)
+{
+    using IntMatrix = Matrix<int>;
+
+    std::mt19937 rng(0);
+    std::uniform_int_distribution<> batchSizeG(1, 8);
+    // Using uniform distribution with positive values to avoid issues with
+    // unpooling negative values.
+    std::uniform_real_distribution<float> nd(0, 1);
+
+    auto initMat = [&](SingleMatrix& buf, size_t r, size_t c, vec& data) -> SingleMatrix
+    {
+        data.resize(r * 3 * c);
+        std::fill(begin(data), end(data), std::numeric_limits<float>::quiet_NaN());
+        std::generate(begin(data) + r * c, begin(data) + 2 * r * c, [&] { return nd(rng); });
+        buf.SetValue(r, 3 * c, buf.GetDeviceId(), data.data());
+        // Get center slice.
+        return buf.ColumnSlice(c, c);
+    };
+
+    int cpuDeviceId = -1;
+    int gpuDeviceId = 0;
+
+    for (const auto& g : GeneratePoolTestConfigs())
+    {
+        // cpuEng and gpuEng are used to compare results against each other.
+        auto cpuEng = ConvEng::Create(g, cpuDeviceId, ImageLayoutKind::CHW, 0, PoolKind::Max, ConvolutionEngineKind::Reference);
+        auto gpuEng = ConvEng::Create(g, gpuDeviceId, ImageLayoutKind::CHW, 0, PoolKind::Max, ConvolutionEngineKind::Reference);
+
+        size_t n = batchSizeG(rng);
+        vec buf;
+        buf.resize(g->InputShape().GetNumElements() * n);
+        std::generate(begin(buf), end(buf), [&] { return nd(rng); });
+        SingleMatrix inC(g->InputShape().GetNumElements(), n, buf.data(), cpuDeviceId, matrixFlagNormal);
+        SingleMatrix inG(g->InputShape().GetNumElements(), n, buf.data(), gpuDeviceId, matrixFlagNormal);
+
+        // First, compute max pooling output and corresponding mask.
+        SingleMatrix outC(g->OutputShape().GetNumElements(), n, cpuDeviceId);
+        SingleMatrix outG(g->OutputShape().GetNumElements(), n, gpuDeviceId);
+
+        cpuEng->ForwardPooling(inC, outC);
+        gpuEng->ForwardPooling(inG, outG);
+        
+        // Second, do the unpooling.
+        size_t crowIn = g->InputShape().GetNumElements();
+        SingleMatrix inUBufC(cpuDeviceId);
+        SingleMatrix inUC = initMat(inUBufC, crowIn, n, buf);
+        SingleMatrix inUBufG(inUBufC.DeepClone(), gpuDeviceId);
+        SingleMatrix inUG = initMat(inUBufG, crowIn, n, buf);
+
+        cpuEng->MaxUnpooling(outC, inC, inUC);
+        gpuEng->MaxUnpooling(outG, inG, inUG);
+
+        // Check that CPU/GPU results are the same.
+        std::stringstream tmsg;
+        tmsg << "Geometry: " << (std::string)(*g) << ", Batch: " << n;
+        std::string msg = " are not equal, " + tmsg.str();
+        std::string msgNan = " has NaNs, " + tmsg.str();
+        std::string msgNotNan = " has buffer overflow/underflow, " + tmsg.str();
+
+        float relErr = 0;
+        float absErr = 0;
+        std::string emsg;
+
+        BOOST_REQUIRE_MESSAGE(!inUC.HasNan("inUC"), "inUC" << msgNan);
+        BOOST_REQUIRE_MESSAGE(!inUG.HasNan("inUG"), "inUG" << msgNan);
+        BOOST_REQUIRE_MESSAGE(CheckEqual(inUC, inUG, emsg, relErr, absErr), "inU" << msg << ". " << emsg);
+        BOOST_REQUIRE_MESSAGE(CountNans(inUBufC) == crowIn * 2 * n, "inUBufC" << msgNotNan);
+        BOOST_REQUIRE_MESSAGE(CountNans(inUBufG) == crowIn * 2 * n, "inUBufG" << msgNotNan);
+
+        // Now do the pooling from unpooled source and compare with original pooling.
+        SingleMatrix outC_2(g->OutputShape().GetNumElements(), n, cpuDeviceId);
+        SingleMatrix outG_2(g->OutputShape().GetNumElements(), n, gpuDeviceId);
+        cpuEng->ForwardPooling(inUC, outC_2);
+        gpuEng->ForwardPooling(inUG, outG_2);
+
+        BOOST_REQUIRE_MESSAGE(CheckEqual(outC_2, outC, emsg, relErr, absErr), "outC_2" << msg << ". " << emsg);
+        BOOST_REQUIRE_MESSAGE(CheckEqual(outG_2, outG, emsg, relErr, absErr), "outG_2" << msg << ". " << emsg);
+    }
+}
+
 BOOST_AUTO_TEST_SUITE_END()

 } } } }
--- a/Tests/UnitTests/MathTests/MathTests.vcxproj
+++ b/Tests/UnitTests/MathTests/MathTests.vcxproj
@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
@ -67,6 +67,7 @@
  <ItemDefinitionGroup>
    <ClCompile>
      <AdditionalIncludeDirectories>$(BOOST_INCLUDE_PATH);$(SolutionDir)Source\Common\Include</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)..;$(BOOST_LIB_PATH)</AdditionalLibraryDirectories>
@ -168,13 +169,13 @@
      <CuDnnDll Condition="$(GpuBuild) And Exists('$(OutDir)..\cudnn64_4.dll')">$(OutDir)..\cudnn64_4.dll</CuDnnDll>
    </PropertyGroup>
    <ItemGroup>
-      <UnitTestDependencies Include="$(OutDir)..\Math.dll;$(OutDir)..\libacml_mp_dll.dll;$(OutDir)..\libifcoremd.dll;$(OutDir)..\libifportmd.dll;$(OutDir)..\libiomp*.dll;$(OutDir)..\libmmd.dll;$(OutDir)..\svml_dispmd.dll;" />
+      <UnitTestDependencies Include="$(OutDir)..\Math.dll;$(UnitTestDlls);" />
    </ItemGroup>
    <ItemGroup Condition="$(GpuBuild)">
-      <UnitTestDependencies Include="$(OutDir)..\cuda*.dll;$(OutDir)..\svml_dispmd.dll;$(CuDnnDll);$(UnitTestDependencies)" />
+      <UnitTestDependencies Include="$(OutDir)..\cuda*.dll;$(CuDnnDll);$(UnitTestDependencies)" />
    </ItemGroup>
    <Copy SourceFiles="@(UnitTestDependencies)" DestinationFolder="$(OutDir)" SkipUnchangedFiles="true">
      <Output TaskParameter="DestinationFiles" ItemName="NewFileWrites" />
    </Copy>
  </Target>
-</Project>
+</Project>
--- a/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
+++ b/Tests/UnitTests/NetworkTests/NetworkTests.vcxproj
@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
@ -58,6 +58,7 @@
      <UseFullPaths>true</UseFullPaths>
      <OpenMPSupport>true</OpenMPSupport>
      <AdditionalIncludeDirectories>$(MSMPI_INC);$(SolutionDir)Source\Readers\ReaderLib;$(SolutionDir)Source\Common\Include;$(SolutionDir)Source\Math;$(SolutionDir)Source\ActionsLib;$(SolutionDir)Source\ComputationNetworkLib;$(SolutionDir)Source\CNTK\BrainScript;$(BOOST_INCLUDE_PATH)</AdditionalIncludeDirectories>
+      <DisableSpecificWarnings>4819</DisableSpecificWarnings>
    </ClCompile>
    <Link>
      <SubSystem>Console</SubSystem>
@ -134,10 +135,10 @@
  </Target>
  <Target Name="CopyUnitTestDependencies" AfterTargets="Build">
    <ItemGroup>
-      <UnitTestDependencies Include="$(OutDir)..\Math.dll;$(OutDir)..\libacml_mp_dll.dll;$(OutDir)..\libifcoremd.dll;$(OutDir)..\libifportmd.dll;$(OutDir)..\libiomp*.dll;$(OutDir)..\libmmd.dll;$(OutDir)..\svml_dispmd.dll;" />
+      <UnitTestDependencies Include="$(OutDir)..\Math.dll;$(UnitTestDlls);" />
    </ItemGroup>
    <Copy SourceFiles="@(UnitTestDependencies)" DestinationFolder="$(OutDir)" SkipUnchangedFiles="true">
      <Output TaskParameter="DestinationFiles" ItemName="NewFileWrites" />
    </Copy>
  </Target>
-</Project>
+</Project>
--- a/Tests/UnitTests/ReaderTests/ReaderTests.vcxproj
+++ b/Tests/UnitTests/ReaderTests/ReaderTests.vcxproj
@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="utf-8"?>
+<?xml version="1.0" encoding="utf-8"?>
 <Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="12.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
@ -262,7 +262,7 @@
  </PropertyGroup>
  <Target Name="CopyUnitTestDependencies" AfterTargets="Build">
    <ItemGroup>
-      <UnitTestDependencies Include="$(OutDir)..\Math.dll;$(OutDir)..\ucifastreader.dll;$(OutDir)..\cntktextformatreader.dll;$(OutDir)..\htkmlfreader.dll;$(OutDir)..\HTKDeserializers.dll;$(OutDir)..\compositedatareader.dll;$(OutDir)..\libacml_mp_dll.dll;$(OutDir)..\libifcoremd.dll;$(OutDir)..\libifportmd.dll;$(OutDir)..\libiomp*.dll;$(OutDir)..\libmmd.dll;$(OutDir)..\svml_dispmd.dll;$(ImageReaderDependencies);" />
+      <UnitTestDependencies Include="$(OutDir)..\Math.dll;$(OutDir)..\ucifastreader.dll;$(OutDir)..\cntktextformatreader.dll;$(OutDir)..\htkmlfreader.dll;$(OutDir)..\HTKDeserializers.dll;$(OutDir)..\compositedatareader.dll;$(UnitTestDlls);$(ImageReaderDependencies);" />
    </ItemGroup>
    <Copy SourceFiles="@(UnitTestDependencies)" DestinationFolder="$(OutDir)" SkipUnchangedFiles="true">
      <Output TaskParameter="DestinationFiles" ItemName="NewFileWrites" />
--- a/Tools/build-and-test
+++ b/Tools/build-and-test
@ -1,4 +1,12 @@
 #!/bin/bash
+#
+# Copyright (c) Microsoft. All rights reserved.
+#
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+#
+# TODO --math-library support on Windows

 # Setting some default values
 BUILD=1
@ -9,6 +17,7 @@ RANDOM_OUTPUT=0
 CODE_COVERAGE=no
 FLAVORS="debug:release"
 TARGETS="cpu:gpu"
+MATH_LIBRARY="acml"
 TESTTARGETS="cpu:gpu"

 # parsing command line arguments:
@ -25,6 +34,7 @@ case $key in
    echo "  -b|--build-only - just build, do not run"
    echo "  -f|--flavors <flavor1:flavor2...> - which flavor to build (by default $FLAVORS)"
    echo "  -t|--targets <target1:target2...> - which target to build (by default $TARGETS)"
+    echo "  -m|--math-library <mathlibrary> - which math library to build with (by default $MATH_LIBRARY)"
    echo "  -tt|--test-targets <testtarget1:testtarget2...> - which target to test (by default $TESTTARGETS)"
    echo "  -cc|--code-coverage - build with support for code coverage (gcov)"
    echo "  -cb|--clean-build - clean up the enlistment binaries before build"
@ -64,6 +74,24 @@ case $key in
    TARGETS="${2,,}"
    shift # past argument
    ;;
+    -m|--math-library)
+    case ${2,,} in
+        acml)
+        MATH_LIBRARY_OPTION="--with-acml=$ACML_PATH"
+        ;;
+        mkl)
+        MATH_LIBRARY_OPTION="--with-mkl=$MKL_PATH"
+        ;;
+        mkl-sequential)
+        MATH_LIBRARY_OPTION="--with-mkl-sequential=$MKL_PATH"
+        ;;
+        *)
+        echo Unknown math library $MATH_LIBRARY
+        exit 1
+        ;;
+    esac
+    shift # past argument
+    ;;
    -tt|--test-targets)
    TESTTARGETS="${2,,}"
    shift # past argument
@ -76,7 +104,7 @@ case $key in
    shift # past argument
    ;;
    *)
-    echo Unkown option $key
+    echo Unknown option $key
    exit 1
    ;;
 esac
@ -86,7 +114,7 @@ done
 # Step 0 -- Validate all necessary prerequisites and check for incompatible options
 # It is possible to use this script on Windows to build CNTK
 # from Cygwin window with Visual C++ environment loaded.
-# In that case OS environment variable will be set and we 
+# In that case OS environment variable will be set and we
 # can use it to differentiate from Linux.
 if [[ $CLEAN_BEFORE == 1 && $RUN == 1 && $BUILD == 0 ]]; then
    echo "============ ERROR: Incompatible options RUN and CLEAN_BEFORE set without BUILD ============"
@ -99,7 +127,7 @@ if [[ $OS == "Windows_NT" && $OSTYPE == "cygwin" ]]; then
    PREFIX_DIR=x64
    BIN_NAME=CNTK.exe
    BUILD_OS="windows"
-  
+
    if [[ $VS120COMNTOOLS == "" ]]; then
        echo "============ Visual Studio 12.0 environment not properly setup or VS not installed ============"
        echo "============ Please find and run the appropriate vcvarsall.bat script ============"
@ -224,7 +252,7 @@ if [[ $BUILD == 1 ]]; then
                        OneBitSGDOPT=yes
                    fi
                fi
-                ./configure --with-build-top=$BUILD_DIR --with-acml=$ACML_PATH --with-buildtype=$FLAVOR --cuda=$CUDAOPT --with-code-coverage=$CODE_COVERAGE --1bitsgd=$OneBitSGDOPT
+                ./configure --with-build-top=$BUILD_DIR ${MATH_LIBRARY_OPTION} --with-buildtype=$FLAVOR --cuda=$CUDAOPT --with-code-coverage=$CODE_COVERAGE --1bitsgd=$OneBitSGDOPT
                if [[ $CLEAN_BEFORE == 1 ]]; then
                    make -C $BUILD_DIR -f $MAKEFILE clean 1>&6 2>&7 || exit $?
                fi
@ -278,7 +306,7 @@ if [[ $RUN == 1 ]]; then
                fi
                OUT_FILE="$RUN_FILE.$FLAVOR.$TARGET.$TESTTARGET.out"

-                BIN_PATH=$CNTK_ROOT/$PREFIX_DIR/$FLAVOR_DIR/$BIN_NAME 
+                BIN_PATH=$CNTK_ROOT/$PREFIX_DIR/$FLAVOR_DIR/$BIN_NAME
                if ! [[ -f $BIN_PATH ]]; then
                    echo "============ ERROR: CNTK did not build properly for $TARGET/$FLAVOR ============"
                    echo "Missing file: $BIN_PATH"
--- a/Tools/generate_build_info
+++ b/Tools/generate_build_info
@ -1,5 +1,11 @@
 #!/bin/bash
-
+#
+# Copyright (c) Microsoft. All rights reserved.
+#
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+#
 # Description: this script is used to generated buildinfo.h in Source/CNTK
 # which will contain the following infomation to be displayed at runtime:
 #   BUILDTYPE       (release/debug)
@ -24,7 +30,7 @@ usage ()
    echo "This script assumes git can be used"
    echo "This script assumes Config.make has been made"
    echo "-------------------------------------------------------------------"
-    if [ ! -z "$1" ] ; then 
+    if [ ! -z "$1" ] ; then
        echo "ERROR message: $1"
    fi
    exit 1
@ -60,20 +66,20 @@ makebuildinfo()
        printf "#define _MATHLIB_ \"%s\"\n" "$MATHLIB"
        printf "#define _BUILDSHA1_ \"%s\"\n" "$GIT_COMMIT"
        printf "#define _BUILDBRANCH_ \"%s\"\n" "$GIT_BRANCH"
-        if [ -z "$CUDA_PATH" ]; then 
+        if [ -z "$CUDA_PATH" ]; then
            printf "#define _BUILDTARGET_ \"CPU-only\"\n"
        else
            printf "#define _BUILDTARGET_ \"GPU\"\n"
            printf "#define _CUDA_PATH_ \"%s\"\n" "$CUDA_PATH"
        fi
-        if [ ! -z "$CUB_PATH" ]; then 
+        if [ ! -z "$CUB_PATH" ]; then
            printf "#define _CUB_PATH_ \"%s\"\n"  "$CUB_PATH"
        fi
-        if [ ! -z "$CUDNN_PATH" ]; then 
+        if [ ! -z "$CUDNN_PATH" ]; then
            printf "#define _CUDNN_PATH_ \"%s\"\n"  $CUDNN_PATH
        fi
        printf "#define _BUILDTYPE_ \"%s\"\n" "$BUILDTYPE"
-        if [ ! -z "$WITH_1BITSGD" ]; then 
+        if [ ! -z "$WITH_1BITSGD" ]; then
            printf "#define _WITH_1BITSGD_ \"yes\"\n"
        else
            printf "#define _WITH_1BITSGD_ \"no\"\n"
@ -87,48 +93,53 @@ makebuildinfo()

 #//////////////////////////////////////////////////////#
 #               main function                          #
-#//////////////////////////////////////////////////////#        
-if [ $# -ne 1 ]; then 
-    usage 
+#//////////////////////////////////////////////////////#
+if [ $# -ne 1 ]; then
+    usage
 fi

 config=$1

-# 1. check whether we have git and what is the sha-1 value 
+# Check whether we have git and what is the SHA-1 value
 if Has_Git; then has_git=1; else has_git=0; usage "git does not exist"; fi
 GIT_STATUS=' (modified)'
 git diff --quiet && git diff --cached --quiet && GIT_STATUS=''
 GIT_COMMIT=`git rev-parse HEAD`$GIT_STATUS
 GIT_BRANCH=`git rev-parse --abbrev-ref HEAD`

-# 2. looking into Config.make
-if [ ! -e $config ] ; then 
+# Looking into Config.make
+if [ ! -e $config ] ; then
    usage "Config.make not exists"
 fi
 source $config

-# 3. whether we have CUDA_PATH 
-if [ -z "${CUDA_PATH+x}" ]; then 
+# Whether we have CUDA_PATH
+if [ -z "${CUDA_PATH+x}" ]; then
    CUDAPATH=""
 else
    CUDAPATH=$CUDA_PATH
 fi

-# 4. whether we have CUB_PATH
-if [ -z "${CUB_PATH+x}" ]; then 
+# Whether we have CUB_PATH
+if [ -z "${CUB_PATH+x}" ]; then
    CUBPATH=""
 else
    CUBPATH=$CUB_PATH
 fi

-# 5. Build machine info
+# Identify MKL variant being used
+if [ "$MATHLIB" = "mkl" -a "$MKL_THREADING" = "sequential" ]; then
+    MATHLIB=mkl-sequential
+fi
+
+# Build machine info
 BUILDER=$USER
 BUILDMACHINE=`hostname`
 BUILDPATH=`pwd`

-# 6. make buildinfo.h (only update if changed)
+# Make buildinfo.h (only update if changed)
 target=Source/CNTK/buildinfo.h
-if [ ! -d Source ] ; then 
+if [ ! -d Source ] ; then
    usage
 fi

--- a/37
+++ b/37
@ -1,4 +1,11 @@
 #!/bin/bash
+#
+# Copyright (c) Microsoft. All rights reserved.
+#
+# Licensed under the MIT license. See LICENSE.md file in the project root
+# for full license information.
+# ==============================================================================
+#

 configure=$0
 build_top=$PWD
@ -12,9 +19,12 @@ have_acml=no
 acml_path=
 acml_check=include/acml.h

+# CNTK Custom MKL Version
+cntk_custom_mkl_version=1
+
 have_mkl=no
 mkl_path=
-mkl_check=mkl/include/mkl.h
+mkl_check=$cntk_custom_mkl_version/include/mkl.h

 # Experimental OpenBLAS support.
 have_openblas=no
@ -23,7 +33,7 @@ openblas_check=include/openblas_config.h

 have_kaldi=no
 kaldi_path=
-kaldi_check=src/kaldi.mk 
+kaldi_check=src/kaldi.mk

 have_buildtype=no
 buildtype=
@ -58,11 +68,11 @@ default_use_code_coverage=no
 enable_code_coverage=$default_use_code_coverage

 # List from best to worst choice
-default_path_list="/usr /usr/local /opt /opt/local /opt/intel"
+default_path_list="/usr /usr/local /opt /opt/local"

 # List from best to worst choice
 default_acmls="acml5.3.1/ifort64_mp"
-default_mkls=""
+default_mkls="CNTKCustomMKL"
 default_openblas=""

 # NOTE: Will get compilation errors with cuda-6.0
@ -171,7 +181,7 @@ function is_hardlinked ()
    echo $r
 }

-function default_use_cuda () 
+function default_use_cuda ()
 {
    if test x$(find_cuda) = x || test x$(find_gdk) = x
    then
@ -182,7 +192,7 @@ function default_use_cuda ()
 }
 enable_cuda=$(default_use_cuda)

-function show_default () 
+function show_default ()
 {
    if test x$1 = x
    then
@ -207,6 +217,7 @@ function show_help ()
    echo "  --with-cudnn[=directory] $(show_default $(find_cudnn))"
    echo "  --with-acml[=directory] $(show_default $(find_acml))"
    echo "  --with-mkl[=directory] $(show_default $(find_mkl))"
+    echo "  --with-mkl-sequential[=directory] $(show_default $(find_mkl))"
    echo "  --with-openblas[=directory] (experimental) $(show_default $(find_openblas))"
    echo "  --with-buildtype=(debug|release) $(show_default $default_buildtype)"
    echo "  --with-kaldi[=directory] $(show_default $(find_kaldi))"
@ -392,12 +403,16 @@ do
        --with-mkl*)
            have_mkl=yes
            mathlib=mkl
+            mkl_threading=parallel
+            case $key in
+                --with-mkl-sequential*) mkl_threading=sequential ;;
+            esac
            if test x$optarg = x
            then
                mkl_path=$(find_mkl)
                if test x$mkl_path = x
                then
-                    echo "Cannot find mkl directory"
+                    echo "Cannot find CNTK custom MKL directory"
                    echo "Please specify a value for --with-mkl"
                    exit 1
                fi
@ -406,7 +421,7 @@ do
                then
                    mkl_path=$optarg
                else
-                    echo "Invalid mkl directory $optarg"
+                    echo "Invalid CNTK custom MKL directory $optarg"
                    exit 1
                fi
            fi
@ -534,7 +549,7 @@ then
        if test x$mkl_path = x
        then
            echo "Cannot find a CPU math library."
-            echo "Please specify --with-acml, --with-mkl, --with-openblas with a path."
+            echo "Please specify --with-acml, --with-mkl, --with-mkl-sequential, --with-openblas with a path."
            exit 1
        else
            mathlib=mkl
@ -637,6 +652,8 @@ case $mathlib in
        ;;
    mkl)
        echo MKL_PATH=$mkl_path >> $config
+        echo MKL_THREADING=$mkl_threading >> $config
+        echo CNTK_CUSTOM_MKL_VERSION=$cntk_custom_mkl_version >> $config
        ;;
    openblas)
        echo OPENBLAS_PATH=$openblas_path >> $config
@ -678,6 +695,6 @@ then
    echo all clean : >> $makefile
    printf '\t$(MAKE) -C $(dir) BUILD_TOP=$(BUILD_TOP) $@\n' >> $makefile
 fi
-echo run 
+echo run
 echo '>make -j all'
 echo to build