CNTK support for CUDA 9

CNTK now supports CUDA 9/cuDNN 7. This requires an update to build environment to Ubuntu 16/GCC 5 for Linux, and Visual Studio 2017/VCTools 14.11 for Windows. With CUDA 9, CNTK also added a preview for 16-bit floating point (a.k.a FP16) computation. Please check out the example of FP16 in ResNet50 at /Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py Notes on FP16 preview: * FP16 implementation on CPU is not optimized, and it's not supposed to be used in CPU inference directly. User needs to convert the model to 32-bit floating point before running on CPU. * Loss/Criterion for FP16 training needs to be 32bit for accumulation without overflow, using cast function. Please check the example above. * Readers do not have FP16 output unless using numpy to feed data, cast from FP32 to FP16 is needed. Please check the example above. * FP16 gradient aggregation is currently only implemented on GPU using NCCL2. Distributed training with FP16 with MPI is not supported. * FP16 math is a subset of current FP32 implementation. Some model may get Feature Not Implemented exception using FP16. * FP16 is currently not supported in BrainScript. Please use Python for FP16. To setup build and runtime environment on Windows: * Install [Visual Studio 2017](https://www.visualstudio.com/downloads/) with following workloads and components. From command line (use Community version installer as example): vs_community.exe --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.ManagedDesktop --add Microsoft.VisualStudio.Workload.Universal --add Microsoft.Component.PythonTools --add Microsoft.VisualStudio.Component.VC.Tools.14.11 * Install [NVidia CUDA 9](https://developer.nvidia.com/cuda-90-download-archive?target_os=Windows&target_arch=x86_64) * From PowerShell, run: /Tools/devInstall/Windows/DevInstall.ps1 * Start VCTools 14.11 command line, run: cmd /k "%VS2017INSTALLDIR%\VC\Auxiliary\Build\vcvarsall.bat" x64 --vcvars_ver=14.11 * Open /CNTK.sln from the VCTools 14.11 command line. Note that starting CNTK.sln other than VCTools 14.11 command line, would causes CUDA 9 [build error](https://developercommunity.visualstudio.com/content/problem/163758/vs-2017-155-doesnt-support-cuda-9.html). To setup build and runtime environment on Linux using docker, please build Unbuntu 16.04 docker image using Dockerfiles /Tools/docker. For other Linux systems, please refer to the Dockerfiles to setup dependent libraries for CNTK.
2018-01-22 16:58:56 -08:00 · 2018-01-22 16:58:56 -08:00 · 3cf3af5df6
--- a/.gitattributes
+++ b/.gitattributes
@ -65,6 +65,7 @@ Makefile text
 *.asax text

 *.h text
+*.hpp text
 *.cpp text
 *.cc text
 *.cu text
--- a/CNTK.Cpp.props
+++ b/CNTK.Cpp.props
@ -3,19 +3,10 @@
  <Import Project="$(SolutionDir)\CNTK.Common.props" />
  <PropertyGroup>
    <CudaVersion />
-    <CudaVersion Condition="Exists('$(CUDA_PATH_V8_0)') And '$(CudaVersion)' == ''">8.0</CudaVersion>
-    <CudaVersion Condition="Exists('$(CUDA_PATH_V7_5)') And '$(CudaVersion)' == ''">7.5</CudaVersion>
-
-    <NvmlInclude />
-    <NvmlInclude Condition="'$(CudaVersion)' == '7.5'">"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\include"</NvmlInclude>
-    <NvmlInclude Condition="'$(CudaVersion)' == '8.0'" />
-
-    <NvmlLibPath />
-    <NvmlLibPath Condition="'$(CudaVersion)' == '7.5'">"c:\Program Files\NVIDIA Corporation\GDK\gdk_win7_amd64_release\nvml\lib"</NvmlLibPath>
-    <NvmlLibPath Condition="'$(CudaVersion)' == '8.0'" />
+    <CudaVersion Condition="Exists('$(CUDA_PATH_V9_0)') And '$(CudaVersion)' == ''">9.0</CudaVersion>

    <NvmlDll>%ProgramW6432%\NVIDIA Corporation\NVSMI\nvml.dll</NvmlDll>
-    <NvmlDll Condition="Exists('c:\local\bindrop\NVSMI\nvml.dll')">c:\local\bindrop\NVSMI\nvml.dll</NvmlDll>
+    <NvmlDll Condition="Exists('c:\local\nvsmi9\NVSMI\nvml.dll')">c:\local\nvsmi9\NVSMI\nvml.dll</NvmlDll>

    <HasOpenCv>false</HasOpenCv>
    <HasOpenCv Condition="Exists('$(OPENCV_PATH)') Or Exists('$(OPENCV_PATH_V31)')">true</HasOpenCv>
@ -65,16 +56,20 @@

  <PropertyGroup Condition="!$(IsUWP)">
    <MathLibrary>MKL</MathLibrary>
-    <MathIncludePath>$(MKLML_PATH)\include</MathIncludePath>
+    <MathIncludePath>$(MKL_PATH)\include</MathIncludePath>
    <MathDefine>USE_MKL</MathDefine>
    <!-- Only non-UWP configurations consume PerformanceProfiler -->
    <ReaderLibs>Cntk.PerformanceProfiler-$(CntkComponentVersion).lib;$(ReaderLibs)</ReaderLibs>
-    <MathLibraryName>MKL-ML Library</MathLibraryName>
-    <MathLibraryPath>$(MKLML_PATH)\lib</MathLibraryPath>
+    <MathLibraryName>MKL Library</MathLibraryName>
+    <MathLibraryPath>$(MKL_PATH)\lib</MathLibraryPath>
    <MathLinkLibrary>mklml.lib</MathLinkLibrary>
    <MathDelayLoad>mklml.dll</MathDelayLoad>
    <MathPostBuildCopyPattern>$(MathLibraryPath)\*.dll</MathPostBuildCopyPattern>
-    <UnitTestDlls>$(OutDir)mklml.lib;$(OutDir)libiomp5md.dll;</UnitTestDlls>
+    <HasMklDnn>false</HasMklDnn>
+    <HasMklDnn Condition="Exists('$(MKL_PATH)\include\mkldnn.h')">true</HasMklDnn>
+    <MathDefine Condition="$(HasMklDnn)">$(MathDefine);USE_MKLDNN</MathDefine>
+    <MathLinkLibrary Condition="$(HasMklDnn)">$(MathLinkLibrary);mkldnn.lib</MathLinkLibrary>
+    <MathDelayLoad Condition="$(HasMklDnn)">$(MathDelayLoad);mkldnn.dll</MathDelayLoad>
  </PropertyGroup>
  <PropertyGroup Condition="$(UseZip)">
    <ZipInclude>$(ZLIB_PATH)\include;$(ZLIB_PATH)\lib\libzip\include;</ZipInclude>
@ -109,31 +104,19 @@
    <ProtobufLib Condition="$(DebugBuild)">libprotobufd.lib</ProtobufLib>
  </PropertyGroup>

-  <PropertyGroup Condition="'$(CudaVersion)' == '8.0'">
-    <CudaPath>$(CUDA_PATH_V8_0)</CudaPath>
-    <CudaRuntimeDll>cudart64_80.dll</CudaRuntimeDll>
-    <CudaDlls>cublas64_80.dll;cusparse64_80.dll;curand64_80.dll;$(CudaRuntimeDll)</CudaDlls>
+  <PropertyGroup Condition="'$(CudaVersion)' == '9.0'">
+    <CudaPath>$(CUDA_PATH_V9_0)</CudaPath>
+    <CudaRuntimeDll>cudart64_90.dll</CudaRuntimeDll>
+    <CudaDlls>cublas64_90.dll;cusparse64_90.dll;curand64_90.dll;$(CudaRuntimeDll)</CudaDlls>

 <!-- Use NvidiaCompute to define nvcc target architectures (will generate code to support them all, i.e. fat-binary, in release mode)
    In debug mode we only include cubin/PTX for 30 and rely on PTX / JIT to generate the required native cubin format
    http://docs.nvidia.com/cuda/pascal-compatibility-guide/index.html#building-applications-with-pascal-support -->
    <NvidiaCompute Condition="$(DebugBuild)">$(CNTK_CUDA_CODEGEN_DEBUG)</NvidiaCompute>
    <NvidiaCompute Condition="$(DebugBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30</NvidiaCompute>
- 
-    <NvidiaCompute Condition="$(ReleaseBuild)">$(CNTK_CUDA_CODEGEN_RELEASE)</NvidiaCompute>
-    <NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_60,sm_60;compute_61,sm_61</NvidiaCompute>
-  </PropertyGroup>

-  <PropertyGroup Condition="'$(CudaVersion)' == '7.5'">
-    <CudaPath>$(CUDA_PATH_V7_5)</CudaPath>
-    <CudaRuntimeDll>cudart64_75.dll</CudaRuntimeDll>
-    <CudaDlls>cublas64_75.dll;cusparse64_75.dll;curand64_75.dll;$(CudaRuntimeDll)</CudaDlls>
-
-    <NvidiaCompute Condition="$(DebugBuild)">$(CNTK_CUDA_CODEGEN_DEBUG)</NvidiaCompute>
-    <NvidiaCompute Condition="$(DebugBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30</NvidiaCompute>
- 
    <NvidiaCompute Condition="$(ReleaseBuild)">$(CNTK_CUDA_CODEGEN_RELEASE)</NvidiaCompute>
-    <NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50</NvidiaCompute>
+    <NvidiaCompute Condition="$(ReleaseBuild) And '$(NvidiaCompute)'==''">compute_30,sm_30;compute_35,sm_35;compute_50,sm_50;compute_60,sm_60;compute_61,sm_61;compute_70,sm_70</NvidiaCompute>
  </PropertyGroup>

  <PropertyGroup>
@ -144,11 +127,14 @@
    <CudaMsbuildPath Condition="'$(CudaMsbuildPath)' == ''">$(VCTargetsPath)\BuildCustomizations</CudaMsbuildPath>
  </PropertyGroup>

+  <PropertyGroup>
+    <PlatformToolset>v141</PlatformToolset>
+  </PropertyGroup>
+
  <!-- TODO warn if ConfigurationType not (yet) defined -->

  <PropertyGroup Condition="'$(ConfigurationType)' == 'StaticLibrary'">
    <UseDebugLibraries>$(DebugBuild)</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
    <WholeProgramOptimization>$(ReleaseBuild)</WholeProgramOptimization>
    <LinkIncremental>$(DebugBuild)</LinkIncremental>
@ -159,6 +145,7 @@
      <PreprocessorDefinitions>CNTK_COMPONENT_VERSION="$(CntkComponentVersion)"</PreprocessorDefinitions>
      <!-- UWP does not use MPI -->
      <PreprocessorDefinitions Condition="!$(IsUWP)">%(PreprocessorDefinitions);HAS_MPI=1</PreprocessorDefinitions>
+      <PreprocessorDefinitions Condition="'$(CudaVersion)' == '9.0'">%(PreprocessorDefinitions);CUDA_NO_HALF;__CUDA_NO_HALF_OPERATORS__</PreprocessorDefinitions>
    </ClCompile>
  </ItemDefinitionGroup>

--- a/CNTK.sln
+++ b/CNTK.sln
--- a/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj
+++ b/Examples/Extensibility/BinaryConvolution/BinaryConvolutionLib/BinaryConvolutionLib.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -39,13 +39,11 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
--- a/Examples/Extensibility/CPP/CPPExtensibilityExamples.vcxproj
+++ b/Examples/Extensibility/CPP/CPPExtensibilityExamples.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -39,13 +39,11 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
--- a/Examples/Extensibility/CPPLib/CPPExtensibilityExamplesLibrary.vcxproj
+++ b/Examples/Extensibility/CPPLib/CPPExtensibilityExamplesLibrary.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -39,13 +39,11 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
--- a/Examples/Extensibility/ProposalLayer/ProposalLayerLib/ProposalLayerLib.vcxproj
+++ b/Examples/Extensibility/ProposalLayer/ProposalLayerLib/ProposalLayerLib.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -42,13 +42,11 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10.py
@ -56,7 +56,7 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):

 # Train and evaluate the network.
 def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_epochs, profiler_dir=None,
-                       model_dir=None, log_dir=None, tensorboard_logdir=None, gen_heartbeat=False):
+                       model_dir=None, log_dir=None, tensorboard_logdir=None, gen_heartbeat=False, fp16=False):

    set_computation_network_trace_level(0)

@ -64,19 +64,32 @@ def train_and_evaluate(reader_train, reader_test, network_name, epoch_size, max_
    input_var = C.input_variable((num_channels, image_height, image_width), name='features')
    label_var = C.input_variable((num_classes))

-    # create model, and configure learning parameters
-    if network_name == 'resnet20':
-        z = create_cifar10_model(input_var, 3, num_classes)
-        lr_per_mb = [1.0]*80 + [0.1]*40 + [0.01]
-    elif network_name == 'resnet110':
-        z = create_cifar10_model(input_var, 18, num_classes)
-        lr_per_mb = [0.1]*1 + [1.0]*80 + [0.1]*40 + [0.01]
+    dtype = np.float16 if fp16 else np.float32
+    if fp16:
+        graph_input = C.cast(input_var, dtype=np.float16)
+        graph_label = C.cast(label_var, dtype=np.float16)
    else:
-        raise RuntimeError("Unknown model name!")
+        graph_input = input_var
+        graph_label = label_var

-    # loss and metric
-    ce = cross_entropy_with_softmax(z, label_var)
-    pe = classification_error(z, label_var)
+    with C.default_options(dtype=dtype):
+        # create model, and configure learning parameters
+        if network_name == 'resnet20':
+            z = create_cifar10_model(graph_input, 3, num_classes)
+            lr_per_mb = [1.0]*80 + [0.1]*40 + [0.01]
+        elif network_name == 'resnet110':
+            z = create_cifar10_model(graph_input, 18, num_classes)
+            lr_per_mb = [0.1]*1 + [1.0]*80 + [0.1]*40 + [0.01]
+        else:
+            raise RuntimeError("Unknown model name!")
+
+        # loss and metric
+        ce = cross_entropy_with_softmax(z, graph_label)
+        pe = classification_error(z, graph_label)
+
+    if fp16:
+        ce = C.cast(ce, dtype=np.float32)
+        pe = C.cast(pe, dtype=np.float32)

    # shared training parameters
    minibatch_size = 128
@ -168,6 +181,7 @@ if __name__=='__main__':
    parser.add_argument('-outputdir', '--outputdir', help='Output directory for checkpoints and models', required=False, default=None)
    parser.add_argument('-logdir', '--logdir', help='Log file', required=False, default=None)
    parser.add_argument('-genheartbeat', '--genheartbeat', help="Turn on heart-beat for philly", action='store_true', default=False)
+    parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)

    args = vars(parser.parse_args())
    epochs = args['epochs']
@ -184,4 +198,4 @@ if __name__=='__main__':
    reader_test = create_image_mb_source(os.path.join(data_path, 'test_map.txt'), os.path.join(data_path, 'CIFAR-10_mean.xml'), False, total_number_of_samples=C.io.FULL_DATA_SWEEP)

    train_and_evaluate(reader_train, reader_test, network_name, epoch_size, epochs, args['profiler_dir'], model_dir,
-                       args['logdir'], args['tensorboard_logdir'], gen_heartbeat=args['genheartbeat'])
+                       args['logdir'], args['tensorboard_logdir'], gen_heartbeat=args['genheartbeat'], fp16=args['fp16'])
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_CIFAR10_Distributed.py
@ -37,22 +37,35 @@ num_classes  = 10
 model_name   = "ResNet_CIFAR10_DataAug.model"

 # Create network
-def create_resnet_network(network_name):
+def create_resnet_network(network_name, fp16):
    # Input variables denoting the features and label data
    input_var = C.input_variable((num_channels, image_height, image_width))
    label_var = C.input_variable((num_classes))

-    # create model, and configure learning parameters
-    if network_name == 'resnet20':
-        z = create_cifar10_model(input_var, 3, num_classes)
-    elif network_name == 'resnet110':
-        z = create_cifar10_model(input_var, 18, num_classes)
+    dtype = np.float16 if fp16 else np.float32
+    if fp16:
+        graph_input = C.cast(input_var, dtype=np.float16)
+        graph_label = C.cast(label_var, dtype=np.float16)
    else:
-        return RuntimeError("Unknown model name!")
+        graph_input = input_var
+        graph_label = label_var

-    # loss and metric
-    ce = cross_entropy_with_softmax(z, label_var)
-    pe = classification_error(z, label_var)
+    with C.default_options(dtype=dtype):
+        # create model, and configure learning parameters
+        if network_name == 'resnet20':
+            z = create_cifar10_model(graph_input, 3, num_classes)
+        elif network_name == 'resnet110':
+            z = create_cifar10_model(graph_input, 18, num_classes)
+        else:
+            return RuntimeError("Unknown model name!")
+
+        # loss and metric
+        ce = cross_entropy_with_softmax(z, graph_label)
+        pe = classification_error(z, graph_label)
+
+    if fp16:
+        ce = C.cast(ce, dtype=np.float32)
+        pe = C.cast(pe, dtype=np.float32)

    return {
        'name' : network_name,
@ -120,7 +133,7 @@ def train_and_test(network, trainer, train_source, test_source, minibatch_size,

 # Train and evaluate the network.
 def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits=32, block_size=None, warm_up=0, 
-                   max_epochs=160, restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, scale_up=False, profiling=False):
+                   max_epochs=160, restore=True, log_to_file=None, num_mbs_per_log=None, gen_heartbeat=False, scale_up=False, profiling=False, fp16=False):

    set_computation_network_trace_level(0)

@ -138,7 +151,7 @@ def resnet_cifar10(train_data, test_data, mean_data, network_name, epoch_size, n
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs)

-    network = create_resnet_network(network_name)
+    network = create_resnet_network(network_name, fp16)
    trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer)
    train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
@ -162,6 +175,7 @@ if __name__=='__main__':
    parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true')
    parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
    parser.add_argument('-profile', '--profile', help="Turn on profiling", action='store_true', default=False)
+    parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)

    args = vars(parser.parse_args())

@ -201,7 +215,8 @@ if __name__=='__main__':
                   restore=not args['restart'],
                   scale_up=scale_up,
                   log_to_file=args['logdir'],
-                   profiling=args['profile'])
+                   profiling=args['profile'],
+                   fp16=args['fp16'])

    # Must call MPI finalize when process exit without exceptions
    Communicator.finalize()
--- a/Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py
+++ b/Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py
@ -63,32 +63,46 @@ def create_image_mb_source(map_file, mean_file, train, total_number_of_samples):
        multithreaded_deserializer=True)

 # Create network
-def create_resnet_network(network_name):
+def create_resnet_network(network_name, fp16):
    # Input variables denoting the features and label data
    input_var = C.input_variable((num_channels, image_height, image_width))
    label_var = C.input_variable((num_classes))

-    stride1x1 = (1, 1)
-    stride3x3 = (2, 2)
-
-    # create model, and configure learning parameters
-    if network_name == 'resnet18':
-        z = create_imagenet_model_basic(input_var, [2, 1, 1, 2], num_classes)
-    elif network_name == 'resnet34':
-        z = create_imagenet_model_basic(input_var, [3, 3, 5, 2], num_classes)
-    elif network_name == 'resnet50':
-        z = create_imagenet_model_bottleneck(input_var, [2, 3, 5, 2], num_classes, stride1x1, stride3x3)
-    elif network_name == 'resnet101':
-        z = create_imagenet_model_bottleneck(input_var, [2, 3, 22, 2], num_classes, stride1x1, stride3x3)
-    elif network_name == 'resnet152':
-        z = create_imagenet_model_bottleneck(input_var, [2, 7, 35, 2], num_classes, stride1x1, stride3x3)
+    dtype = np.float16 if fp16 else np.float32
+    if fp16:
+        graph_input = C.cast(input_var, dtype=np.float16)
+        graph_label = C.cast(label_var, dtype=np.float16)
    else:
-        return RuntimeError("Unknown model name!")
+        graph_input = input_var
+        graph_label = label_var

-    # loss and metric
-    ce = cross_entropy_with_softmax(z, label_var)
-    errs = classification_error(z, label_var, topN=1)
-    top5Errs = classification_error(z, label_var, topN=5)
+    with C.default_options(dtype=dtype):
+        stride1x1 = (1, 1)
+        stride3x3 = (2, 2)
+
+        # create model, and configure learning parameters
+        if network_name == 'resnet18':
+            z = create_imagenet_model_basic(graph_input, [2, 1, 1, 2], num_classes)
+        elif network_name == 'resnet34':
+            z = create_imagenet_model_basic(graph_input, [3, 3, 5, 2], num_classes)
+        elif network_name == 'resnet50':
+            z = create_imagenet_model_bottleneck(graph_input, [2, 3, 5, 2], num_classes, stride1x1, stride3x3)
+        elif network_name == 'resnet101':
+            z = create_imagenet_model_bottleneck(graph_input, [2, 3, 22, 2], num_classes, stride1x1, stride3x3)
+        elif network_name == 'resnet152':
+            z = create_imagenet_model_bottleneck(graph_input, [2, 7, 35, 2], num_classes, stride1x1, stride3x3)
+        else:
+            return RuntimeError("Unknown model name!")
+
+        # loss and metric
+        ce = cross_entropy_with_softmax(z, graph_label)
+        errs = classification_error(z, graph_label, topN=1)
+        top5Errs = classification_error(z, graph_label, topN=5)
+
+    if fp16:
+        ce = C.cast(ce, dtype=np.float32)
+        errs = C.cast(errs, dtype=np.float32)
+        top5Errs = C.cast(top5Errs, dtype=np.float32)

    return {
        'name' : network_name,
@ -156,7 +170,7 @@ def train_and_test(network, trainer, train_source, test_source, minibatch_size,

 # Train and evaluate the network.
 def resnet_imagenet(train_data, test_data, mean_data, network_name, epoch_size, num_quantization_bits=32, block_size=None, warm_up=0, 
-                    max_epochs=90, restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False):
+                    max_epochs=90, restore=True, log_to_file=None, num_mbs_per_log=100, gen_heartbeat=False, scale_up=False, profiling=False, fp16=False):

    set_computation_network_trace_level(0)

@ -174,7 +188,7 @@ def resnet_imagenet(train_data, test_data, mean_data, network_name, epoch_size,
        gen_heartbeat=gen_heartbeat,
        num_epochs=max_epochs)

-    network = create_resnet_network(network_name)
+    network = create_resnet_network(network_name, fp16)
    trainer = create_trainer(network, minibatch_size, epoch_size, num_quantization_bits, block_size, warm_up, progress_printer)
    train_source = create_image_mb_source(train_data, mean_data, train=True, total_number_of_samples=max_epochs * epoch_size)
    test_source = create_image_mb_source(test_data, mean_data, train=False, total_number_of_samples=C.io.FULL_DATA_SWEEP)
@ -198,6 +212,7 @@ if __name__=='__main__':
    parser.add_argument('-r', '--restart', help='Indicating whether to restart from scratch (instead of restart from checkpoint file by default)', action='store_true', default=False)
    parser.add_argument('-device', '--device', type=int, help="Force to run the script on a specified device", required=False, default=None)
    parser.add_argument('-profile', '--profile', help="Turn on profiling", action='store_true', default=False)
+    parser.add_argument('-fp16', '--fp16', help="use float16", action='store_true', default=False)

    args = vars(parser.parse_args())

@ -237,7 +252,8 @@ if __name__=='__main__':
                    restore=not args['restart'],
                    scale_up=scale_up,
                    log_to_file=args['logdir'],
-                    profiling=args['profile'])
+                    profiling=args['profile'],
+                    fp16=args['fp16'])

    # Must call MPI finalize when process exit without exceptions
    Communicator.finalize()
--- a/Examples/PythonExamples.pyproj
+++ b/Examples/PythonExamples.pyproj
@ -11,8 +11,6 @@
    <OutputPath>.</OutputPath>
    <ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
    <LaunchProvider>Standard Python launcher</LaunchProvider>
-    <InterpreterId>{4aa09c20-bd30-41b0-b691-0beca389326c}</InterpreterId>
-    <InterpreterVersion>3.5</InterpreterVersion>
    <EnableNativeCodeDebugging>False</EnableNativeCodeDebugging>
    <IsWindowsApplication>False</IsWindowsApplication>
  </PropertyGroup>
@ -20,7 +18,6 @@
  <PropertyGroup Condition="'$(Configuration)' == 'Release'" />
  <PropertyGroup>
    <VisualStudioVersion Condition=" '$(VisualStudioVersion)' == '' ">10.0</VisualStudioVersion>
-    <PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
  </PropertyGroup>
  <ItemGroup>
    <Compile Include="..\bindings\python\cntk\ops\tests\block_test.py">
@ -280,6 +277,9 @@
  <ItemGroup>
    <Folder Include="1stSteps\" />
    <Folder Include="common\" />
+    <Folder Include="dToEndTests\" />
+    <Folder Include="dToEndTests\cntkv2Python\" />
+    <Folder Include="dToEndTests\cntkv2Python\Examples\" />
    <Folder Include="Image\" />
    <Folder Include="Image\Classification\" />
    <Folder Include="Image\Classification\ConvNet\" />
@ -343,10 +343,6 @@
    <Folder Include="ToEndTests\cntkv2Python\" />
    <Folder Include="ToEndTests\cntkv2Python\Examples\" />
  </ItemGroup>
-  <ItemGroup>
-    <InterpreterReference Include="{4aa09c20-bd30-41b0-b691-0beca389326c}\3.5" />
-    <InterpreterReference Include="{c2abc240-2da9-4409-8b09-0ee30a5b56e2}\3.4" />
-  </ItemGroup>
  <ItemGroup>
    <Content Include="Text\CharacterLM\data\tinyshakespeare.txt" />
    <Content Include="Text\CharacterLM\README.md" />
@ -355,6 +351,5 @@
    <Content Include="Text\WordLMWithSampledSoftmax\test\text.txt" />
    <Content Include="Text\WordLMWithSampledSoftmax\test\token2id.txt" />
  </ItemGroup>
-  <Import Project="$(PtvsTargetsFile)" Condition="Exists($(PtvsTargetsFile))" />
-  <Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" Condition="!Exists($(PtvsTargetsFile))" />
-</Project>
+  <Import Project="$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets" />
+</Project>
--- a/32
+++ b/32
@ -45,7 +45,7 @@
 #     defaults to /usr/local/mpi
 # These can be overridden on the command line, e.g. make BUILDTYPE=debug

-# TODO: Build static libraries for common dependencies that are shared by multiple 
+# TODO: Build static libraries for common dependencies that are shared by multiple
 # targets, e.g. eval and CNTK.

 ARCH=$(shell uname)
@ -90,8 +90,8 @@ SOURCEDIR:= Source
 INCLUDEPATH:= $(addprefix $(SOURCEDIR)/, Common/Include CNTKv2LibraryDll CNTKv2LibraryDll/API CNTKv2LibraryDll/proto ../Examples/Extensibility/CPP Math CNTK ActionsLib ComputationNetworkLib SGDLib SequenceTrainingLib CNTK/BrainScript Readers/ReaderLib PerformanceProfilerDll)
 INCLUDEPATH+=$(PROTOBUF_PATH)/include
 # COMMON_FLAGS include settings that are passed both to NVCC and C++ compilers.
-COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11
-CPPFLAGS:= 
+COMMON_FLAGS:= -DHAS_MPI=$(HAS_MPI) -D_POSIX_SOURCE -D_XOPEN_SOURCE=600 -D__USE_XOPEN2K -std=c++11 -DCUDA_NO_HALF -D__CUDA_NO_HALF_OPERATORS__
+CPPFLAGS:=
 CXXFLAGS:= $(SSE_FLAGS) -std=c++0x -fopenmp -fpermissive -fPIC -Werror -fcheck-new
 LIBPATH:=
 LIBS_LIST:=
@ -168,10 +168,10 @@ endif

 ifeq ("$(MATHLIB)","mkl")
  INCLUDEPATH += $(MKL_PATH)/include
-  LIBS_LIST += m iomp5 pthread mklml_intel
+  LIBS_LIST += m iomp5 pthread mklml_intel mkldnn
  MKL_LIB_PATH := $(MKL_PATH)/lib
  LIBPATH += $(MKL_LIB_PATH)
-  COMMON_FLAGS += -DUSE_MKL
+  COMMON_FLAGS += -DUSE_MKL -DUSE_MKLDNN
 endif

 ifeq ($(CUDA_GDR),1)
@ -212,6 +212,7 @@ GENCODE_SM50 := -gencode arch=compute_50,code=\"sm_50,compute_50\"
 GENCODE_SM52 := -gencode arch=compute_52,code=\"sm_52,compute_52\"
 GENCODE_SM60 := -gencode arch=compute_60,code=\"sm_60,compute_60\"
 GENCODE_SM61 := -gencode arch=compute_61,code=\"sm_61,compute_61\"
+GENCODE_SM70 := -gencode arch=compute_70,code=\"sm_70,compute_70\"

 # Should we relocate *.gcno and *.gcda files using -fprofile-dir option?
 # Use GCOV_PREFIX and GCOV_PREFIX_STRIP if relocating:
@ -241,7 +242,7 @@ ifeq ("$(BUILDTYPE)","release")
  ifdef CNTK_CUDA_CODEGEN_RELEASE
    GENCODE_FLAGS := $(CNTK_CUDA_CODEGEN_RELEASE)
  else
-    GENCODE_FLAGS := $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM61)
+    GENCODE_FLAGS := $(GENCODE_SM30) $(GENCODE_SM35) $(GENCODE_SM50) $(GENCODE_SM60) $(GENCODE_SM61) $(GENCODE_SM70)
  endif

  CXXFLAGS += -g -O4
@ -365,6 +366,10 @@ MATH_SRC =\
 	$(SOURCEDIR)/Math/CUDAPageLockedMemAllocator.cpp \
 	$(SOURCEDIR)/Math/CPUMatrixFloat.cpp \
 	$(SOURCEDIR)/Math/CPUMatrixDouble.cpp \
+	$(SOURCEDIR)/Math/CPUMatrixHalf.cpp \
+	$(SOURCEDIR)/Math/CPUMatrixTensorFloat.cpp \
+	$(SOURCEDIR)/Math/CPUMatrixTensorDouble.cpp \
+	$(SOURCEDIR)/Math/CPUMatrixTensorHalf.cpp \
 	$(SOURCEDIR)/Math/CPURNGHandle.cpp \
 	$(SOURCEDIR)/Math/CPUSparseMatrix.cpp \
 	$(SOURCEDIR)/Math/ConvolutionEngine.cpp \
@ -415,7 +420,7 @@ $(CNTKMATH_LIB): $(MATH_OBJ) | $(PERF_PROFILER_LIB)
 	$(CXX) $(LDFLAGS) -shared $(patsubst %,-L%, $(LIBPATH) $(LIBDIR) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -fopenmp -l$(PERF_PROFILER)


-# Any executable using Common or ReaderLib needs to link these libraries. 
+# Any executable using Common or ReaderLib needs to link these libraries.
 READER_LIBS := $(CNTKMATH_LIB) $(PERF_PROFILER_LIB)
 L_READER_LIBS := -l$(CNTKMATH) -l$(PERF_PROFILER)

@ -711,7 +716,7 @@ $(CNTKLIBRARY_CPP_EVAL_EXAMPLES): $(CNTKLIBRARY_CPP_EVAL_EXAMPLES_OBJ) | $(CNTKL
 	$(CXX) $(LDFLAGS) $(patsubst %,-L%, $(LIBDIR) $(LIBPATH) $(GDK_NVML_LIB_PATH)) $(patsubst %,$(RPATH)%, $(ORIGINLIBDIR) $(LIBPATH)) -o $@ $^ $(LIBS) -l$(CNTKLIBRARY) $(L_READER_LIBS)

 ########################################
-# Eval V2 Sample test 
+# Eval V2 Sample test
 ########################################
 CNTKLIBRARY_CPP_EVAL_TEST:=$(BINDIR)/CNTKLibraryCPPEvalExamplesTest

@ -1047,13 +1052,13 @@ ifeq (,$(wildcard Source/1BitSGD/*.h))
  $(error Build with 1bit-SGD was requested but cannot find the code. Please check https://docs.microsoft.com/en-us/cognitive-toolkit/Enabling-1bit-SGD for instructions)
 endif

-  INCLUDEPATH += $(SOURCEDIR)/1BitSGD 
+  INCLUDEPATH += $(SOURCEDIR)/1BitSGD

  COMMON_FLAGS += -DCNTK_PARALLEL_TRAINING_SUPPORT
  # temporarily adding to 1bit, need to work with others to fix it
 endif

- 
+
 ########################################
 # ASGD(multiverso) setup
 ########################################
@ -1081,7 +1086,7 @@ MULTIVERSO_CMAKE_BUILDTYPE=Debug
 endif

 # TODO need to align Multiverso OpenMP with the one we use (libiomp). For now, disabled.
-$(MULTIVERSO_LIB): 
+$(MULTIVERSO_LIB):
 	@echo "Build Multiverso lib"
 	@mkdir -p $(LIBDIR)
 	@mkdir -p $(BINDIR)
@ -1322,6 +1327,7 @@ UNITTEST_MATH_SRC = \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/MatrixLearnerTests.cpp \
 	$(SOURCEDIR)/../Tests/UnitTests/MathTests/stdafx.cpp \
+	$(SOURCEDIR)/../Tests/UnitTests/MathTests/HalfGPUTests.cpp \

 UNITTEST_MATH_SRC += $(CNTK_COMMON_SRC)
 UNITTEST_MATH_OBJ := $(patsubst %.cpp, $(OBJDIR)/%.o, $(UNITTEST_MATH_SRC))
@ -1530,13 +1536,13 @@ $(OBJDIR)/%.o : %.cu $(BUILD_CONFIGURATION)
 	@mkdir -p $(dir $@)
 	$(NVCC) -c $< -o $@ $(COMMON_FLAGS) $(CUFLAGS) $(INCLUDEPATH:%=-I%) -Xcompiler "-fPIC -Werror"

-$(OBJDIR)/%.pb.o : %.pb.cc $(BUILD_CONFIGURATION) 
+$(OBJDIR)/%.pb.o : %.pb.cc $(BUILD_CONFIGURATION)
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
 	@mkdir -p $(dir $@)
 	$(CXX) -c $< -o $@ $(COMMON_FLAGS) $(CPPFLAGS) $(CXXFLAGS) $(INCLUDEPATH:%=-I%) -MD -MP -MF ${@:.o=.d}

-$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION) 
+$(OBJDIR)/%.o : %.cpp $(BUILD_CONFIGURATION)
 	@echo $(SEPARATOR)
 	@echo creating $@ for $(ARCH) with build type $(BUILDTYPE)
 	@mkdir -p $(dir $@)
--- a/README.md
+++ b/README.md
@ -2,6 +2,32 @@

 ## Latest news

+***2018-01-22.*** CNTK support for CUDA 9
+
+CNTK now supports CUDA 9/cuDNN 7. This requires an update to build environment to Ubuntu 16/GCC 5 for Linux, and Visual Studio 2017/VCTools 14.11 for Windows. With CUDA 9, CNTK also added a preview for 16-bit floating point (a.k.a FP16) computation.
+
+Please check out the example of FP16 in ResNet50 [here](./Examples/Image/Classification/ResNet/Python/TrainResNet_ImageNet_Distributed.py)
+
+Notes on FP16 preview:
+* FP16 implementation on CPU is not optimized, and it's not supposed to be used in CPU inference directly. User needs to convert the model to 32-bit floating point before running on CPU.
+* Loss/Criterion for FP16 training needs to be 32bit for accumulation without overflow, using cast function. Please check the example above.
+* Readers do not have FP16 output unless using numpy to feed data, cast from FP32 to FP16 is needed. Please check the example above.
+* FP16 gradient aggregation is currently only implemented on GPU using NCCL2. Distributed training with FP16 with MPI is not supported.
+* FP16 math is a subset of current FP32 implementation. Some model may get Feature Not Implemented exception using FP16.
+* FP16 is currently not supported in BrainScript. Please use Python for FP16.
+
+To setup build and runtime environment on Windows:
+* Install [Visual Studio 2017](https://www.visualstudio.com/downloads/) with following workloads and components. From command line (use Community version installer as example):
+    vs_community.exe --add Microsoft.VisualStudio.Workload.NativeDesktop --add Microsoft.VisualStudio.Workload.ManagedDesktop --add Microsoft.VisualStudio.Workload.Universal --add Microsoft.Component.PythonTools --add Microsoft.VisualStudio.Component.VC.Tools.14.11
+* Install [NVidia CUDA 9](https://developer.nvidia.com/cuda-90-download-archive?target_os=Windows&target_arch=x86_64)
+* From PowerShell, run:
+    [DevInstall.ps1](./Tools/devInstall/Windows/DevInstall.ps1)
+* Start VCTools 14.11 command line, run:
+    cmd /k "%VS2017INSTALLDIR%\VC\Auxiliary\Build\vcvarsall.bat" x64 --vcvars_ver=14.11
+* Open [CNTK.sln](./CNTK.sln) from the VCTools 14.11 command line. Note that starting CNTK.sln other than VCTools 14.11 command line, would causes CUDA 9 [build error](https://developercommunity.visualstudio.com/content/problem/163758/vs-2017-155-doesnt-support-cuda-9.html).
+
+To setup build and runtime environment on Linux using docker, please build Unbuntu 16.04 docker image using Dockerfiles [here](./Tools/docker). For other Linux systems, please refer to the Dockerfiles to setup dependent libraries for CNTK.
+
 ***2017-12-05.* CNTK 2.3.1**
 Release of Cognitive Toolkit v.2.3.1.

--- a/Source/1BitSGD
+++ b/Source/1BitSGD
@ -1 +1 @@
-Subproject commit 70d0c096a11e3b370d1b529f098f9711a7966090
+Subproject commit fb72ecb8c4d4e1d2e59b200ad1092331a372edea
--- a/Source/ActionsLib/ActionsLib.vcxproj
+++ b/Source/ActionsLib/ActionsLib.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -140,4 +140,4 @@
  </ItemGroup>
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
  <ImportGroup Label="ExtensionTargets" />
-</Project>
+</Project>
--- a/Source/ActionsLib/NDLNetworkBuilder.h
+++ b/Source/ActionsLib/NDLNetworkBuilder.h
@ -291,7 +291,7 @@ public:

            // map all to lowercase
            std::wstring lvalue = std::wstring(value.begin(), value.end());
-            std::transform(lvalue.begin(), lvalue.end(), lvalue.begin(), ::tolower); // note: may crash for chars >127. Don't use those.
+            std::transform(lvalue.begin(), lvalue.end(), lvalue.begin(), [](wchar_t c) { return (wchar_t)::tolower(c); }); // note: may crash for chars >127. Don't use those.

            // add to the respective node group
            m_net->AddToNodeGroup(lvalue, compNode);
--- a/Source/CNTK/CNTK.vcxproj
+++ b/Source/CNTK/CNTK.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -33,13 +33,11 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>Application</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
--- a/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibrary.h
@ -31,6 +31,7 @@
 #endif

 #include "CNTKLibraryInternals.h"
+#include "HalfConverter.hpp"

 // undef max in the rest of the file to avoid conflicts with the max macro defined in windows.h.
 #pragma push_macro("max")
@ -38,6 +39,43 @@

 namespace CNTK
 {
+    class float16
+    {
+    protected:
+        unsigned short __x;
+
+    public:
+        float16() = default;
+        float16(const float16& other) { __x = other.__x; }
+
+#ifndef SWIG
+        // construction from build-in types
+        float16(float f) { floatToFloat16(&f, &__x); }
+        float16(double d) : float16((float)d) {}
+        float16(int i) : float16((float)i) {}
+        float16(size_t u) : float16((float)u) {}
+
+        // cast to build-in types
+        operator float() const { float f; float16ToFloat(&__x, &f); return f; }
+
+        // compare functions
+        inline bool operator==(const float16& rhs) const { return (__x == rhs.__x); }
+        inline bool operator!=(const float16& rhs) const { return (__x != rhs.__x); }
+#endif
+
+        static float16 create(float f)
+        {
+            float16 v;
+            floatToFloat16(&f, &v.__x);
+            return v;
+        }
+
+        static float16 create(double d)
+        {
+            return create((float)d);
+        }
+    };
+
    ///
    /// Enumeration type denoting data type of symbolic data entities or actual data.
    ///
@ -47,6 +85,7 @@ namespace CNTK
        Float = 1,
        Double = 2,
        UChar = 3, // So far only used internally in deserializers.
+        Float16 = 4,

        /* TODO:
        Bit,
@ -74,6 +113,8 @@ namespace CNTK
            return DataType::Float;
        else if (std::is_same<ElementType, double>())
            return DataType::Double;
+        else if (std::is_same<ElementType, float16>())
+            return DataType::Float16;
        else
            NOT_IMPLEMENTED;
    }
@ -84,6 +125,8 @@ namespace CNTK
            return "Float";
        else if (dataType == DataType::Double)
            return "Double";
+        else if (dataType == DataType::Float16)
+            return "Float16";
        else
            LogicError("Unknown DataType.");
    }
@ -94,6 +137,8 @@ namespace CNTK
            return sizeof(float);
        else if (dataType == DataType::Double)
            return sizeof(double);
+        else if (dataType == DataType::Float16)
+            return sizeof(float16);
        else
            LogicError("Unknown DataType.");
    }
@ -621,8 +666,7 @@ namespace CNTK
        /// Construct a NDArrayView with newly allocated sparse storage in SparseCSC format on the specified 'device' and initialize its contents
        /// with the specified Sparse CSC format data.
        ///
-        template <typename ElementType>
-        CNTK_API NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false);
+        CNTK_API NDArrayView(::CNTK::DataType dataType, const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const void* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly = false);

        ///
        /// Construct a NDArrayView over newly allocated storage in the specified format on the specified 'device'.
@ -707,6 +751,9 @@ namespace CNTK
            case DataType::Double:
                SetValue(value);
                break;
+            case DataType::Float16:
+                SetValue(float16::create(value));
+                break;
            default:
                LogicError("Unsupported DataType %s.", DataTypeName(m_dataType));
                break;
@ -801,6 +848,11 @@ namespace CNTK
        ///
        CNTK_API void SetValue(double value);

+        ///
+        /// Fill 'this' NDArrayView with the specified value. The underlying DataType of 'this' view should be DataType::Double.
+        ///
+        CNTK_API void SetValue(float16 value);
+
        ///
        /// Creates a new NDArrayView with newly allocated storage on the specified device and copies 'this' view's contents into the newly allocated view.
        ///
@ -876,6 +928,25 @@ namespace CNTK
        // Disallow copy and move construction and assignment
        NDArrayView(const NDArrayView&) = delete; NDArrayView& operator=(const NDArrayView&) = delete; NDArrayView& operator=(NDArrayView&&) = delete; NDArrayView(NDArrayView&& other) = delete;

+        // template functions connecting V1ElemType and ElementType
+        template <typename ElementType, typename V1ElemType>
+        const ElementType* _DataBuffer() const;
+
+        template <typename ElementType, typename V1ElemType>
+        std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> _SparseCSCDataBuffers() const;
+
+        template <typename ElementType, typename V1ElemType>
+        std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> _SparseBlockColumnDataBuffers() const;
+
+        template <typename ElementType, typename V1ElemType>
+        static NDArrayViewPtr _RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device);
+
+        template <typename ElementType, typename V1ElemType>
+        static NDArrayViewPtr _RandomUniform(const NDShape& shape, double rangeStart, double rangeEnd, unsigned long seed, const DeviceDescriptor& device);
+
+        template<typename ElementType, typename V1ElemType>
+        ElementType _AsScalar() const;
+
    private:
        static const size_t AutoSelectRowColSplitPoint = SIZE_MAX;

@ -891,6 +962,10 @@ namespace CNTK
        template <typename ElementType>
        std::shared_ptr<Microsoft::MSR::CNTK::Matrix<ElementType>> GetWritableMatrix(size_t rowColSplitPoint = AutoSelectRowColSplitPoint);

+        std::shared_ptr<const Microsoft::MSR::CNTK::MatrixBase> GetMatrixBase(size_t rowColSplitPoint = AutoSelectRowColSplitPoint) const;
+
+        std::shared_ptr<Microsoft::MSR::CNTK::MatrixBase> GetWritableMatrixBase(size_t rowColSplitPoint = AutoSelectRowColSplitPoint);
+
        template <typename ElementType>
        const Microsoft::MSR::CNTK::TensorView<ElementType>* GetTensorView() const;

@ -2836,6 +2911,10 @@ namespace CNTK
            {
                CopyVariableValueToVector<double>(outputVariable, sequences);
            }
+            else if (dataType == DataType::Float16)
+            {
+                CopyVariableValueToVector<float16>(outputVariable, sequences);
+            }
        }

        ///
@ -6385,6 +6464,16 @@ namespace CNTK
    ///
    CNTK_API FunctionPtr Crop(const Variable& nodeInput, const Variable& nodeReferent, const Variable& ancestorInput, const Variable& ancestorReferent, const std::wstring& name = L"");

+    ///
+    /// Creates an instance of crop node with automatically computed crop offsets and specified ancestor nodes.
+    /// This is used in cases when input nodes do not have common ancestor in the network.
+    /// nodeInput: input node to be cropped.
+    /// nodeReferent: input node which determines the spatial size of output.
+    /// ancestorInput: ancestor of nodeInput.
+    /// ancestorReferent: ancestor of nodeReferent which is treated as equal to ancestorInput for the purpose of computing crop offsets.
+    ///
+    CNTK_API FunctionPtr Cast(const Variable& nodeInput, DataType outputType, const std::wstring& name = L"");
+
 #endif // !CNTK_HEADERONLY_DEFINITIONS
 }

--- a/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
+++ b/Source/CNTKv2LibraryDll/API/CNTKLibraryInternals.h
@ -41,6 +41,8 @@

 // Forward declarations
 namespace Microsoft { namespace MSR { namespace CNTK {
+    struct MatrixBase;
+
    template <typename ElemType>
    class Matrix;

--- a/Source/CNTKv2LibraryDll/API/HalfConverter.hpp
+++ b/Source/CNTKv2LibraryDll/API/HalfConverter.hpp
@ -0,0 +1,94 @@
+//This code is shared between V1 and V2
+
+#pragma once
+
+namespace CNTK {
+
+// Host functions for converting between FP32 and FP16 formats
+// TODO: use f16c instructions if available
+inline void float16ToFloat(const unsigned short* src, float* res)
+{
+    unsigned h = *src;
+    unsigned sign = ((h >> 15) & 1);
+    unsigned exponent = ((h >> 10) & 0x1f);
+    unsigned mantissa = ((h & 0x3ff) << 13);
+
+    if (exponent == 0x1f) {  /* NaN or Inf */
+        mantissa = (mantissa ? (sign = 0, 0x7fffff) : 0);
+        exponent = 0xff;
+    }
+    else if (!exponent) {  /* Denorm or Zero */
+        if (mantissa) {
+            unsigned int msb;
+            exponent = 0x71;
+            do {
+                msb = (mantissa & 0x400000);
+                mantissa <<= 1;  /* normalize */
+                --exponent;
+            } while (!msb);
+            mantissa &= 0x7fffff;  /* 1.mantissa is implicit */
+        }
+    }
+    else {
+        exponent += 0x70;
+    }
+
+    *(unsigned*)res = ((sign << 31) | (exponent << 23) | mantissa);
+}
+
+inline void floatToFloat16(float* src, unsigned short* dest)
+{
+    unsigned x = *(unsigned*)src;
+    unsigned u = (x & 0x7fffffff), remainder, shift, lsb, lsb_s1, lsb_m1;
+    unsigned short sign;
+    unsigned exponent, mantissa;
+
+    // Get rid of +NaN/-NaN case first.
+    if (u > 0x7f800000) {
+        *dest = 0x7fffU;
+        return;
+    }
+
+    sign = (unsigned short)((x >> 16) & 0x8000);
+
+    // Get rid of +Inf/-Inf, +0/-0.
+    if (u > 0x477fefff) {
+        *dest = sign | 0x7c00U;
+        return;
+    }
+    if (u < 0x33000001) {
+        *dest = (sign | 0x0000);
+        return;
+    }
+
+    exponent = ((u >> 23) & 0xff);
+    mantissa = (u & 0x7fffff);
+
+    if (exponent > 0x70) {
+        shift = 13;
+        exponent -= 0x70;
+    }
+    else {
+        shift = 0x7e - exponent;
+        exponent = 0;
+        mantissa |= 0x800000;
+    }
+    lsb = (1 << shift);
+    lsb_s1 = (lsb >> 1);
+    lsb_m1 = (lsb - 1);
+
+    // Round to nearest even.
+    remainder = (mantissa & lsb_m1);
+    mantissa >>= shift;
+    if (remainder > lsb_s1 || (remainder == lsb_s1 && (mantissa & 0x1))) {
+        ++mantissa;
+        if (!(mantissa & 0x3ff)) {
+            ++exponent;
+            mantissa = 0;
+        }
+    }
+
+    *dest = (sign | (unsigned short)((exponent << 10) | mantissa));
+}
+
+}
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -40,7 +40,6 @@
  <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
  <PropertyGroup Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
@ -161,6 +160,7 @@
    <ClInclude Include="API\CNTKLibraryC.h" />
    <ClInclude Include="API\CNTKLibraryExperimental.h" />
    <ClInclude Include="API\CNTKLibraryInternals.h" />
+    <ClInclude Include="API\HalfConverter.hpp" />
    <ClInclude Include="BackCompat.h" />
    <ClInclude Include="BlockFunction.h" />
    <ClInclude Include="CompositeFunction.h" />
--- a/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
+++ b/Source/CNTKv2LibraryDll/CNTKv2LibraryDll.vcxproj.filters
@ -177,6 +177,9 @@
    <ClInclude Include="proto\onnx\core\utils.h">
      <Filter>proto\onnx\core</Filter>
    </ClInclude>
+    <ClInclude Include="API\HalfConverter.hpp">
+      <Filter>API</Filter>
+    </ClInclude>
    <ClInclude Include="proto\onnx\core\record.h">
      <Filter>proto\onnx\core</Filter>
    </ClInclude>
@ -249,4 +252,4 @@
      <Filter>proto\onnx\protobuf</Filter>
    </Proto>
  </ItemGroup>
-</Project>
+</Project>
--- a/Source/CNTKv2LibraryDll/Common.cpp
+++ b/Source/CNTKv2LibraryDll/Common.cpp
@ -966,7 +966,7 @@ namespace CNTK
 #endif
 #ifdef _CUDA_PATH_
            int cudaVersion = 0;
-            if (cudaDriverGetVersion(&cudaVersion) == cudaSuccess)
+            if (cudaRuntimeGetVersion(&cudaVersion) == cudaSuccess)
            {
                int major = 0, minor = 0, patchLevel = 0;
                ExtractCUDAVersion(cudaVersion, major, minor, patchLevel);
--- a/Source/CNTKv2LibraryDll/CompositeFunction.cpp
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.cpp
@ -407,6 +407,36 @@ namespace CNTK
        }
    }

+    template <typename ElementType>
+    /*static*/ Microsoft::MSR::CNTK::ComputationNodeBasePtr CompositeFunction::CreateLearnableParameterFromVariable(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, const NDShape& shape, const std::wstring& name)
+    {
+        switch (variable.GetDataType())
+        {
+        case DataType::Float:
+            return builder.template TypedCreateLearnableParameter<float>(name, AsTensorShape(shape));
+        case DataType::Double:
+            return builder.template TypedCreateLearnableParameter<double>(name, AsTensorShape(shape));
+        case DataType::Float16:
+            return builder.template TypedCreateLearnableParameter<half>(name, AsTensorShape(shape));
+        default:
+            return builder.CreateLearnableParameter(name, AsTensorShape(shape));
+        }
+    }
+
+    /*static*/ void CompositeFunction::CastAssignNodeValue(ComputationNodeBasePtr node, DataType dataType, std::shared_ptr<const MatrixBase> matrix)
+    {
+        switch (dataType)
+        {
+        case DataType::Float:
+            return (dynamic_cast<ComputationNode<float>*>(&*node))->Value().CastAssignValuesOf(*matrix);
+        case DataType::Double:
+            return (dynamic_cast<ComputationNode<double>*>(&*node))->Value().CastAssignValuesOf(*matrix);
+        case DataType::Float16:
+            return (dynamic_cast<ComputationNode<half>*>(&*node))->Value().CastAssignValuesOf(*matrix);
+        default:
+            LogicError("Unsupported data type");
+        }
+    }

    // Recursively create a sub-network of ComputationNode instances corresponding to the graph of Functions
    // underlying the specified 'variable' and return the ComputationNode instance that corresponds to the
@ -441,33 +471,77 @@ namespace CNTK
        // Lets add a null entry in the map for this variable, to break infinite recursion when processing recurrent graphs
        variableToNodeMap[variable] = nullptr;

-        std::shared_ptr<ComputationNode<ElementType>> computationNodePtr;
+        std::shared_ptr<ComputationNodeBase> computationNodePtr;
        auto internalNodeName = CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes);
        if (variable.IsParameter() || variable.IsConstant())
        {
            if (variable.Shape().HasInferredDimension())
                InvalidArgument("Parameter or Constant '%S' with unresolved shape %S found when compiling the Function graph.", variable.AsString().c_str(), variable.Shape().AsString().c_str());

-            computationNodePtr = builder.CreateLearnableParameter(internalNodeName, AsTensorShape(variable.Shape()));
+            computationNodePtr = CreateLearnableParameterFromVariable(variable, builder, variable.Shape(), internalNodeName);
            network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
            if (!variable.NeedsGradient() || (inputsToExcludeGradientsFor.find(variable) != inputsToExcludeGradientsFor.end()))
                computationNodePtr->SetLearningRateMultiplier(0.0);

            NDArrayViewPtr value = variable.IsConstant() ? Constant(variable).Value() : Parameter(variable).Value();
-            std::shared_ptr<const Matrix<ElementType>> valueMatrix = variable.IsConstant() ? value->GetMatrix<ElementType>() : value->GetWritableMatrix<ElementType>();
+            std::shared_ptr<const MatrixBase> valueMatrix = variable.IsConstant() ? value->GetMatrixBase() : value->GetWritableMatrixBase();

            if (variable.IsParameter() || (valueMatrix->GetDeviceId() == network->GetDeviceId()))
-                computationNodePtr->Value() = valueMatrix->AsReference();
+            {
+                // shallow copy from parameter value to computation node value to link them together
+                switch (variable.GetDataType())
+                {
+                case DataType::Float:
+                    std::dynamic_pointer_cast<ComputationNode<float>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<float>>(valueMatrix)->AsReference();
+                    break;
+                case DataType::Double:
+                    std::dynamic_pointer_cast<ComputationNode<double>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<double>>(valueMatrix)->AsReference();
+                    break;
+                case DataType::Float16:
+                    std::dynamic_pointer_cast<ComputationNode<half>>(computationNodePtr)->Value() = std::dynamic_pointer_cast<const Matrix<half>>(valueMatrix)->AsReference();
+                    break;
+                default:
+                    LogicError("Unsupported data type");
+                }
+            }
            else // Constant: if initialized data lives on wrong device, make a copy to the right one (copy is OK since it's constant)
            {
                // TODO: the following two lines are a workaround for a bug in the Math library
                // (AssignValuesOf throws when source and destination matrices reside on different GPU devices).
                // Once this bug is fixed, change to
                // Matrix<ElementType> clonedMatrix(valueMatrix->GetNumRows(), valueMatrix->GetNumCols(), network->GetDeviceId(), valueMatrix->GetMatrixType(), valueMatrix->GetFormat());
-                Matrix<ElementType> clonedMatrix(network->GetDeviceId());
-                clonedMatrix.SwitchToMatrixType(valueMatrix->GetMatrixType(), valueMatrix->GetFormat(), false);
-                clonedMatrix.AssignValuesOf(*valueMatrix);
-                computationNodePtr->Value() = std::move(clonedMatrix);
+                switch (variable.GetDataType())
+                {
+                case DataType::Float:
+                {
+                    Matrix<float>& nodeValue = dynamic_cast<ComputationNode<float>*>(&*computationNodePtr)->Value();
+                    Matrix<float> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
+                    clonedMatrix.CastAssignValuesOf(*valueMatrix);
+                    clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
+                    nodeValue = std::move(clonedMatrix);
+                    break;
+                }
+                case DataType::Double:
+                {
+                    Matrix<double>& nodeValue = dynamic_cast<ComputationNode<double>*>(&*computationNodePtr)->Value();
+                    Matrix<double> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
+                    clonedMatrix.CastAssignValuesOf(*valueMatrix);
+                    clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
+                    nodeValue = std::move(clonedMatrix);
+                    break;
+                }
+                case DataType::Float16:
+                {
+                    Matrix<half>& nodeValue = dynamic_cast<ComputationNode<half>*>(&*computationNodePtr)->Value();
+                    Matrix<half> clonedMatrix(nodeValue.GetNumRows(), nodeValue.GetNumCols(), valueMatrix->GetDeviceId(), nodeValue.GetMatrixType(), nodeValue.GetFormat());
+                    clonedMatrix.CastAssignValuesOf(*valueMatrix);
+                    clonedMatrix.TransferToDeviceIfNotThere(network->GetDeviceId(), true);
+                    nodeValue = std::move(clonedMatrix);
+                    break;
+                }
+                default:
+                    LogicError("Unsupported data type");
+                }
            }
        }
        else if (variable.IsInput())
@ -515,7 +589,7 @@ namespace CNTK
            }
            else
            {
-                computationNodePtr = builder.CreateLearnableParameter(internalNodeName, AsTensorShape(fullyDefinedArgumentVar.Shape()));
+                computationNodePtr = CreateLearnableParameterFromVariable(variable, builder, fullyDefinedArgumentVar.Shape(), internalNodeName);
                network->InitLearnableParameters(computationNodePtr, L"fixedValue", 0); // must call this to follow protocol; can overwrite later
                if (!variable.NeedsGradient() || (inputsToExcludeGradientsFor.find(variable) != inputsToExcludeGradientsFor.end()))
                    computationNodePtr->SetLearningRateMultiplier(0.0);
@ -538,7 +612,7 @@ namespace CNTK
            // Can be null in case of loops with f.output == f.input.
            // Such loops cannot be handled, so we leave nullptr as computational node.
            if (outputVariableNode)
-                computationNodePtr = outputVariableNode->template As<ComputationNode<ElementType>>()->shared_from_this();
+                computationNodePtr = outputVariableNode->template As<ComputationNodeBase>()->shared_from_this();
            else
                computationNodePtr = nullptr;
        }
@ -589,10 +663,10 @@ namespace CNTK
            return mappingVariable;
    }

-    template <typename ElementType>
+    template<typename ElementType>
    /*static*/ ComputationNodeBasePtr CompositeFunction::CreateComputationNode(const Variable& variable,
                                                                               Function* function,
-                                                                               const std::vector<std::shared_ptr<ComputationNode<ElementType>>>& inputNodes,
+                                                                               const std::vector<std::shared_ptr<ComputationNodeBase>>& inputNodes,
                                                                               Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
                                                                               std::unordered_map<Variable, ComputationNodeBasePtr>& variableToNodeMap,
                                                                               bool useMangledNamesForComputationNodes)
@ -609,6 +683,40 @@ namespace CNTK
        for (auto inputNode : inputNodes)
            inputNodesBasePtrs.push_back(inputNode);

+        // take the dataType from the first input, if not specified (i.e. placeholder) then use default
+        // node like BatchNormalization may have inputs with different precision,
+        // and that validation is done in specific node constructor
+        DataType inputNodeType = AsDataType<ElementType>();
+        if (inputNodes.size() > 0)
+        {
+            if (std::dynamic_pointer_cast<ComputationNode<float>, ComputationNodeBase>(inputNodes[0]))
+                inputNodeType = DataType::Float;
+            else if (std::dynamic_pointer_cast<ComputationNode<double>, ComputationNodeBase>(inputNodes[0]))
+                inputNodeType = DataType::Double;
+            else if (std::dynamic_pointer_cast<ComputationNode<half>, ComputationNodeBase>(inputNodes[0]))
+                inputNodeType = DataType::Float16;
+        }
+
+#define ASSIGN_NEW_NODE(nodeClass, ...)                               \
+    do {                                                              \
+        if (inputNodeType == DataType::Float)                         \
+            computationNodePtr = New<nodeClass<float>>(__VA_ARGS__);  \
+        else if (inputNodeType == DataType::Double)                   \
+            computationNodePtr = New<nodeClass<double>>(__VA_ARGS__); \
+        else if (inputNodeType == DataType::Float16)                  \
+            computationNodePtr = New<nodeClass<half>>(__VA_ARGS__);   \
+    } while(0)
+
+#define ASSIGN_NEW_NODE2(nodeClass, dtype, ...)                              \
+    do {                                                                     \
+        if (inputNodeType == DataType::Float)                                \
+            computationNodePtr = New<nodeClass<dtype, float>>(__VA_ARGS__);  \
+        else if (inputNodeType == DataType::Double)                          \
+            computationNodePtr = New<nodeClass<dtype, double>>(__VA_ARGS__); \
+        else if (inputNodeType == DataType::Float16)                         \
+            computationNodePtr = New<nodeClass<dtype, half>>(__VA_ARGS__);   \
+    } while(0)
+
        auto outputs = function->RawOutputs();
        if (variable == outputs[0])
        {
@ -621,76 +729,76 @@ namespace CNTK
                switch (op)
                {
                case PrimitiveOpType::Negate:
-                    computationNodePtr = New<NegateNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(NegateNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Sigmoid:
-                    computationNodePtr = New<SigmoidNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(SigmoidNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Atanh:
-                    computationNodePtr = New<AtanhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(AtanhNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Tanh:
-                    computationNodePtr = New<TanhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(TanhNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Acos:
-                    computationNodePtr = New<AcosNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(AcosNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Cos:
-                    computationNodePtr = New<CosineNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(CosineNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Asin:
-                    computationNodePtr = New<AsinNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(AsinNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Sin:
-                    computationNodePtr = New<SinNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(SinNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Cosh:
-                    computationNodePtr = New<CoshNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(CoshNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Asinh:
-                    computationNodePtr = New<AsinhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(AsinhNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Sinh:
-                    computationNodePtr = New<SinhNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(SinhNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::ReLU:
-                    computationNodePtr = New<RectifiedLinearNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(RectifiedLinearNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Exp:
-                    computationNodePtr = New<ExpNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ExpNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Log:
-                    computationNodePtr = New<LogNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(LogNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Sqrt:
-                    computationNodePtr = New<SqrtNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(SqrtNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::ELU:
-                    computationNodePtr = New<ExponentialLinearUnitNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ExponentialLinearUnitNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Floor:
-                    computationNodePtr = New<FloorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(FloorNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Abs:
-                    computationNodePtr = New<AbsNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(AbsNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Reciprocal:
-                    computationNodePtr = New<ReciprocalNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ReciprocalNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Softmax:
-                    computationNodePtr = New<SoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(SoftmaxNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Hardmax:
-                    computationNodePtr = New<HardmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(HardmaxNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::TopK:
                {
                    auto k = functionConfig[PrimitiveFunction::AttributeNameNumItems].Value<size_t>();
-                    computationNodePtr = New<TopKNode<ElementType>>(network->GetDeviceId(), internalNodeName, k);
+                    ASSIGN_NEW_NODE(TopKNode, network->GetDeviceId(), internalNodeName, k);
                    break;
                }
                case PrimitiveOpType::StableSigmoid:
-                    computationNodePtr = New<StableSigmoidNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(StableSigmoidNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::TransposeAxes:
                {
@ -699,7 +807,7 @@ namespace CNTK
                        auto perm = AsVector<Axis>(functionConfig[PrimitiveFunction::AttributeNameAxisVec].Value<std::vector<DictionaryValue>>());
                        for (auto& p : perm)
                            p = NormalizeStaticAxis(p, perm.size());
-                        computationNodePtr = New<TransposeDimensionsNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(perm));
+                        ASSIGN_NEW_NODE(TransposeDimensionsNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(perm));
                    }
                    else
                    {
@ -707,7 +815,7 @@ namespace CNTK
                        auto axis2 = functionConfig[PrimitiveFunction::AttributeNameAxis2].Value<Axis>();

                        // The axis ids passed to the internal CNTK TransposeDimensionsNode are 1 based instead of 0 based
-                        computationNodePtr = New<TransposeDimensionsNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(axis1), AsCNTKInternalAxisIdx(axis2));
+                        ASSIGN_NEW_NODE(TransposeDimensionsNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(axis1), AsCNTKInternalAxisIdx(axis2));
                    }
                    break;
                }
@ -715,24 +823,24 @@ namespace CNTK
                {
                    auto dynamicAxes = variable.DynamicAxes();
                    auto internalCNTKWhereNodeDynamicAxisName = InternalDynamicAxisNameFromDynamicAxes(dynamicAxes);
-                    computationNodePtr = New<WhereNode<ElementType>>(network->GetDeviceId(), internalNodeName, internalCNTKWhereNodeDynamicAxisName);
+                    ASSIGN_NEW_NODE(WhereNode, network->GetDeviceId(), internalNodeName, internalCNTKWhereNodeDynamicAxisName);
                    break;
                }
                case PrimitiveOpType::ToSequence:
                {
                    auto dynamicAxes = variable.DynamicAxes();
                    auto internalCNTKDynamicAxisName = InternalDynamicAxisNameFromDynamicAxes(dynamicAxes);
-                    computationNodePtr = New<ToSequenceNode<ElementType>>(network->GetDeviceId(), internalNodeName, internalCNTKDynamicAxisName);
+                    ASSIGN_NEW_NODE(ToSequenceNode, network->GetDeviceId(), internalNodeName, internalCNTKDynamicAxisName);
                    break;
                }
                case PrimitiveOpType::ToSequenceLike:
-                    computationNodePtr = New<ToSequenceLikeNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ToSequenceLikeNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::UnpackSequence:
                {
                    auto paddingValue = functionConfig[PrimitiveFunction::AttributeNameSequenceUnpackPaddingValue].Value<double>();
                    auto suppressMaskOutput = functionConfig[PrimitiveFunction::AttributeNameSequenceUnpackSuppressMaskOutput].Value<bool>();
-                    computationNodePtr = New<UnpackSequenceNode<ElementType>>(network->GetDeviceId(), internalNodeName, (ElementType)paddingValue, suppressMaskOutput);
+                    ASSIGN_NEW_NODE(UnpackSequenceNode, network->GetDeviceId(), internalNodeName, paddingValue, suppressMaskOutput);
                    break;
                }
                case PrimitiveOpType::Slice:
@ -768,28 +876,28 @@ namespace CNTK
                        RuntimeError("Failed to create computation node: Slice operation with inconsistent attributes");
                    }
                    // Internal CNTK SliceNode takes 1 based axis indices instead of 0 based
-                    computationNodePtr = New<SliceNode<ElementType>>(network->GetDeviceId(), internalNodeName, beginIndex, endIndex, AsCNTKInternalAxisIdx(axis), strides);
+                    ASSIGN_NEW_NODE(SliceNode, network->GetDeviceId(), internalNodeName, beginIndex, endIndex, AsCNTKInternalAxisIdx(axis), strides);
                    break;
                }
                case PrimitiveOpType::RandomSample:
                {
                    auto numSamples = functionConfig[PrimitiveFunction::AttributeNameNumSamples].Value<size_t>();
                    auto allowDuplicates = functionConfig[PrimitiveFunction::AttributeNameAllowDuplicates].Value<bool>();
-                    computationNodePtr = New<RandomSampleNode<ElementType>>(network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
+                    ASSIGN_NEW_NODE(RandomSampleNode, network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
                    break;
                }
                case PrimitiveOpType::RandomSampleInclusionFrequency:
                {
                    auto numSamples = functionConfig[PrimitiveFunction::AttributeNameNumSamples].Value<size_t>();
                    auto allowDuplicates = functionConfig[PrimitiveFunction::AttributeNameAllowDuplicates].Value<bool>();
-                    computationNodePtr = New<RandomSampleInclusionFrequencyNode<ElementType>>(network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
+                    ASSIGN_NEW_NODE(RandomSampleInclusionFrequencyNode, network->GetDeviceId(), internalNodeName, numSamples, allowDuplicates);
                    break;
                }
                case PrimitiveOpType::Dropout:
                {
                    auto dropoutRate = functionConfig[PrimitiveFunction::AttributeNameDropoutRate].Value<double>();
-                    computationNodePtr = New<DropoutNode<ElementType>>(network->GetDeviceId(), internalNodeName);
-                    computationNodePtr->As<DropoutNode<ElementType>>()->SetDropoutRate(dropoutRate);
+                    ASSIGN_NEW_NODE(DropoutNode, network->GetDeviceId(), internalNodeName);
+                    SMART_NODE_INVOKE(DropoutNode, computationNodePtr, SetDropoutRate, dropoutRate);
                    break;
                }
                case PrimitiveOpType::RandomDistribution:
@ -805,11 +913,11 @@ namespace CNTK
                    if (functionConfig.Contains(PrimitiveFunction::AttributeNameNewShape))
                    {
                        auto shape = functionConfig[PrimitiveFunction::AttributeNameNewShape].Value<NDShape>();
-                        computationNodePtr = New<RandomDistributionNode<ElementType>>(network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs, AsTensorShape(shape));
+                        ASSIGN_NEW_NODE(RandomDistributionNode, network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs, AsTensorShape(shape));
                    }
                    else
-                        computationNodePtr = New<RandomDistributionNode<ElementType>>(network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs);
-                    computationNodePtr->As<RandomDistributionNode<ElementType>>()->SetRngState(seed, offset);
+                        ASSIGN_NEW_NODE(RandomDistributionNode, network->GetDeviceId(), internalNodeName, rvtype, randomDistributionArgs);
+                    SMART_NODE_INVOKE(RandomDistributionNode, computationNodePtr, SetRngState, seed, offset);
                    break;
                }
                case PrimitiveOpType::Reshape:
@ -829,7 +937,7 @@ namespace CNTK
                            replacementShape[i] = 0;
                    }

-                    computationNodePtr = New<ReshapeNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(replacementShape), AsCNTKInternalAxisIdx(beginAxis), AsCNTKInternalAxisIdx(endAxis));
+                    ASSIGN_NEW_NODE(ReshapeNode, network->GetDeviceId(), internalNodeName, AsTensorShape(replacementShape), AsCNTKInternalAxisIdx(beginAxis), AsCNTKInternalAxisIdx(endAxis));
                    break;
                }
                case PrimitiveOpType::Squeeze:
@ -853,7 +961,7 @@ namespace CNTK
                    PoolingType poolingType = (PoolingType)(functionConfig[PrimitiveFunction::AttributeNamePoolingType].Value<size_t>());
                    auto roiOutputShape = functionConfig[PrimitiveFunction::AttributeNameROIOutputShape].Value<NDShape>();
                    auto spatialScale = functionConfig[PrimitiveFunction::AttributeNameSpatialScale].Value<double>();
-                    computationNodePtr = New<ROIPoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(roiOutputShape), spatialScale);
+                    ASSIGN_NEW_NODE(ROIPoolingNode, network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(roiOutputShape), spatialScale);
                    break;
                }
                case PrimitiveOpType::Pooling:
@ -874,7 +982,7 @@ namespace CNTK
                    {
                        includePad = functionConfig[PrimitiveFunction::AttributeNameIncludePad].Value<bool>();
                    }
-                    computationNodePtr = New<PoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ceilOutDim, includePad, ImageLayoutKind::CHW);
+                    ASSIGN_NEW_NODE(PoolingNode, network->GetDeviceId(), internalNodeName, AsCNTKPoolKind(poolingType), AsTensorShape(poolingWindowsShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ceilOutDim, includePad, ImageLayoutKind::CHW);
                    break;
                }
                case PrimitiveOpType::Unpooling:
@ -885,77 +993,77 @@ namespace CNTK
                    auto upperPad = functionConfig[PrimitiveFunction::AttributeNameUpperPad].Value<NDShape>();
                    auto autoPadding = AsVector<bool>(functionConfig[PrimitiveFunction::AttributeNameAutoPadding].Value<std::vector<DictionaryValue>>());
                    //We only get here after validation so it is safe to assume unpooling is max
-                    computationNodePtr = New<MaxUnpoolingNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(unpoolingWindowShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ImageLayoutKind::CHW);
+                    ASSIGN_NEW_NODE(MaxUnpoolingNode, network->GetDeviceId(), internalNodeName, AsTensorShape(unpoolingWindowShape), AsTensorShape(strides), autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), ImageLayoutKind::CHW);
                    break;
                }
                case PrimitiveOpType::SumAll:
-                    computationNodePtr = New<SumElementsNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(SumElementsNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::OneHot:
                {
                    auto numClass = functionConfig[PrimitiveFunction::AttributeNameNumClass].Value<size_t>();
                    auto is_sparse = functionConfig[PrimitiveFunction::AttributeNameOneHotOutputSparse].Value<bool>();
                    auto axis = functionConfig[PrimitiveFunction::AttributeNameOneHotAxis].Value<Axis>();
-                    computationNodePtr = New<OneHotNode<ElementType>>(network->GetDeviceId(), numClass, is_sparse, axis.StaticAxisIndex(), internalNodeName);
+                    ASSIGN_NEW_NODE(OneHotNode, network->GetDeviceId(), numClass, is_sparse, axis.StaticAxisIndex(), internalNodeName);
                    break;
                }
                case PrimitiveOpType::Gather:
-                    computationNodePtr = New<GatherNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(GatherNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::ToBatch:
                {
-                    computationNodePtr = New<ToBatchAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ToBatchAxisNode, network->GetDeviceId(), internalNodeName);
                    break;
                }
                case PrimitiveOpType::UnpackBatch:
                {
-                    computationNodePtr = New<UnpackBatchAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(UnpackBatchAxisNode, network->GetDeviceId(), internalNodeName);
                    break;
                }
                case PrimitiveOpType::Plus:
-                    computationNodePtr = New<PlusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(PlusNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::LogPlus:
-                    computationNodePtr = New<LogPlusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(LogPlusNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Pow:
-                    computationNodePtr = New<PowNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(PowNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Minus:
-                    computationNodePtr = New<MinusNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(MinusNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::ElementTimes:
-                    computationNodePtr = New<ElementTimesNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ElementTimesNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Equal:
-                    computationNodePtr = New<EqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(EqualNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::NotEqual:
-                    computationNodePtr = New<NotEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(NotEqualNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Less:
-                    computationNodePtr = New<LessNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(LessNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::LessEqual:
-                    computationNodePtr = New<LessEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(LessEqualNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Greater:
-                    computationNodePtr = New<GreaterNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(GreaterNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::GreaterEqual:
-                    computationNodePtr = New<GreaterEqualNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(GreaterEqualNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Times:
                {
                    size_t outputRank = functionConfig[PrimitiveFunction::AttributeNameOutputRank].Value<size_t>();
                    auto inferInputRankToMap = functionConfig[PrimitiveFunction::AttributeNameInferInputRankToMap].Value<int>();
-                    computationNodePtr = New<TimesNode<ElementType>>(network->GetDeviceId(), internalNodeName, outputRank, inferInputRankToMap);
+                    ASSIGN_NEW_NODE(TimesNode, network->GetDeviceId(), internalNodeName, outputRank, inferInputRankToMap);
                    break;
                }
                case PrimitiveOpType::TransposeTimes:
                {
                    size_t outputRank = functionConfig[PrimitiveFunction::AttributeNameOutputRank].Value<size_t>();
-                    computationNodePtr = New<TransposeTimesNode<ElementType>>(network->GetDeviceId(), internalNodeName, outputRank);
+                    ASSIGN_NEW_NODE(TransposeTimesNode, network->GetDeviceId(), internalNodeName, outputRank);
                    break;
                }
                case PrimitiveOpType::Convolution:
@ -975,7 +1083,7 @@ namespace CNTK
                    if (functionConfig.Contains(PrimitiveFunction::AttributeNameOutputShape))
                        outputShape = functionConfig[PrimitiveFunction::AttributeNameOutputShape].Value<NDShape>();
                    auto maxTempMemSizeInSamples = functionConfig[PrimitiveFunction::AttributeNameMaxTempMemSizeInSamples].Value<size_t>();
-                    computationNodePtr = New<ConvolutionNode<ElementType>>(network->GetDeviceId(), internalNodeName,
+                    ASSIGN_NEW_NODE(ConvolutionNode, network->GetDeviceId(), internalNodeName,
                                                                           AsTensorShape(kernelShape), AsTensorShape(outputMapCount), AsTensorShape(strides),
                                                                           sharing, autoPadding, AsTensorShape(lowerPad), AsTensorShape(upperPad), transpose,
                                                                           outputShape.IsUnknown() ? TensorShape(0) : AsTensorShape(outputShape),
@ -983,22 +1091,22 @@ namespace CNTK
                    break;
                }
                case PrimitiveOpType::CosDistance:
-                    computationNodePtr = New<CosDistanceNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(CosDistanceNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::CosDistanceWithNegativeSamples:
-                    computationNodePtr = New<CosDistanceWithNegativeSamplesNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(CosDistanceWithNegativeSamplesNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Logistic:
-                    computationNodePtr = New<LogisticNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(LogisticNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::SquaredError:
-                    computationNodePtr = New<SquareErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(SquareErrorNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::CrossEntropyWithSoftmax:
-                    computationNodePtr = New<CrossEntropyWithSoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(CrossEntropyWithSoftmaxNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::ClassificationError:
-                    computationNodePtr = New<ClassificationErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ClassificationErrorNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::EditDistanceError:
                {
@ -1007,7 +1115,7 @@ namespace CNTK
                    auto insPen = functionConfig[PrimitiveFunction::AttributeNameInsertionPenalty].Value<float>();
                    auto squashInputs = functionConfig[PrimitiveFunction::AttributeNameSquashInputs].Value<bool>();
                    auto tokensToIgnore = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNameTokensToIgnore].Value<std::vector<DictionaryValue>>());
-                    computationNodePtr = New<EditDistanceErrorNode<ElementType>>(network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
+                    ASSIGN_NEW_NODE(EditDistanceErrorNode, network->GetDeviceId(), internalNodeName, subPen, delPen, insPen, squashInputs, tokensToIgnore);
                    break;
                }
                case PrimitiveOpType::LatticeSequenceWithSoftmax:
@ -1033,14 +1141,14 @@ namespace CNTK
                {
                    auto delayContraint = functionConfig[PrimitiveFunction::AttributeNameDelayConstraint].Value<int>();
                    auto blankTokenId = functionConfig[PrimitiveFunction::AttributeNameBlankTokenId].Value<size_t>();
-                    computationNodePtr = New<ForwardBackwardNode<ElementType>>(network->GetDeviceId(), internalNodeName, blankTokenId, delayContraint);
+                    ASSIGN_NEW_NODE(ForwardBackwardNode, network->GetDeviceId(), internalNodeName, blankTokenId, delayContraint);
                    break;
                }
                case PrimitiveOpType::LambdaRank:
-                    computationNodePtr = New<LambdaRankNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(LambdaRankNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::NDCG:
-                    computationNodePtr = New<NDCG1EvalNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(NDCG1EvalNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::PastValue:
                case PrimitiveOpType::FutureValue:
@ -1050,9 +1158,9 @@ namespace CNTK

                    size_t offset = primitiveFunction->Attributes()[PrimitiveFunction::AttributeNameOffset].Value<size_t>();
                    if (op == PrimitiveOpType::PastValue)
-                        computationNodePtr = New<PastValueNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
+                        ASSIGN_NEW_NODE(PastValueNode, network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
                    else
-                        computationNodePtr = New<FutureValueNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);
+                        ASSIGN_NEW_NODE(FutureValueNode, network->GetDeviceId(), internalNodeName, AsTensorShape(inputOperandVar.Shape()), offset);

                    break;
                }
@ -1080,7 +1188,7 @@ namespace CNTK
                        );

                    } 
-                    computationNodePtr = New<ReduceElementsNode<ElementType>>(network->GetDeviceId(), internalNodeName, reductionOpName, AsCNTKInternalAxisIdx(reductionAxis), keepDimensions);
+                    ASSIGN_NEW_NODE(ReduceElementsNode, network->GetDeviceId(), internalNodeName, reductionOpName, AsCNTKInternalAxisIdx(reductionAxis), keepDimensions);
                    break;
                }
                case PrimitiveOpType::BatchNormalization:
@ -1097,7 +1205,7 @@ namespace CNTK
                        disableRegularization = functionConfig[PrimitiveFunction::AttributeNameDisableRegularization].Value<bool>();
                    }
                    
-                    computationNodePtr = New<BatchNormalizationNode<ElementType>>(network->GetDeviceId(), internalNodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, disableRegularization, ImageLayoutKind::CHW);
+                    ASSIGN_NEW_NODE(BatchNormalizationNode, network->GetDeviceId(), internalNodeName, spatial, normalizationTimeConstant, blendTimeConstant, epsilon, !useCuDNNEngine, disableRegularization, ImageLayoutKind::CHW);
                    break;
                }
                case PrimitiveOpType::Combine:
@ -1106,24 +1214,24 @@ namespace CNTK
                    computationNodePtr = variableToNodeMap[variable];
                    break;
                case PrimitiveOpType::PackedIndex:
-                    computationNodePtr = New<PackedIndexNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(PackedIndexNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::GatherPacked:
-                    computationNodePtr = New<GatherPackedNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(GatherPackedNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::ScatterPacked:
-                    computationNodePtr = New<ScatterPackedNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ScatterPackedNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Clip:
-                    computationNodePtr = New<ClipNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ClipNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Select:
-                    computationNodePtr = New<IfNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(IfNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Splice:
                {
                    Axis spliceAxis = functionConfig[PrimitiveFunction::AttributeNameAxis].Value<Axis>();
-                    computationNodePtr = New<RowStackNode<ElementType>>(network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(spliceAxis));
+                    ASSIGN_NEW_NODE(RowStackNode, network->GetDeviceId(), internalNodeName, AsCNTKInternalAxisIdx(spliceAxis));
                    break;
                }
                case PrimitiveOpType::Pad:
@ -1132,7 +1240,7 @@ namespace CNTK
                    auto foot = AsVector<size_t>(functionConfig[PrimitiveFunction::AttributeNamePaddingFoot].Value<std::vector<DictionaryValue>>());
                    auto mode = functionConfig[PrimitiveFunction::AttributeNamePaddingMode].Value<size_t>();
                    auto constantValue = functionConfig[PrimitiveFunction::AttributeNamePaddingConstantValue].Value<double>();
-                    computationNodePtr = New<PaddingNode<ElementType>>(network->GetDeviceId(), internalNodeName, head, foot, (PaddingType)mode, (ElementType)constantValue);
+                    ASSIGN_NEW_NODE(PaddingNode, network->GetDeviceId(), internalNodeName, head, foot, (PaddingType)mode, constantValue);
                    break;
                }
                case PrimitiveOpType::OptimizedRNNStack:
@ -1142,31 +1250,31 @@ namespace CNTK
                    auto hiddenSize = functionConfig[PrimitiveFunction::AttributeNameHiddenSize].Value<size_t>();
                    auto recurrentOp = functionConfig[PrimitiveFunction::AttributeNameRecurrentOp].Value<std::wstring>();

-                    computationNodePtr = New<OptimizedRNNStackNode<ElementType>>(network->GetDeviceId(), internalNodeName, bidirectional, numLayers, hiddenSize, recurrentOp);
+                    ASSIGN_NEW_NODE(OptimizedRNNStackNode, network->GetDeviceId(), internalNodeName, bidirectional, numLayers, hiddenSize, recurrentOp);
                    break;
                }
                case PrimitiveOpType::ReconcileDynamicAxis:
                {
-                    computationNodePtr = New<ReconcileDynamicAxisNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(ReconcileDynamicAxisNode, network->GetDeviceId(), internalNodeName);
                    break;
                }
                case PrimitiveOpType::LogSoftmax:
                {
                    //This can be implemented as x => x - ReduceLogSum(x). How to do this here?
-                    computationNodePtr = New<LogSoftmaxNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(LogSoftmaxNode, network->GetDeviceId(), internalNodeName);
                    break;
                }
                case PrimitiveOpType::Pass:
-                    computationNodePtr = New<PassNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(PassNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::LabelsToGraph:
-                    computationNodePtr = New<LabelsToGraphNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(LabelsToGraphNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::StopGradient:
-                    computationNodePtr = New<StopGradientNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(StopGradientNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Assign:
-                    computationNodePtr = New<AssignNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                    ASSIGN_NEW_NODE(AssignNode, network->GetDeviceId(), internalNodeName);
                    break;
                case PrimitiveOpType::Crop:
                    if (functionInputs.size() == 2)
@ -1179,24 +1287,41 @@ namespace CNTK
                            {
                                CNTK::LogicError("Vector of crop offsets must have size 2.");
                            }
-                            computationNodePtr = New<CropNode<ElementType>>(offsets[0], offsets[1], network->GetDeviceId(), internalNodeName);
+                            ASSIGN_NEW_NODE(CropNode, offsets[0], offsets[1], network->GetDeviceId(), internalNodeName);
                        }
                        else
                        {
                            // Crop with two inputs and automatic offset computation.
-                            computationNodePtr = New<CropNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                            ASSIGN_NEW_NODE(CropNode, network->GetDeviceId(), internalNodeName);
                        }
                    }
                    else if (functionInputs.size() == 4)
                    {
                        // Crop with four inputs and automatic offset computation.
-                        computationNodePtr = New<CropNode<ElementType>>(network->GetDeviceId(), internalNodeName);
+                        ASSIGN_NEW_NODE(CropNode, network->GetDeviceId(), internalNodeName);
                    }
                    else
                    {
                        CNTK::LogicError("Crop node must have 2 or 4 node inputs.");
                    }
                    break;
+                case PrimitiveOpType::Cast:
+                {
+                    DataType outputType = (DataType)functionConfig[PrimitiveFunction::AttributeNameNewDataType].Value<int>();
+                    switch (outputType)
+                    {
+                    case DataType::Float:
+                        ASSIGN_NEW_NODE2(CastNode, float, network->GetDeviceId(), internalNodeName);
+                        break;
+                    case DataType::Double:
+                        ASSIGN_NEW_NODE2(CastNode, double, network->GetDeviceId(), internalNodeName);
+                        break;
+                    case DataType::Float16:
+                        ASSIGN_NEW_NODE2(CastNode, half, network->GetDeviceId(), internalNodeName);
+                        break;
+                    }
+                    break;
+                }
                default:
                    CNTK::LogicError("Specified op %S not yet supported", PrimitiveOpTypeName(op).c_str());
                    break;
@ -1224,7 +1349,7 @@ namespace CNTK
            }
            else
            {
-                computationNodePtr = New<UserDefinedV2FunctionNode<ElementType>>(network->GetDeviceId(), internalNodeName, function->shared_from_this());
+                ASSIGN_NEW_NODE(UserDefinedV2FunctionNode, network->GetDeviceId(), internalNodeName, function->shared_from_this());

                // For user defined functions, we only attach unique inputs in the internal computation network since, the UDF
                // backward implementations directly compute aggregate gradient values for unique inputs
@ -1244,7 +1369,7 @@ namespace CNTK
            while (outputs[i] != variable) i++;
            assert(i < outputs.size());

-            computationNodePtr = New<OutputMultiplexerNode<ElementType>>(network->GetDeviceId(), CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes), i);
+            ASSIGN_NEW_NODE(OutputMultiplexerNode, network->GetDeviceId(), CNTKInternalNodeNameFromUidAndName(variable.Uid(), variable.Name(), useMangledNamesForComputationNodes), i);
            inputNodesBasePtrs = { variableToNodeMap[outputs[0]] };
        }

@ -1279,15 +1404,16 @@ namespace CNTK
        }

        // Create the nodes corresponding to the inputs
-        std::vector<std::shared_ptr<ComputationNode<ElementType>>> inputNodes;
+        std::vector<std::shared_ptr<ComputationNodeBase>> inputNodes;
        for (auto& inputVar : functionInputs)
        {
            // If the inputVar is a constant and not the right DataType let's coerce it to the right type
-            if (inputVar.IsConstant() && (nonConstInputDataType != DataType::Unknown) && (inputVar.GetDataType() != nonConstInputDataType))
+            // except for FP16 that mismatch is needed (e.g. BatchNorm stats in FP16 need to be FP32)
+            if (inputVar.IsConstant() && (nonConstInputDataType != DataType::Unknown) && (nonConstInputDataType != DataType::Float16) && (inputVar.GetDataType() != nonConstInputDataType))
                inputVar = Constant(inputVar).CloneAs(nonConstInputDataType);

            auto baseNodePtr = GetNode(inputVar, network, builder, fullyDefinedArgumentsMap, variableToNodeMap, isVariableRootMap, inputsToExcludeGradientsFor, useMangledNamesForComputationNodes);
-            inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr->template As<ComputationNode<ElementType>>()->shared_from_this() : nullptr);
+            inputNodes.push_back((baseNodePtr != nullptr) ? baseNodePtr : nullptr);
        }

        BlockFunction* blockFunction = dynamic_cast<BlockFunction*>(function);
@ -1303,7 +1429,7 @@ namespace CNTK
            return GetNode(variable.BlockFunctionVariableMapping(), network, builder, fullyDefinedArgumentsMap, variableToNodeMap, isVariableRootMap, inputsToExcludeGradientsFor, useMangledNamesForComputationNodes);
        }
        else
-            computationNodePtr = CreateComputationNode(variable, function, inputNodes, network, variableToNodeMap, useMangledNamesForComputationNodes);
+            computationNodePtr = CreateComputationNode<ElementType>(variable, function, inputNodes, network, variableToNodeMap, useMangledNamesForComputationNodes);

        PrimitiveFunction* primitiveFunction = dynamic_cast<PrimitiveFunction*>(function);
        if (!primitiveFunction || (primitiveFunction->OpType() != PrimitiveOpType::Combine))
@ -1732,6 +1858,9 @@ namespace CNTK
            case DataType::Double:
                PopulateComputationNodeValue<double>({ argument, argumentValue }, argumentComputationNode, layoutsPopulated);
                break;
+            case DataType::Float16:
+                PopulateComputationNodeValue<half>({ argument, argumentValue }, argumentComputationNode, layoutsPopulated);
+                break;
            default:
                LogicError("Function '%S' Forward: Unsupported DataType %s.", AsString().c_str(), DataTypeName(argumentValue->GetDataType()));
                break;
@ -1775,6 +1904,9 @@ namespace CNTK
            case DataType::Double:
                PopulateComputationNodeGradient<double>(gradientVarValuePair, outputComputationNode);
                break;
+            case DataType::Float16:
+                PopulateComputationNodeGradient<half>(gradientVarValuePair, outputComputationNode);
+                break;
            default:
                LogicError("Function '%S' Backward: Unsupported DataType %s.", AsString().c_str(), DataTypeName(gradientValue->GetDataType()));
                break;
@ -1816,6 +1948,15 @@ namespace CNTK
                nodeValue = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(var, computationNode, matrix, layout);
            break;
        }
+        case DataType::Float16:
+        {
+            auto& matrix = getGradient ? computationNode->As<ComputationNode<half>>()->Gradient() : computationNode->As<ComputationNode<half>>()->Value();
+            if (varValue == nullptr)
+                nodeValue = MakeSharedObject<PackedValue>(varShape, var.DynamicAxes(), std::make_shared<Matrix<half>>(matrix.AsReference()), layout, /*readOnly =*/ false);
+            else
+                nodeValue = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(var, computationNode, matrix, layout);
+            break;
+        }
        default:
            CNTK::LogicError("CompositeFunction::Forward/Backward: Unsupported DataType %s", DataTypeName(var.GetDataType()));
            break;
@ -1973,6 +2114,8 @@ namespace CNTK
            GetComputationNetwork<float>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
        else if (dataType == DataType::Double)
            GetComputationNetwork<double>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
+        else if (dataType == DataType::Float16)
+            GetComputationNetwork<half>(computeDevice, outputsToRetainBackwardStateFor, requestedOutputVariables, inputsToExcludeGradientsFor, true);
        else
            InvalidArgument("Unsupported DataType %s", DataTypeName(dataType));

--- a/Source/CNTKv2LibraryDll/CompositeFunction.h
+++ b/Source/CNTKv2LibraryDll/CompositeFunction.h
@ -286,7 +286,7 @@ namespace CNTK
        template <typename ElementType>
        static Microsoft::MSR::CNTK::ComputationNodeBasePtr CreateComputationNode(const Variable& variable,
                                                                                  Function* function,
-                                                                                  const std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNode<ElementType>>>& inputNodes,
+                                                                                  const std::vector<std::shared_ptr<Microsoft::MSR::CNTK::ComputationNodeBase>>& inputNodes,
                                                                                  Microsoft::MSR::CNTK::ComputationNetworkPtr& network,
                                                                                  std::unordered_map<Variable, Microsoft::MSR::CNTK::ComputationNodeBasePtr>& variableToNodeMap,
                                                                                  bool useMangledNamesForComputationNodes);
@ -359,6 +359,11 @@ namespace CNTK
                refVar.IsParameter() ? Parameter(refVar).RecordValueUpdate() : Constant(refVar).RecordValueUpdate();
        }

+        template <typename ElementType>
+        static Microsoft::MSR::CNTK::ComputationNodeBasePtr CreateLearnableParameterFromVariable(const Variable& variable, Microsoft::MSR::CNTK::ComputationNetworkBuilder<ElementType>& builder, const NDShape& shape, const std::wstring& name);
+
+        static void CastAssignNodeValue(Microsoft::MSR::CNTK::ComputationNodeBasePtr node, DataType dataType, std::shared_ptr<const Microsoft::MSR::CNTK::MatrixBase> matrix);
+
    private:

        // Set of all primitive functions in the graph underlying 'this' Function. Also keeps the primitive Function objects alive 
--- a/Source/CNTKv2LibraryDll/DataParallelDistributedLearner.cpp
+++ b/Source/CNTKv2LibraryDll/DataParallelDistributedLearner.cpp
@ -133,7 +133,7 @@ namespace CNTK
 #endif

            if (info.IsEmpty())
-                PrepaireZeroGradients(gradientValues, info);
+                PrepaireZeroGradients(gradientValues);

            // sorts gradient buffers according to parameter uid, and perform sparse to dense conversion
            // if !UseSparseGradientAggregationInDataParallelSGD()
--- a/Source/CNTKv2LibraryDll/DistributedCommunicator.cpp
+++ b/Source/CNTKv2LibraryDll/DistributedCommunicator.cpp
@ -64,7 +64,9 @@ namespace CNTK
            return viewPtr->WritableDataBuffer<float>();
        if (viewPtr->GetDataType() == DataType::Double)
            return viewPtr->WritableDataBuffer<double>();
-        
+        if (viewPtr->GetDataType() == DataType::Float16)
+            return viewPtr->WritableDataBuffer<float16>();
+
        LogicError("Unknown DataType");
        return nullptr; // Make compiler happy.
    }
@ -398,6 +400,11 @@ namespace CNTK
                AllReduceData(static_cast<double*>(inputData), static_cast<double*>(outputData), numElements,
                    &allReduceRequests, (inputValue->Device() == DeviceDescriptor::CPUDevice()));
            }
+            else if (dataType == DataType::Float16)
+            {
+                AllReduceDataHalf(static_cast<half*>(inputData), static_cast<half*>(outputData), numElements,
+                    &allReduceRequests, (inputValue->Device() == DeviceDescriptor::CPUDevice()));
+            }
            else
                LogicError("MPICommunicator: Unknown DataType.");
        }
@ -581,8 +588,10 @@ namespace CNTK

            if (sbc->GetDataType() == DataType::Float)
                AllReduceData<float>((float*)nz, (float*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);
-            else
+            else if (sbc->GetDataType() == DataType::Double)
                AllReduceData<double>((double*)nz, (double*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);
+            else if (sbc->GetDataType() == DataType::Float16)
+                AllReduceDataHalf((half*)nz, (half*)nz, requiredElements, nullptr, aggregateOnCPU, MPI_SUM, true);

            if (aggregateOnCPU)
            {
@ -719,4 +728,17 @@ namespace CNTK
        else
            m_mpi->AllReduceAsync(inputData, outputData, numElements, &(pAllReduceRequests->back()), op);
    }
+
+    void MPICommunicatorImpl::AllReduceDataHalf(half* inputData, half* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op, bool forceSync)
+    {
+        if (m_nccl->IsSupported() && !dataOnCPU)
+        {
+            m_nccl->AllReduce(inputData, outputData, numElements, op);
+
+            return;
+        }
+
+        //half aggregation other than NCCL is not supported
+        NOT_IMPLEMENTED;
+    }
 }
--- a/Source/CNTKv2LibraryDll/DistributedCommunicator.h
+++ b/Source/CNTKv2LibraryDll/DistributedCommunicator.h
@ -143,5 +143,7 @@ namespace CNTK

        template <typename ElemType>
        void AllReduceData(ElemType* inputData, ElemType* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op = MPI_SUM, bool forceSync = false);
+
+        void AllReduceDataHalf(half* inputData, half* outputData, size_t numElements, std::vector<MPI_Request>* pAllReduceRequests, bool dataOnCPU, MPI_Op op = MPI_SUM, bool forceSync = false);
    };
 }
--- a/Source/CNTKv2LibraryDll/DistributedLearnerBase.cpp
+++ b/Source/CNTKv2LibraryDll/DistributedLearnerBase.cpp
@ -36,7 +36,7 @@ namespace CNTK
        m_sampleCount = checkpoint[L"totalNumberOfSamplesSeen"].Value<size_t>();
    }

-    void DistributedLearnerBase::PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info)
+    void DistributedLearnerBase::PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues)
    {
        // Need to initialize gradients to 0 in case when it is an empty minibatch.
        for (auto& g : gradientValues)
@ -44,10 +44,6 @@ namespace CNTK
            auto weights = g.first.Value();
            g.second = MakeSharedObject<NDArrayView>(0, weights->GetDataType(), weights->Shape(), weights->Device());
        }
-
-        auto dataType = gradientValues.begin()->first.GetDataType();
-        info.evalCriterionValue = MakeSharedObject<NDArrayView>(0, dataType, NDShape{}, DeviceDescriptor::UseDefaultDevice());
-        info.trainingLossValue = MakeSharedObject<NDArrayView>(0, dataType, NDShape{}, DeviceDescriptor::UseDefaultDevice());
    }

    void DistributedLearnerBase::ConvertToOrdered(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, std::vector<std::pair<Parameter, NDArrayViewPtr>>& result, std::unordered_map<Parameter, NDArrayViewPtr>* convertedGradientValues)
--- a/Source/CNTKv2LibraryDll/DistributedLearnerBase.h
+++ b/Source/CNTKv2LibraryDll/DistributedLearnerBase.h
@ -22,7 +22,7 @@ namespace CNTK
    protected:
        DistributedLearnerBase(DistributedCommunicatorPtr communicator, LearnerPtr learner, size_t distributeAfterSamples, bool convertSparseToDense=true);

-        static void PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, MinibatchInfo& info);
+        static void PrepaireZeroGradients(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues);
        void ConvertToOrdered(const std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, std::vector<std::pair<Parameter, NDArrayViewPtr>>& result, std::unordered_map<Parameter, NDArrayViewPtr>* convertedGradientValues = nullptr);

        std::vector<std::pair<Parameter, NDArrayViewPtr>> m_gradientBuffer;
--- a/Source/CNTKv2LibraryDll/Evaluator.cpp
+++ b/Source/CNTKv2LibraryDll/Evaluator.cpp
@ -44,7 +44,7 @@ namespace CNTK
            m_aggregatedEvaluationFunction = m_evaluationFunction;
            m_testSampleCountVar = m_evaluationFunction->RootFunction()->Inputs()[0];
        }
-
+        
        if(initializeCombined)
            m_combinedEvalFunction = Combine(GetCombinedEvalFunctionArgs());
    }
--- a/Source/CNTKv2LibraryDll/Function.cpp
+++ b/Source/CNTKv2LibraryDll/Function.cpp
@ -2098,14 +2098,14 @@ namespace CNTK

            FunctionPtr classificationErrorComposite;
            if (axis == Axis(0))
-                classificationErrorComposite = Minus(Constant::Scalar(1.0f), TransposeTimes(labelPlaceholder, Hardmax(predictionPlaceholder)));
+                classificationErrorComposite = Minus(Constant::Scalar(prediction.GetDataType(), 1.0), TransposeTimes(labelPlaceholder, Hardmax(predictionPlaceholder)));
            else
            {
                auto axMax = ReduceMax(predictionPlaceholder, axis);
                auto pred = Equal(predictionPlaceholder, axMax);
                auto wrongPred = NotEqual(labelPlaceholder, pred);
                auto axErr = ReduceSum(wrongPred, axis);
-                auto capErr = GreaterEqual(axErr, Constant::Scalar(1.0f));
+                auto capErr = GreaterEqual(axErr, Constant::Scalar(prediction.GetDataType(), 1.0));
                classificationErrorComposite = ReduceMean(capErr, Axis::AllStaticAxes());
            }

@ -2938,6 +2938,17 @@ namespace CNTK
            operands, Dictionary(), name), name);
    }

+    FunctionPtr Cast(const Variable& nodeInput, DataType outputType, const std::wstring& name)
+    {
+        std::vector<Variable> operands = { nodeInput };
+        Dictionary additionalAttributes;
+        additionalAttributes.Add(
+                PrimitiveFunction::AttributeNameNewDataType, static_cast<int>(outputType));
+        return AsComposite(MakeSharedObject<PrimitiveFunction>(
+            PrimitiveOpType::Cast,
+            operands, std::move(additionalAttributes), name), name);
+    }
+
    namespace Internal
    {
        FunctionPtr IsWithin(const Variable& operand, int offset, const std::wstring& name)
--- a/Source/CNTKv2LibraryDll/Learner.cpp
+++ b/Source/CNTKv2LibraryDll/Learner.cpp
@ -10,7 +10,7 @@
 #include "Serialization.h"

 #define DISPATCH_TO_TYPED_UPDATE_FUNCTION                                                                     \
-    switch (smoothedGradientValue->GetDataType())                                                             \
+    switch (gradientValue->GetDataType())                                                                     \
    {                                                                                                         \
    case DataType::Float:                                                                                     \
        Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                  \
@ -18,6 +18,9 @@
    case DataType::Double:                                                                                    \
        Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                 \
        break;                                                                                                \
+    case DataType::Float16:                                                                                   \
+        Update<half>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);                   \
+        break;                                                                                                \
    default:                                                                                                  \
        NOT_IMPLEMENTED;                                                                                      \
    }
@ -203,10 +206,10 @@ namespace CNTK

    LearnerBase::LearnerBase(const vector<Parameter>& parameters,
                             const LearningRateSchedule& learningRateSchedule,
-                             AdditionalLearningOptions additionalOptions,
-                             bool allocateSmoothGradients /* = true */)
+                             AdditionalLearningOptions additionalOptions)
                             : Learner(parameters, learningRateSchedule, additionalOptions),
-                             m_noiseInjectionSeed(Internal::GenerateRandomSeed())
+                             m_noiseInjectionSeed(Internal::GenerateRandomSeed()),
+                             m_masterParameterUpdated(false)
    {
        if (parameters.empty())
            InvalidArgument("The parameters list specified to a Learner must not be empty.");
@ -215,22 +218,40 @@ namespace CNTK

        if (uniqueParameters.size() != parameters.size())
            InvalidArgument("Learner's parameters list must not contain duplicates.");
+    }

-        if (allocateSmoothGradients)
+    void LearnerBase::AllocateSmoothedGradients(const std::vector<Parameter>& parameters, size_t factor, size_t fp16Factor)
+    {
+        for (const auto& parameter : parameters)
        {
-            for (const auto& parameter : parameters)
-            {
-                NDArrayViewPtr view = AllocateNDArrayView(parameter, parameter.Shape());
-                m_smoothedGradientValues.emplace(parameter, view);
-            }
+            NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor, fp16Factor);
+            m_smoothedGradientValues.emplace(parameter, view);
        }
    }

-    /*static*/ NDArrayViewPtr LearnerBase::AllocateNDArrayView(const Parameter& parameter, const NDShape& shape)
+    /*static*/ NDArrayViewPtr LearnerBase::AllocateSmoothedGradientFor(const Parameter& parameter, size_t factor, size_t fp16Factor)
    {
-        if (parameter.GetDataType() == DataType::Float)
+        // float16 parameter needs extra buffer for master-copy of weights
+        if (parameter.GetDataType() == DataType::Float16) factor += fp16Factor;
+
+        const auto paramShape = GetMatrixShape(parameter);
+        NDShape shape;
+        if (factor == 0)
        {
-            return MakeSharedObject<NDArrayView>(float(0.0), shape, parameter.Value()->Device());
+            shape = NDShape({});
+        }
+        else
+        {
+            if (factor == 1)
+                shape = parameter.Shape();
+            else
+                shape = NDShape({ paramShape[0], factor * paramShape[1] });
+        }
+
+        if (parameter.GetDataType() != DataType::Double)
+        {
+            // float and half both have smoothed gradient in float
+            return MakeSharedObject<NDArrayView>(0.0f, shape, parameter.Value()->Device());
        }
        else
        {
@ -245,11 +266,16 @@ namespace CNTK
            auto matrix = GetMatrix<float>(parameter.Value());
            return{ matrix->GetNumRows(), matrix->GetNumCols() };
        }
-        else
+        else if (parameter.GetDataType() == DataType::Double)
        {
            auto matrix = GetMatrix<double>(parameter.Value());
            return{ matrix->GetNumRows(), matrix->GetNumCols() };
        }
+        else
+        {
+            auto matrix = GetMatrix<half>(parameter.Value());
+            return{ matrix->GetNumRows(), matrix->GetNumCols() };
+        }
    }

    /*virtual*/ bool LearnerBase::Update(unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd) /*override*/
@ -267,10 +293,22 @@ namespace CNTK

        UpdateOnMinibatch(trainingSampleCount);

+        bool needUpdateMasterParameter = !m_masterParameterUpdated;
        for (const auto& parameter : Parameters())
        {
            const auto& smoothedGradientValue = m_smoothedGradientValues.at(parameter);
            const auto& gradientValue = gradientValues.at(parameter);
+
+            if (needUpdateMasterParameter && parameter.GetDataType() == DataType::Float16)
+            {
+                // convert fp16 parameter to fp32
+                auto sg = smoothedGradientValue->GetWritableMatrix<float>();
+                auto pv16 = parameter.Value()->GetWritableMatrix<half>();
+                size_t factor = sg->GetNumCols() / pv16->GetNumCols();
+                auto pv = sg->ColumnSlice(pv16->GetNumCols() * (factor - 1), pv16->GetNumCols());
+                pv.CastAssignValuesOf(*pv16);
+            }
+
            // TODO: make this a runtime parameter.
 #if DUMPOUTPUT
            LOGPRINTF(stderr, "Update_%ls\n", parameter.Uid().c_str());
@ -303,6 +341,11 @@ namespace CNTK
                LogicError("%ls has NaNs in parameter values after parameter update.", parameter.Uid().c_str());
 #endif
        }
+
+        if (needUpdateMasterParameter)
+        {
+            m_masterParameterUpdated = true;
+        }
        m_sampleCount += trainingSampleCount;
        m_minibatchCount++;
        if (sweepEnd)
@ -319,7 +362,19 @@ namespace CNTK
    {
        const auto& parameterValue = parameter.Value();
        PreProcess<ElementType>(parameterValue, gradientValue, trainingSampleCount);
+
        Update(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+
+        if (parameter.GetDataType() == DataType::Float16)
+        {
+            // convert fp32 parameter to fp16 after update
+            auto sg = smoothedGradientValue->GetWritableMatrix<float>();
+            auto pv16 = parameterValue->GetWritableMatrix<half>();
+            size_t factor = sg->GetNumCols() / pv16->GetNumCols();
+            auto pv = sg->ColumnSlice(pv16->GetNumCols() * (factor - 1), pv16->GetNumCols());
+            pv16->CastAssignValuesOf(pv);
+        }
+
        PostProcess<ElementType>(parameter, gradientValue, trainingSampleCount);

        auto paramRef = parameter;
@ -344,6 +399,7 @@ namespace CNTK
        checkpoint[sweepCountKey] = m_sweepCount;
        checkpoint[learningRateScheduleKey] = m_learningRateSchedule.Serialize();
        checkpoint[noiseInjectionSeedKey] = m_noiseInjectionSeed;
+        checkpoint[masterParameterUpdatedKey] = m_masterParameterUpdated;

        // TODO: should we also save momentum schedule into the checkpoint?
        // If that is the case, need to be able to override this method in subclasses.
@ -390,6 +446,11 @@ namespace CNTK
            m_noiseInjectionSeed = checkpoint[noiseInjectionSeedKey].Value<size_t>();
        }

+        if (checkpoint.Contains(masterParameterUpdatedKey))
+        {
+            m_masterParameterUpdated = checkpoint[masterParameterUpdatedKey].Value<bool>();
+        }
+
        // TODO: which learning rate schedule should take precedence here? 
        // The one given at construction time or the one loaded from a checkpoint?
        m_learningRateSchedule = TrainingParameterSchedule<double>::Deserialize(checkpoint[learningRateScheduleKey].Value<Dictionary>());
@ -464,23 +525,6 @@ namespace CNTK
        }
    }

-    LearnerSGD::LearnerSGD(const std::vector<Parameter>& parameters, 
-                           const LearningRateSchedule& learningRateSchedule, 
-                           AdditionalLearningOptions additionalOptions,
-                           bool allocateSmoothGradients)
-                           : LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients)
-    {
-        if (!allocateSmoothGradients)
-        {
-            // the vanilla sgd does not need the smooth gradients per se, 
-            // insert dummy nd views instead.
-            for (const auto& parameter : parameters)
-            {
-                m_smoothedGradientValues.emplace(parameter, AllocateNDArrayView(parameter, {}));
-            }
-        }
-    }
-
    /*virtual*/ void LearnerSGD::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
                                        const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
    {
@ -514,7 +558,20 @@ namespace CNTK
    {
        ReportTrainingParameterValue(m_momentumSchedule, L"Momentum");

-        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
+        switch (gradientValue->GetDataType())
+        {
+        case DataType::Float:
+            Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        case DataType::Double:
+            Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        case DataType::Float16:
+            UpdateHalf(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        default:
+            NOT_IMPLEMENTED;
+        }
    }

    template <typename ElementType>
@ -555,10 +612,42 @@ namespace CNTK
                                           learningRate, momentum, unitGainFactor);
    }

+    void LearnerMomentumSGD::UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
+        const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        const auto& compoundMatrix = GetWritableMatrix<float>(smoothedGradientValue);
+        const auto& gradientMatrix = GetWritableMatrix<half>(gradientValue);
+        auto smoothedGradientMatrix = compoundMatrix->ColumnSlice(0, gradientMatrix->GetNumCols());
+        auto tempGradientMatrix = compoundMatrix->ColumnSlice(gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
+        auto parameterMatrix = compoundMatrix->ColumnSlice(2 * gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
+
+        tempGradientMatrix.CastAssignValuesOf(*gradientMatrix);
+
+        const auto learningRate = float(LearningRate(trainingSampleCount));
+        const auto momentum = float(MomentumValueForMB(trainingSampleCount));
+        const auto unitGainFactor = UnitGainFactor<float>(trainingSampleCount);
+
+        parameterMatrix.MomentumSGDUpdate(tempGradientMatrix, smoothedGradientMatrix,
+            learningRate, momentum, unitGainFactor);
+    }
+
    /*virtual*/ void LearnerNesterov::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, 
                                             const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
    {
-        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
+        switch (gradientValue->GetDataType())
+        {
+        case DataType::Float:
+            Update<float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        case DataType::Double:
+            Update<double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        case DataType::Float16:
+            UpdateHalf(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        default:
+            NOT_IMPLEMENTED;
+        }
    }

    template <typename ElementType>
@ -575,11 +664,30 @@ namespace CNTK
                                                              learningRate, momentum, unitGainFactor);
    }

+    void LearnerNesterov::UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
+        const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const
+    {
+        const auto& compoundMatrix = GetWritableMatrix<float>(smoothedGradientValue);
+        const auto& gradientMatrix = GetWritableMatrix<half>(gradientValue);
+        auto smoothedGradientMatrix = compoundMatrix->ColumnSlice(0, gradientMatrix->GetNumCols());
+        auto tempGradientMatrix = compoundMatrix->ColumnSlice(gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
+        auto parameterMatrix = compoundMatrix->ColumnSlice(2 * gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols());
+
+        tempGradientMatrix.CastAssignValuesOf(*gradientMatrix);
+
+        const auto learningRate = float(LearningRate(trainingSampleCount));
+        const auto momentum = float(MomentumValueForMB(trainingSampleCount));
+        const auto unitGainFactor = UnitGainFactor<float>(trainingSampleCount);
+
+        parameterMatrix.NesterovAcceleratedMomentumSGDUpdate(tempGradientMatrix, smoothedGradientMatrix,
+            learningRate, momentum, unitGainFactor);
+    }
+
    LearnerAdaGrad::LearnerAdaGrad(const std::vector<Parameter>& parameters,
                                   const LearningRateSchedule& learningRateSchedule,
                                   bool needAveMultiplier,
                                   AdditionalLearningOptions additionalOptions)
-                                   : LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
+                                   : LearnerBase(parameters, learningRateSchedule, additionalOptions),
                                   m_needAveMultiplier(needAveMultiplier)
    {
        for (const auto& parameter : parameters)
@ -591,8 +699,7 @@ namespace CNTK
                factor = 2;
            }

-            const auto shape = GetMatrixShape(parameter);
-            NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] });
+            NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor);

            m_smoothedGradientValues.emplace(parameter, view);
        }
@ -621,21 +728,29 @@ namespace CNTK
        const LearningRateSchedule& learningRateSchedule,
        double rho, double epsilon,
        AdditionalLearningOptions additionalOptions)
-        : LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
+        : LearnerBase(parameters, learningRateSchedule, additionalOptions),
        m_rho(rho), m_epsilon(epsilon)
    {
-        for (const auto& parameter : parameters)
-        {
-            const auto shape = GetMatrixShape(parameter);
-            NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
-            m_smoothedGradientValues.emplace(parameter, view);
-        }
+        AllocateSmoothedGradients(parameters, 2);
    }

    /*virtual*/ void LearnerAdaDelta::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
        const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) /*override*/
    {
-        DISPATCH_TO_TYPED_UPDATE_FUNCTION;
+        switch (gradientValue->GetDataType())
+        {
+        case DataType::Float:
+            Update<float, float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        case DataType::Double:
+            Update<double, double>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        case DataType::Float16:
+            Update<half, float>(parameter, gradientValue, smoothedGradientValue, trainingSampleCount);
+            break;
+        default:
+            NOT_IMPLEMENTED;
+        }
    }

    // When the gradients are sparse, we update the corresponding internal buffers of adadelta in a sparse way
@ -645,11 +760,16 @@ namespace CNTK
    // TODO: consider exposing this somehow so that it is easy to test by setting it to small value.
    /* static */ const int LearnerAdaDelta::s_SyncInterval = 1 << 20;

-    template <typename ElementType>
+    template <typename GradType, typename AccumType>
    void LearnerAdaDelta::Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue,
        const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount)
    {
-        GET_WRITABLE_MATRICES
+        const auto& gradientMatrix = GetWritableMatrix<GradType>(gradientValue);
+        const auto& smoothedGradientMatrix = GetWritableMatrix<AccumType>(smoothedGradientValue);
+        // parameter is accumulated to fp32 for fp16 gradient in the master copy (allocated in last part in smoothedGradient)
+        auto parameterMatrix = (std::is_same<GradType, half>::value) ?
+            smoothedGradientMatrix->ColumnSlice(smoothedGradientMatrix->GetNumCols() - gradientMatrix->GetNumCols(), gradientMatrix->GetNumCols()) :
+            GetWritableMatrix<AccumType>(parameter.Value())->ColumnSlice(0, gradientMatrix->GetNumCols());

        const auto learningRate = LearningRate(trainingSampleCount);

@ -685,14 +805,14 @@ namespace CNTK
            if (currentTimestamp >= LearnerAdaDelta::s_SyncInterval)
            {
                // Once in a while sync the state and reset the timestamps and current time to 0
-                smoothedGradientMatrix->AdaDeltaFlushState(numCols, (ElementType)m_rho, timestamps, currentTimestamp);
+                smoothedGradientMatrix->AdaDeltaFlushState(numCols, (AccumType)m_rho, timestamps, currentTimestamp);
                m_currentTime[parameter] = currentTimestamp = 0;
            }
            currentTimestamp += 1;
            m_currentTime[parameter] = currentTimestamp;
        }

-        smoothedGradientMatrix->AdaDeltaUpdate(*gradientMatrix, *parameterMatrix, (ElementType)learningRate, (ElementType)m_rho, (ElementType)m_epsilon, timestamps, currentTimestamp);
+        smoothedGradientMatrix->template AdaDeltaUpdate<GradType>(*gradientMatrix, parameterMatrix, (AccumType)learningRate, (AccumType)m_rho, (AccumType)m_epsilon, timestamps, currentTimestamp);
    }

    /*virtual*/ Dictionary LearnerAdaDelta::CreateCheckpoint() /*override*/
@ -713,12 +833,15 @@ namespace CNTK
                const auto& smoothedGradientMatrix = GetWritableMatrix<float>(smoothedGradientValue);
                smoothedGradientMatrix->AdaDeltaFlushState(numCols, (float)m_rho, timestamps, currentTimestamp);
            }
-            else
+            else if (parameter.GetDataType() == CNTK::DataType::Double)
            {
                const auto numCols = GetMatrix<double>(parameter.Value())->GetNumCols();
                const auto& smoothedGradientMatrix = GetWritableMatrix<double>(smoothedGradientValue);
                smoothedGradientMatrix->AdaDeltaFlushState(numCols, (double)m_rho, timestamps, currentTimestamp);
            }
+            else
+                LogicError("Unexpected parameter data type");
+
            m_currentTime[parameter] = 0;
        }
        return LearnerBase::CreateCheckpoint();
@ -748,16 +871,10 @@ namespace CNTK
                                       const MomentumSchedule& varianceMomentumSchedule,
                                       AdditionalLearningOptions additionalOptions)
                                       : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, 
-                                                            unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
+                                                            unitGain, additionalOptions, 2),
                                       m_varianceMomentumSchedule(varianceMomentumSchedule),
                                       m_smoothedCount(0.0)
    {
-        for (const auto& parameter : parameters)
-        {
-            const auto shape = GetMatrixShape(parameter);
-            NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], 2 * shape[1] });
-            m_smoothedGradientValues.emplace(parameter, view);
-        }
    }

    /*virtual*/ Dictionary LearnerFSAdaGrad::CreateCheckpoint() /*override*/
@ -823,7 +940,7 @@ namespace CNTK
        bool adamax,
        AdditionalLearningOptions additionalOptions)
        : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule,
-            unitGain, additionalOptions, /*allocateSmoothGradients*/ false),
+            unitGain, additionalOptions, 2),
          m_varianceMomentumSchedule(varianceMomentumSchedule), m_epsilon(epsilon),
          m_adamax(adamax)
    {
@ -833,12 +950,8 @@ namespace CNTK
            InvalidArgument("Epsilon should be non-negative. You are trying to set it to %g.", m_epsilon);
        }

-        for (const auto& parameter : parameters)
-        {
-            const auto shape = GetMatrixShape(parameter);
-            NDArrayViewPtr view = AllocateNDArrayView(parameter, {shape[0], 2 * shape[1]});
-            m_smoothedGradientValues.emplace(parameter, view);
-        }
+        AllocateSmoothedGradients(parameters, 2);
+
        m_smoothedCount = 0.0;
    }

@ -893,7 +1006,7 @@ namespace CNTK
                                   double gamma, double inc, double dec, double max, double min,
                                   bool needAveMultiplier,
                                   AdditionalLearningOptions additionalOptions)
-                                   : LearnerBase(parameters, learningRateSchedule, additionalOptions, /*allocateSmoothGradients*/ false),
+                                   : LearnerBase(parameters, learningRateSchedule, additionalOptions),
                                   m_gamma(gamma), m_inc(inc), m_dec(dec), m_max(max), m_min(min), m_needAveMultiplier(needAveMultiplier)
    {
        // validation of learner settings
@ -922,7 +1035,7 @@ namespace CNTK
            }

            const auto shape = GetMatrixShape(parameter);
-            NDArrayViewPtr view = AllocateNDArrayView(parameter, { shape[0], factor * shape[1] });
+            NDArrayViewPtr view = AllocateSmoothedGradientFor(parameter, factor);

            m_smoothedGradientValues.emplace(parameter, view);
        }
@ -996,7 +1109,7 @@ namespace CNTK
                                  bool unitGain,
                                  AdditionalLearningOptions additionalOptions /*= AdditionalLearningOptions()*/)
    {
-        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions);
+        return MakeSharedObject<LearnerMomentumSGD>(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, 1);
    }

    LearnerPtr NesterovLearner(const vector<Parameter>& parameters,
@ -1059,7 +1172,7 @@ namespace CNTK


    LearnerUniversal::LearnerUniversal(const std::vector<Parameter>& parameters, const ParameterUpdateFunctor& func)
-        : LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions(), /*allocateSmoothGradients*/ false)
+        : LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions())
    {
        std::vector<Variable> gradients;
        std::vector<FunctionPtr> functions;
@ -1083,7 +1196,7 @@ namespace CNTK
    }

    LearnerUniversal::LearnerUniversal(const std::vector<Parameter>& parameters, const std::vector<Variable>& gradients, FunctionPtr updateFunc)
-        : LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions(), /*allocateSmoothGradients*/ false)
+        : LearnerBase(parameters, LearningRateSchedule(1.0), AdditionalLearningOptions())
    {
        ValidateInput(parameters, gradients, updateFunc);
    }
@ -1107,7 +1220,7 @@ namespace CNTK
                fprintf(stderr, "WARNING: Update function does not contain the gradient for parameter %ls in its computation\n", param.AsString().c_str());
            m_parameter_gradient_map.insert({parameters[i], gradients[i]});
        }
-        AllocateDummySmoothedGradients(parameters);
+        AllocateSmoothedGradients(parameters, 0);
        m_update_func = updateFunc;
    }

--- a/Source/CNTKv2LibraryDll/Learner.h
+++ b/Source/CNTKv2LibraryDll/Learner.h
@ -30,14 +30,11 @@ namespace CNTK
        virtual void ResetSmoothedGradients() override;

    protected:
-        // allocateSmoothGradients flag specifies whether NDArrayViews for smoothed gradients can be allocated 
-        // in the base class constructor (in which case they are allocated with the shapes identical to the shapes of
-        // the corresponding parameters) or if the allocation should be deferred to the subclass constructor (which
-        // performs allocation that is specific to the particular learner, see FSAdaGrad and RMSProp).
        LearnerBase(const std::vector<Parameter>& parameters,
            const LearningRateSchedule& learningRateSchedule,
-            AdditionalLearningOptions additionalOptions,
-            bool allocateSmoothGradients = true);
+            AdditionalLearningOptions additionalOptions);
+
+        void AllocateSmoothedGradients(const std::vector<Parameter>& parameters, size_t factor, size_t fp16Factor = 1);

        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) = 0;

@ -82,6 +79,8 @@ namespace CNTK

        std::unordered_map<Parameter, NDArrayViewPtr> m_smoothedGradientValues;

+        bool m_masterParameterUpdated; // whether the master copy of parameters are updated
+
        mutable size_t m_noiseInjectionSeed;

        // The following four static protected methods expose private methods of NDArrayView class
@ -113,7 +112,7 @@ namespace CNTK

        // Returns an NDArrayView with the required shape, with the same data type as parameter value
        // and allocated on the same device.
-        static NDArrayViewPtr AllocateNDArrayView(const Parameter& parameter, const NDShape& shape);
+        static NDArrayViewPtr AllocateSmoothedGradientFor(const Parameter& parameter, size_t factor, size_t fp16Factor = 1);

        // Retrieves the shape of the matrix corresponding to the parameter value.
        static NDShape GetMatrixShape(const Parameter& parameter);
@ -142,8 +141,11 @@ namespace CNTK
    public:
        LearnerSGD(const std::vector<Parameter>& parameters,
                   const LearningRateSchedule& learningRateSchedule,
-                   AdditionalLearningOptions additionalOptions,
-                   bool allocateSmoothGradients = false);
+                   AdditionalLearningOptions additionalOptions)
+                   : LearnerBase(parameters, learningRateSchedule, additionalOptions)
+        {
+            AllocateSmoothedGradients(parameters, 0);
+        }

    protected:

@ -162,11 +164,13 @@ namespace CNTK
                           const MomentumSchedule& momentumSchedule,
                           bool unitGain,
                           AdditionalLearningOptions additionalOptions,
-                           bool allocateSmoothGradients = true)
-                           : LearnerBase(parameters, learningRateSchedule, additionalOptions, allocateSmoothGradients),
+                           size_t smoothGradientFactor)
+                           : LearnerBase(parameters, learningRateSchedule, additionalOptions),
                           m_momentumSchedule(momentumSchedule), 
                           m_unitGain(unitGain)
-        { }
+        {
+            AllocateSmoothedGradients(parameters, smoothGradientFactor, 2);
+        }

        // returns current per-minibatch momentum value.
        virtual double MomentumValueForMB(size_t minibatchSize) const
@ -177,9 +181,11 @@ namespace CNTK
    protected:
        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) override;

-        template <typename ElementType>
+        template <typename ElemType>
        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;

+        void UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+
        // returns current per-minibatch momentum value from the provided schedule.
        double MomentumValueForMB(const MomentumSchedule& schedule, size_t minibatchSize) const;

@ -216,7 +222,7 @@ namespace CNTK
                        const MomentumSchedule& momentumSchedule,
                        bool unitGain,
                        AdditionalLearningOptions additionalOptions)
-                        : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, /*allocateSmoothGradients*/ true)
+                        : LearnerMomentumSGD(parameters, learningRateSchedule, momentumSchedule, unitGain, additionalOptions, 1)
        {}

    protected:
@ -224,6 +230,7 @@ namespace CNTK

        template <typename ElementType>
        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
+        void UpdateHalf(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) const;
    };

    class LearnerAdaGrad : public LearnerBase
@ -268,7 +275,7 @@ namespace CNTK

        virtual void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount) override;

-        template <typename ElementType>
+        template <typename GradType, typename AccumType>
        void Update(const Parameter& parameter, const NDArrayViewPtr& gradientValue, const NDArrayViewPtr& smoothedGradientValue, size_t trainingSampleCount);

        virtual Dictionary CreateCheckpoint() override;
@ -404,14 +411,6 @@ namespace CNTK
        virtual bool Update(std::unordered_map<Parameter, NDArrayViewPtr>& gradientValues, size_t trainingSampleCount, bool sweepEnd) override;

    private:
-        void AllocateDummySmoothedGradients(const std::vector<Parameter>& parameters)
-        {
-            for (const auto& parameter : parameters)
-            {
-                m_smoothedGradientValues.emplace(parameter, AllocateNDArrayView(parameter, {}));
-            }
-        }
-
        void ValidateInput(const std::vector<Parameter>& parameters, const std::vector<Variable>& gradients, FunctionPtr updateFunc);


--- a/Source/CNTKv2LibraryDll/NDArrayView.cpp
+++ b/Source/CNTKv2LibraryDll/NDArrayView.cpp
@ -17,8 +17,20 @@ using namespace Microsoft::MSR::CNTK;

 namespace CNTK
 {
-    template <typename ElementType>
-    static TensorView<ElementType>* AllocateTensorView(const NDShape& viewShape,
+    template<typename ElemType>
+    inline ElemType quiet_NaN()
+    {
+        return std::numeric_limits<ElemType>::quiet_NaN();
+    }
+
+    template<>
+    inline float16 quiet_NaN<float16>()
+    {
+        return float16(std::numeric_limits<float>::quiet_NaN());
+    }
+
+    template <typename V1ElemType>
+    static TensorView<V1ElemType>* AllocateTensorView(const NDShape& viewShape,
                                                       const DeviceDescriptor& device,
                                                       void* dataBuffer,
                                                       size_t bufferSizeInBytes)
@ -26,13 +38,13 @@ namespace CNTK
        if (dataBuffer == nullptr)
            InvalidArgument("Cannot create a NDArrayView over a null data buffer.");

-        if (bufferSizeInBytes < (viewShape.TotalSize() * sizeof(ElementType)))
+        if (bufferSizeInBytes < (viewShape.TotalSize() * sizeof(V1ElemType)))
            InvalidArgument("Size (%d) of the specified buffer for creating the NDArrayView is smaller than the specified view shape '%S'.",
                            (int)bufferSizeInBytes, viewShape.AsString().c_str());

        auto matrixDims = GetMatrixDimensions(viewShape);
-        std::shared_ptr<Matrix<ElementType>> matrix = std::make_shared<Matrix<ElementType>>(matrixDims.first, matrixDims.second, (ElementType*)dataBuffer, AsCNTKImplDeviceId(device), matrixFlagDontOwnBuffer);
-        return new TensorView<ElementType>(matrix, AsTensorViewShape(viewShape));
+        std::shared_ptr<Matrix<V1ElemType>> matrix = std::make_shared<Matrix<V1ElemType>>(matrixDims.first, matrixDims.second, (V1ElemType*)dataBuffer, AsCNTKImplDeviceId(device), matrixFlagDontOwnBuffer);
+        return new TensorView<V1ElemType>(matrix, AsTensorViewShape(viewShape));
    }

    static void* AllocateTensorView(CNTK::DataType dataType,
@ -47,26 +59,28 @@ namespace CNTK
            return AllocateTensorView<float>(viewShape, device, dataBuffer, bufferSizeInBytes);
        case DataType::Double:
            return AllocateTensorView<double>(viewShape, device, dataBuffer, bufferSizeInBytes);
+        case DataType::Float16:
+            return AllocateTensorView<half>(viewShape, device, dataBuffer, bufferSizeInBytes);
        default:
            LogicError("Unsupported DataType %s", DataTypeName(dataType));
            break;
        }
    }

-    template <typename ElementType>
-    static TensorView<ElementType>* AllocateTensorView(const NDShape& viewShape,
-                                                       CNTK::StorageFormat storageType,
-                                                       const DeviceDescriptor& device,
-                                                       size_t numNonZeroValues = 0)
+    template<typename V1ElemType>
+    static TensorView<V1ElemType>* AllocateTensorView(const NDShape& viewShape,
+                                                      CNTK::StorageFormat storageType,
+                                                      const DeviceDescriptor& device,
+                                                      size_t numNonZeroValues = 0)
    {
        auto matrixDims = GetMatrixDimensions(viewShape);
-        std::shared_ptr<Matrix<ElementType>> matrix = std::make_shared<Matrix<ElementType>>(matrixDims.first,
-                                                                                            matrixDims.second,
-                                                                                            AsCNTKImplDeviceId(device),
-                                                                                            IsSparseStorageFormat(storageType) ? MatrixType::SPARSE : MatrixType::DENSE,
-                                                                                            AsCNTKImplMatrixFormat(storageType),
-                                                                                            numNonZeroValues);
-        return new TensorView<ElementType>(matrix, AsTensorViewShape(viewShape));
+        std::shared_ptr<Matrix<V1ElemType>> matrix = std::make_shared<Matrix<V1ElemType>>(matrixDims.first,
+                                                                                          matrixDims.second,
+                                                                                          AsCNTKImplDeviceId(device),
+                                                                                          IsSparseStorageFormat(storageType) ? MatrixType::SPARSE : MatrixType::DENSE,
+                                                                                          AsCNTKImplMatrixFormat(storageType),
+                                                                                          numNonZeroValues);
+        return new TensorView<V1ElemType>(matrix, AsTensorViewShape(viewShape));
    }

    static void* AllocateTensorView(CNTK::DataType dataType,
@ -81,6 +95,8 @@ namespace CNTK
            return AllocateTensorView<float>(viewShape, storageType, device, numNonZeroValues);
        case DataType::Double:
            return AllocateTensorView<double>(viewShape, storageType, device, numNonZeroValues);
+        case DataType::Float16:
+            return AllocateTensorView<half>(viewShape, storageType, device, numNonZeroValues);
        default:
            LogicError("Unsupported DataType %s", DataTypeName(dataType));
            break;
@ -92,17 +108,37 @@ namespace CNTK
    {
    }

-    template <typename ElementType>
-    NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/)
-        : NDArrayView(AsDataType<ElementType>(), device, StorageFormat::SparseCSC, viewShape, false, AllocateTensorView<ElementType>(viewShape, StorageFormat::SparseCSC, device, numNonZeroValues))
+    NDArrayView::NDArrayView(CNTK::DataType dataType, const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const void* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/)
+        : NDArrayView(dataType, device, StorageFormat::SparseCSC, viewShape, false, AllocateTensorView(dataType, StorageFormat::SparseCSC, viewShape, device, numNonZeroValues * DataTypeSize(dataType)))
    {
        if ((colStarts == nullptr) || (rowIndices == nullptr) || (nonZeroValues == nullptr) || (numNonZeroValues == 0) || (numNonZeroValues > viewShape.TotalSize()))
            InvalidArgument("Invalid sparse CSC format data specified for construction of NDArrayView with shape '%S'; "
                            "either one of the specified buffers is null or the count (%d) of non-zero values is invalid.",
                            viewShape.AsString().c_str(), (int)numNonZeroValues);
-
-        auto sparseMatrix = GetWritableMatrix<ElementType>(1);
-        sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
+        switch (dataType)
+        {
+            case DataType::Float:
+            {
+                auto sparseMatrix = GetWritableMatrix<float>(1);
+                sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const float*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
+                break;
+            }
+            case DataType::Double:
+            {
+                auto sparseMatrix = GetWritableMatrix<double>(1);
+                sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const double*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
+                break;
+            }
+            case DataType::Float16:
+            {
+                auto sparseMatrix = GetWritableMatrix<half>(1);
+                sparseMatrix->SetMatrixFromCSCFormat(colStarts, rowIndices, (const half*)nonZeroValues, numNonZeroValues, sparseMatrix->GetNumRows(), sparseMatrix->GetNumCols());
+                break;
+            }
+            default:
+                LogicError("Unsupported DataType %s", DataTypeName(dataType));
+                break;
+        }
        m_isReadOnly = readOnly;
    }

@ -118,6 +154,9 @@ namespace CNTK
            case DataType::Double:
                delete GetTensorView<double>();
                break;
+            case DataType::Float16:
+                delete GetTensorView<half>();
+                break;
            default:
                LogicError("Unsupported DataType %s", DataTypeName(m_dataType));
                break;
@ -136,6 +175,8 @@ namespace CNTK
    {
        if (GetDataType() == DataType::Double)
            SetValue((double)value);
+        else if (GetDataType() == DataType::Float16)
+            SetValue((float16)value);
        else
        {
            if (IsSparse())
@ -167,12 +208,25 @@ namespace CNTK
            auto currentMatrix = GetMatrix<double>();
            return currentMatrix->IsView();
        }
+        case DataType::Float16:
+        {
+            auto currentMatrix = GetMatrix<half>();
+            return currentMatrix->IsView();
+        }
        }
        return false;
    }

-    template <typename ElementType>
-    /*static*/ std::shared_ptr<Matrix<ElementType>> NDArrayView::GetMatrixImpl(const TensorView<ElementType>* tensorView, size_t rowColSplitPoint)
+    void NDArrayView::SetValue(float16 value)
+    {
+        if (IsSparse())
+            LogicError("NDArrayView::SetValue: Setting a NDArrayView contents to a scalar is only allowed for objects with dense storage format.");
+
+        GetWritableMatrix<half>()->SetValue(*reinterpret_cast<half*>(&value));
+    }
+
+    template <typename V1ElemType>
+    /*static*/ std::shared_ptr<Matrix<V1ElemType>> NDArrayView::GetMatrixImpl(const TensorView<V1ElemType>* tensorView, size_t rowColSplitPoint)
    {
        auto tensorShape = tensorView->GetShape();

@ -213,34 +267,66 @@ namespace CNTK
        return tensorView->Reshaped(tensorShape).AsMatrix();
    }

-    template <typename ElementType>
-    std::shared_ptr<const Matrix<ElementType>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
+    template <typename V1ElemType>
+    std::shared_ptr<const Matrix<V1ElemType>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
    {
-        return GetMatrixImpl<ElementType>(GetTensorView<ElementType>(), rowColSplitPoint);
+        return GetMatrixImpl<V1ElemType>(GetTensorView<V1ElemType>(), rowColSplitPoint);
    }

-    template <typename ElementType>
-    std::shared_ptr<Matrix<ElementType>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
+    template <typename V1ElemType>
+    std::shared_ptr<Matrix<V1ElemType>> NDArrayView::GetWritableMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
    {
-        return GetMatrixImpl<ElementType>(GetWritableTensorView<ElementType>(), rowColSplitPoint);
+        return GetMatrixImpl<V1ElemType>(GetWritableTensorView<V1ElemType>(), rowColSplitPoint);
    }

-    template <typename ElementType>
-    const TensorView<ElementType>* NDArrayView::GetTensorView() const
+    std::shared_ptr<const MatrixBase> NDArrayView::GetMatrixBase(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const
    {
-        if (AsDataType<ElementType>() != m_dataType)
-            LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(ElementType).name(), DataTypeName(m_dataType));
-
-        return (const TensorView<ElementType>*)(m_tensorView.get());
+        switch (m_dataType)
+        {
+        case DataType::Float:
+            return GetMatrixImpl<float>(GetTensorView<float>(), rowColSplitPoint);
+        case DataType::Double:
+            return GetMatrixImpl<double>(GetTensorView<double>(), rowColSplitPoint);
+        case DataType::Float16:
+            return GetMatrixImpl<half>(GetTensorView<half>(), rowColSplitPoint);
+        default:
+            LogicError("Unknown m_dataType %d", (int)m_dataType);
+        }
+        return nullptr;
    }

-    template <typename ElementType>
-    TensorView<ElementType>* NDArrayView::GetWritableTensorView()
+    std::shared_ptr<MatrixBase> NDArrayView::GetWritableMatrixBase(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/)
+    {
+        switch (m_dataType)
+        {
+        case DataType::Float:
+            return GetMatrixImpl<float>(GetWritableTensorView<float>(), rowColSplitPoint);
+        case DataType::Double:
+            return GetMatrixImpl<double>(GetWritableTensorView<double>(), rowColSplitPoint);
+        case DataType::Float16:
+            return GetMatrixImpl<half>(GetWritableTensorView<half>(), rowColSplitPoint);
+        default:
+            LogicError("Unknown m_dataType %d", (int)m_dataType);
+        }
+        return nullptr;
+    }
+
+    template <typename V1ElemType>
+    const TensorView<V1ElemType>* NDArrayView::GetTensorView() const
+    {
+        if (AsDataType<V1ElemType>() != m_dataType)
+            LogicError("NDArrayView::GetTensorView: The specified ElementType %s does not match the DataType %s", typeid(V1ElemType).name(), DataTypeName(m_dataType));
+
+        return (const TensorView<V1ElemType>*)(m_tensorView.get());
+    }
+
+    template <typename V1ElemType>
+    TensorView<V1ElemType>* NDArrayView::GetWritableTensorView()
    {
        if (IsReadOnly())
            InvalidArgument("NDArrayView::GetWritableTensorView: Cannot get a writable TensorView from a read-only NDArrayView.");

-        return const_cast<TensorView<ElementType>*>(GetTensorView<ElementType>());
+        return const_cast<TensorView<V1ElemType>*>(GetTensorView<V1ElemType>());
    }

    NDArrayViewPtr NDArrayView::DeepClone(const DeviceDescriptor& device, bool readOnly/* = false*/) const
@ -262,6 +348,13 @@ namespace CNTK
            newMatrix->AssignValuesOf(*thisMatrix);
            break;
        }
+        case DataType::Float16:
+        {
+            auto newMatrix = newView->GetWritableMatrix<half>();
+            auto thisMatrix = GetMatrix<half>();
+            newMatrix->AssignValuesOf(*thisMatrix);
+            break;
+        }
        default:
            LogicError("NDArrayView::DeepClone: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -296,6 +389,13 @@ namespace CNTK
            destMatrix->AssignValuesOf(*sourceMatrix);
            break;
        }
+        case DataType::Float16:
+        {
+            auto sourceMatrix = source.GetMatrix<half>();
+            auto destMatrix = GetWritableMatrix<half>();
+            destMatrix->AssignValuesOf(*sourceMatrix);
+            break;
+        }
        default:
            LogicError("NDArrayView::CopyFrom: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -313,6 +413,9 @@ namespace CNTK
        case DataType::Double:
            tensorView = new TensorView<double>(*(GetTensorView<double>()));
            break;
+        case DataType::Float16:
+            tensorView = new TensorView<half>(*(GetTensorView<half>()));
+            break;
        default:
            LogicError("NDArrayView::Alias: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -385,6 +488,19 @@ namespace CNTK
            tensorView = new TensorView<double>(slicedMatrixView, AsTensorViewShape(sliceViewShape));
            break;
        }
+        case DataType::Float16:
+        {
+            auto currentMatrix = GetMatrix<half>();
+            std::pair<size_t, size_t> currentMatrixDims = { currentMatrix->GetNumRows(), currentMatrix->GetNumCols() };
+            std::shared_ptr<Matrix<half>> slicedMatrixView;
+            if (sliceViewMatrixDims.first != currentMatrixDims.first)
+                slicedMatrixView = make_shared<Matrix<half>>(currentMatrix->Reshaped(1, currentMatrix->GetNumElements()).ColumnSlice(flatBufferOffset, sliceViewShape.TotalSize()));
+            else
+                slicedMatrixView = make_shared<Matrix<half>>(currentMatrix->ColumnSlice(sliceMatrixColumnOffset, sliceViewMatrixDims.second));
+
+            tensorView = new TensorView<half>(slicedMatrixView, AsTensorViewShape(sliceViewShape));
+            break;
+        }
        default:
            LogicError("NDArrayView::SliceView: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -412,6 +528,9 @@ namespace CNTK
        case DataType::Double:
            tensorView = new TensorView<double>(*(GetTensorView<double>()), newTensorShape);
            break;
+        case DataType::Float16:
+            tensorView = new TensorView<half>(*(GetTensorView<half>()), newTensorShape);
+            break;
        default:
            LogicError("NDArrayView::AsShape: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -430,9 +549,21 @@ namespace CNTK
        return const_cast<ElementType*>(DataBuffer<ElementType>());
    }

-    // TODO: This could actually be strided?
    template <typename ElementType>
    const ElementType* NDArrayView::DataBuffer() const
+    {
+        return const_cast<ElementType*>(_DataBuffer<ElementType, ElementType>());
+    }
+
+    template<>
+    const float16* NDArrayView::DataBuffer<float16>() const
+    {
+        return const_cast<float16*>(_DataBuffer<float16, half>());
+    }
+
+    // TODO: This could actually be strided?
+    template <typename ElementType, typename V1ElemType>
+    const ElementType* NDArrayView::_DataBuffer() const
    {
        if (AsDataType<ElementType>() != m_dataType)
            InvalidArgument("NDArrayView::DataBuffer: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
@ -441,13 +572,25 @@ namespace CNTK
            InvalidArgument("The stroage format of 'this' NDArrayView is sparse. Please use SparseDataBuffers().");

        // First make sure that the underlying matrix is on the right device
-        auto matrix = GetMatrix<ElementType>();
+        auto matrix = GetMatrix<V1ElemType>();
        matrix->TransferToDeviceIfNotThere(AsCNTKImplDeviceId(m_device), true);
-        return matrix->Data();
+        return reinterpret_cast<ElementType*>(matrix->Data());
    }

    template <typename ElementType>
    std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers() const
+    {
+        return _SparseCSCDataBuffers<ElementType, ElementType>();
+    }
+
+    template <>
+    std::tuple<const float16 *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float16>() const
+    {
+        return _SparseCSCDataBuffers<float16, half>();
+    }
+
+    template <typename ElementType, typename V1ElemType>
+    std::tuple<const ElementType *, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::_SparseCSCDataBuffers() const
    {
        if (AsDataType<ElementType>() != m_dataType)
            InvalidArgument("NDArrayView::SparseDataBuffers: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
@ -458,7 +601,7 @@ namespace CNTK
        if(GetStorageFormat() != StorageFormat::SparseCSC)
            RuntimeError("The SparseCSCDataBuffers() method only supports CSC sparse format.");

-        std::shared_ptr<const Matrix<ElementType>> matrix = GetMatrix<ElementType>();
+        std::shared_ptr<const Matrix<V1ElemType>> matrix = GetMatrix<V1ElemType>();
        auto matrixDims = GetMatrixDimensions(Shape());
        if (matrix->GetNumRows() != matrixDims.first)
            LogicError("The number of rows of the underlying matrix does not match the shape.");
@ -470,16 +613,16 @@ namespace CNTK
            RuntimeError("NDArrayView::SparseDataBuffers: The underlying matrix of 'this' NDArrayView is not in the CSC sparse format.");

        size_t numNonZeroValues;
-        ElementType* nonZeroValues;
+        V1ElemType* nonZeroValues;
        SparseIndexType* colStarts;
        SparseIndexType* rowIndices;
        if (m_device.Type() == DeviceKind::CPU)
        {
            if (sizeof(CPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
                LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on CPU.");
-            std::shared_ptr<Microsoft::MSR::CNTK::CPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_CPUSparseMatrix;
+            std::shared_ptr<Microsoft::MSR::CNTK::CPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_CPUSparseMatrix;
            numNonZeroValues = sparseMatrix->NzCount();
-            nonZeroValues = static_cast<ElementType *>(sparseMatrix->NzValues());
+            nonZeroValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
            colStarts = static_cast<SparseIndexType *>(sparseMatrix->ColLocation());
            rowIndices = static_cast<SparseIndexType *>(sparseMatrix->RowLocation());
        }
@ -487,9 +630,9 @@ namespace CNTK
        {
            if (sizeof(GPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
                LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on GPU.");
-            std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_GPUSparseMatrix;
+            std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_GPUSparseMatrix;
            numNonZeroValues = sparseMatrix->NzCount();
-            nonZeroValues = static_cast<ElementType *>(sparseMatrix->NzValues());
+            nonZeroValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
            colStarts = static_cast<SparseIndexType *>(sparseMatrix->ColLocation());
            rowIndices = static_cast<SparseIndexType *>(sparseMatrix->RowLocation());
        }
@ -498,11 +641,23 @@ namespace CNTK
            RuntimeError("NDArrayView::SparseDataBuffers: The device %S is currently not supported.",DeviceKindName(m_device.Type()));
        }

-        return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t>(nonZeroValues, colStarts, rowIndices, numNonZeroValues);
+        return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t>(reinterpret_cast<ElementType*>(nonZeroValues), colStarts, rowIndices, numNonZeroValues);
    }

    template <typename ElementType>
    std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers() const
+    {
+        return _SparseBlockColumnDataBuffers<ElementType, ElementType>();
+    }
+
+    template <>
+    std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float16>() const
+    {
+        return _SparseBlockColumnDataBuffers<float16, half>();
+    }
+
+    template <typename ElementType, typename V1ElemType>
+    std::tuple<const void *, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::_SparseBlockColumnDataBuffers() const
    {
        if (AsDataType<ElementType>() != m_dataType)
            InvalidArgument("NDArrayView::SparseBlockColumnDataBuffers: The specified ElementType '%s' does not match this NDArrayView's DataType '%s'.", typeid(ElementType).name(), DataTypeName(m_dataType));
@ -513,23 +668,23 @@ namespace CNTK
        if (GetStorageFormat() != StorageFormat::SparseBlockCol)
            RuntimeError("The SparseBlockColumnDataBuffers() method only supports sparse block column format.");

-        std::shared_ptr<const Matrix<ElementType>> matrix = GetMatrix<ElementType>();
+        std::shared_ptr<const Matrix<V1ElemType>> matrix = GetMatrix<V1ElemType>();

        size_t numBlocks;
        size_t numRows;
        size_t numCols;
-        ElementType* blockValues;
+        V1ElemType* blockValues;
        SparseIndexType* blockId2Col;
        SparseIndexType* col2BlockId;
        if (m_device.Type() == DeviceKind::GPU)
        {
            if (sizeof(GPUSPARSE_INDEX_TYPE) != sizeof(SparseIndexType))
                LogicError("Inconsistent data type for sparse index in 'this' Value and the underlying matrix on GPU.");
-            std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<ElementType>> sparseMatrix = matrix->m_GPUSparseMatrix;
+            std::shared_ptr<Microsoft::MSR::CNTK::GPUSparseMatrix<V1ElemType>> sparseMatrix = matrix->m_GPUSparseMatrix;
            numBlocks = sparseMatrix->GetBlockSize();
            numRows = sparseMatrix->GetNumRows();
            numCols = sparseMatrix->GetNumCols();
-            blockValues = static_cast<ElementType *>(sparseMatrix->NzValues());
+            blockValues = static_cast<V1ElemType *>(sparseMatrix->NzValues());
            blockId2Col = static_cast<SparseIndexType *>(sparseMatrix->BlockId2ColOrRow());
            col2BlockId = static_cast<SparseIndexType *>(sparseMatrix->ColOrRow2BlockId());
        }
@ -539,7 +694,7 @@ namespace CNTK
            RuntimeError("NDArrayView::SparseBlockColumnDataBuffers: The device %S is currently not supported.", DeviceKindName(m_device.Type()));
        }

-        return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t, size_t, size_t>(blockValues, blockId2Col, col2BlockId, numBlocks, numRows, numCols);
+        return std::tuple<ElementType *, SparseIndexType *, SparseIndexType *, size_t, size_t, size_t>(reinterpret_cast<ElementType*>(blockValues), blockId2Col, col2BlockId, numBlocks, numRows, numCols);
    }

    void NDArrayView::AdjustSparseBlockColumn(const SparseIndexType* cpuCol2BlockId, size_t numBlocks, bool useBlockId2Col)
@ -585,6 +740,13 @@ namespace CNTK
            matrix->CollapseDataLocation();
            break;
        }
+        case DataType::Float16:
+        {
+            auto matrix = GetMatrix<half>();
+            matrix->TransferFromDeviceToDevice(matrix->GetDeviceId(), AsCNTKImplDeviceId(device), /*isBeingMoved = */ true, /*emptyTransfer =*/ false, /*updatePreferredDevice =*/ true);
+            matrix->CollapseDataLocation();
+            break;
+        }
        default:
            LogicError("NDArrayView::ChangeDevice: Unsupported DataType %s", DataTypeName(m_dataType));
            break;
@ -594,33 +756,69 @@ namespace CNTK
    }

    template <typename ElementType>
-    /*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/)
+    /*static*/ NDArrayViewPtr NDArrayView::RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device)
+    {
+        return NDArrayView::_RandomNormal<ElementType, ElementType>(shape, mean, stdDev, seed, device);
+    }
+
+    template <>
+    /*static*/ NDArrayViewPtr NDArrayView::RandomNormal<float16>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device)
+    {
+        return NDArrayView::_RandomNormal<float16, half>(shape, mean, stdDev, seed, device);
+    }
+
+    template <typename ElementType, typename V1ElemType>
+    /*static*/ NDArrayViewPtr NDArrayView::_RandomNormal(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device /*= DeviceDescriptor::UseDefaultDevice()*/)
    {
        auto matrixDims = GetMatrixDimensions(shape);
-        auto randomNormalMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)mean, (ElementType)stdDev, seed));
-        auto tensorView = new TensorView<ElementType>(randomNormalMatrix, AsTensorViewShape(shape));
+        auto randomNormalMatrix = std::make_shared<Matrix<V1ElemType>>(Matrix<V1ElemType>::RandomGaussian(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (V1ElemType)mean, (V1ElemType)stdDev, seed));
+        auto tensorView = new TensorView<V1ElemType>(randomNormalMatrix, AsTensorViewShape(shape));

        return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
    }

    template <typename ElementType>
-    /*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/)
+    /*static*/ NDArrayViewPtr NDArrayView::RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device)
+    {
+        return NDArrayView::_RandomUniform<ElementType, ElementType>(shape, rangeBegin, rangeEnd, seed, device);
+    }
+
+    template <>
+    /*static*/ NDArrayViewPtr NDArrayView::RandomUniform<float16>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device)
+    {
+        return NDArrayView::_RandomUniform<float16, half>(shape, rangeBegin, rangeEnd, seed, device);
+    }
+
+    template <typename ElementType, typename V1ElemType>
+    /*static*/ NDArrayViewPtr NDArrayView::_RandomUniform(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/)
    {
        auto matrixDims = GetMatrixDimensions(shape);
-        auto randomUniformMatrix = std::make_shared<Matrix<ElementType>>(Matrix<ElementType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (ElementType)rangeBegin, (ElementType)rangeEnd, seed));
-        auto tensorView = new TensorView<ElementType>(randomUniformMatrix, AsTensorViewShape(shape));
+        auto randomUniformMatrix = std::make_shared<Matrix<V1ElemType>>(Matrix<V1ElemType>::RandomUniform(matrixDims.first, matrixDims.second, AsCNTKImplDeviceId(device), (V1ElemType)rangeBegin, (V1ElemType)rangeEnd, seed));
+        auto tensorView = new TensorView<V1ElemType>(randomUniformMatrix, AsTensorViewShape(shape));

        return MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), device, StorageFormat::Dense, shape, false, tensorView);
    }

    template <typename ElementType>
    ElementType NDArrayView::AsScalar() const
+    {
+        return _AsScalar<ElementType, ElementType>();
+    }
+
+    template <>
+    float16 NDArrayView::AsScalar<float16>() const
+    {
+        return _AsScalar<float16, half>();
+    }
+
+    template <typename ElementType, typename V1ElemType>
+    ElementType NDArrayView::_AsScalar() const
    {
        auto scalarData = this->shared_from_this();
        if (scalarData->Shape().TotalSize() != 1)
            LogicError("NDArrayView::AsScalar: The NDArrayView shaped '%S' is not a scalar.", scalarData->Shape().AsString().c_str());

-        ElementType scalar = std::numeric_limits<ElementType>::quiet_NaN();
+        ElementType scalar = quiet_NaN<ElementType>();
        std::shared_ptr<const NDArrayView> cpuData;
        if (scalarData->Device() == DeviceDescriptor::CPUDevice())
            cpuData = scalarData;
@ -635,6 +833,8 @@ namespace CNTK
            scalar = *(cpuData->DataBuffer<float>());
        else if (scalarData->GetDataType() == DataType::Double)
            scalar = static_cast<ElementType>(*(cpuData->DataBuffer<double>()));
+        else if (scalarData->GetDataType() == DataType::Float16)
+            scalar = static_cast<ElementType>(*(cpuData->DataBuffer<float16>()));
        else
            LogicError("NDArrayView::AsScalar: Unsupported DataType");

@ -652,35 +852,44 @@ namespace CNTK
    // Explicit template instantiations
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<double>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
+    template CNTK_API NDArrayViewPtr NDArrayView::RandomUniform<float16>(const NDShape& shape, double rangeBegin, double rangeEnd, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);

    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<double>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);
+    template CNTK_API NDArrayViewPtr NDArrayView::RandomNormal<float16>(const NDShape& shape, double mean, double stdDev, unsigned long seed, const DeviceDescriptor& device/* = DeviceDescriptor::UseDefaultDevice()*/);

    template CNTK_API const float* NDArrayView::DataBuffer<float>() const;
    template CNTK_API const double* NDArrayView::DataBuffer<double>() const;
+    template CNTK_API const float16* NDArrayView::DataBuffer<float16>() const;

    template CNTK_API const TensorView<float>* NDArrayView::GetTensorView<float>() const;
    template CNTK_API const TensorView<double>* NDArrayView::GetTensorView<double>() const;
+    template CNTK_API const TensorView<half>* NDArrayView::GetTensorView<half>() const;

    template CNTK_API std::tuple<const float*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float>() const;
    template CNTK_API std::tuple<const double*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<double>() const;
+    template CNTK_API std::tuple<const float16*, const SparseIndexType*, const SparseIndexType*, size_t> NDArrayView::SparseCSCDataBuffers<float16>() const;

    template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float>() const;
    template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<double>() const;
+    template CNTK_API std::tuple<const void*, const SparseIndexType*, const SparseIndexType*, size_t, size_t, size_t> NDArrayView::SparseBlockColumnDataBuffers<float16>() const;

    template CNTK_API float* NDArrayView::WritableDataBuffer<float>();
    template CNTK_API double* NDArrayView::WritableDataBuffer<double>();
+    template CNTK_API float16* NDArrayView::WritableDataBuffer<float16>();

    template std::shared_ptr<const Matrix<float>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
    template std::shared_ptr<const Matrix<double>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;
+    template std::shared_ptr<const Matrix<half>> NDArrayView::GetMatrix(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/) const;

    template std::shared_ptr<Matrix<float>> NDArrayView::GetWritableMatrix<float>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
    template std::shared_ptr<Matrix<double>> NDArrayView::GetWritableMatrix<double>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
+    template std::shared_ptr<Matrix<half>> NDArrayView::GetWritableMatrix<half>(size_t rowColSplitPoint/* = AutoSelectRowColSplitPoint*/);
    template TensorView<float>* NDArrayView::GetWritableTensorView<float>();
    template TensorView<double>* NDArrayView::GetWritableTensorView<double>();
+    template TensorView<half>* NDArrayView::GetWritableTensorView<half>();

-    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
-    template CNTK_API NDArrayView::NDArrayView(const NDShape& viewShape, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template float NDArrayView::AsScalar<float>() const;
    template double NDArrayView::AsScalar<double>() const;
+    template float16 NDArrayView::AsScalar<float16>() const;
 }
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.cpp
@ -53,8 +53,14 @@ namespace CNTK
                }
                else
                {
+                    // batch normalization on FP16 requires 32-bit scale/bias/mean/variance, so specialize that case
+                    bool batchNormSpecialCase =
+                        (op == PrimitiveOpType::BatchNormalization) &&
+                        (outputDataType == DataType::Float16) &&
+                        (inputDataType == DataType::Float);
+
                    // The DataType of all operands should match except for Constants where we allow coercion
-                    if ((inputDataType != DataType::Unknown) && (inputDataType != outputDataType) && !input.IsConstant())
+                    if ((inputDataType != DataType::Unknown) && (inputDataType != outputDataType) && !input.IsConstant() && !batchNormSpecialCase)
                        InvalidArgument("Primitive op '%S' passed operands '%S' with different DataTypes '%s' and '%s'.",
                                        PrimitiveOpTypeName(op).c_str(), NamedListString(inputs).c_str(), DataTypeName(outputDataType), DataTypeName(inputDataType));
                }
@ -70,7 +76,18 @@ namespace CNTK
            for (auto& input : inputs)
            {
                if ((input.GetDataType() == DataType::Unknown) && (input.IsConstant() || input.IsParameter()))
-                    input.m_dataFields->m_dataType = outputDataType;
+                {
+                    // batch normalization on FP16 requires 32-bit scale/bias/mean/variance, so specialize that case
+                    if ((op == PrimitiveOpType::BatchNormalization) &&
+                        (outputDataType == DataType::Float16))
+                    {
+                        input.m_dataFields->m_dataType = DataType::Float;
+                    }
+                    else
+                    {
+                        input.m_dataFields->m_dataType = outputDataType;
+                    }
+                }
            }
        }

@ -235,6 +252,10 @@ namespace CNTK
        else
        {
            DataType outputDataType = GetOutputDataType(m_op, m_inputs, true);
+
+            if (m_op == PrimitiveOpType::Cast)
+                outputDataType = static_cast<DataType>(m_attributes[PrimitiveFunction::AttributeNameNewDataType].Value<int>());
+
            std::vector<Axis> outputDynamicAxes = GetOutputDynamicAxes(m_op, m_inputs, this, m_attributes);
            bool needsGradient = std::any_of(m_inputs.begin(), m_inputs.end(), [](const Variable& input) { return input.NeedsGradient(); });

@ -333,6 +354,7 @@ namespace CNTK
                        case PrimitiveOpType::ELU:
                        case PrimitiveOpType::StableSigmoid:
                        case PrimitiveOpType::ConstantOp:
+                        case PrimitiveOpType::Cast:
                            assert(m_inputs.size() == 1);
                            outputShape = UnaryElementwiseOpOutputShape(m_inputs[0].Shape());
                            break;
--- a/Source/CNTKv2LibraryDll/PrimitiveFunction.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveFunction.h
@ -116,6 +116,7 @@ namespace CNTK
        {PrimitiveOpType::TopK, L"TopK"},
        {PrimitiveOpType::ConstantOp, L"ConstantOp"},
        {PrimitiveOpType::Squeeze, L"Squeeze"},
+        {PrimitiveOpType::Cast, L"Cast" },
    };

    inline const std::wstring& PrimitiveOpTypeName(PrimitiveOpType opType)
--- a/Source/CNTKv2LibraryDll/PrimitiveOpType.h
+++ b/Source/CNTKv2LibraryDll/PrimitiveOpType.h
@ -100,6 +100,7 @@ namespace CNTK
        Squeeze = 88,
        ConstantOp = 89,
        LatticeSequenceWithSoftmax = 90,
+        Cast = 91,
        // New op types should only be appended to the end of this list 
        UnknownOP
        // and UnknownOP should always be last.
--- a/Source/CNTKv2LibraryDll/Serialization.cpp
+++ b/Source/CNTKv2LibraryDll/Serialization.cpp
@ -231,13 +231,17 @@ namespace CNTK
            return DictionaryValue::Type(type);
        }

-        template <typename T>
-        static void CopyData(const NDArrayView& src, RepeatedField<T>* dst)
+        template <typename SrcT, typename DstT=SrcT>
+        static void CopyData(const NDArrayView& src, RepeatedField<DstT>* dst)
        {
            auto size = src.Shape().TotalSize();
-            dst->Resize((int)size, T());
-            const T* buffer = src.DataBuffer<T>();
-            memcpy(dst->mutable_data(), buffer, (int)size * sizeof(T));
+            dst->Resize((int)size, DstT());
+            const SrcT* buffer = src.DataBuffer<SrcT>();
+            if (std::is_same<SrcT, DstT>::value)
+                memcpy(dst->mutable_data(), buffer, (int)size * sizeof(DstT));
+            else
+                for (size_t i = 0; i < size; i++)
+                    dst->mutable_data()[i] = (DstT)buffer[i];
        }

        template <typename T>
@ -250,32 +254,44 @@ namespace CNTK
            {
                auto value = buffer[i];
                if (tSize <= sizeof(uint32))
-                    output.WriteLittleEndian32(Encode<T, uint32>(value));
-                else 
+                {
+                    output.WriteLittleEndian32(Encode<T, uint32>((float)value));
+                }
+                else
+                {
                    output.WriteLittleEndian64(Encode<T, uint64>(value));
+                }
            }
        }

-        template <typename T>
+        template <typename SrcT, typename DstT = SrcT>
        static bool ReadData(RenewableCodedStream& input, NDArrayView& dst)
        {
            auto size = dst.Shape().TotalSize();
-            T* buffer = dst.WritableDataBuffer<T>();
+            DstT* buffer = dst.WritableDataBuffer<DstT>();
            for (auto i = 0; i < size; i++)
            {
-                if (!input.Read<T>(buffer+i))
+                SrcT value;
+                if (!input.Read<SrcT>(&value))
                    return false;
+                buffer[i] = (DstT)value;
            }
            return true;
        }

-        template <typename T>
-        static void CopyData(const RepeatedField<T>& src, NDArrayView* dst)
+        template <typename SrcT, typename DstT = SrcT>
+        static void CopyData(const RepeatedField<SrcT>& src, NDArrayView* dst)
        {
            auto size = src.size();
            assert(size == dst->Shape().TotalSize());;
-            T* buffer = dst->WritableDataBuffer<T>();
-            memcpy(buffer, src.data(), size * sizeof(T));
+            DstT* buffer = dst->WritableDataBuffer<DstT>();
+            if (std::is_same<SrcT, DstT>::value)
+                memcpy(buffer, src.data(), size * sizeof(SrcT));
+            else
+            {
+                for (size_t i = 0; i < size; i++)
+                    buffer[i] = (DstT)src.data()[i];
+            }
        }

        
@ -312,6 +328,10 @@ namespace CNTK
            {
                CopyData<double>(src, dst->mutable_double_values()->mutable_value());
            }
+            else if (src.GetDataType() == DataType::Float16)
+            {
+                CopyData<float16, float>(src, dst->mutable_float_values()->mutable_value());
+            }
        }
    }

@ -328,6 +348,10 @@ namespace CNTK
            {
                WriteData<double>(src, output);
            }
+            else if (src.GetDataType() == DataType::Float16)
+            {
+                WriteData<float16>(src, output);
+            }
        }
    }

@ -350,6 +374,11 @@ namespace CNTK
                if (!ReadData<double>(wrapper, dst))
                    return false;                
            }
+            else if (dst.GetDataType() == DataType::Float16)
+            {
+                if (!ReadData<float, float16>(wrapper, dst))
+                    return false;
+            }
        }
        return true;
    }
@ -446,6 +475,13 @@ namespace CNTK
            else
                m_arrayViews.push_back({ dst, nullptr });
        }
+        else if(dataType == DataType::Float16)
+        {
+            if (src.float_values().value().size() == shape->TotalSize())
+                CopyData<float, float16>(src.float_values().value(), dst);
+            else
+                m_arrayViews.push_back({ dst, nullptr });
+        }
        return dst;
    }

--- a/Source/CNTKv2LibraryDll/Serialization.h
+++ b/Source/CNTKv2LibraryDll/Serialization.h
@ -37,6 +37,7 @@ namespace CNTK
    const std::wstring learningRateScheduleKey = L"learnig_rate_schedule";
    const std::wstring smoothedGradientsKey = L"smoothed_gradients";
    const std::wstring noiseInjectionSeedKey = L"noise_injection_seed";
+    const std::wstring masterParameterUpdatedKey = L"master_parameter_updated";
    const std::wstring smoothedCountKey = L"smoothed_count";
    const std::wstring stateKey = L"state";
    const std::wstring rngSeedKey = L"rng_seed";
--- a/Source/CNTKv2LibraryDll/Trainer.cpp
+++ b/Source/CNTKv2LibraryDll/Trainer.cpp
@ -51,6 +51,10 @@ namespace CNTK
            combinedFunctionArgs = m_model->Outputs();

        combinedFunctionArgs.push_back(m_lossFunction);
+
+        if (m_lossFunction->Output().GetDataType() == DataType::Float16)
+            fprintf(stderr, "WARNING: using Float16 for loss function may cause overflow, please cast to float");
+
        if (!m_lossFunction->Output().DynamicAxes().empty())
        {
            m_aggregatedLossFunction = ReduceSum(lossFunction, Axis::AllAxes(), L"aggregateLoss");
@ -226,6 +230,9 @@ namespace CNTK
            // Gradients are not existing.
            for (const auto& parameter : m_learnerParameters)
                gradients[parameter] = nullptr;
+
+            trainingLoss = MakeSharedObject<NDArrayView>(0, (m_aggregatedLossFunction ? m_aggregatedLossFunction->Output().GetDataType() : DataType::Float), NDShape{}, computeDevice);
+            evalCriterion = MakeSharedObject<NDArrayView>(0, (m_aggregatedEvaluationFunction ? m_aggregatedEvaluationFunction->Output().GetDataType() : DataType::Float), NDShape{}, computeDevice);
        }
        else
        {
@ -363,8 +370,10 @@ namespace CNTK

        if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Float)
            m_rootGradientValue->Data()->SetValue(1.0f);
-        else
+        else if (m_aggregatedLossFunction->Output().GetDataType() == DataType::Double)
            m_rootGradientValue->Data()->SetValue(1.0);
+        else
+            m_rootGradientValue->Data()->SetValue(half(1.0));

        for (const auto& parameter : m_learnerParameters)
            parameterGradients[parameter] = nullptr;
@ -541,7 +550,7 @@ namespace CNTK
            return m_parameterLearners->ParameterLearners().front()->TotalNumberOfSamplesSeen();
        default:
            //should not be here; whenever a new data unit is defined, there should be a new case in this function.
-            LogicError("Unsupported data unit: %d", unit);
+            LogicError("Unsupported data unit: %d", (int)unit);
        }
    }

--- a/Source/CNTKv2LibraryDll/Utils.cpp
+++ b/Source/CNTKv2LibraryDll/Utils.cpp
@ -754,7 +754,7 @@ namespace CNTK
            if (mask != nullptr)
                Value::GetSequenceStartsAndLengths(mask, sequenceBeginIndices, sequenceLengths, numDynamicAxes);

-            bool hasTruncatedSequences = std::find_if(sequenceBeginIndices.begin(), sequenceBeginIndices.end(), [](const int& val) { return (val < 0); }) != sequenceBeginIndices.end();
+            bool hasTruncatedSequences = std::find_if(sequenceBeginIndices.begin(), sequenceBeginIndices.end(), [](const ptrdiff_t& val) { return (val < 0); }) != sequenceBeginIndices.end();

            auto layout = std::make_shared<MBLayout>();
            std::vector<std::pair<size_t, size_t>> placement;
@ -1178,12 +1178,15 @@ namespace CNTK

    template std::pair<std::shared_ptr<const Matrix<float>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<float>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);
    template std::pair<std::shared_ptr<const Matrix<double>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<double>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);
+    template std::pair<std::shared_ptr<const Matrix<half>>, MBLayoutPtr> Utils::GetCNTKImplMatrixAndMBLayoutFromValueObject<half>(const Variable& var, const ValuePtr& value, NDShape* inferredVarShape);

    template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<float>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
    template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<double>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
+    template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(const NDShape& sampleShape, const std::vector<Axis>& sampleDynamicAxes, const Matrix<half>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);

    template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<float>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<float>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
    template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<double>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<double>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);
+    template ValuePtr Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout<half>(const Variable& var, const ComputationNodeBasePtr& computationNode, const Matrix<half>& matrix, const MBLayoutPtr& layout, bool readOnly /*= true*/);

    void Accumulator::Update(const ValuePtr& delta, const DeviceDescriptor& device)
    {
@ -1206,8 +1209,10 @@ namespace CNTK

        if (delta->GetDataType() == DataType::Float)
            Data()->GetWritableTensorView<float>()->AddCopyOf(*delta->Data()->GetTensorView<float>());
-        else
+        else if(delta->GetDataType() == DataType::Double)
            Data()->GetWritableTensorView<double>()->AddCopyOf(*delta->Data()->GetTensorView<double>());
+        else
+            RuntimeError("Unexpected data type in accumulator");

        if (copied && m_numUpdates != 0)
            RuntimeError("Accumulation values are created when accumulated num updates not zero");
@ -1228,8 +1233,10 @@ namespace CNTK

        if (GetDataType() == DataType::Float)
            Data()->SetValue(0.0f);
-        else
+        else if (GetDataType() == DataType::Double)
            Data()->SetValue(0.0);
+        else
+            RuntimeError("Unsupported data type in Accumulator");
    }

    std::wstring DynamicAxesAsString(const std::vector<Axis>& axes, bool rowMajor)
--- a/Source/CNTKv2LibraryDll/Utils.h
+++ b/Source/CNTKv2LibraryDll/Utils.h
@ -742,9 +742,18 @@ namespace CNTK
    template <typename T> //T can be Variable or StreamInfo
    static bool IsAtSweepEnd(const std::unordered_map<T, MinibatchData>& arguments)
    {
+        if (arguments.empty()) return true;
+
        return std::any_of(arguments.begin(), arguments.end(), [](const std::pair<const T, MinibatchData>& kv)
        {
            return kv.second.sweepEnd;
        });
    }
+
+    // half is V1 ElemType, so specialize here instead of in CNTKLibrary.h
+    template<>
+    inline DataType AsDataType<half>()
+    {
+        return DataType::Float16;
+    }
 }
--- a/Source/CNTKv2LibraryDll/Value.cpp
+++ b/Source/CNTKv2LibraryDll/Value.cpp
@ -108,7 +108,7 @@ namespace CNTK
        }

        colStarts[numCSCCols - 1] = (SparseIndexType)(nonZeroValues.size());
-        NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
+        NDArrayViewPtr deviceValueData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), valueDataShape, colStarts.data(), rowIndices.data(), nonZeroValues.data(), nonZeroValues.size(), device, readOnly);
        return MakeSharedObject<Value>(deviceValueData, deviceValueMask);
    }

@ -229,18 +229,7 @@ namespace CNTK
                auto totalNumNonZeroValues = nonZeroValues.size() / DataTypeSize(dataType);
                colStarts.push_back(totalNumNonZeroValues);

-                switch (dataType)
-                {
-                case DataType::Float:
-                    // TODO: In case of sparse we can directly create on target device
-                    valueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), (float*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
-                    break;
-                case DataType::Double:
-                    valueData = MakeSharedObject<NDArrayView>(valueDataShape, colStarts.data(), rowIndices.data(), (double*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
-                    break;
-                default:
-                    NOT_IMPLEMENTED;
-                }
+                valueData = MakeSharedObject<NDArrayView>(dataType, valueDataShape, colStarts.data(), rowIndices.data(), (void*)nonZeroValues.data(), totalNumNonZeroValues, device, readOnly);
            }
            else
            {
@ -372,7 +361,7 @@ namespace CNTK
    /*static*/  ValuePtr Value::CreateSequence(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const ElementType* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/)
    {
        auto sequenceShape = sampleShape.AppendShape({sequenceLength});
-        auto sequenceData = MakeSharedObject<NDArrayView>(sequenceShape, colStarts, rowIndices, nonZeroValues, numNonZeroValues, device, readOnly);
+        auto sequenceData = MakeSharedObject<NDArrayView>(AsDataType<ElementType>(), sequenceShape, colStarts, rowIndices, nonZeroValues, numNonZeroValues, device, readOnly);
        return Create(sampleShape, {sequenceData}, {sequenceStartFlag}, device, readOnly, false);
    }

@ -683,6 +672,9 @@ namespace CNTK
            case DataType::Double:
                valueObject = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_sampleShape, m_sampleDynamicAxes, *(m_packedData->GetMatrix<double>()), m_packedDataLayout, m_isReadOnly);
                break;
+            case DataType::Float16:
+                valueObject = Utils::GetValueObjectFromCNTKImplMatrixAndMBLayout(m_sampleShape, m_sampleDynamicAxes, *(m_packedData->GetMatrix<half>()), m_packedDataLayout, m_isReadOnly);
+                break;
            default:
                LogicError("Unsupported DataType %s", DataTypeName(dataType));
            }
@ -728,7 +720,7 @@ namespace CNTK
            bool found = false;
            for (size_t i = 0; i < sampleSize; i++)
            {
-                if (*currentp == 1)
+                if (*currentp == (ElementType)1)
                {
                    if (found)
                        RuntimeError("CopyDenseToOneHot: Cannot convert to onehot vector; more than one non-zero value in the sample.");
@ -736,7 +728,7 @@ namespace CNTK
                    index = i;
                    found = true;
                }
-                else if (*currentp != 0)
+                else if (*currentp != (ElementType)0)
                    RuntimeError("CopyDenseToOneHot: Cannot convert to onehot vector; contains value other than 0/1.");

                currentp++;
@ -753,26 +745,38 @@ namespace CNTK
    // Explicit template instantiations
    template /*static*/ CNTK_API ValuePtr Value::Create<float>(const NDShape& sampleShape, const std::vector<std::vector<float>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::Create<double>(const NDShape& sampleShape, const std::vector<std::vector<double>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
+    template /*static*/ CNTK_API ValuePtr Value::Create<float16>(const NDShape& sampleShape, const std::vector<std::vector<float16>>& sequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::Create<float>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::Create<double>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
+    template /*static*/ CNTK_API ValuePtr Value::Create<float16>(const NDShape& sampleShape, const std::vector<std::vector<size_t>>& oneHotSequences, const std::vector<bool>& sequenceStartFlags, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float>(const NDShape& sampleShape, const std::vector<float>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
    template /*static*/ CNTK_API ValuePtr Value::CreateBatch<double>(const NDShape& sampleShape, const std::vector<double>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
+    template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float16>(const NDShape& sampleShape, const std::vector<float16>& batchData, const DeviceDescriptor& device, bool readOnly /*= false */);
    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(const NDShape& sampleShape, const std::vector<float>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(const NDShape& sampleShape, const std::vector<double>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
+    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16> (const NDShape& sampleShape, const std::vector<float16>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly /*= false */);
    template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float>(size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::CreateBatch<double>(size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
+    template /*static*/ CNTK_API ValuePtr Value::CreateBatch<float16> (size_t dimension, const std::vector<size_t>& batchData, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
+    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16>(size_t dimension, const std::vector<size_t>& sequenceData, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<double>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const double* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
+    template /*static*/ CNTK_API ValuePtr Value::CreateSequence<float16>(const NDShape& sampleShape, size_t sequenceLength, const SparseIndexType* colStarts, const SparseIndexType* rowIndices, const float16* nonZeroValues, size_t numNonZeroValues, bool sequenceStartFlag, const DeviceDescriptor& device, bool readOnly/* = false*/);
    template CNTK_API void Value::CopyVariableValueToVector<float>(const Variable& outputVariable, std::vector<std::vector<float>>& sequences);
    template CNTK_API void Value::CopyVariableValueToVector<double>(const Variable& outputVariable, std::vector<std::vector<double>>& sequences);
+    template CNTK_API void Value::CopyVariableValueToVector<float16>(const Variable& outputVariable, std::vector<std::vector<float16>>& sequences);
    template CNTK_API void Value::CopyVariableValueToVector<float>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
    template CNTK_API void Value::CopyVariableValueToVector<double>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
+    template CNTK_API void Value::CopyVariableValueToVector<float16>(const Variable& outputVariable, std::vector<std::vector<size_t>>& sequences);
    template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<float>(const Variable& outputVariable);
    template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<double>(const Variable& outputVariable);
+    template CNTK_API std::tuple<size_t, size_t, size_t> Value::ValidateSparseCSCAndGetIndexBufferSizes<float16>(const Variable& outputVariable);
    template CNTK_API void Value::CopyVariableValueToCSCSparse<float>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<float>& nonZeroValues, size_t& numNonZeroValues);
    template CNTK_API void Value::CopyVariableValueToCSCSparse<double>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<double>& nonZeroValues, size_t& numNonZeroValues);
+    template CNTK_API void Value::CopyVariableValueToCSCSparse<float16>(size_t sequenceLength, std::vector<SparseIndexType>& colStarts, std::vector<SparseIndexType>& rowIndices, std::vector<float16>& nonZeroValues, size_t& numNonZeroValues);
    template float Value::AsScalar<float>() const;
    template double Value::AsScalar<double>() const;
+    template float16 Value::AsScalar<float16>() const;
 }
--- a/Source/CNTKv2LibraryDll/Variable.cpp
+++ b/Source/CNTKv2LibraryDll/Variable.cpp
@ -135,6 +135,11 @@ namespace CNTK
                    m_dataFields->m_value = CreateValueFromParameterInitializer<double>(Shape(), *m_dataFields->m_valueInitializer, *m_dataFields->m_valueInitializationDevice);
                    break;
                }
+                case DataType::Float16:
+                {
+                    m_dataFields->m_value = CreateValueFromParameterInitializer<half>(Shape(), *m_dataFields->m_valueInitializer, *m_dataFields->m_valueInitializationDevice);
+                    break;
+                }
                default:
                    LogicError("Variable '%S' Value(): Unsupported DataType %s", AsString().c_str(), DataTypeName(GetDataType()));
                    break;
@ -496,7 +501,8 @@ namespace CNTK
        DataType dataType = DataType(dict[dataTypeKey].Value<std::size_t>());
        if (dataType != DataType::Unknown &&
            dataType != DataType::Float &&
-            dataType != DataType::Double)
+            dataType != DataType::Double &&
+            dataType != DataType::Float16)
        {
            LogicError("Unexpected variable datatype '%ls':'%u' (%s).", 
                       dataTypeKey.c_str(), 
--- a/Source/CNTKv2LibraryDll/proto/CNTK.proto
+++ b/Source/CNTKv2LibraryDll/proto/CNTK.proto
@ -23,6 +23,7 @@ message NDArrayView {
 	Unknown = 0;
 	Float = 1;
 	Double = 2;
+        Float16 = 4;
  }
  
  enum StorageFormat {
--- a/Source/CNTKv2LibraryDll/proto/onnx/ONNXToCNTK.cpp
+++ b/Source/CNTKv2LibraryDll/proto/onnx/ONNXToCNTK.cpp
@ -1497,7 +1497,7 @@ FunctionPtr ONNXToCNTKHelper::CreateFunction(const Node *node, const std::vector
        CNTK::PaddingMode cntkPaddingMode;
        double cntkConstantValue = 0.0;
        auto mode = GetNamedAttributeAsString(node, "mode", "constant");
-        std::transform(mode.begin(), mode.end(), mode.begin(), ::tolower);
+        std::transform(mode.begin(), mode.end(), mode.begin(), [](char v) { return (char)::tolower(v); });
        if (mode == "constant")
            cntkPaddingMode = CNTK::PaddingMode::CONSTANTPAD;
        else if (mode == "reflect")
--- a/Source/Common/Common.vcxproj
+++ b/Source/Common/Common.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
--- a/Source/Common/File.cpp
+++ b/Source/Common/File.cpp
@ -34,6 +34,7 @@
 #define WRITE_BUFFER_SIZE (1024 * 1024)

 #include <boost/algorithm/string.hpp>
+#include "half.hpp"

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -978,9 +979,11 @@ template <class ElemType>

 template vector<float>  File::LoadMatrixFromTextFile<float> (const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
 template vector<double> File::LoadMatrixFromTextFile<double>(const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
+template vector<half> File::LoadMatrixFromTextFile<half>(const std::wstring& filePath, size_t& /*out*/ numRows, size_t& /*out*/ numCols);

 template vector<float>  File::LoadMatrixFromStringLiteral<float> (const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
 template vector<double> File::LoadMatrixFromStringLiteral<double>(const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);
+template vector<half> File::LoadMatrixFromStringLiteral<half>(const std::string& literal, size_t& /*out*/ numRows, size_t& /*out*/ numCols);

 #ifndef CNTK_COMPONENT_VERSION
 #error CNTK_COMPONENT_VERSION must be set
--- a/Source/Common/Include/Basics.h
+++ b/Source/Common/Include/Basics.h
@ -296,7 +296,7 @@ struct utf8 : std::string
        }                                   // empty string
        std::vector<char> buf(3 * len + 1); // max: 1 wchar => up to 3 mb chars
        // ... TODO: this fill() should be unnecessary (a 0 is appended)--but verify
-        std::fill(buf.begin(), buf.end(), 0);
+        std::fill(buf.begin(), buf.end(), (char)0);
        int rc = WideCharToMultiByte(CP_UTF8, 0, p.c_str(), (int) len,
                                     &buf[0], (int) buf.size(), NULL, NULL);
        if (rc == 0)
@ -335,7 +335,7 @@ static inline std::string wcstombs(const std::wstring& p) // output: MBCS
 {
    size_t len = p.length();
    std::vector<char> buf(2 * len + 1); // max: 1 wchar => 2 mb chars
-    std::fill(buf.begin(), buf.end(), 0);
+    std::fill(buf.begin(), buf.end(), (char)0);
    ::wcstombs(&buf[0], p.c_str(), 2 * len + 1);
    return std::string(&buf[0]);
 }
--- a/Source/Common/Include/File.h
+++ b/Source/Common/Include/File.h
@ -159,12 +159,10 @@ public:
    template <typename T>
    File& operator<<(T val)
    {
-        {
-            if (IsTextBased())
-                fputText(m_file, val);
-            else
-                fput(m_file, val);
-        }
+        if (IsTextBased())
+            fputText(m_file, val);
+        else
+            fput(m_file, val);
        return *this;
    }
    File& operator<<(const std::wstring& val);
--- a/Source/Common/Include/latticearchive.h
+++ b/Source/Common/Include/latticearchive.h
@ -1158,8 +1158,10 @@ public:
        void release(bool cpumode);
        void setloglls(const Microsoft::MSR::CNTK::Matrix<float>& loglls);
        void setloglls(const Microsoft::MSR::CNTK::Matrix<double>& loglls);
+        void setloglls(const Microsoft::MSR::CNTK::Matrix<half>& loglls);
        void getgamma(Microsoft::MSR::CNTK::Matrix<float>& loglls);
        void getgamma(Microsoft::MSR::CNTK::Matrix<double>& loglls);
+        void getgamma(Microsoft::MSR::CNTK::Matrix<half>& loglls);
    };

    // forward-backward function
--- a/Source/Common/fileutil.cpp
+++ b/Source/Common/fileutil.cpp
@ -2099,7 +2099,7 @@ static inline std::string wcstombs(const std::wstring& p) // output: MBCS
 {
    size_t len = p.length();
    vector<char> buf(2 * len + 1); // max: 1 wchar => 2 mb chars
-    fill(buf.begin(), buf.end(), 0);
+    fill(buf.begin(), buf.end(), (char)0);
    ::wcstombs(&buf[0], p.c_str(), 2 * len + 1);
    return std::string(&buf[0]);
 }
--- a/Source/ComputationNetworkLib/ComputationNetwork.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetwork.cpp
@ -47,6 +47,12 @@ vector<MemRequestInfo<double>>& MatrixPool::GetMemRequestInfoVec<double>()
    return m_memRequestInfoDoubleVec;
 }

+template <>
+vector<MemRequestInfo<half>>& MatrixPool::GetMemRequestInfoVec<half>()
+{
+    return m_memRequestInfoHalfVec;
+}
+
 // -----------------------------------------------------------------------
 // construction
 // -----------------------------------------------------------------------
@ -129,6 +135,8 @@ void ComputationNetwork::SaveToFileImpl(const wstring& fileName, const FileOptio
            precision = ElemTypeName<float>();
        else if (nodePtr->Is<ComputationNode<double>>())
            precision = ElemTypeName<double>();
+        else if (nodePtr->Is<ComputationNode<half>>())
+            precision = ElemTypeName<half>();
        else LogicError("Unexpected node type.");
        fstream << precision;
 #endif
@ -229,7 +237,7 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
    {
        wstring precision;
        if (modelVersion >= CNTK_MODEL_VERSION_7)
-            fstream >> precision; // "float" or "double"; default is "" meaning <ElemType> as passed in from outside
+            fstream >> precision; // "float" or "double" or "half"; default is "" meaning <ElemType> as passed in from outside

        wstring opName, nodeName;
        fstream >> opName >> nodeName;
@ -241,6 +249,8 @@ void ComputationNetwork::ReadPersistableParameters(size_t modelVersion, File& fs
            node = ComputationNetworkBuilder<float>::NewNode(opName, m_deviceId, nodeName);
        else if (precision == L"double")
            node = ComputationNetworkBuilder<double>::NewNode(opName, m_deviceId, nodeName);
+        else if (precision == L"half")
+            node = ComputationNetworkBuilder<half>::NewNode(opName, m_deviceId, nodeName);
        else if (precision == L"") // old file format: default to <ElemType>
            node = ComputationNetworkBuilder<ElemType>::NewNode(opName, m_deviceId, nodeName);
        else
@ -434,9 +444,10 @@ void ComputationNetwork::InitLearnableParameters(const ComputationNodeBasePtr& n
 {
    randomSeed += GetRandomSeedOffset();
    if (TryPostInitParameters<float> (node, initString, initValue, randomSeed, initOnCPUOnly) ||
-        TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly))
+        TryPostInitParameters<double>(node, initString, initValue, randomSeed, initOnCPUOnly) ||
+        TryPostInitParameters<half>  (node, initString, initValue, randomSeed, initOnCPUOnly))
        return;
-    LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double>");
+    LogicError("InitLearnableParameters: Input node is not a LearnableParameter<float or double or half>");
 }

 // non-static version needed because it accesses m_randomSeedOffset
@ -666,6 +677,9 @@ void ComputationNetwork::SetSeqParam(ComputationNetworkPtr net,
            auto noded = dynamic_pointer_cast<ConvolutionNode<double>>(*nodeIter);
            if (noded)
                noded->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
+            auto nodeh = dynamic_pointer_cast<ConvolutionNode<half>>(*nodeIter);
+            if (nodeh)
+                nodeh->SetmMaxTempMemSizeInSamples(maxTempMemSizeInSamples);
        }
    }
 }
@ -1233,7 +1247,7 @@ void ComputationNetwork::SaveToDbnFile(ComputationNetworkPtr net, const std::wst
    auto GetAllPriorNodes = [](ComputationNodeBasePtr node)->bool
    {
        std::wstring lowerName = node->GetName();
-        std::transform(lowerName.begin(), lowerName.end(), lowerName.begin(), ::tolower);
+        std::transform(lowerName.begin(), lowerName.end(), lowerName.begin(), [](wchar_t v) { return (wchar_t)::tolower(v); });

        return node->OperationName() == OperationNameOf(LearnableParameter) && (lowerName.find(L"prior") != wstring::npos);
    };
@ -1536,6 +1550,15 @@ template void ComputationNetwork::SetSeqParam<double>(ComputationNetworkPtr net,
                                                      const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
 template void ComputationNetwork::SaveToDbnFile<double>(ComputationNetworkPtr net, const std::wstring& fileName) const;

+template void ComputationNetwork::InitLearnableParametersWithBilinearFill<half>(const ComputationNodeBasePtr& node, size_t kernelWidth, size_t kernelHeight);
+template void ComputationNetwork::Read<half>(const wstring& fileName);
+template void ComputationNetwork::ReadPersistableParameters<half>(size_t modelVersion, File& fstream, bool create);
+template void ComputationNetwork::PerformSVDecomposition<half>(const map<wstring, float>& SVDConfig, size_t alignedsize);
+template /*static*/ void ComputationNetwork::SetBatchNormalizationTimeConstants<half>(ComputationNetworkPtr net, const ComputationNodeBasePtr& criterionNode, const double normalizationTimeConstant, double& prevNormalizationTimeConstant, double blendTimeConstant, double& prevBlendTimeConstant);
+template void ComputationNetwork::SetSeqParam<half>(ComputationNetworkPtr net, const ComputationNodeBasePtr criterionNode, const double& hsmoothingWeight, const double& frameDropThresh, const bool& doreferencealign,
+    const double& amf, const double& lmf, const double& wp, const double& bMMIfactor, const bool& sMBR);
+template void ComputationNetwork::SaveToDbnFile<half>(ComputationNetworkPtr net, const std::wstring& fileName) const;
+
 // register ComputationNetwork with the ScriptableObject system
 ScriptableObjects::ConfigurableRuntimeTypeRegister::Add<ComputationNetwork> registerComputationNetwork(L"ComputationNetwork");

--- a/Source/ComputationNetworkLib/ComputationNetwork.h
+++ b/Source/ComputationNetworkLib/ComputationNetwork.h
@ -1309,10 +1309,12 @@ typedef ComputationNetwork::ComputationNetworkPtr ComputationNetworkPtr;
 template <typename ElemType> static inline const wchar_t* ElemTypeName();
 template <> /*static*/ inline const wchar_t* ElemTypeName<float>()  { return L"float"; }
 template <> /*static*/ inline const wchar_t* ElemTypeName<double>() { return L"double"; }
+template <> /*static*/ inline const wchar_t* ElemTypeName<half>() { return L"half"; }

 // The following emits the class and enables the BaseMatrix<double> to be available (used by EvalDll)
 // The corresponding Matrix<float> is emitted in the SetDeviceId function above.
 template class Matrix<double>;
+template class Matrix<half>;

 // TODOs:
 //  - automatic inference of time window w.r.t. delay nodes (and related nodes such as a temporal pooling)
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.cpp
@ -226,9 +226,10 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Creat

 // this is used in V2
 template <class ElemType>
-shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
+template <class ValueType>
+shared_ptr<ComputationNode<ValueType>> ComputationNetworkBuilder<ElemType>::TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
 {
-    return net.AddNodeToNetWithElemType(New<LearnableParameter<ElemType>>(net.GetDeviceId(), paramName, tensorShape));
+    return net.AddNodeToNetWithElemType(New<LearnableParameter<ValueType>>(net.GetDeviceId(), paramName, tensorShape));
 }

 // TODO: change these to take an actual object instead of a name for dynamicAxis
@ -964,5 +965,17 @@ shared_ptr<ComputationNode<ElemType>> ComputationNetworkBuilder<ElemType>::Batch

 template class ComputationNetworkBuilder<float>;
 template class ComputationNetworkBuilder<double>;
+template class ComputationNetworkBuilder<half>;
+
+// V2 allows mixed precision
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<float>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<double>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);
+template shared_ptr<ComputationNode<float>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<float>(const std::wstring& paramName, const TensorShape& tensorShape);
+template shared_ptr<ComputationNode<double>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<double>(const std::wstring& paramName, const TensorShape& tensorShape);
+template shared_ptr<ComputationNode<half>> ComputationNetworkBuilder<half>::TypedCreateLearnableParameter<half>(const std::wstring& paramName, const TensorShape& tensorShape);

 }}}
--- a/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
+++ b/Source/ComputationNetworkLib/ComputationNetworkBuilder.h
@ -45,7 +45,14 @@ public:
    // TODO: separate into nodes that have inputs and those that duplicate functions with input adding except just not adding inputs. Clear?

    ComputationNodePtr CreateLearnableParameter(const std::wstring& paramName, const size_t rows, const size_t cols);
-    ComputationNodePtr CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
+    shared_ptr<ComputationNode<ElemType>> CreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape)
+    {
+        return this->template TypedCreateLearnableParameter<ElemType>(paramName, tensorShape);
+    }
+
+    template<class ValueType>
+    shared_ptr<ComputationNode<ValueType>> TypedCreateLearnableParameter(const std::wstring& paramName, const TensorShape& tensorShape); // V2
+
    // sparse matrix size is optionally specified
    // ComputationNodePtr CreateSparseLearnableParameter(const std::wstring & paramName, const size_t rows, const size_t cols, const size_t size = 0);
    ComputationNodePtr CreateInputNode(const std::wstring& inputName, const size_t rows, const wstring& dynamicAxisName = L"");
--- a/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
+++ b/Source/ComputationNetworkLib/ComputationNetworkEvaluation.cpp
@ -111,7 +111,7 @@ ComputationNodeBasePtr ComputationNetwork::GetNestedNetwork(const ComputationNod
 // concurrent computation in bulk CUDA launches.
 // -----------------------------------------------------------------------

-template<class ElemType> static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient);
+static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient);

 ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(const std::vector<shared_ptr<SEQTraversalFlowControlNode>>& recurrentInfo, const std::list<ComputationNodeBasePtr>& allNodes /*must be in eval order*/)
 {
@ -152,7 +152,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con

        // Extreme Tracing, part 1/4
        if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode())
-            DumpNode<float>(node, /*dumpGradient=*/false) || DumpNode<double>(node, false);
+            DumpNode(node, /*dumpGradient=*/false);
    }
 }

@ -188,7 +188,7 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con

        // Extreme Tracing, part 2/4
        if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode() && node->NeedsGradient())
-            DumpNode<float>(node, /*dumpGradient=*/true) || DumpNode<double>(node, true);
+            DumpNode(node, /*dumpGradient=*/true);
    }
 }
 /*virtual*/ void ComputationNetwork::PARTraversalFlowControlNode::RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) /*override*/
@ -207,26 +207,36 @@ ComputationNetwork::PARTraversalFlowControlNode::PARTraversalFlowControlNode(con
 {
 }

-// helper for logging. Returns false if it was not able to dynamic-cast nodep to ComputationNode<ElemType>
-template<class ElemType>
-static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
+template<typename ElemType>
+bool TypedDumpNode(shared_ptr<ComputationNode<ElemType>> node, bool dumpGradient)
 {
-    let node = dynamic_pointer_cast<ComputationNode<ElemType>>(nodep);
    if (!node)
        return false;
    let dataPtr = dumpGradient ? node->GradientPtr() : node->ValuePtr();
    if (!dataPtr)
        return true; // e.g. SEQ sentinel node

-    bool concise = !(nodep->Environment().IsLogLevelNodeTrace());
+    bool concise = !(node->Environment().IsLogLevelNodeTrace());

    fprintf(stderr, "Dump --> %s%s\n", node->FormatOperationPrototype("").c_str(), dumpGradient ? " Grad" : "");
    node->WriteMinibatchWithFormatting(stderr, FrameRange(), SIZE_MAX, SIZE_MAX, false/*transpose*/, /*isCategoryLabel=*/false, /*isSparse=*/false, std::vector<std::string>(),
-                                       ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
-                                       "%13.10f"/*valueFormatString*/, dumpGradient, concise);
+        ""/*sequenceSeparator*/, "  "/*sequencePrologue*/, "\n"/*sequenceEpilogue*/, " "/*elementSeparator*/, "\n  "/*sampleSeparator*/,
+        "%13.10f"/*valueFormatString*/, dumpGradient, concise);
    return true;
 }

+// helper for logging. Returns false if it was not able to dump
+static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
+{
+    let nodef = dynamic_pointer_cast<ComputationNode<float>>(nodep);
+    if (nodef) return TypedDumpNode<float>(nodef, dumpGradient);
+    let noded = dynamic_pointer_cast<ComputationNode<double>>(nodep);
+    if (noded) return TypedDumpNode<double>(noded, dumpGradient);
+    let nodeh = dynamic_pointer_cast<ComputationNode<half>>(nodep);
+    if (nodeh) return TypedDumpNode<half>(nodeh, dumpGradient);
+    return false;
+}
+
 // -----------------------------------------------------------------------
 // SEQTraversalFlowControlNode methods -- implements SEQ traversal (loop unrolling)
 //
@ -280,7 +290,7 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
    {
        if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode())
        {
-            DumpNode<float>(node, /*dumpGradient=*/false) || DumpNode<double>(node, false);
+            DumpNode(node, /*dumpGradient=*/false);
        }
    }
 }
@ -321,7 +331,7 @@ static bool DumpNode(ComputationNodeBasePtr nodep, bool dumpGradient)
    {
        if (node->HasEnvironmentPtr() && node->Environment().ShouldDumpNode() && node->NeedsGradient())
        {
-            DumpNode<float>(node, /*dumpGradient=*/true) || DumpNode<double>(node, true);
+            DumpNode(node, /*dumpGradient=*/true);
        }
    }
 }
--- a/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
+++ b/Source/ComputationNetworkLib/ComputationNetworkLib.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" InitialTargets="CheckDependencies" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
--- a/Source/ComputationNetworkLib/ComputationNode.cpp
+++ b/Source/ComputationNetworkLib/ComputationNode.cpp
@ -16,7 +16,7 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-using namespace std;
+    using namespace std;

 // -----------------------------------------------------------------------
 // subroutines for evaluation
@ -85,18 +85,18 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh

    for (size_t i = 0; i < m_inputs.size(); i++)
    {
-        ComputationNodePtr child = Input(i);
-        if (child->m_needsGradient &&
+        ComputationNodeBasePtr child = m_inputs[i];
+        if (child->NeedsGradient() &&
            ((childrenInThisLoop  && child->IsPartOfLoop() == IsPartOfLoop()) ||
             (childrenInOuterLoop && child->IsPartOfLoop() != IsPartOfLoop()) ))
        {
            // fprintf(stderr, "Backprop: %ls %ls operation -> child %d %ls %ls\n", NodeName().c_str(), OperationName().c_str(), (int)i, child->NodeName().c_str(), child->OperationName().c_str());
-            if (!m_needsGradient)
+            if (!NeedsGradient())
                LogicError("%ls %ls operation has m_needsGradient set to false but children require it.", NodeName().c_str(), OperationName().c_str());
 #if DUMPOUTPUT
            fprintf(stderr, "Backprop%d_%ls\n", i, NodeName().c_str());
 #endif
-            child->LazyZeroGradient(this); // set gradient to 0 if this is the first time
+            SMART_NODE_INVOKE(ComputationNode, child, LazyZeroGradient, this); // set gradient to 0 if this is the first time

            // If we propagate from a loop to a node that is outside the loop, we are not efficient.
            // This case is handled by SEQTraversalFlowControlNode::Backprop().
@ -108,7 +108,7 @@ void ComputationNode<ElemType>::Backprop(const FrameRange& fr, bool childrenInTh
            }

            // before backprop, verify gradient optimization info
-            Input(i)->VerifyGradientOptimization(this);
+            SMART_NODE_INVOKE(ComputationNode, child, VerifyGradientOptimization, this);

            // fprintf(stderr, "BackpropTo %d %d %ls %ls\n", (int)fr.timeIdxInSeq, (int)i, NodeName().c_str(), OperationName().c_str());
            BackpropTo(i, fr); // this computes partial wrt to the child and sums the gradient value in the child
@ -753,11 +753,14 @@ template <class ElemType>
    {
        for (size_t i = 0; i < m_inputs.size(); i++)
        {
-            ComputationNodePtr child = Input(i);
-            if (child->m_needsGradient)
+            ComputationNodeBasePtr child = m_inputs[i];
+            if (child->NeedsGradient())
            {
-                child->MaskMissingGradientColumnsToZero(FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
-                if (child->Gradient().HasNan("EndBackprop"))
+                SMART_NODE_INVOKE(ComputationNode, child, MaskMissingGradientColumnsToZero, FrameRange(child->GetMBLayout())); // HasNaN() operates on a whole matrix, so first flatten all gaps to 0
+
+                bool hasNan = false;
+                SMART_NODE_INVOKE_WITH_RET(ComputationNode, child, Gradient().HasNan, hasNan, "EndBackprop");
+                if (hasNan)
                {
                    LogicError("%ls %ls operation unexpectedly produced NaN gradients on its input %ls.", NodeName().c_str(), OperationName().c_str(), child->NodeName().c_str());
                }
@ -1004,7 +1007,7 @@ void ComputationNode<ElemType>::WriteMinibatchWithFormatting(FILE* f,
                    double absSumLocal = 0;
                    for (size_t j = 0; j < jend; j++) // loop over elements
                    {
-                        absSumLocal += abs(seqData[i * istride + j * jstride]);
+                        absSumLocal += (double)abs(seqData[i * istride + j * jstride]);
                    }
                    absSum += absSumLocal;
                }
@ -1136,6 +1139,7 @@ atomic_ullong TimeStamp::s_timeStampCounter = ATOMIC_VAR_INIT(0);

 template <> map<size_t, map<size_t, shared_ptr<SingleMatrix>>> ComputationNode<float>::s_constOnes{};
 template <> map<size_t, map<size_t, shared_ptr<DoubleMatrix>>> ComputationNode<double>::s_constOnes{};
+template <> map<size_t, map<size_t, shared_ptr<HalfMatrix>>> ComputationNode<half>::s_constOnes{};

 // -----------------------------------------------------------------------
 // instantiate the core class templates
@ -1143,6 +1147,7 @@ template <> map<size_t, map<size_t, shared_ptr<DoubleMatrix>>> ComputationNode<d

 template class ComputationNode<float>;
 template class ComputationNode<double>;
+template class ComputationNode<half>;

 }}}

--- a/Source/ComputationNetworkLib/ComputationNode.h
+++ b/Source/ComputationNetworkLib/ComputationNode.h
@ -1375,7 +1375,7 @@ public:
        m_inputs.resize(inputs.size());
        for (size_t i = 0; i < m_inputs.size(); i++)
            if (inputs[i])
-                m_inputs[i] = DownCast(inputs[i]); // (DownCast() checks the type; the assignment then downcasts it again)
+                m_inputs[i] = inputs[i]; // remove DownCast check here to allow CastNode to pass
            else
                m_inputs[i] = nullptr; // during network creation, nullptrs are possible

@ -1430,6 +1430,19 @@ protected:
        return DownCast(m_inputs[inputIndex]);
    }

+    template<typename InputType>
+    inline shared_ptr<ComputationNode<InputType>> TypedInput(const size_t inputIndex) const
+    {
+        if (inputIndex >= m_inputs.size())
+            LogicError("Inputs: inputIndex %d is out of range for %ls %ls operation.", (int)inputIndex, NodeName().c_str(), OperationName().c_str());
+
+        shared_ptr<ComputationNode<InputType>> node = dynamic_pointer_cast<ComputationNode<InputType>>(m_inputs[inputIndex]);
+        if (!node)
+            InvalidArgument("an TypedInput of mismatching precision was passed");
+
+        return node;
+    }
+
    // Fast downcast without runtime type check of dynamic_pointer_cast.
    // Meant to be used in Forward and BackPropTo, assuming that Validate() has already used Input() which validated the correct types.
    inline ComputationNode<ElemType>& InputRef(const size_t inputIndex) const
@ -1929,24 +1942,36 @@ protected:
    // if the matrix's size will scale with minibatch size, set mbScale = true 
    // if workspace flag is true, the memory request will be treated specially. We assume workspace memory will share their own pointers 
    // this is currently a workaround for workspace memory for convolutions
-    void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false, bool isWorkSpace=false, bool aliasing=false)
+    template<typename ValueType>
+    void TypedRequestMatrixFromPool(shared_ptr<Matrix<ValueType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize=0, bool mbScale=false, bool isWorkSpace=false, bool aliasing=false)
    {
        if (matrixPtr == nullptr)
        {
            if (aliasing)
-                matrixPool.RequestAliasedAllocate<ElemType>(m_deviceId, this, &matrixPtr, matrixSize, mbScale);
+                matrixPool.RequestAliasedAllocate<ValueType>(m_deviceId, this, &matrixPtr, matrixSize, mbScale);
            else
-                matrixPool.RequestAllocate<ElemType>(m_deviceId, &matrixPtr, matrixSize, mbScale, isWorkSpace);
+                matrixPool.RequestAllocate<ValueType>(m_deviceId, &matrixPtr, matrixSize, mbScale, isWorkSpace);
        }
    }

-    void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, bool aliasing=false)
+    template<typename ValueType>
+    void TypedReleaseMatrixToPool(shared_ptr<Matrix<ValueType>>& matrixPtr, MatrixPool& matrixPool, bool aliasing=false)
    {
        assert(matrixPtr != nullptr);
        if (aliasing)
-            matrixPool.RequestAliasedRelease<ElemType>(this);
+            matrixPool.RequestAliasedRelease<ValueType>(this);
        else
-            matrixPool.RequestRelease<ElemType>(&matrixPtr);
+            matrixPool.RequestRelease<ValueType>(&matrixPtr);
+    }
+
+    void RequestMatrixFromPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, size_t matrixSize = 0, bool mbScale = false, bool isWorkSpace = false, bool aliasing = false)
+    {
+        TypedRequestMatrixFromPool<ElemType>(matrixPtr, matrixPool, matrixSize, mbScale, isWorkSpace, aliasing);
+    }
+
+    void ReleaseMatrixToPool(shared_ptr<Matrix<ElemType>>& matrixPtr, MatrixPool& matrixPool, bool aliasing = false)
+    {
+        TypedReleaseMatrixToPool<ElemType>(matrixPtr, matrixPool, aliasing);
    }

 public:
@ -2114,7 +2139,7 @@ public:
            s_constOnes[rows].find(cols) == s_constOnes[rows].end()) // not found
        {
            shared_ptr<Matrix<ElemType>> matrix = make_shared<Matrix<ElemType>>(rows, cols, (DEVICEID_TYPE) deviceId);
-            matrix->SetValue(1);
+            matrix->SetValue((ElemType)1);
            s_constOnes[rows][cols] = matrix;
        }

@ -2516,4 +2541,26 @@ public:

 #pragma endregion base computation class

+#define SMART_NODE_INVOKE(nodeClass, node, func, ...)                           \
+    do {                                                                        \
+        if (dynamic_pointer_cast<nodeClass<float>>(node))                       \
+            dynamic_pointer_cast<nodeClass<float>>(node)->func(__VA_ARGS__);    \
+        else if (dynamic_pointer_cast<nodeClass<double>>(node))                 \
+            dynamic_pointer_cast<nodeClass<double>>(node)->func(__VA_ARGS__);   \
+        else if (dynamic_pointer_cast<nodeClass<half>>(node))                   \
+            dynamic_pointer_cast<nodeClass<half>>(node)->func(__VA_ARGS__);     \
+        else                                                                    \
+            LogicError("Unknown nodeClass type");                               \
+    } while(0)
+
+#define SMART_NODE_INVOKE_WITH_RET(nodeClass, node, func, ret, ...)                 \
+    do {                                                                            \
+        if (dynamic_pointer_cast<nodeClass<float>>(node))                           \
+            ret = dynamic_pointer_cast<nodeClass<float>>(node)->func(__VA_ARGS__);  \
+        else if (dynamic_pointer_cast<nodeClass<double>>(node))                     \
+            ret = dynamic_pointer_cast<nodeClass<double>>(node)->func(__VA_ARGS__); \
+        else if (dynamic_pointer_cast<nodeClass<half>>(node))                       \
+            ret = dynamic_pointer_cast<nodeClass<half>>(node)->func(__VA_ARGS__);   \
+        else LogicError("Unknown ComputationNode type");                            \
+    } while(0)
 }}}
--- a/Source/ComputationNetworkLib/EvaluationNodes.h
+++ b/Source/ComputationNetworkLib/EvaluationNodes.h
@ -873,6 +873,7 @@ protected:

 template class OneHotNode<float>;
 template class OneHotNode<double>;
+template class OneHotNode<half>;

 #ifdef COMING_SOON

--- a/Source/ComputationNetworkLib/InputAndParamNodes.cpp
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.cpp
@ -104,7 +104,7 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
        static unsigned long randomSeed = 1;
        int forcedRandomSeed = configp->Get(L"randomSeed"); // forcing a specific random seed is useful for testing to get repeatable initialization independent of evaluation order
        m_randomSeed = forcedRandomSeed < 0 ? randomSeed++ : (unsigned long)forcedRandomSeed;
-        m_initValueScale = configp->Get(L"initValueScale");
+        m_initValueScale = (ElemType)(float)configp->Get(L"initValueScale");
        m_initFilterRank = configp->Get(L"initFilterRank"); 
        m_initOutputRank = configp->Get(L"initOutputRank");
        m_initOnCPUOnly  = configp->Get(L"initOnCPUOnly");
@ -112,12 +112,12 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
    else if (initString == L"zero")
    {
        m_initString = L"fromValue";
-        m_initValue = 0;
+        m_initValue = (ElemType)0;
    }
    else if (initString == L"fromValue") // from 'initValue'
    {
        m_initString = initString;
-        m_initValue = initValue;
+        m_initValue = (ElemType)(float)initValue;
    }
    else if (initString == L"bilinear")
    {
@ -138,7 +138,7 @@ LearnableParameter<ElemType>::LearnableParameter(const ScriptableObjects::IConfi
    else if (initString == L"fixedValue") // deprecated. Use initValue=... instead
    {
        m_initString = L"fromValue";
-        m_initValue = (ElemType)configp->Get(L"value");
+        m_initValue = (ElemType)(float)configp->Get(L"value");
    }
    else if (initString == L"fromLiteral") // deprecated. Use initValue=array instead
    {
@ -549,7 +549,7 @@ void LearnableParameter<ElemType>::LazyInitParameters()
    if (m_initString == L"fromValue")
    {
        if (GetEnvironmentPtr() && Environment().traceLevel > 0) // note: this will not log before node is part of network
-            fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initValue);
+            fprintf(stderr, "%ls: Initializing Parameter[%s] <- %f.\n", NodeDescription().c_str(), string(GetSampleLayout()).c_str(), (float)m_initValue);
        Value().SetValue(m_initValue);
    }
    else if (ParseRandomizationType(m_initString).second != 0)
@ -651,5 +651,6 @@ template <class ElemType>

 template class LearnableParameter<float>;
 template class LearnableParameter<double>;
+template class LearnableParameter<half>;

 }}}
--- a/Source/ComputationNetworkLib/InputAndParamNodes.h
+++ b/Source/ComputationNetworkLib/InputAndParamNodes.h
@ -108,7 +108,7 @@ private:
        {
            fprintf(stderr, "%ls: Initializing Parameter[%s] <- %ls(seed=%d, init dims=[%d x %d], range=%f(%f*%f), onCPU=%s.\n)",
                    NodeDescription().c_str(), string(GetSampleLayout()).c_str(), m_initString.c_str(),
-                    (int)randomSeed, (int)fanOut, (int)fanIn, range, range/initValueScale, initValueScale, initOnCPUOnly ? "true" : "false");
+                    (int)randomSeed, (int)fanOut, (int)fanIn, (float)range, (float)(range/initValueScale), (float)(initValueScale), initOnCPUOnly ? "true" : "false");
        }
    }

--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.cpp
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.cpp
@ -41,6 +41,9 @@ template void Microsoft::MSR::CNTK::UpdateRunningAverage<float>(ComputationNode<
 template void Microsoft::MSR::CNTK::UpdateRunningAverage<double>(ComputationNode<double>& newInput,
                                                                 TensorView<double>& runningAverage,
                                                                 size_t& runningCount);
+template void Microsoft::MSR::CNTK::UpdateRunningAverage<half>(ComputationNode<half>& newInput,
+                                                                 TensorView<half>& runningAverage,
+                                                                 size_t& runningCount);

 template <class ElemType>
 EpochAccumulatorNode<ElemType>::EpochAccumulatorNode(DEVICEID_TYPE deviceId, const wstring& name)
@ -127,4 +130,5 @@ void EpochAccumulatorNode<ElemType>::Reset()
 }

 template class EpochAccumulatorNode<float>;
-template class EpochAccumulatorNode<double>;
+template class EpochAccumulatorNode<double>;
+template class EpochAccumulatorNode<half>;
--- a/Source/ComputationNetworkLib/LinearAlgebraNodes.h
+++ b/Source/ComputationNetworkLib/LinearAlgebraNodes.h
@ -91,6 +91,7 @@ public:

 template class PlusNode<float>;
 template class PlusNode<double>;
+template class PlusNode<half>;

 // -----------------------------------------------------------------------
 // LogPlusNode (summand1, summand2)
@ -165,6 +166,7 @@ public:

 template class LogPlusNode<float>;
 template class LogPlusNode<double>;
+template class LogPlusNode<half>;


 // -----------------------------------------------------------------------
@ -225,6 +227,7 @@ public:

 template class PowNode<float>;
 template class PowNode<double>;
+template class PowNode<half>;


 // -----------------------------------------------------------------------
@ -287,6 +290,7 @@ public:

 template class MinusNode<float>;
 template class MinusNode<double>;
+template class MinusNode<half>;

 // -----------------------------------------------------------------------
 // ElementTimesNode (factor1, factor2)
@ -360,6 +364,7 @@ public:

 template class ElementTimesNode<float>;
 template class ElementTimesNode<double>;
+template class ElementTimesNode<half>;

 // -----------------------------------------------------------------------
 // TimesNodeBase (A, B, outputRank=1)
@ -606,7 +611,7 @@ private:
                Matrix<ElemType> inputValueSlice = unpackedInputValue.ColumnSlice(s * maxNumTimeSteps, maxNumTimeSteps); // k x s*
                inputValueSlice.Reshape(k * maxNumTimeSteps, 1); // (k * s*) x 1
                Matrix<ElemType> gradientSlice = Gradient().ColumnSlice(s, 1); // m x 1
-                Matrix<ElemType>::MultiplyAndWeightedAdd(1, gradientSlice, false, inputValueSlice, true, unpacked[inputIndex] ? 0 : beta, inputGradientSlice);
+                Matrix<ElemType>::MultiplyAndWeightedAdd(1, gradientSlice, false, inputValueSlice, true, unpacked[inputIndex] ? (ElemType)0 : beta, inputGradientSlice);
            }

            if (unpacked[inputIndex])
@ -624,7 +629,7 @@ private:
                Matrix<ElemType> inputValueSlice = unpackedInputValue.ColumnSlice(s * maxNumTimeSteps, maxNumTimeSteps); // (m * k) x s*
                inputValueSlice.Reshape(m, k * maxNumTimeSteps); // m x (k * s*)
                Matrix<ElemType> gradientSlice = Gradient().ColumnSlice(s, 1); // m x 1
-                Matrix<ElemType>::MultiplyAndWeightedAdd(1, inputValueSlice, true, gradientSlice, false, unpacked[inputIndex] ? 0 : beta, inputGradientSlice);
+                Matrix<ElemType>::MultiplyAndWeightedAdd(1, inputValueSlice, true, gradientSlice, false, unpacked[inputIndex] ? (ElemType)0 : beta, inputGradientSlice);
            }
            
            if (unpacked[inputIndex])
@ -1113,6 +1118,7 @@ public:

 template class TimesNode<float>;
 template class TimesNode<double>;
+template class TimesNode<half>;

 // -----------------------------------------------------------------------
 // TransposeTimesNode (A', B)
@ -1142,6 +1148,7 @@ public:

 template class TransposeTimesNode<float>;
 template class TransposeTimesNode<double>;
+template class TransposeTimesNode<half>;

 // Fixed-point matrix product. This scales inputs to 16bit signed integers by Symmetric quantizers, performs
 // integer multiplication using SSE/AVX2, and transforms the results back.
@ -1232,6 +1239,7 @@ public:

 template class QuantizedTimesNode<float>;
 template class QuantizedTimesNode<double>;
+template class QuantizedTimesNode<half>;

 // -----------------------------------------------------------------------
 // SumElementsNode (input)
@ -1467,6 +1475,7 @@ private:

 template class TransposeDimensionsNode<float>;
 template class TransposeDimensionsNode<double>;
+template class TransposeDimensionsNode<half>;

 // -----------------------------------------------------------------------
 // CosDistanceNode (left, right)
@ -1587,6 +1596,7 @@ private:

 template class CosDistanceNode<float>;
 template class CosDistanceNode<double>;
+template class CosDistanceNode<half>;

 // -----------------------------------------------------------------------
 // KhatriRaoProductNode (left, right)
@ -1913,6 +1923,7 @@ private:

 template class CosDistanceWithNegativeSamplesNode<float>;
 template class CosDistanceWithNegativeSamplesNode<double>;
+template class CosDistanceWithNegativeSamplesNode<half>;

 template <class ElemType>
 void UpdateRunningAverage(ComputationNode<ElemType>& newInput, TensorView<ElemType>& runningAverage,
@ -1992,4 +2003,48 @@ protected:
    size_t m_numSamples;
 };

+// -----------------------------------------------------------------------
+// CastNode converts data types from InputType to ElemType
+// -----------------------------------------------------------------------
+template <class ElemType, class InputType>
+class CastNode : public UnaryElementWiseNode<ElemType>
+{
+    typedef UnaryElementWiseNode<ElemType> Base; UsingUnaryElementwiseNodeBaseMembers;
+    static const std::wstring TypeName() { return L"Cast"; }
+
+public:
+    CastNode(DEVICEID_TYPE deviceId, const wstring& name)
+        : Base(deviceId, name)
+    {
+    }
+
+    virtual void /*ComputationNode::*/ ForwardProp(const FrameRange& fr) override
+    {
+        auto result = ValueFor(fr);
+        auto input = static_cast<ComputationNode<InputType>&>(*m_inputs[0].get()).ValueFor(fr);
+        result.CastAssignValuesOf(input);
+    }
+
+    virtual void /*ComputationNode::*/ BackpropTo(const size_t /*inputIndex*/, const FrameRange& fr) override
+    {
+        auto grad = GradientFor(fr);
+        auto inputGrad = static_cast<ComputationNode<InputType>&>(*m_inputs[0].get()).GradientFor(fr);
+        inputGrad.CastAssignValuesOf(grad);
+    }
+
+    virtual void /*ComputationNodeBase::*/ Validate(bool isFinalValidationPass) override
+    {
+        ValidateUnaryMap(isFinalValidationPass);
+    }
+
+    virtual bool OutputUsedInComputingInputNodesGradients() const override { return false; }
+    virtual bool InputUsedInComputingInputNodesGradients(size_t /*childIndex*/) const override { return false; }
+};
+
+template class CastNode<half, float>;
+template class CastNode<half, double>;
+template class CastNode<float, half>;
+template class CastNode<float, double>;
+template class CastNode<double, half>;
+template class CastNode<double, float>;
 }}}
--- a/Source/ComputationNetworkLib/MatrixPool.h
+++ b/Source/ComputationNetworkLib/MatrixPool.h
@ -72,6 +72,7 @@ public:
 protected:
    vector<MemRequestInfo<float>> m_memRequestInfoFloatVec; 
    vector<MemRequestInfo<double>> m_memRequestInfoDoubleVec;
+    vector<MemRequestInfo<half>> m_memRequestInfoHalfVec;
    set<DEVICEID_TYPE> m_deviceIDSet; 
    int m_stepCounter; 

@ -151,6 +152,7 @@ public:
        // MatrixPool is not templated, so we call both float and double versions here 
        OptimizedMemoryAllocationFunc<float>(); 
        OptimizedMemoryAllocationFunc<double>();
+        OptimizedMemoryAllocationFunc<half>();
        return; 
    }

--- a/Source/ComputationNetworkLib/NonlinearityNodes.h
+++ b/Source/ComputationNetworkLib/NonlinearityNodes.h
@ -446,6 +446,7 @@ public:

 template class HardmaxNode<float>;
 template class HardmaxNode<double>;
+template class HardmaxNode<half>;



@ -517,7 +518,8 @@ public:
            CreateMatrixIfNull(m_steps);
            auto dim = Input(0)->GetSampleLayout().GetDimPadded(0);
            auto tmp = new ElemType[numCols];
-            std::generate(tmp, tmp + numCols, [i = ElemType(0), dim]() mutable { auto ret = i; i += dim; return ret; });
+            ElemType i = ElemType(0);
+            std::generate(tmp, tmp + numCols, [&i, dim]() mutable { auto ret = i; i += dim; return ret; });
            m_steps->SetValue(1, numCols, this->m_deviceId, tmp);
            delete[] tmp;
            m_sortedIndices->ScaleAndAdd(ElemType(1), *m_steps, *m_sortedIndices);
@ -768,7 +770,8 @@ public:                                                                 \
 };                                                                      \
                                                                        \
 template class ClassName<float>;                                        \
-template class ClassName<double>;
+template class ClassName<double>;                                       \
+template class ClassName<half>;

 DefineComparisonNode(LessNode,         -1, 0)
 DefineComparisonNode(EqualNode,         0, 0)
--- a/Source/ComputationNetworkLib/RNNNodes.cpp
+++ b/Source/ComputationNetworkLib/RNNNodes.cpp
@ -341,5 +341,6 @@ void OptimizedRNNStackNode<ElemType>::UnpackSequencesFromCuDNN(const Matrix<Elem

 template class OptimizedRNNStackNode<float>;
 template class OptimizedRNNStackNode<double>;
+template class OptimizedRNNStackNode<half>;

 }}}
--- a/Source/ComputationNetworkLib/RecurrentNodes.cpp
+++ b/Source/ComputationNetworkLib/RecurrentNodes.cpp
@ -587,9 +587,11 @@ template<class ElemType, int direction>
 // instantiate the classes that derive from the above
 template class PastValueNode<float>;
 template class PastValueNode<double>;
+template class PastValueNode<half>;

 template class FutureValueNode<float>;
 template class FutureValueNode<double>;
+template class FutureValueNode<half>;

 // -----------------------------------------------------------------------
 // DelayedValueNodeState -- helper class for exporting/importing state from/to DelayedValueNodes.
--- a/Source/ComputationNetworkLib/ReshapingNodes.cpp
+++ b/Source/ComputationNetworkLib/ReshapingNodes.cpp
@ -373,6 +373,7 @@ template <class ElemType>

 template class ReduceElementsNode<float>;
 template class ReduceElementsNode<double>;
+template class ReduceElementsNode<half>;

 // -----------------------------------------------------------------------
 // Where(bitVector) -- extract indices of non-0 values in a sequence
@ -490,6 +491,7 @@ template <class ElemType>

 template class WhereNode<float>;
 template class WhereNode<double>;
+template class WhereNode<half>;

 // -----------------------------------------------------------------------
 // PackedIndexNode(targetObject, indexSequence) -- map sequence
@ -551,6 +553,7 @@ template <class ElemType>

 template class PackedIndexNode<float>;
 template class PackedIndexNode<double>;
+template class PackedIndexNode<half>;

 // -----------------------------------------------------------------------
 // GatherPackedNode(packedIndex, sourceData) -- gather operation
@ -623,6 +626,7 @@ template <class ElemType>

 template class GatherPackedNode<float>;
 template class GatherPackedNode<double>;
+template class GatherPackedNode<half>;

 // -----------------------------------------------------------------------
 // ScatterPackedNode(layoutData, packedIndex, sourceData) -- scatter operation
@ -686,6 +690,7 @@ template <class ElemType>

 template class ScatterPackedNode<float>;
 template class ScatterPackedNode<double>;
+template class ScatterPackedNode<half>;

 // -----------------------------------------------------------------------
 // CropNode -- crop operation, crops first input according to shape of second
@ -1026,5 +1031,6 @@ bool CropNode<ElemType>::SupportsTransformOnInput(size_t inputIndex)

 template class CropNode<float>;
 template class CropNode<double>;
+template class CropNode<half>;

 }}}
--- a/Source/ComputationNetworkLib/ReshapingNodes.h
+++ b/Source/ComputationNetworkLib/ReshapingNodes.h
@ -878,8 +878,8 @@ public:
    

 public:
-    PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector<size_t> head, std::vector<size_t> foot, PaddingType mode = PaddingType::CONSTANTPAD, ElemType constantValue = 0)
-        : Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value(constantValue)
+    PaddingNode(DEVICEID_TYPE deviceId, const wstring& name, std::vector<size_t> head, std::vector<size_t> foot, PaddingType mode = PaddingType::CONSTANTPAD, double constantValue = 0)
+        : Base(deviceId, name), m_head(head), m_foot(foot), m_mode(mode), m_constant_value((ElemType)constantValue)
    {
    }

--- a/Source/ComputationNetworkLib/SequenceReshapeNodes.h
+++ b/Source/ComputationNetworkLib/SequenceReshapeNodes.h
@ -312,8 +312,8 @@ class UnpackSequenceNode : public ComputationNodeNonLooping<ElemType>, public Mu

 public:
    DeclareConstructorFromConfig(UnpackSequenceNode);
-    UnpackSequenceNode(DEVICEID_TYPE deviceId, const wstring& name, ElemType paddingValue = 0, bool suppressMaskOutput = false)
-        : Base(deviceId, name), MultiOutputNode<ElemType>(suppressMaskOutput ? 1 : 2), m_paddingValue(paddingValue), m_suppressMaskOutput(suppressMaskOutput)
+    UnpackSequenceNode(DEVICEID_TYPE deviceId, const wstring& name, double paddingValue = 0, bool suppressMaskOutput = false)
+        : Base(deviceId, name), MultiOutputNode<ElemType>(suppressMaskOutput ? 1 : 2), m_paddingValue((ElemType)paddingValue), m_suppressMaskOutput(suppressMaskOutput)
    {}

    virtual void /*ComputationNodeNonLooping::*/ ForwardPropNonLooping() override
--- a/Source/ComputationNetworkLib/SpecialPurposeNodes.cpp
+++ b/Source/ComputationNetworkLib/SpecialPurposeNodes.cpp
@ -144,5 +144,6 @@ template <class ElemType>

 template class TraceNode<float>;
 template class TraceNode<double>;
+template class TraceNode<half>;

 }}}
--- a/Source/ComputationNetworkLib/TrainingNodes.cpp
+++ b/Source/ComputationNetworkLib/TrainingNodes.cpp
@ -47,6 +47,7 @@ template <class ElemType>

 template class RandomDistributionNode<float>;
 template class RandomDistributionNode<double>;
+template class RandomDistributionNode<half>;

 template<class ElemType>
 void RandomSampleNodeBase<ElemType>::Validate(bool isFinalValidationPass)
@ -108,9 +109,9 @@ void RandomSampleNodeBase<ElemType>::UpdateWeightsPrefixSum()
    {
        ElemType currentWeight = samplingWeights.GetValue(iClass, 0);
        if (currentWeight < 0)
-            InvalidArgument("Sampling weights contain negative number %f.", currentWeight);
+            InvalidArgument("Sampling weights contain negative number %f.", (float)currentWeight);

-        runningWeightsSum += currentWeight;
+        runningWeightsSum += (double)currentWeight;
        m_samplingWeightsPrefixSum.push_back(runningWeightsSum);
    }
 }
@ -231,6 +232,7 @@ bool RandomSampleNode<ElemType>::IsOutOfDateWrtInputs() const

 template class RandomSampleNode<float>;
 template class RandomSampleNode<double>;
+template class RandomSampleNode<half>;

 template<class ElemType>
 double RandomSampleInclusionFrequencyNode<ElemType>::EstimateNumberOfTries()
@ -303,6 +305,7 @@ void RandomSampleInclusionFrequencyNode<ElemType>::Validate(bool isFinalValidati

 template class RandomSampleInclusionFrequencyNode<float>;
 template class RandomSampleInclusionFrequencyNode<double>;
+template class RandomSampleInclusionFrequencyNode<half>;

 template<class ElemType>
 void DropoutNode<ElemType>::Save(File& fstream) const
@ -336,8 +339,10 @@ void BatchNormalizationNode<ElemType>::AttachInputs(const std::vector<Computatio

 template class DropoutNode<float>;
 template class DropoutNode<double>;
+template class DropoutNode<half>;

 template class BatchNormalizationNode<float>;
 template class BatchNormalizationNode<double>;
+template class BatchNormalizationNode<half>;

 }}}
--- a/Source/ComputationNetworkLib/TrainingNodes.h
+++ b/Source/ComputationNetworkLib/TrainingNodes.h
@ -525,7 +525,7 @@ public:
                    {
                        Url& UrlJ = *itUrlJ;
                        discountJ = m_logWeights[UrlJ.m_rank];
-                        if (abs(gainI - UrlJ.m_gain) < 0.0000001)
+                        if (abs(gainI - UrlJ.m_gain) < (ElemType)0.0000001)
                        {
                            continue;
                        }
@ -534,7 +534,7 @@ public:
                        lambdaIJ = (gainI - UrlJ.m_gain) * (discountI - discountJ) / (discountI * discountJ);

                        // |delta NDCG|
-                        lambdaIJ = (idealMetric == 0.0 ? (ElemType) 0.0 : abs(lambdaIJ / idealMetric));
+                        lambdaIJ = (idealMetric == (ElemType)0.0 ? (ElemType) 0.0 : (ElemType)abs(lambdaIJ / idealMetric));

                        // Combine lambda
                        lambdaIJ = lambdas(0, pairsCount++) * lambdaIJ;
@ -2394,6 +2394,8 @@ class BatchNormalizationNode : public ComputationNodeNonLooping<ElemType>, publi
    typedef ComputationNodeNonLooping<ElemType> Base; UsingComputationNodeMembersBoilerplate;
    static const std::wstring TypeName() { return L"BatchNormalization"; }

+    typedef typename std::conditional<std::is_same<ElemType, half>::value, float, ElemType>::type StatType;
+
    // inputs
    // TODO: Change all of these throughout the codebase to 'class enum'. Also change all places where we still use integer constants.
    static const size_t DATA      = 0;
@ -2412,7 +2414,7 @@ public:
        m_one(1, 1, deviceId),
        m_convertRunningVariancePending(false)
    {
-        m_one.SetValue((ElemType)1); // (constant value used for GPU-side update of runCount)
+        m_one.SetValue((StatType)1); // (constant value used for GPU-side update of runCount)
    }
    BatchNormalizationNode(const ScriptableObjects::IConfigRecordPtr configp) :
        BatchNormalizationNode(configp->Get(L"deviceId"), L"<placeholder>", configp->Get(L"spatial"),
@ -2564,14 +2566,14 @@ private: // time-constant conversions
    void ResetRunCount()
    {
        if (HasTiedRunCount())
-            Input(RUN_COUNT)->Value().SetValue(0);
+            this->template TypedInput<StatType>(RUN_COUNT)->Value().SetValue(0);
        m_runCountUntied = 0;
    }
    void AggregateRunCount(size_t countToAdd)
    {
        if (HasTiedRunCount())
        {
-            Input(RUN_COUNT)->Value().AddWithScaleOf(/*alpha=*/(ElemType)countToAdd, m_one); // this += countToAdd * (1)
+            this->template TypedInput<StatType>(RUN_COUNT)->Value().AddWithScaleOf(/*alpha=*/(StatType)countToAdd, m_one); // this += countToAdd * (1)
            if (countToAdd != 0)
                m_runCountUntied = SIZE_MAX; // we only need this for 0 checks, this value says we only know it's not 0
        }
@ -2581,7 +2583,7 @@ private: // time-constant conversions
    size_t RunCount() const // const version of above; keep identical
    {
        if (HasTiedRunCount())
-            m_runCountUntied = (size_t)Input(RUN_COUNT)->Value().Get00Element(); // if needed then cache it over
+            m_runCountUntied = (size_t)this->template TypedInput<StatType>(RUN_COUNT)->Value().Get00Element(); // if needed then cache it over
        return m_runCountUntied;
    }
    bool IsRunCount0() const { return m_runCountUntied == 0 && RunCount() == 0; } // tied count >= untied one, so we can ask the untied one first to avoid GPU sync
@ -2664,10 +2666,10 @@ public:
        FrameRange fr(Input(DATA)->GetMBLayout());

        Matrix<ElemType> sliceInputValue  = Input(DATA)->MaskedValueFor(fr);
-        const Matrix<ElemType>& scale     = Input(SCALE)->Value();
-        const Matrix<ElemType>& bias      = Input(BIAS)->Value();
-        Matrix<ElemType>& runMean         = Input(RUN_MEAN)->Value();
-        Matrix<ElemType>& runVariance     = Input(RUN_VAR)->Value();
+        const Matrix<StatType>& scale     = this->template TypedInput<StatType>(SCALE)->Value();
+        const Matrix<StatType>& bias      = this->template TypedInput<StatType>(BIAS)->Value();
+        Matrix<StatType>& runMean         = this->template TypedInput<StatType>(RUN_MEAN)->Value();
+        Matrix<StatType>& runVariance     = this->template TypedInput<StatType>(RUN_VAR)->Value();
        Matrix<ElemType> sliceOutputValue = ValueFor(fr);

        assert(scale.GetNumRows() == bias.GetNumRows());
@ -2718,8 +2720,8 @@ public:
        {
            auto sliceOutputGrad          = MaskedGradientFor(fr);
            auto sliceInputValue          = Input(DATA)->ValueFor(fr);
-            const Matrix<ElemType>& scale = Input(SCALE)->Value();
-            const Matrix<ElemType>& bias  = Input(BIAS)->Value();
+            const Matrix<StatType>& scale = this->template TypedInput<StatType>(SCALE)->Value();
+            const Matrix<StatType>& bias  = this->template TypedInput<StatType>(BIAS)->Value();

            // If inputIndex is not DATA and we get here, then it means that DATA receives no gradient.
            // However, the underlying engine does not foresee this case, and thus always needs a place
@ -2752,19 +2754,19 @@ public:
        {
            assert(m_gradientValid);

-            if (Input(SCALE)->IsGradientInitializedBy(this))
-                Input(SCALE)->Gradient().AssignValuesOf(*m_dScale);
+            if (this->template TypedInput<StatType>(SCALE)->IsGradientInitializedBy(this))
+                this->template TypedInput<StatType>(SCALE)->Gradient().AssignValuesOf(*m_dScale);
            else
-                Input(SCALE)->Gradient() += *m_dScale;
+                this->template TypedInput<StatType>(SCALE)->Gradient() += *m_dScale;
        }
        else if (inputIndex == BIAS) // derivative with respect to the bias, precomputed during input derivative computation
        {
            assert(m_gradientValid);

-            if (Input(BIAS)->IsGradientInitializedBy(this))
-                Input(BIAS)->Gradient().AssignValuesOf(*m_dBias);
+            if (this->template TypedInput<StatType>(BIAS)->IsGradientInitializedBy(this))
+                this->template TypedInput<StatType>(BIAS)->Gradient().AssignValuesOf(*m_dBias);
            else
-                Input(BIAS)->Gradient() += *m_dBias;
+                this->template TypedInput<StatType>(BIAS)->Gradient() += *m_dBias;
        }
        // No derivatives with respect to running mean and variance.
    }
@ -2797,7 +2799,7 @@ public:
        // running statistics inputs must be learnable parameters, since we update them directly here
        for (size_t i = RUN_MEAN; i < GetNumInputs(); i++)
            //if (!Input(i)->Is<LearnableParameter<ElemType>>()) // somehow this does not compile on gcc (works on VS)
-            if (!dynamic_cast<LearnableParameter<ElemType>*>(Input(i).get()))
+            if (!dynamic_cast<LearnableParameter<StatType>*>(this->template TypedInput<StatType>(i).get()))
                InvalidArgument("%ls: Inputs [%d..%d] must be learnable parameters.", NodeDescription().c_str(), (int)RUN_MEAN, (int)GetNumInputs());

        // infer dimensions of learnable parameters
@ -2807,7 +2809,7 @@ public:
 #if 1   // Workaround for today's definition: Trigger on [0 x 1] and infer that 0 as the total # elements needed.
        for (size_t i = SCALE; i < RUN_COUNT; i++) // scale, bias, run_mean, and run_variance
        {
-            auto paramLayout = Input(i)->GetSampleLayout();
+            auto paramLayout = this->template TypedInput<StatType>(i)->GetSampleLayout();
            if (paramLayout.GetRank() == 2 && paramLayout[0] == 0 && paramLayout[1] == 1 && inputLayout.GetNumElements() > 0) // [0 x 1]
            {
                size_t total = m_spatial ? inputLayout.GetDims().back() : inputLayout.GetNumElements();
@ -2841,10 +2843,11 @@ public:
            // check inputs
            for (size_t i = SCALE; i < RUN_COUNT; i++) // scale, bias, run_mean, and run_variance
            {
-                if (Input(i)->HasMBLayout())
+                auto inputPtr = this->template TypedInput<StatType>(i);
+                if (inputPtr->HasMBLayout())
                    InvalidArgument("%ls: Input[%d] has a dynamic axis. BatchNormalization parameters cannot have that.", NodeDescription().c_str(), (int)i);
-                auto paramLayout = Input(i)->GetSampleLayout();
-                if (paramLayout != Input(SCALE)->GetSampleLayout())
+                auto paramLayout = inputPtr->GetSampleLayout();
+                if (paramLayout != this->template TypedInput<StatType>(SCALE)->GetSampleLayout())
                    InvalidArgument("%ls: Input[%d] has a layout different from Input[1]. All must be identical.", NodeDescription().c_str(), (int)i);
 #if 0           // BUGBUG: For this to work, parameter shapes must be correct (cf. comment above on inference).
                if (paramLayout.GetRank() > inputLayout.GetRank())
@ -2857,9 +2860,9 @@ public:
            if (HasTiedRunCount()) // 0-th order statistics (count) (optional for backcompat with old code which didn't correctly share it)
            {
                // This must always be a [1] tensor. No inference allowed.
-                size_t i = RUN_COUNT;
-                if (Input(i)->HasMBLayout() || (Input(i)->GetSampleLayout().GetRank() > 1) || (Input(i)->GetSampleLayout().GetNumElements() != 1))
-                    InvalidArgument("%ls: Input[%d] must be a vector of 1 element without dynamic axis.", NodeDescription().c_str(), (int)i);
+                auto inputPtr = this->template TypedInput<StatType>(RUN_COUNT);
+                if (inputPtr->HasMBLayout() || (inputPtr->GetSampleLayout().GetRank() > 1) || (inputPtr->GetSampleLayout().GetNumElements() != 1))
+                    InvalidArgument("%ls: Input[RUN_COUNT] must be a vector of 1 element without dynamic axis.", NodeDescription().c_str());
                RunCount(); // cache the shared value into the local cache, for 0 checks
            }
            if (m_spatial && m_imageLayoutKind != CHW)
@ -2902,7 +2905,7 @@ public:
            if (m_bnEng == nullptr)
            {
                auto shape = GetSampleLayout();
-                m_bnEng = BatchNormEngine<ElemType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
+                m_bnEng = BatchNormEngine<ElemType, StatType>::Create(m_deviceId, shape, m_spatial, m_imageLayoutKind,
                                                            m_useCntkEngine ? BatchNormEngineKind::Cntk : BatchNormEngineKind::CuDnn);
            }

@ -2916,26 +2919,26 @@ public:
    void RequestMatricesBeforeForwardProp(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeForwardProp(matrixPool);
-        RequestMatrixFromPool(m_savedMean, matrixPool);
-        RequestMatrixFromPool(m_savedInvStdDev, matrixPool);
+        this->template TypedRequestMatrixFromPool<StatType>(m_savedMean, matrixPool);
+        this->template TypedRequestMatrixFromPool<StatType>(m_savedInvStdDev, matrixPool);
    }

    void RequestMatricesBeforeBackprop(MatrixPool& matrixPool) override
    {
        Base::RequestMatricesBeforeBackprop(matrixPool);
        RequestMatrixFromPool(m_dDataDummy, matrixPool);
-        RequestMatrixFromPool(m_dScale, matrixPool);
-        RequestMatrixFromPool(m_dBias, matrixPool);
+        this->template TypedRequestMatrixFromPool<StatType>(m_dScale, matrixPool);
+        this->template TypedRequestMatrixFromPool<StatType>(m_dBias, matrixPool);
    }

    void ReleaseMatricesAfterBackprop(MatrixPool& matrixPool) override
    {
        Base::ReleaseMatricesAfterBackprop(matrixPool);
-        ReleaseMatrixToPool(m_savedMean, matrixPool);
-        ReleaseMatrixToPool(m_savedInvStdDev, matrixPool);
+        this->template TypedReleaseMatrixToPool<StatType>(m_savedMean, matrixPool);
+        this->template TypedReleaseMatrixToPool<StatType>(m_savedInvStdDev, matrixPool);
        ReleaseMatrixToPool(m_dDataDummy, matrixPool);
-        ReleaseMatrixToPool(m_dScale, matrixPool);
-        ReleaseMatrixToPool(m_dBias, matrixPool);
+        this->template TypedReleaseMatrixToPool<StatType>(m_dScale, matrixPool);
+        this->template TypedReleaseMatrixToPool<StatType>(m_dBias, matrixPool);
    }

    void SetNormalizationTimeConstants(double normalizationTimeConstant, double prevNormalizationTimeConstant,
@ -2970,8 +2973,8 @@ public:
    // Turn off the L1 and L2 regularization
    void DisableRegInBatchNormalization()
    {
-        let scaleNode = dynamic_pointer_cast<LearnableParameter<ElemType>>(Input(SCALE));
-        let biasNode  = dynamic_pointer_cast<LearnableParameter<ElemType>>(Input(BIAS));
+        let scaleNode = dynamic_pointer_cast<LearnableParameter<StatType>>(this->template TypedInput<StatType>(SCALE));
+        let biasNode  = dynamic_pointer_cast<LearnableParameter<StatType>>(this->template TypedInput<StatType>(BIAS));
        scaleNode->SetRegMultiplier(0.f);
        biasNode->SetRegMultiplier(0.f);
    }
@ -3046,20 +3049,20 @@ private:
    // This value is not updated unless needed, so it may be out of date during most operation.
    // It will be updated at start (Validate()) and saving models, and any time the true value is needed.
    mutable size_t m_runCountUntied; // cached running sample count (mutable since it is a cache)
-    Matrix<ElemType> m_one;  // constant [1x1] matrix that contains a 1 (used for updating the shared count)
+    Matrix<StatType> m_one;  // constant [1x1] matrix that contains a 1 (used for updating the shared count)

    // Interpolated actual mean/inverse stddev values. Pre-computed on forward pass, also used in gradient computation.
-    shared_ptr<Matrix<ElemType>> m_savedMean;
-    shared_ptr<Matrix<ElemType>> m_savedInvStdDev;
+    shared_ptr<Matrix<StatType>> m_savedMean;
+    shared_ptr<Matrix<StatType>> m_savedInvStdDev;
    // Temp buffer for scale and bias derivatives. Only used in BackpropTo(), carrying info from first call to subsequent calls.
    // Not used for blendFactor=1 in CNTK engine.
    shared_ptr<Matrix<ElemType>> m_dDataDummy;
-    shared_ptr<Matrix<ElemType>> m_dScale;
-    shared_ptr<Matrix<ElemType>> m_dBias;
+    shared_ptr<Matrix<StatType>> m_dScale;
+    shared_ptr<Matrix<StatType>> m_dBias;

    bool m_gradientValid = false;

-    std::unique_ptr<BatchNormEngine<ElemType>> m_bnEng;
+    std::unique_ptr<BatchNormEngine<ElemType, StatType>> m_bnEng;

    bool m_convertRunningVariancePending;
 };
--- a/Source/EvalDll/EvalDll.vcxproj
+++ b/Source/EvalDll/EvalDll.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -33,13 +33,11 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
--- a/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
+++ b/Source/Extensibility/EvalWrapper/EvalWrapper.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -33,14 +33,12 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CLRSupport>true</CLRSupport>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CLRSupport>true</CLRSupport>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
@ -64,7 +62,7 @@
    <Link>
      <AdditionalLibraryDirectories>$(OutDir)</AdditionalLibraryDirectories>
      <AdditionalDependencies>Cntk.Eval-$(CntkComponentVersion).lib;Cntk.Math-$(CntkComponentVersion).lib;Cntk.Common-$(CntkComponentVersion).lib;$(MSMPI_LIB64)msmpi.lib;kernel32.lib;user32.lib;gdi32.lib;winspool.lib;comdlg32.lib;advapi32.lib;shell32.lib;ole32.lib;oleaut32.lib;uuid.lib;odbc32.lib;odbccp32.lib;%(AdditionalDependencies)</AdditionalDependencies>
-      <DelayLoadDLLs>Cntk.Eval-$(CntkComponentVersion).dll;Cntk.Math-$(CntkComponentVersion).dll</DelayLoadDLLs>
+      <DelayLoadDLLs>Cntk.Eval-$(CntkComponentVersion).dll;Cntk.Math-$(CntkComponentVersion).dll;msmpi.dll</DelayLoadDLLs>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
--- a/Source/ImageWriterDll/ImageWriterDll.vcxproj
+++ b/Source/ImageWriterDll/ImageWriterDll.vcxproj
@ -1,5 +1,5 @@
 <?xml version="1.0" encoding="utf-8"?>
-<Project DefaultTargets="Build" ToolsVersion="14.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+<Project DefaultTargets="Build" ToolsVersion="15.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup Label="ProjectConfigurations">
    <ProjectConfiguration Include="Debug|x64">
      <Configuration>Debug</Configuration>
@ -33,13 +33,11 @@
  <PropertyGroup Condition="$(DebugBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>true</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
  <PropertyGroup Condition="$(ReleaseBuild)" Label="Configuration">
    <ConfigurationType>DynamicLibrary</ConfigurationType>
    <UseDebugLibraries>false</UseDebugLibraries>
-    <PlatformToolset>v140</PlatformToolset>
    <WholeProgramOptimization>true</WholeProgramOptimization>
    <CharacterSet>Unicode</CharacterSet>
  </PropertyGroup>
@ -64,7 +62,7 @@
    </ClCompile>
    <Link>
      <AdditionalLibraryDirectories>$(SolutionDir)$(Platform)\$(Configuration);$(OpenCvLibPath)</AdditionalLibraryDirectories>
-      <AdditionalDependencies>Cntk.Common-$(CntkComponentVersion).lib;$(OpenCvLib);%(AdditionalDependencies)</AdditionalDependencies>
+      <AdditionalDependencies>$(OpenCvLib);%(AdditionalDependencies)</AdditionalDependencies>
    </Link>
  </ItemDefinitionGroup>
  <ItemDefinitionGroup Condition="$(DebugBuild)">
--- a/Source/ImageWriterDll/ImageWriterDll.vcxproj.filters
+++ b/Source/ImageWriterDll/ImageWriterDll.vcxproj.filters
@ -2,9 +2,7 @@
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
  <ItemGroup>
    <ClCompile Include="ImageWriter.cpp" />
-    <ClCompile Include="dllmain.cpp">
-      <Filter>Misc</Filter>
-    </ClCompile>
+    <ClCompile Include="dllmain.cpp" />
  </ItemGroup>
  <ItemGroup>
    <ClInclude Include="ImageWriter.h" />
--- a/Source/Math/BatchNormalizationEngine.cpp
+++ b/Source/Math/BatchNormalizationEngine.cpp
@ -6,13 +6,13 @@
 #include "stdafx.h"
 #include "BatchNormalizationEngine.h"
 #include "CuDnnFactories.h"
-#include "Mkl2017DnnCommon.h"
+#include "MklDnnCommon.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

-template <class ElemType>
-void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-                                        Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
+template <class InoutType, class StatType>
+void BatchNormEngine<InoutType, StatType>::Forward(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
+                                        InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev)
 {
    assert(in.GetNumRows() == m_inOutT.GetNumElements());
    assert(out.GetNumRows() == m_inOutT.GetNumElements());
@ -63,9 +63,9 @@ void BatchNormEngine<ElemType>::Forward(const Mat& in, const Mat& scale, const M
    }
 }

-template <class ElemType>
-void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor,
-                                         const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad)
+template <class InoutType, class StatType>
+void BatchNormEngine<InoutType, StatType>::Backward(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor,
+                                         const StatMat& savedMean, const StatMat& savedInvStdDev, StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad)
 {
    assert(!savedMean.IsEmpty());
    assert(!savedInvStdDev.IsEmpty());
@ -73,12 +73,13 @@ void BatchNormEngine<ElemType>::Backward(const Mat& in, const Mat& srcGrad, Mat&
    BackwardCore(in, srcGrad, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad);
 }

-template <class ElemType>
-class CntkBatchNormEngine : public BatchNormEngine<ElemType>
+template <class InoutType, class StatType>
+class CntkBatchNormEngine : public BatchNormEngine<InoutType, StatType>
 {
 public:
-    using Base = BatchNormEngine<ElemType>;
-    using typename Base::Mat;
+    using Base = BatchNormEngine<InoutType, StatType>;
+    using typename Base::InoutMat;
+    using typename Base::StatMat;

 public:
    CntkBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
@ -99,28 +100,30 @@ protected:
            InvalidArgument("CNTK batch normalization supports only cudnn(CHW) layout.");
    }

-    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-        Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
+    void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
+                     InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) override
    {
 #ifdef USE_MKL2017DNN
        if (in.GetCurrentMatrixLocation() == CPU &&
-            ForwardCoreMKL(in, scale, bias, inferenceOnly, expAvgFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev))
+            std::is_same<InoutType, StatType>::value &&
+            ForwardCoreMKL(*(const StatMat*)&in, scale, bias, inferenceOnly, expAvgFactor, runMean, runVariance, *(StatMat*)&out, epsilon, savedMean, savedInvStdDev))
            return;
 #endif

        in.BatchNormalizationForward(scale, bias, inferenceOnly, expAvgFactor, blendFactor, runMean, runVariance, out, epsilon, savedMean, savedInvStdDev);
    }

-    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
-        Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) override
+    void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& savedMean, const StatMat& savedInvStdDev,
+                      StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) override
    {
 #ifdef USE_MKL2017DNN
        if (srcGrad.GetCurrentMatrixLocation() == CPU &&
-            BackwardCoreMKL(in, srcGrad, grad, scale, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad))
+            std::is_same<InoutType, StatType>::value &&
+            BackwardCoreMKL(*(const StatMat*)&in, *(const StatMat*)&srcGrad, *(StatMat*)&grad, scale, savedMean, savedInvStdDev, scaleGrad, biasGrad, accumulateDataGrad))
            return;
 #endif
        if (!accumulateDataGrad)
-            grad.SetValue((ElemType)0);
+            grad.SetValue((InoutType)0);

        srcGrad.BatchNormalizationBackward(in, grad, scale, blendFactor, savedMean, savedInvStdDev, scaleGrad, biasGrad);
    }
@ -147,7 +150,7 @@ private:
        struct MKLScaleShiftAdapter
        {
            bool isInput;
-            std::shared_ptr<Matrix<ElemType>> mat;
+            std::shared_ptr<Matrix<StatType>> mat;
            dnnResourceType_t resourceType;
            size_t numChannels;

@ -155,19 +158,19 @@ private:
            {
                Clear();
                numChannels = n;
-                mat = std::make_shared<Matrix<ElemType>>(numChannels, 2, CPUDEVICE);
+                mat = std::make_shared<Matrix<StatType>>(numChannels, 2, CPUDEVICE);
                isInput = userToPrim;
                resourceType = rt;
            }

            void PrepareForExecution(void* scale, void* bias, void* resources[dnnResourceNumber])
            {
-                ElemType* buffer = mat->Data();
+                StatType* buffer = mat->Data();
                resources[resourceType] = buffer;
                if (isInput)
                {
-                    memcpy(buffer, scale, sizeof(ElemType) * numChannels);
-                    memcpy(buffer + numChannels, bias, sizeof(ElemType) * numChannels);
+                    memcpy(buffer, scale, sizeof(StatType) * numChannels);
+                    memcpy(buffer + numChannels, bias, sizeof(StatType) * numChannels);
                }
            }

@ -176,9 +179,9 @@ private:
                if (isInput)
                    RuntimeError("Cannot execute output ResourceAdapter for input");

-                ElemType* buffer = mat->Data();
-                memcpy(scale, buffer, sizeof(ElemType) * numChannels);
-                memcpy(bias, buffer + numChannels, sizeof(ElemType) * numChannels);
+                StatType* buffer = mat->Data();
+                memcpy(scale, buffer, sizeof(StatType) * numChannels);
+                memcpy(bias, buffer + numChannels, sizeof(StatType) * numChannels);
            }

            void Clear()
@ -194,21 +197,21 @@ private:

        struct PrimitiveContext
        {
-            MKLDnnResourceAdapter<ElemType> input;
-            MKLDnnResourceAdapter<ElemType> output;
+            MKLDnnResourceAdapter<StatType> input;
+            MKLDnnResourceAdapter<StatType> output;
            MKLScaleShiftAdapter scaleShift;
-            std::shared_ptr<Mat> varianceMat; // variance matrix used for converting InvStdDev
+            std::shared_ptr<StatMat> varianceMat; // variance matrix used for converting InvStdDev

            dnnPrimitive_t primitive = nullptr;
            dnnPrimitiveAttributes_t attributes = nullptr;

            void Clear()
            {
-                if (primitive) { dnnDelete<ElemType>(primitive); primitive = nullptr; }
+                if (primitive) { dnnDelete<StatType>(primitive); primitive = nullptr; }
                input.Clear();
                scaleShift.Clear();
                output.Clear();
-                if (attributes) { dnnPrimitiveAttributesDestroy<ElemType>(attributes); attributes = nullptr; }
+                if (attributes) { dnnPrimitiveAttributesDestroy<StatType>(attributes); attributes = nullptr; }
            }

            ~PrimitiveContext()
@ -219,7 +222,7 @@ private:

        TensorShape m_shape;
        size_t m_numSamples;
-        ElemType m_epsilon;
+        StatType m_epsilon;

    public:
        MKLBatchNormalizationContext() :
@ -233,12 +236,12 @@ private:
            return !!(m_contextFlags & (1 << contextIndex));
        }

-        void Prepare(const TensorShape& shape, bool spatial, size_t numSamples, ContextIndex contextIndex, ElemType epsilon = 0)
+        void Prepare(const TensorShape& shape, bool spatial, size_t numSamples, ContextIndex contextIndex, StatType epsilon = 0)
        {
            int flag = (1 << contextIndex);
            if (contextIndex == ContextIndex_Backward)
            {
-                epsilon = HasPreparedFor(ContextIndex_ForwardTrain) ? m_epsilon : (ElemType)DEFAULT_EPSILON;
+                epsilon = HasPreparedFor(ContextIndex_ForwardTrain) ? m_epsilon : (StatType)DEFAULT_EPSILON;
            }

            bool same = (shape == m_shape) && (numSamples == m_numSamples) && (epsilon == m_epsilon);
@ -285,10 +288,10 @@ private:
            {
            case ContextIndex_ForwardInfer:
            case ContextIndex_ForwardTrain:
-                CHECK_MKL(dnnLayoutCreate<ElemType>(&ltUserInput, inoutDim, inoutSizes, inoutStrides));
-                CHECK_MKL(dnnLayoutCreate<ElemType>(&ltUserOutput, inoutDim, inoutSizes, inoutStrides));
-                CHECK_MKL(dnnPrimitiveAttributesCreate<ElemType>(&ctx.attributes));
-                CHECK_MKL(dnnBatchNormalizationCreateForward_v2<ElemType>(
+                CHECK_MKL(dnnLayoutCreate<StatType>(&ltUserInput, inoutDim, inoutSizes, inoutStrides));
+                CHECK_MKL(dnnLayoutCreate<StatType>(&ltUserOutput, inoutDim, inoutSizes, inoutStrides));
+                CHECK_MKL(dnnPrimitiveAttributesCreate<StatType>(&ctx.attributes));
+                CHECK_MKL(dnnBatchNormalizationCreateForward_v2<StatType>(
                    &ctx.primitive,
                    ctx.attributes,
                    ltUserInput,
@ -299,10 +302,10 @@ private:
                scaleShiftType = dnnResourceScaleShift;
                break;
            case ContextIndex_Backward:
-                CHECK_MKL(dnnLayoutCreate<ElemType>(&ltUserInput, inoutDim, inoutSizes, inoutStrides));
-                CHECK_MKL(dnnLayoutCreate<ElemType>(&ltUserOutput, inoutDim, inoutSizes, inoutStrides));
-                CHECK_MKL(dnnPrimitiveAttributesCreate<ElemType>(&ctx.attributes));
-                CHECK_MKL(dnnBatchNormalizationCreateBackward_v2<ElemType>(
+                CHECK_MKL(dnnLayoutCreate<StatType>(&ltUserInput, inoutDim, inoutSizes, inoutStrides));
+                CHECK_MKL(dnnLayoutCreate<StatType>(&ltUserOutput, inoutDim, inoutSizes, inoutStrides));
+                CHECK_MKL(dnnPrimitiveAttributesCreate<StatType>(&ctx.attributes));
+                CHECK_MKL(dnnBatchNormalizationCreateBackward_v2<StatType>(
                    &ctx.primitive,
                    ctx.attributes,
                    ltUserInput,
@ -311,16 +314,16 @@ private:
                inputType = dnnResourceDiffDst;
                outputType = dnnResourceDiffSrc;
                scaleShiftType = dnnResourceDiffScaleShift;
-                ctx.varianceMat = std::make_shared<Mat>(numChannels, 1, CPUDEVICE);
+                ctx.varianceMat = std::make_shared<StatMat>(numChannels, 1, CPUDEVICE);
                break;
            default:
                RuntimeError("Unexpected context type %d", (int)contextIndex);
            }

-            CHECK_MKL(dnnLayoutCreateFromPrimitive<ElemType>(&ltPrimInput, ctx.primitive, inputType));
+            CHECK_MKL(dnnLayoutCreateFromPrimitive<StatType>(&ltPrimInput, ctx.primitive, inputType));
            ctx.input.Create(ltUserInput, ltPrimInput, inputType, true);

-            CHECK_MKL(dnnLayoutCreateFromPrimitive<ElemType>(&ltPrimOutput, ctx.primitive, outputType));
+            CHECK_MKL(dnnLayoutCreateFromPrimitive<StatType>(&ltPrimOutput, ctx.primitive, outputType));
            ctx.output.Create(ltUserOutput, ltPrimOutput, outputType, false);

            ctx.scaleShift.Create(scaleShiftType, contextIndex != ContextIndex_Backward, numChannels);
@ -338,7 +341,7 @@ private:
            resources[dnnResourceMean] = runMean;
            resources[dnnResourceVariance] = runVariance;

-            CHECK_MKL(dnnExecute<ElemType>(ctx.primitive, resources));
+            CHECK_MKL(dnnExecute<StatType>(ctx.primitive, resources));

            ctx.output.ConvertOutput(output);
        }
@ -352,16 +355,16 @@ private:
            ctx.output.PrepareForExecution(grad, resources);
            ctx.scaleShift.PrepareForExecution(scaleGrad, biasGrad, resources);

-            std::shared_ptr<Mat> scaleShiftMat;
-            scaleShiftMat = std::make_shared<Mat>(ctx.scaleShift.numChannels, 2, CPUDEVICE);
-            memcpy(scaleShiftMat->Data(), scale, ctx.scaleShift.numChannels * sizeof(ElemType));
+            std::shared_ptr<StatMat> scaleShiftMat;
+            scaleShiftMat = std::make_shared<StatMat>(ctx.scaleShift.numChannels, 2, CPUDEVICE);
+            memcpy(scaleShiftMat->Data(), scale, ctx.scaleShift.numChannels * sizeof(StatType));
            resources[dnnResourceScaleShift] = scaleShiftMat->Data();

            // convert from InvStdDev to variance
            for (size_t i = 0; i < ctx.scaleShift.numChannels; i++)
            {
-                ElemType& v = ctx.varianceMat->Data()[i];
-                ElemType& s = ((ElemType*)savedInvStdDev)[i];
+                StatType& v = ctx.varianceMat->Data()[i];
+                StatType& s = ((StatType*)savedInvStdDev)[i];
                v = (1 / (s * s) - m_epsilon);
            }

@ -369,7 +372,7 @@ private:
            resources[dnnResourceMean] = savedMean;
            resources[dnnResourceVariance] = ctx.varianceMat->Data();

-            CHECK_MKL(dnnExecute<ElemType>(ctx.primitive, resources));
+            CHECK_MKL(dnnExecute<StatType>(ctx.primitive, resources));

            ctx.output.ConvertOutput(grad);
            ctx.scaleShift.ConvertOutput(scaleGrad, biasGrad);
@ -377,15 +380,15 @@ private:
    };

    MKLBatchNormalizationContext m_mklContext;
-    std::shared_ptr<Mat> m_dataGradWorkspace;
+    std::shared_ptr<StatMat> m_dataGradWorkspace;

-    bool ForwardCoreMKL(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, Mat& runMean, Mat& runVariance,
-        Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev)
+    bool ForwardCoreMKL(const StatMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, StatMat& runMean, StatMat& runVariance,
+        StatMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev)
    {
        ContextIndex contextIndex = inferenceOnly ?
            ContextIndex_ForwardInfer :
            ContextIndex_ForwardTrain;
-        m_mklContext.Prepare(m_inOutT, m_spatial, in.GetNumCols(), contextIndex, (ElemType)epsilon);
+        m_mklContext.Prepare(m_inOutT, m_spatial, in.GetNumCols(), contextIndex, (StatType)epsilon);

        if (inferenceOnly)
        {
@ -398,33 +401,33 @@ private:
            m_mklContext.Forward(in.Data(), out.Data(), scale.Data(), bias.Data(), savedMean.Data(), savedInvStdDev.Data(), contextIndex);

            // update savedMean, savedInvStdDev
-            ElemType OneMinusExpAvgFactor = (ElemType)(1.0 - expAvgFactor);
-            cblas_axpby((MKL_INT)runMean.GetNumElements(), (ElemType)expAvgFactor, savedMean.Data(), OneMinusExpAvgFactor, runMean.Data());
+            StatType OneMinusExpAvgFactor = (StatType)(1.0 - expAvgFactor);
+            cblas_axpby((MKL_INT)runMean.GetNumElements(), (StatType)expAvgFactor, savedMean.Data(), OneMinusExpAvgFactor, runMean.Data());

            // note savedInvStdDev currently hold variance of in.Data(), need to convert to InvStdDev and interpolate
-            ElemType numReduced = (ElemType)(in.GetNumElements() / runVariance.GetNumElements());
-            ElemType bcf = numReduced / (numReduced - 1);
+            StatType numReduced = (StatType)(in.GetNumElements() / runVariance.GetNumElements());
+            StatType bcf = numReduced / (numReduced - 1);
            for (size_t i = 0; i < runVariance.GetNumElements(); i++)
            {
-                ElemType& v = runVariance.Data()[i];
-                ElemType& s = savedInvStdDev.Data()[i];
-                v = v * OneMinusExpAvgFactor + bcf * s * (ElemType)expAvgFactor;
-                s = (ElemType)1 / sqrt(s + (ElemType)epsilon);
+                StatType& v = runVariance.Data()[i];
+                StatType& s = savedInvStdDev.Data()[i];
+                v = v * OneMinusExpAvgFactor + bcf * s * (StatType)expAvgFactor;
+                s = (StatType)1 / sqrt(s + (StatType)epsilon);
            }
        }

        return true;
    }

-    bool BackwardCoreMKL(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale,
-        const Mat& savedMean, const Mat& savedInvStdDev, Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad)
+    bool BackwardCoreMKL(const StatMat& in, const StatMat& srcGrad, StatMat& grad, const StatMat& scale,
+        const StatMat& savedMean, const StatMat& savedInvStdDev, StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad)
    {
        m_mklContext.Prepare(m_inOutT, m_spatial, srcGrad.GetNumCols(), ContextIndex_Backward);

        if (accumulateDataGrad)
        {
            if (!m_dataGradWorkspace)
-                m_dataGradWorkspace = std::make_shared<Matrix<ElemType>>(0, 0, CPUDEVICE);
+                m_dataGradWorkspace = std::make_shared<Matrix<StatType>>(0, 0, CPUDEVICE);

            m_dataGradWorkspace->SetValue(grad);
        }
@ -432,23 +435,24 @@ private:
        m_mklContext.Backward(in.Data(), srcGrad.Data(), grad.Data(), scale.Data(), savedMean.Data(), savedInvStdDev.Data(), scaleGrad.Data(), biasGrad.Data());

        if (accumulateDataGrad)
-            cblas_axpby((MKL_INT)grad.GetNumElements(), (ElemType)1.0, m_dataGradWorkspace->Data(), (ElemType)1.0, grad.Data());
+            cblas_axpby((MKL_INT)grad.GetNumElements(), (StatType)1.0, m_dataGradWorkspace->Data(), (StatType)1.0, grad.Data());

        return true;
    }
 #endif
 };

-template class CntkBatchNormEngine<float>;
-template class CntkBatchNormEngine<double>;
+template class CntkBatchNormEngine<float, float>;
+template class CntkBatchNormEngine<double, double>;
+template class CntkBatchNormEngine<half, float>;

 template <typename T> bool HasFlag(T src, T testFlag)
 {
    return ((int)src & (int)testFlag) != 0;
 }

-template <class ElemType>
-std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+template <class InoutType, class StatType>
+std::unique_ptr<BatchNormEngine<InoutType, StatType>> BatchNormEngine<InoutType, StatType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
                                                                             bool spatial, ImageLayoutKind imageLayout,
                                                                             BatchNormEngineKind enabledEngines)
 {
@ -458,7 +462,7 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
        if (GetMathLibTraceLevel() > 0)
            fprintf(stderr, "Using CNTK batch normalization engine.\n");

-        return std::make_unique<CntkBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
+        return std::make_unique<CntkBatchNormEngine<InoutType, StatType>>(deviceId, inOutT, spatial, imageLayout);
    }

    if (HasFlag(enabledEngines, BatchNormEngineKind::CuDnn))
@ -466,13 +470,14 @@ std::unique_ptr<BatchNormEngine<ElemType>> BatchNormEngine<ElemType>::Create(DEV
        if (GetMathLibTraceLevel() > 0)
            fprintf(stderr, "Using cuDNN batch normalization engine.\n");

-        return CuDnnBatchNormEngineFactory<ElemType>::Create(deviceId, inOutT, spatial, imageLayout);
+        return CuDnnBatchNormEngineFactory<InoutType, StatType>::Create(deviceId, inOutT, spatial, imageLayout);
    }

    RuntimeError("Could not find appropriate batch normalization engine.");
 }

-template class BatchNormEngine<float>;
-template class BatchNormEngine<double>;
+template class BatchNormEngine<float, float>;
+template class BatchNormEngine<double, double>;
+template class BatchNormEngine<half, float>;

 }}}
--- a/Source/Math/BatchNormalizationEngine.h
+++ b/Source/Math/BatchNormalizationEngine.h
@ -25,22 +25,23 @@ enum class BatchNormEngineKind
 #pragma warning(push)
 #pragma warning(disable : 4251)

-template <class ElemType>
+template <class InoutType, class StatType = InoutType>
 class MATH_API BatchNormEngine
 {
 public:
-    using Mat = Matrix<ElemType>;
+    using InoutMat = Matrix<InoutType>;
+    using StatMat = Matrix<StatType>;

 public:
    virtual ~BatchNormEngine() {};

-    void Forward(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-                 Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev);
+    void Forward(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
+                 InoutMat& out, double epsilon, StatMat& saveMean, StatMat& saveInvStdDev);

-    void Backward(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
-                  Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad);
+    void Backward(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& saveMean, const StatMat& saveInvStdDev,
+                  StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad);

-    static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+    static std::unique_ptr<BatchNormEngine<InoutType, StatType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
                                                             bool spatial, ImageLayoutKind imageLayout,
                                                             BatchNormEngineKind enabledEngines = BatchNormEngineKind::All);

@ -56,11 +57,11 @@ protected:
    virtual void EnsureCompatible() = 0;

    // saveMean/saveInvStdDev return the actual mean/stddev used for normalization, except for blendFactor=1, these are unused and untouched
-    virtual void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-                 Mat& out, double epsilon, Mat& saveMean, Mat& saveInvStdDev) = 0;
+    virtual void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
+                 InoutMat& out, double epsilon, StatMat& saveMean, StatMat& saveInvStdDev) = 0;

-    virtual void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& saveMean, const Mat& saveInvStdDev,
-                  Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) = 0;
+    virtual void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& saveMean, const StatMat& saveInvStdDev,
+                  StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) = 0;

 protected:
    DEVICEID_TYPE m_deviceId;
--- a/Source/Math/CPUMatrix.h
+++ b/Source/Math/CPUMatrix.h
@ -14,6 +14,7 @@
 #include <ctime>
 #include <limits.h>
 #include "QuantizedOperations.h"
+#include "half.hpp"

 //#include "GPUMatrix.h"
 //#include "CPUSparseMatrix.h"
@ -108,7 +109,9 @@ public:
                     const bool needAveMultiplier,
                     const bool initialized);

-    void AdaDelta(CPUMatrix<ElemType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon);
+    template<typename GradType>
+    void AdaDelta(CPUMatrix<GradType>& gradients, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon);
+
    void AdaDeltaFlushTimestamps(size_t cols, ElemType rho, int* timestamps, int currentTimestamp);

    void Reshape(const size_t numRows, const size_t numCols);
@ -417,10 +420,13 @@ public:
    void AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
                                CPUMatrix<ElemType>& grad, const bool poolIncludePad, bool accumulateGradient) const;

-    void BatchNormalizationForward(const CPUMatrix<ElemType>& scale, const CPUMatrix<ElemType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<ElemType>& runMean, CPUMatrix<ElemType>& runVariance,
-                                   CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<ElemType>& saveMean, CPUMatrix<ElemType>& saveInvStdDev) const;
-    void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<ElemType>& scale, double blendFactor, const CPUMatrix<ElemType>& saveMean, const CPUMatrix<ElemType>& saveInvStdDev,
-                                    CPUMatrix<ElemType>& scaleGrad, CPUMatrix<ElemType>& biasGrad) const;
+    template<class StatType>
+    void BatchNormalizationForward(const CPUMatrix<StatType>& scale, const CPUMatrix<StatType>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<StatType>& runMean, CPUMatrix<StatType>& runVariance,
+                                   CPUMatrix<ElemType>& out, double epsilon, CPUMatrix<StatType>& saveMean, CPUMatrix<StatType>& saveInvStdDev) const;
+
+    template<class StatType>
+    void BatchNormalizationBackward(const CPUMatrix<ElemType>& in, CPUMatrix<ElemType>& grad, const CPUMatrix<StatType>& scale, double blendFactor, const CPUMatrix<StatType>& saveMean, const CPUMatrix<StatType>& saveInvStdDev,
+                                    CPUMatrix<StatType>& scaleGrad, CPUMatrix<StatType>& biasGrad) const;

 public:
    // This functions do not depend on <ElemType>, i.e. you can call them on any <ElemType>
@ -577,5 +583,30 @@ private:

 typedef CPUMatrix<float> CPUSingleMatrix;
 typedef CPUMatrix<double> CPUDoubleMatrix;
+typedef CPUMatrix<half> CPUHalfMatrix;
+
+template<typename ElemType>
+void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
+template<typename ElemType>
+void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 3>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
+
+template<typename ElemType>
+void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 4>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
+
+template<typename ElemType>
+void CPUMatrixTensorArgOpImpl(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);

 }}}
--- a/Source/Math/CPUMatrixHalf.cpp
+++ b/Source/Math/CPUMatrixHalf.cpp
@ -0,0 +1,132 @@
+//
+// Copyright (c) Microsoft. All rights reserved.
+// Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
+//
+#include "stdafx.h"
+#include "CPUMatrixImpl.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// General conversion function with no performance optimization
+// this should only be used in CPU half precision
+// For performance on inference on CPU, user should convert fp16 model to fp32 first, unless MKL supports half precision
+template<typename SrcT, typename DstT>
+static void ConvertBuffer(DstT* dst, const SrcT* src, size_t count)
+{
+    for (size_t i = 0; i < count; i++)
+    {
+        dst[i] = (DstT)src[i];
+    }
+}
+
+// specialization to convert from half to float for computation, and then store in half
+template <>
+void CPUMatrix<half>::MultiplyAndWeightedAdd(half alpha, const CPUMatrix<half>& a, const bool transposeA, const CPUMatrix<half>& b, const bool transposeB,
+    half beta, CPUMatrix<half>& c, shared_ptr<QuantizedMultiplier<half>> pQuantizedMultiplier)
+{
+    CPUMatrix<float> af(a.GetNumRows(), a.GetNumCols());
+    CPUMatrix<float> bf(b.GetNumRows(), b.GetNumCols());
+    CPUMatrix<float> cf(c.GetNumRows(), c.GetNumCols());
+
+    if (alpha != 0)
+    {
+        ConvertBuffer<half, float>(af.Data(), a.Data(), a.GetNumElements());
+        ConvertBuffer<half, float>(bf.Data(), b.Data(), b.GetNumElements());
+    }
+
+    if (beta != 0)
+    {
+        ConvertBuffer<half, float>(cf.Data(), c.Data(), c.GetNumElements());
+    }
+
+    if (pQuantizedMultiplier)
+        RuntimeError("Quantized matrix multiply not supported for Half");
+
+    CPUMatrix<float>::MultiplyAndWeightedAdd((float)alpha, af, transposeA, bf, transposeB, (float)beta, cf, nullptr);
+
+    ConvertBuffer<float, half>(c.Data(), cf.Data(), c.GetNumElements());
+}
+
+// specialization to RunTimeError for now due to omp implementation only support build-in type
+template <>
+void CPUMatrix<half>::AssignSoftmaxSum(const CPUMatrix<half>& softmax, CPUMatrix<half>& c)
+{
+    RuntimeError("half AssignSoftmaxSum not supported.");
+}
+
+template <>
+void CPUMatrix<half>::AssignNCEUnnormalizedEval(const CPUMatrix<half>& a,
+                                                const CPUMatrix<half>& b, const CPUMatrix<half>& bias, CPUMatrix<half>& c)
+{
+    RuntimeError("half AssignNCEUnnormalizedEval not supported.");
+}
+
+template <>
+void CPUMatrix<half>::VectorSum(const CPUMatrix<half>& a, CPUMatrix<half>& c, const bool isColWise)
+{
+    RuntimeError("half VectorSum not supported.");
+}
+
+template <>
+void CPUMatrix<half>::VectorNorm1(CPUMatrix<half>& c, const bool isColWise) const
+{
+    RuntimeError("half VectorNorm1 not supported.");
+}
+
+template <>
+half CPUMatrix<half>::SumOfElements() const
+{
+    RuntimeError("half SumOfElements not supported.");
+}
+
+template <>
+half CPUMatrix<half>::MatrixNorm1() const
+{
+    RuntimeError("half MatrixNorm1 not supported.");
+}
+
+template <>
+    half CPUMatrix<half>::FrobeniusNorm() const
+{
+    RuntimeError("half FrobeniusNorm not supported.");
+}
+
+template <>
+void CPUMatrix<half>::MaxPoolingBackward(const CPUMatrix<half>& out, const CPUMatrix<half>& in,
+                                         const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices,
+                                         CPUMatrix<half>& grad, bool accumulateGradient) const
+{
+    RuntimeError("half MaxPoolingBackward not supported.");
+}
+
+template <>
+void CPUMatrix<half>::MaxROIPoolingBackward(const size_t numRois, const size_t numImg, const size_t channels, const size_t width, const size_t height,
+                                            const size_t pooledWidth, const size_t pooledHeight, const CPUMatrix<half>& roiData, CPUMatrix<half>& grad,
+                                            CPUMatrix<half>& argmax, double spatialScale) const
+{
+    RuntimeError("half MaxROIPoolingBackward not supported.");
+}
+
+template <>
+void CPUMatrix<half>::AveragePoolingBackward(const CPUMatrix<int>& mpRowCol, const CPUMatrix<int>& mpRowIndices, const CPUMatrix<int>& indices, CPUMatrix<half>& grad, const bool poolIncludePad, bool accumulateGradient) const
+{
+    RuntimeError("half AveragePoolingBackward not supported.");
+}
+
+// explicit instantiations, due to CPUMatrix being too big and causing VS2015 cl crash.
+template class MATH_API CPUMatrix<half>;
+
+// instantiate templated methods
+template void CPUMatrix<float>::AdaDelta(CPUMatrix<float>& gradients, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon);
+template void CPUMatrix<double>::AdaDelta(CPUMatrix<double>& gradients, CPUMatrix<double>& functionValues, double learningRate, double rho, double epsilon);
+template void CPUMatrix<float>::AdaDelta(CPUMatrix<half>& gradients, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon);
+
+template void CPUMatrix<float>::BatchNormalizationForward(const CPUMatrix<float>& scale, const CPUMatrix<float>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<float>& runMean, CPUMatrix<float>& runVariance, CPUMatrix<float>& out, double epsilon, CPUMatrix<float>& saveMean, CPUMatrix<float>& saveInvStdDev) const;
+template void CPUMatrix<double>::BatchNormalizationForward(const CPUMatrix<double>& scale, const CPUMatrix<double>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<double>& runMean, CPUMatrix<double>& runVariance, CPUMatrix<double>& out, double epsilon, CPUMatrix<double>& saveMean, CPUMatrix<double>& saveInvStdDev) const;
+template void CPUMatrix<half>::BatchNormalizationForward(const CPUMatrix<float>& scale, const CPUMatrix<float>& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, CPUMatrix<float>& runMean, CPUMatrix<float>& runVariance, CPUMatrix<half>& out, double epsilon, CPUMatrix<float>& saveMean, CPUMatrix<float>& saveInvStdDev) const;
+
+template void CPUMatrix<float>::BatchNormalizationBackward(const CPUMatrix<float>& in, CPUMatrix<float>& grad, const CPUMatrix<float>& scale, double blendFactor, const CPUMatrix<float>& saveMean, const CPUMatrix<float>& saveInvStdDev, CPUMatrix<float>& scaleGrad, CPUMatrix<float>& biasGrad) const;
+template void CPUMatrix<double>::BatchNormalizationBackward(const CPUMatrix<double>& in, CPUMatrix<double>& grad, const CPUMatrix<double>& scale, double blendFactor, const CPUMatrix<double>& saveMean, const CPUMatrix<double>& saveInvStdDev, CPUMatrix<double>& scaleGrad, CPUMatrix<double>& biasGrad) const;
+template void CPUMatrix<half>::BatchNormalizationBackward(const CPUMatrix<half>& in, CPUMatrix<half>& grad, const CPUMatrix<float>& scale, double blendFactor, const CPUMatrix<float>& saveMean, const CPUMatrix<float>& saveInvStdDev, CPUMatrix<float>& scaleGrad, CPUMatrix<float>& biasGrad) const;
+
+}}}
--- a/Source/Math/CPUMatrixImpl.h
+++ b/Source/Math/CPUMatrixImpl.h
--- a/Source/Math/CPUMatrixTensorDouble.cpp
+++ b/Source/Math/CPUMatrixTensorDouble.cpp
@ -0,0 +1,30 @@
+#include "stdafx.h"
+#include "CPUMatrixTensorImpl.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template
+void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
+template
+void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, const CPUMatrix<double>& b, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 3>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
+
+template
+void CPUMatrixTensorOpImpl(double beta, const CPUMatrix<double>& a, const CPUMatrix<double>& b, const CPUMatrix<double>& c, CPUMatrix<double>& o, double alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 4>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
+
+template
+void CPUMatrixTensorArgOpImpl(const CPUMatrix<double>& a, CPUMatrix<double>& o, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
+}}}
--- a/Source/Math/CPUMatrixTensorFloat.cpp
+++ b/Source/Math/CPUMatrixTensorFloat.cpp
@ -0,0 +1,30 @@
+#include "stdafx.h"
+#include "CPUMatrixTensorImpl.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template
+void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
+template
+void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, const CPUMatrix<float>& b, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 3>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
+
+template
+void CPUMatrixTensorOpImpl(float beta, const CPUMatrix<float>& a, const CPUMatrix<float>& b, const CPUMatrix<float>& c, CPUMatrix<float>& o, float alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 4>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
+
+template
+void CPUMatrixTensorArgOpImpl(const CPUMatrix<float>& a, CPUMatrix<float>& o, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
+}}}
--- a/Source/Math/CPUMatrixTensorHalf.cpp
+++ b/Source/Math/CPUMatrixTensorHalf.cpp
@ -0,0 +1,30 @@
+#include "stdafx.h"
+#include "CPUMatrixTensorImpl.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+template
+void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
+template
+void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, const CPUMatrix<half>& b, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 3>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides);
+
+template
+void CPUMatrixTensorOpImpl(half beta, const CPUMatrix<half>& a, const CPUMatrix<half>& b, const CPUMatrix<half>& c, CPUMatrix<half>& o, half alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+    const array<size_t, 4>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides);
+
+template
+void CPUMatrixTensorArgOpImpl(const CPUMatrix<half>& a, CPUMatrix<half>& o, ElementWiseOperator reductionOp,
+    const array<size_t, 2>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides);
+
+}}}
--- a/Source/Math/CPUMatrixTensorImpl.h
+++ b/Source/Math/CPUMatrixTensorImpl.h
@ -0,0 +1,519 @@
+// Move some files out of CPUMatrixImpl.h to prevent compiler crash on out-of-heap
+
+#include "CPUMatrix.h"
+#include "TensorOps.h"
+
+namespace Microsoft { namespace MSR { namespace CNTK {
+
+// =======================================================================
+// TensorView support
+// =======================================================================
+
+// To save time, this makes extensive use of templates and macros.
+
+// -----------------------------------------------------------------------
+// function to compute the value for a given output location (perform reduction if needed)
+// -----------------------------------------------------------------------
+
+// perform loop over reduction index m
+// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int m>
+struct TensorOpReduction
+{
+    // reduction case (non-reduction case is specialized)
+    static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& reductionOp,
+                                const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+    {
+        array<ptrdiff_t, N - 1> strides;   // N-1 because last one is the result pointer, which is unused in reduction
+        for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
+            strides[i] = reducingStrides[i][(size_t) m];
+
+        double aggregate = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
+        for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
+        {
+            // advance the pointers
+            for (size_t i = 0; i < N - 1; i++)
+                pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
+
+            // need to descend into one loop deeper
+            aggregate = reductionOp(aggregate, TensorOpReduction<ElemType, OPFN, ReductionOp, N, m - 1>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides));
+        }
+        // Actually it would be nicer to return double but we keep ElementType so that test don't return different numbers than previous implementation.
+        return static_cast<ElemType>(aggregate);
+    }
+};
+
+// perform loop over reduction index m
+// This is the specialized version for m = -1, which terminates the recursion.
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
+struct TensorOpReduction<ElemType, OPFN, ReductionOp, N, -1>
+{
+    static inline ElemType Loop(array<ElemType*, N> pointers, const OPFN& opfn, const ReductionOp& /*reductionOp*/,
+                                const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&)
+    {
+        return opfn(pointers); // finally we are doing some work!!!
+    }
+};
+
+// perform loop over reduction index m, while keeping track of the number of elements and their corresponding indices.
+// This function is declared inside a wrapper struct to allow partial specialization (m = -1).
+template <class ElemType, size_t N, int m>
+struct TensorArgOpReduction
+{
+    static inline std::pair<ElemType, size_t> ReduceAll(array<ElemType*, N> pointers, const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides,
+        ElementWiseOperator reductionOp)
+    {
+        size_t counter = 0;
+        size_t index = 0;
+        ElemType val = (ElemType)0;
+
+        switch (reducingOpDims.size())
+        {
+        case 3:
+            val = TensorArgOpReduction<ElemType, N, 2>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
+            break;
+        case 2:
+            val = TensorArgOpReduction<ElemType, N, 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
+            break;
+        case 1:
+            val = TensorArgOpReduction<ElemType, N, 0>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
+            break;
+        case 0:
+            val = TensorArgOpReduction<ElemType, N, -1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
+            break;
+        default:
+            LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)reducingOpDims.size());
+        }
+
+        return make_pair(val, index);
+    }
+
+    // reduction case (non-reduction case is specialized)
+    static inline ElemType Loop(array<ElemType*, N> pointers, const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides,
+                                ElementWiseOperator reductionOp, size_t& counter, size_t& index)
+    {
+        array<ptrdiff_t, N - 1> strides;   // N-1 because last one is the result pointer, which is unused in reduction
+        for (size_t i = 0; i < N - 1; i++) // N = a small constant, this will be unrolled
+            strides[i] = reducingStrides[i][(size_t)m];
+
+        ElemType aggregate = TensorArgOpReduction<ElemType, N, m - 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
+        for (size_t dim = reducingOpDims[(size_t)m] - 1; dim-- > 0;)
+        {
+            // advance the pointers
+            for (size_t i = 0; i < N - 1; i++)
+                pointers[i] += strides[i]; // note: last pointer (result) is unused and untouched here
+
+            ElemType val = TensorArgOpReduction<ElemType, N, m - 1>::Loop(pointers, reducingOpDims, reducingStrides, reductionOp, counter, index);
+
+            bool update = false;
+            switch (reductionOp)
+            {
+            case ElementWiseOperator::opArgmin:
+                update = (aggregate > val);
+                break;
+            case ElementWiseOperator::opArgmax:
+                update = (aggregate < val);
+                break;
+            }
+
+            if (update)
+            {
+                aggregate = val;
+                index = counter - 1;
+            }
+        }
+
+        return aggregate;
+    }
+};
+
+// perform loop over reduction index m
+// This is the specialized version for m = -1, which terminates the recursion.
+template <class ElemType, size_t N>
+struct TensorArgOpReduction<ElemType, N, -1>
+{
+    static inline ElemType Loop(array<ElemType*, N> pointers,
+        const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&, ElementWiseOperator /*reductionOp*/, size_t& counter, size_t& /*index*/)
+    {
+        counter++;
+        return *pointers[0]; // finally we are doing some work!!!
+    }
+};
+
+// -----------------------------------------------------------------------
+// perform loop over regular index k for N-nary operations (N counting the output)
+// -----------------------------------------------------------------------
+
+// perform loop over regular index k and reducing index m for N operands (counting the output)
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m, int k>
+struct TensorOpIteration
+{
+    static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
+                            const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+                            const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+    {
+        // non-scalar case: still nested result loops left
+        array<ptrdiff_t, N> strides;
+        for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
+            strides[i] = regularStrides[i][(size_t) k];
+        for (size_t dim = regularOpDims[(size_t) k]; dim-- > 0;)
+        {
+            // need to descend into one loop deeper
+            TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, k - 1>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+            // advance the pointers
+            for (size_t i = 0; i < N; i++)
+                pointers[i] += strides[i];
+        }
+    }
+};
+
+// Special version for innermost loop with strides all being 1 and no further reduction. Compiler can use SSE.
+// This is a very common case, e.g. adding vectors or computing the Sigmoid.
+template <class ElemType, typename OPFN, typename ReductionOp>
+struct TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
+{
+    static inline void Loop(ElemType beta, array<ElemType*, 3> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
+                            const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
+                            const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
+    {
+        ElemType* pa = pointers[0];
+        ElemType* pb = pointers[1];
+        ElemType* pc = pointers[2];
+        size_t K = regularOpDims[0];
+        // special-case beta and alpha to allow the compiler to short-circuit it
+        if (beta != 0)
+#pragma omp parallel for
+            for (int k = 0; k < (int) K; k++)
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        else if (alpha != 1)
+#pragma omp parallel for
+            for (int k = 0; k < (int) K; k++)
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        else
+#pragma omp parallel for
+            for (int k = 0; k < (int) K; k++)
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 3, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 3>{pa + k, pb + k, pc + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        // TODO: According to Amit, the VS compiler is not able to vectorize into lambdas. Solution: change the lambda to take an N, or to implement the loop inside (with 1 element by default).
+        // TODO: The signedness of k (required for omp) causes an extra sign-extend.
+        // TODO: OMP adds LOTS of overhead. Do we need a guard, a min size when to use it?
+    }
+};
+// and unary
+template <class ElemType, typename OPFN, typename ReductionOp>
+struct TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, 0 /*innermost loop*/>
+{
+    static inline void Loop(ElemType beta, array<ElemType*, 2> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
+                            const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                            const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
+    {
+        ElemType* pa = pointers[0];
+        ElemType* pb = pointers[1];
+        size_t K = regularOpDims[0];
+        // special-case beta and alpha to allow the compiler to short-circuit it
+        if (beta != 0)
+#pragma omp parallel for
+            for (int k = 0; k < (int) K; k++)
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(beta, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        else if (alpha != 1)
+#pragma omp parallel for
+            for (int k = 0; k < (int) K; k++)
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        else
+#pragma omp parallel for
+            for (int k = 0; k < (int) K; k++)
+                TensorOpIteration<ElemType, OPFN, ReductionOp, 2, true /*vectorizable*/, -1 /*no reduction*/, -1 /*scalar*/>::Loop(0, array<ElemType*, 2>{pa + k, pb + k}, 1, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    }
+};
+
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N, bool vectorizable, int m>
+struct TensorOpIteration<ElemType, OPFN, ReductionOp, N, vectorizable, m, -1>
+{
+    static inline void Loop(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
+                            const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
+                            const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+    {
+        // we are at element level for the result: perform the op (there may still be reduction)
+        ElemType val = TensorOpReduction<ElemType, OPFN, ReductionOp, N, m>::Loop(pointers, opfn, reductionOp, reducingOpDims, reducingStrides);
+        // scale
+        val *= alpha;
+        // combine with previous value in target matrix, then write it out
+        auto* pout = pointers.back();
+        if (beta != 0)
+            val += beta * *pout;
+        // save
+        *pout = val;
+        return;
+    }
+};
+
+// perform loop over regular index k and reducing index m for N operands (counting the output), the difference
+// between TensorOpIteration and TensorArgOpIteration, is that the latter store the index of the result, instead of 
+// the result. The reason that they aren't combined is because of performance.
+template <class ElemType, size_t N, int k>
+struct TensorArgOpIteration
+{
+    static inline void Loop(array<ElemType*, N> pointers,
+        const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+        const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, ElementWiseOperator reductionOp)
+    {
+        // non-scalar case: still nested result loops left
+        array<ptrdiff_t, N> strides;
+        for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
+            strides[i] = regularStrides[i][(size_t)k];
+        for (size_t dim = regularOpDims[(size_t)k]; dim-- > 0;)
+        {
+            // need to descend into one loop deeper
+            TensorArgOpIteration<ElemType, N, k - 1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
+            // advance the pointers
+            for (size_t i = 0; i < N; i++)
+                pointers[i] += strides[i];
+        }
+    }
+};
+
+template <class ElemType, size_t N>
+struct TensorArgOpIteration<ElemType, N, -1>
+{
+    static inline void Loop(array<ElemType*, N> pointers,
+        const SmallVector<size_t>&, const array<SmallVector<ptrdiff_t>, N>&,
+        const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides, ElementWiseOperator reductionOp)
+    {
+        // we are at element level for the result: perform the op (there may still be reduction)
+        auto val = TensorArgOpReduction<ElemType, N, 2>::ReduceAll(pointers, reducingOpDims, reducingStrides, reductionOp);
+
+        auto* pout = pointers.back();
+        *pout = (ElemType)val.second;
+        return;
+    }
+};
+
+// -----------------------------------------------------------------------
+// map runtime parameters N to template parameters
+// -----------------------------------------------------------------------
+
+// tensor operation with k+1 dimensions (-1 means scalar)
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N, int k>
+static void TensorOpWithRegularLoop(ElemType beta, const array<ElemType*, N>& pointers, ElemType alpha, const OPFN& opfn, ReductionOp reductionOp,
+                                    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+                                    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+{
+    size_t dims = reducingOpDims.size();
+    switch (dims)
+    {
+    case 2:
+        return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 1:
+        return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, 0, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 0:
+    {
+        // if all leading dimensions are 1, we can let the compiler do some unrolling
+        bool leadingAllOne = true;
+        for (size_t i = 0; i < N; i++)
+            leadingAllOne &= k >= 0 && regularStrides[i][0] == 1;
+        if (leadingAllOne) // special version that uses a hard-coded increment of 1 for all leading dimensions
+            return TensorOpIteration<ElemType, OPFN, ReductionOp, N, true /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+        else
+            return TensorOpIteration<ElemType, OPFN, ReductionOp, N, false /*vectorizable*/, -1, k>::Loop(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    }
+    default:
+        LogicError("TensorOp: %d non-flattened reduction dimensions are not supported.", (int) dims);
+    }
+}
+
+// tensor operation, generalized in number of arguments, operation already provided as a lambda
+// This function now expands into different k.
+template <class ElemType, typename OPFN, typename ReductionOp, size_t N>
+static void TensorOpWithFnAndReduction(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, const ReductionOp& reductionOp,
+    const array<size_t, N>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+{
+    for (size_t i = 0; i < N; i++) // N = a small constant, this will be unrolled
+        pointers[i] += offsets[i];
+    size_t dims = regularOpDims.size();
+    switch (dims)
+    {
+    // N.B. consider code size impact when adding more cases.
+    case 5:
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 4>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 4:
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 3>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 3:
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 2>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 2:
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 1:
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, 0>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    case 0:
+        return TensorOpWithRegularLoop<ElemType, OPFN, ReductionOp, N, -1>(beta, pointers, alpha, opfn, reductionOp, regularOpDims, regularStrides, reducingOpDims, reducingStrides);
+    default:
+        LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)dims);
+    }
+}
+
+// tensor operation, generalized in number of arguments, operation already provided as a lambda
+// This function now expands into different reductionOps
+template <class ElemType, typename OPFN, size_t N>
+static void TensorOpWithFn(ElemType beta, array<ElemType*, N> pointers, ElemType alpha, const OPFN& opfn, ElementWiseOperator reductionOp,
+    const array<size_t, N>& offsets,
+    const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, N>& regularStrides,
+    const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, N>& reducingStrides)
+{
+// BUGBUG: Using always 'double' as type of aggregator even for ElemType==float. Reason: otherwise some e2e test would fail as historically we 
+// used double for aggregator of sum. But:
+// * for min and max reductions this is meaningless.
+// * It is not consitent with what we do on GPU, there we aggregate on ElemType.
+// * It costs performance.
+// TODO: apdapt e2e tests to run with aggregator of type ElemType.
+#define CaseTensorOpWithFnAndReduction(oper)                                                  \
+    case ElementWiseOperator::op##oper:                                                       \
+    return TensorOpWithFnAndReduction(beta, pointers, alpha, opfn, [](double a, double b)     \
+                                    {                                                         \
+                                    return Op##oper(a, b);                                    \
+                                    },                                                        \
+                                    offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
+    switch (reductionOp)
+    {
+        CaseTensorOpWithFnAndReduction(Sum);
+        CaseTensorOpWithFnAndReduction(LogSum);
+        CaseTensorOpWithFnAndReduction(Min);
+        CaseTensorOpWithFnAndReduction(Max);
+        CaseTensorOpWithFnAndReduction(ElementwiseProduct);
+    default:
+        LogicError("Specified ElementWiseOperator op %d not supported as reduction operation.", (int)reductionOp);
+    }
+}
+
+// -----------------------------------------------------------------------
+// entry points from Matrix.cpp; also map op to a lambda
+// -----------------------------------------------------------------------
+
+// perform unary operation 'op' on a giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+// This maps 'op' to a lambda.
+template <class ElemType>
+void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                           const array<size_t, 2>& offsets,
+                           const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                           const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
+{
+    if (reductionOp != ElementWiseOperator::opSum    &&
+        reductionOp != ElementWiseOperator::opLogSum &&
+        reductionOp != ElementWiseOperator::opMin    &&
+        reductionOp != ElementWiseOperator::opMax    &&
+        reductionOp != ElementWiseOperator::opElementwiseProduct)
+        InvalidArgument("TensorOp: Unary reduction operations other than opMax, opMin, opSum, and opLogSum are not implemented.");
+
+// TODO: Change the lambda to take a pointer and a number of elements, so that we can pass it 1 or 4 elements, in order for it to SSE-vectorize.
+#define CaseUnaryTensorOp(oper)                                                        \
+    case ElementWiseOperator::op##oper:                                                \
+        return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 2>& pp) \
+                              {                                                        \
+                                  return Op##oper((*(pp[0])));                         \
+                              },                                                       \
+                              reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
+    array<ElemType*, 2> pointers = {a.Data(), o.Data()};
+    switch (op)
+    {
+        ForAllUnaryOps(CaseUnaryTensorOp);
+    default:
+        LogicError("TensorOp: Unknown unary op code %d.", (int) op);
+    }
+}
+
+// perform binary operation 'op' on a and b giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+// This maps 'op' to a lambda.
+template <class ElemType>
+void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                           const array<size_t, 3>& offsets,
+                           const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 3>& regularStrides,
+                           const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 3>& reducingStrides)
+{
+    if (reductionOp != ElementWiseOperator::opSum)
+        InvalidArgument("TensorOp (binary): The only permitted binary reduction operation is opSum.");
+
+#define CaseBinaryTensorOp(oper)                                                       \
+    case ElementWiseOperator::op##oper:                                                \
+        return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 3>& pp) \
+                              {                                                        \
+                                  return Op##oper((*(pp[0])), (*(pp[1])));             \
+                              },                                                       \
+                              reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
+    array<ElemType*, 3> pointers = {a.Data(), b.Data(), o.Data()};
+    switch (op)
+    {
+        ForAllBinaryOps(CaseBinaryTensorOp);
+    default:
+        LogicError("TensorOp: Unknown op binary code %d.", (int) op);
+    }
+}
+
+// perform ternary operation 'op' on a, and c giving 'this', reinterpreting the matrices as tensors as specified by the dims and strides
+// This maps 'op' to a lambda.
+template <class ElemType>
+void CPUMatrixTensorOpImpl(ElemType beta, const CPUMatrix<ElemType>& a, const CPUMatrix<ElemType>& b, const CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& o, ElemType alpha, ElementWiseOperator op, ElementWiseOperator reductionOp,
+                           const array<size_t, 4>& offsets,
+                           const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 4>& regularStrides,
+                           const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 4>& reducingStrides)
+{
+    if (reductionOp != ElementWiseOperator::opSum)
+        InvalidArgument("TensorOp: The only permitted ternary reduction operation is opSum.");
+
+#define CaseTernaryTensorOp(oper)                                                      \
+    case ElementWiseOperator::op##oper:                                                \
+        return TensorOpWithFn(beta, pointers, alpha, [](const array<ElemType*, 4>& pp) \
+                              {                                                        \
+                                  return Op##oper((*(pp[0])), (*(pp[1])), (*(pp[2]))); \
+                              },                                                       \
+                              reductionOp, offsets, regularOpDims, regularStrides, reducingOpDims, reducingStrides)
+
+    array<ElemType*, 4> pointers = {a.Data(), b.Data(), c.Data(), o.Data()};
+    switch (op)
+    {
+        ForAllTernaryOps(CaseTernaryTensorOp);
+    default:
+        LogicError("TensorOp: Unknown ternary op code %d.", (int) op);
+    }
+}
+
+template <class ElemType>
+void CPUMatrixTensorArgOpImpl(const CPUMatrix<ElemType>& a, CPUMatrix<ElemType>& o, ElementWiseOperator reductionOp,
+                              const array<size_t, 2>& offsets,
+                              const SmallVector<size_t>& regularOpDims, const array<SmallVector<ptrdiff_t>, 2>& regularStrides,
+                              const SmallVector<size_t>& reducingOpDims, const array<SmallVector<ptrdiff_t>, 2>& reducingStrides)
+{
+    if (reductionOp != ElementWiseOperator::opArgmin &&
+        reductionOp != ElementWiseOperator::opArgmax)
+        InvalidArgument("TensorOp: Arg reduction operations other than opArgmax, and opArgmin are not implemented.");
+
+    if (o.GetNumElements() == 1)
+    {
+        o.Data()[0] = (ElemType) a.ArgOp(reductionOp);
+    }
+    else
+    {
+        const size_t N = 2;
+        array<ElemType*, N> pointers = { a.Data(), o.Data() };
+        for (size_t i = 0; i < N; i++)
+            pointers[i] += offsets[i];
+
+        switch (regularOpDims.size())
+        {
+            case 2:
+                TensorArgOpIteration<ElemType, N, 1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
+                break;
+            case 1:
+                TensorArgOpIteration<ElemType, N, 0>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
+                break;
+            case 0:
+                TensorArgOpIteration<ElemType, N, -1>::Loop(pointers, regularOpDims, regularStrides, reducingOpDims, reducingStrides, reductionOp);
+                break;
+            default:
+                LogicError("TensorOp: %d non-flattened input dimensions are not supported.", (int)regularOpDims.size());
+        }
+    }
+}
+
+}}}
--- a/Source/Math/CPUSparseMatrix.cpp
+++ b/Source/Math/CPUSparseMatrix.cpp
@ -20,6 +20,7 @@
 #ifdef LEAKDETECT
 #include <vld.h>
 #endif
+#include "half.hpp"

 #pragma warning(disable : 4127) // conditional expression is constant; "if (sizeof(ElemType)==sizeof(float))" triggers this

@ -480,7 +481,7 @@ CPUSparseMatrix<ElemType>& CPUSparseMatrix<ElemType>::DoScatterColumnsOf(ElemTyp
    // TODO: Replace with std::exclusive_scan when we switch to C++17
    for (size_t i = 1; i <= GetNumCols(); ++i)
        SecondaryIndexLocation()[i] = SecondaryIndexLocation()[i - 1] + columnElementCounts[i - 1];
-    
+
    size_t offset = a.SecondaryIndexLocation()[0];
    // TODO: Does it make sense to parallelize this?
    for (long j = 0; j < numColsToWrite; j++)
@ -531,7 +532,7 @@ void CPUSparseMatrix<ElemType>::Print(const char* matrixName, ptrdiff_t /*rowSta
            fprintf(stderr, "\n");
            j++;
        }
-        fprintf(stderr, "%d:%.f ", unCompressedIndex[i], dataBuffer[i]);
+        fprintf(stderr, "%d:%.f ", unCompressedIndex[i], (double)dataBuffer[i]);
    }
    fprintf(stderr, "\n");
 }
@ -721,7 +722,7 @@ void CPUSparseMatrix<ElemType>::SetMatrixFromSBCFormat(const size_t* blockIds, c
 template <class ElemType>
 ElemType* CPUSparseMatrix<ElemType>::Data()  const
 {
-    return (Buffer() + 
+    return (Buffer() +
        ((GetFormat() == matrixFormatSparseCSC || GetFormat() == matrixFormatSparseCSR) ? GetCompIndex()[m_sliceViewOffset] : 0));
 }

@ -810,7 +811,7 @@ template <class ElemType>
 void CPUSparseMatrix<ElemType>::RequireSizeAndAllocate(const size_t numRows, const size_t numCols, const size_t numNZElemToReserve, const MatrixFormat matrixFormat, const bool growOnly /*= true*/, bool keepExistingValues /*= true*/)
 {
    RequireSize(numRows, numCols, numNZElemToReserve, matrixFormat, growOnly);
-    
+
    size_t newCompIndexSize = (numCols > numRows ? numCols : numRows) + 1;
    bool reallocate = (GetSizeAllocated() < numNZElemToReserve || (GetSizeAllocated() > numNZElemToReserve && !growOnly) || GetCompIndexSize() < newCompIndexSize);

@ -964,7 +965,7 @@ public:
                    else if ( denseTimesSparse &&  transposeA) denseVal = dense(     innerIndex, outerIndexDense);
                    else if (!denseTimesSparse && !transposeB) denseVal = dense(     innerIndex, outerIndexDense);
                    else if (!denseTimesSparse &&  transposeB) denseVal = dense(outerIndexDense,      innerIndex);
-                    
+

                    // Update matrix c.
                    if (denseTimesSparse)
@ -1312,7 +1313,7 @@ void CPUSparseMatrix<ElemType>::InnerProduct(const CPUSparseMatrix<ElemType>& a,
 }

 // A helper method used in MomentumSGDUpdate and NesterovAcceleratedMomentumSGDUpdate.
-// Modifies the smoothed gradients "c", as well as the current gradients "this" on which this method is invoked. 
+// Modifies the smoothed gradients "c", as well as the current gradients "this" on which this method is invoked.
 // Classic momentum (unitGainFactor == 1.0):
 // 1) c = momentum * c + this
 // Unit-gain momentum (unitGainFactor == 1.0 - momentum):
@ -1423,7 +1424,8 @@ ElemType CPUSparseMatrix<ElemType>::Adagrad(CPUMatrix<ElemType>& c, const bool n
 }

 template <class ElemType>
-void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp)
+template <class AccumType>
+void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<AccumType>& c, CPUMatrix<AccumType>& functionValues, AccumType learningRate, AccumType rho, AccumType epsilon, int* timestamps, int currentTimestamp)
 {
    size_t numColsNeeded = 2 * GetNumCols();

@ -1441,9 +1443,9 @@ void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemT

    size_t n = GetNumElements();
    ElemType* grad = Data();
-    ElemType* smoothAda = c.Data();
-    ElemType* smoothX2 = c.Data() + n;
-    ElemType* val = functionValues.Data();
+    AccumType* smoothAda = c.Data();
+    AccumType* smoothX2 = c.Data() + n;
+    AccumType* val = functionValues.Data();
    auto rows = GetNumRows();

 #pragma omp parallel for
@ -1459,10 +1461,10 @@ void CPUSparseMatrix<ElemType>::AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemT
        {
            size_t denseIndex = columnOffset + row;;
            ElemType g = grad[blockOffset + row];
-            ElemType adaSqr = rho * decay * smoothAda[denseIndex] + (1 - rho) * g * g;
+            AccumType adaSqr = rho * decay * smoothAda[denseIndex] + (1 - rho) * g * g;
            smoothAda[denseIndex] = adaSqr;
-            ElemType x2 = decay * smoothX2[denseIndex];
-            ElemType deltaX = -sqrt(x2 + epsilon) / sqrt(adaSqr + epsilon) * g;
+            AccumType x2 = decay * smoothX2[denseIndex];
+            AccumType deltaX = -sqrt(x2 + epsilon) / sqrt(adaSqr + epsilon) * g;
            smoothX2[denseIndex] = rho * x2 + (1 - rho) * deltaX * deltaX;
            val[denseIndex] += learningRate * deltaX;
        }
@ -1708,6 +1710,18 @@ ElemType CPUSparseMatrix<ElemType>::SumOfElements() const
    return sum;
 }

+// specialization to RunTimeError for now due to omp implementation only support build-in type
+template <>
+half CPUSparseMatrix<half>::FrobeniusNorm() const
+{
+    RuntimeError("half FrobeniusNorm not supported.");
+}
+template <>
+half CPUSparseMatrix<half>::SumOfElements() const
+{
+    RuntimeError("half SumOfElements not supported.");
+}
+
 template <typename ElemType>
 MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
 {
@ -1763,8 +1777,54 @@ MATH_API File& operator>>(File& stream, CPUSparseMatrix<ElemType>& us)
 template MATH_API File& operator>>(File& stream, CPUSparseMatrix<float>& us);
 template MATH_API File& operator>>(File& stream, CPUSparseMatrix<double>& us);

+template <typename ElemType>
+MATH_API File& operator<<(File& stream, const CPUSparseMatrix<ElemType>& us)
+{
+    if (us.GetFormat() != matrixFormatSparseCSC && us.GetFormat() != matrixFormatSparseCSR)
+        NOT_IMPLEMENTED;
+
+    stream.PutMarker(fileMarkerBeginSection, std::wstring(L"BMAT"));
+    stream << sizeof(ElemType);
+    stream << std::wstring(L"nnmatrix"); // Note this is needed for compatability, and could potentially be an empty string
+
+    size_t nz, numRows, numCols;
+    size_t compressedSize = us.SecondaryIndexCount();
+    int format = us.GetFormat();
+
+    stream << format << nz << numCols << numRows;
+
+    if (nz > 0)
+    {
+        ElemType* dataBuffer = us.NzValues();
+        CPUSPARSE_INDEX_TYPE* unCompressedIndex = us.MajorIndexLocation();
+        CPUSPARSE_INDEX_TYPE* compressedIndex = us.SecondaryIndexLocation();
+
+        for (size_t i = 0; i < nz; ++i)
+        {
+            stream << dataBuffer[i];
+        }
+        for (size_t i = 0; i < nz; ++i)
+        {
+            stream << unCompressedIndex[i];
+        }
+        for (size_t i = 0; i < compressedSize; ++i)
+        {
+            stream << compressedIndex[i];
+        }
+    }
+    stream.PutMarker(fileMarkerEndSection, std::wstring(L"EMAT"));
+
+    return stream;
+}
+
 template class CPUSparseMatrix<float>;
 template class CPUSparseMatrix<double>;
+template class CPUSparseMatrix<half>;
+
+// instantiate learner methods
+template void CPUSparseMatrix<float>::AdaDelta(CPUMatrix<float>& c, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon, int* timestamps, int currentTimestamp);
+template void CPUSparseMatrix<double>::AdaDelta(CPUMatrix<double>& c, CPUMatrix<double>& functionValues, double learningRate, double rho, double epsilon, int* timestamps, int currentTimestamp);
+template void CPUSparseMatrix<half>::AdaDelta(CPUMatrix<float>& c, CPUMatrix<float>& functionValues, float learningRate, float rho, float epsilon, int* timestamps, int currentTimestamp);

 // We use Matrix<char> as the backing store for QuantizedMatrix
 // Let's explciitly instantiate the methods we need for that purpose
--- a/Source/Math/CPUSparseMatrix.h
+++ b/Source/Math/CPUSparseMatrix.h
@ -232,7 +232,9 @@ public:
 public:
    void NormalGrad(CPUMatrix<ElemType>& c, const ElemType momentum, ElemType unitGainFactor);
    ElemType Adagrad(CPUMatrix<ElemType>& c, const bool needAveMultiplier);
-    void AdaDelta(CPUMatrix<ElemType>& c, CPUMatrix<ElemType>& functionValues, ElemType learningRate, ElemType rho, ElemType epsilon, int* timestamps, int currentTimestamp);
+
+    template<typename AccumType>
+    void AdaDelta(CPUMatrix<AccumType>& c, CPUMatrix<AccumType>& functionValues, AccumType learningRate, AccumType rho, AccumType epsilon, int* timestamps, int currentTimestamp);

 public:
    CPUSparseMatrix<ElemType>& InplaceTruncateTop(const ElemType threshold);
--- a/Source/Math/CntkBatchNormalization.cuh
+++ b/Source/Math/CntkBatchNormalization.cuh
@ -1,23 +1,11 @@
 //
 // Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //

 #pragma once

-#ifdef _MSC_VER
-#pragma warning(push)
-#pragma warning(disable : 4100) // 'identifier': unreferenced formal parameter
-#pragma warning(disable : 4127) // conditional expression is constant
-#pragma warning(disable : 4201) // nonstandard extension used: nameless struct/union
-#pragma warning(disable : 4458) // declaration of 'identifier' hides class member
-#pragma warning(disable : 4515) // 'namespace': namespace uses itself
-#endif
-#include <cub/cub.cuh>
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
 namespace Microsoft { namespace MSR { namespace CNTK {

 size_t RoundUpToMultiple(size_t n, size_t blockSize)
@ -40,7 +28,7 @@ cudaError_t GetLastCudaError()
 #endif
    return cudaSuccess;
 }
-
+/*
 template <int U, typename T>
 __device__ __forceinline__ void LoadValues(const T* src, T dst[U])
 {
@ -48,9 +36,17 @@ __device__ __forceinline__ void LoadValues(const T* src, T dst[U])
    for (int i = 0; i < U; i++)
        dst[i] = src[i];
 }
+*/
+template <int U, typename T1, typename T2>
+__device__ __forceinline__ void LoadValues(const T1* src, T2 dst[U])
+{
+#pragma unroll
+    for (int i = 0; i < U; i++)
+        dst[i] = (T2)src[i];
+}

 template <>
-__device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst[2])
+__device__ __forceinline__ void LoadValues<2, float, float>(const float* src, float dst[2])
 {
    // src must be aligned at 8 bytes boundary.
    assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
@ -60,7 +56,7 @@ __device__ __forceinline__ void LoadValues<2, float>(const float* src, float dst
 }

 template <>
-__device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst[4])
+__device__ __forceinline__ void LoadValues<4, float, float>(const float* src, float dst[4])
 {
    // src must be aligned at 16 bytes boundary.
    assert(reinterpret_cast<uintptr_t>(src) % (sizeof(dst)) == 0);
@ -73,7 +69,7 @@ __device__ __forceinline__ void LoadValues<4, float>(const float* src, float dst
    dst[2] = v.z;
    dst[3] = v.w;
 }
-
+/*
 template <int U, typename T>
 __device__ __forceinline__ void StoreValues(const T src[U], T* dst)
 {
@ -81,9 +77,17 @@ __device__ __forceinline__ void StoreValues(const T src[U], T* dst)
    for (int i = 0; i < U; i++)
        dst[i] = src[i];
 }
+*/
+template <int U, typename T1, typename T2>
+__device__ __forceinline__ void StoreValues(const T1 src[U], T2* dst)
+{
+#pragma unroll
+    for (int i = 0; i < U; i++)
+        dst[i] = (T2)src[i];
+}

 template <>
-__device__ __forceinline__ void StoreValues<2, float>(const float src[2], float* dst)
+__device__ __forceinline__ void StoreValues<2, float, float>(const float src[2], float* dst)
 {
    // dst must be aligned at 8 bytes boundary.
    assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
@ -94,7 +98,7 @@ __device__ __forceinline__ void StoreValues<2, float>(const float src[2], float*
 }

 template <>
-__device__ __forceinline__ void StoreValues<4, float>(const float src[4], float* dst)
+__device__ __forceinline__ void StoreValues<4, float, float>(const float src[4], float* dst)
 {
    // dst must be aligned at 16 bytes boundary.
    assert(reinterpret_cast<uintptr_t>(dst) % (sizeof(src)) == 0);
@ -107,12 +111,16 @@ __device__ __forceinline__ void StoreValues<4, float>(const float src[4], float*
 }

 template <typename T>
-__device__ __forceinline__ T Shuffle(T input, int srcLane)
+__device__ __forceinline__ T Shuffle(T input, int srcLane, unsigned int mask)
 {
 #ifdef __CUDA_ARCH__
    // shfl is supported only on Kepler+
    static_assert(__CUDA_ARCH__ >= 300, "CNTK only supports only Kepler GPU architecture or newer.");
+#if CUDA_VERSION >= 9000
+    return cub::ShuffleIndex(input, srcLane, CUB_PTX_WARP_THREADS, mask); // Need cub > 1.7.0
+#else
    return cub::ShuffleIndex(input, srcLane);
+#endif
 #else
    assert(false);
    return input; // keep compiler happy
@ -136,6 +144,15 @@ namespace Operations
        assert(::isfinite(a) && a > 0);
        return rsqrt(a);
    }
+
+    __device__ half RSqrt(half a)
+    {
+#if __CUDA_ARCH__ >= 600
+        return hrsqrt(a);
+#else
+        return __float2half(rsqrtf(__half2float(a)));
+#endif
+    }
 }

 // This function is used to select correct unroll factor.
@ -151,6 +168,17 @@ void Call(size_t vectorSize, Targs... args)
        Func<1>::template Call<T>(args...);
 }

+template <template <int> class Func, typename T1, typename T2, typename ...Targs>
+void Call2(size_t vectorSize, Targs... args)
+{
+    if ((vectorSize % 4) == 0)
+        Func<4>::template Call<T1, T2>(args...);
+    else if ((vectorSize % 2) == 0)
+        Func<2>::template Call<T1, T2>(args...);
+    else
+        Func<1>::template Call<T1, T2>(args...);
+}
+
 //--------------------------------------------------------------------
 // Mean and variance computation
 //--------------------------------------------------------------------
@ -186,15 +214,16 @@ void Call(size_t vectorSize, Targs... args)
 // 0 < blendFactor <  1 - blend new running mean/var with averaged mean/var of the current minibatch, e.g.,
 //                        new xMean = (1 - blendFactor) * actual batch mean + blendFactor * new runMean
 //     blendFactor == 0 - use actual batch mean/var
-template <int BlockDimX, int BlockDimY, int U, typename ElemType>
+template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
 __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
                                              const ElemType* x,                         // (in) input data
                                              double expAvgFactor, // TODO why not ElemType? same for the other parameters, functions?
                                              double blendFactor,
-                                              ElemType* runMean, ElemType* runVariance,  // (in/out) running mean/variance, gets updated with current minibatch
+                                              StatType* runMean, StatType* runVariance,  // (in/out) running mean/variance, gets updated with current minibatch
                                              double epsilon,
-                                              ElemType* xMean, ElemType* xInvStdDev)     // (out) this minibatch's mean and inverse stddev
+                                              StatType* xMean, StatType* xInvStdDev)     // (out) this minibatch's mean and inverse stddev
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
    assert((vectorSize % U) == 0);
@ -219,9 +248,9 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,

        // first estimate mean over all data for this thread
        int n = 0;
-        ElemType mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
-        ElemType m2[U];   // likewise for variance
-        ElemType im2[U];  // and inverse stddev
+        comp_t mean[U]; // this thread's part of the mean vector (stored as a normalized mean also during accumulation)
+        comp_t m2[U];   // likewise for variance
+        comp_t im2[U];  // and inverse stddev
 #pragma unroll
        for (int k = 0; k < U; k++)
        {
@ -235,13 +264,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
        for (; icolSrc < batchSize; icolSrc += BlockDimY)
        {
            n++;
-            ElemType curVal[U];
+            comp_t curVal[U];
            LoadValues<U>(psrc, curVal);
            // No need for separate unrolling, SASS looks good.
 #pragma unroll
            for (int k = 0; k < U; k++)
            {
-                ElemType d = curVal[k] - mean[k];
+                comp_t d = curVal[k] - mean[k];
                // REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
                // Using precise math slows down the code by about 40%.
                mean[k] += d / n; // mean_n = [mean_{n-1} * (n-1) + curVal] / n = mean_{n-1} *n/n - mean_{n-1} / n + curVal / n
@ -253,6 +282,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
        // now reduce minibatch mean/variance across threads
        const int tid = threadIdx.y * BlockDimX + threadIdx.x;
        const int laneId = tid & 0x1f;
+
+        unsigned int mask;
+#if CUDA_VERSION >= 9000
+        mask = __ballot_sync(0xffffffff, n);
+#endif
+
        // First, reduce within warp using shuffle.
        if (n > 0)
        {
@ -260,16 +295,16 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
            for (int i = 1; i < CUB_PTX_WARP_THREADS / BlockDimX; i *= 2)
            {
                int srcLane = laneId + BlockDimX * i;
-                int n2 = Shuffle(n, srcLane);
+                int n2 = Shuffle(n, srcLane, mask);
                int nsum = n + n2;
-                ElemType d[U];
+                comp_t d[U];
 #pragma unroll
                for (int k = 0; k < U; k++)
                {
-                    d[k] = Shuffle(mean[k], srcLane) - mean[k];
-                    ElemType dScaled = d[k] * n2 / nsum;
+                    d[k] = Shuffle(mean[k], srcLane, mask) - mean[k];
+                    comp_t dScaled = d[k] * n2 / nsum;
                    mean[k] += dScaled;
-                    m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
+                    m2[k] += Shuffle(m2[k], srcLane, mask) + d[k] * n * dScaled;
                }
                n = nsum;
            }
@ -278,8 +313,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
        // Storage for each warp in a thread block. First warp ("accumulator") holds
        // final results so it does not need shared memory.
        const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
-        __shared__ ElemType meanRes[BlockDimX * U][cwarp - 1];
-        __shared__ ElemType m2Res[BlockDimX * U][cwarp - 1];
+        __shared__ comp_t meanRes[BlockDimX * U][cwarp - 1];
+        __shared__ comp_t m2Res[BlockDimX * U][cwarp - 1];
        __shared__ int nRes[cwarp - 1];

        // Each warp (except warp0) will write accumulated results to shared memory.
@ -309,12 +344,12 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
            {
                int n2 = nRes[i];
                int nsum = n + n2;
-                ElemType d[U];
+                comp_t d[U];
 #pragma unroll
                for (int k = 0; k < U; k++)
                {
                    d[k] = meanRes[threadIdx.x * U + k][i] - mean[k];
-                    ElemType dScaled = d[k] * n2 / nsum;
+                    comp_t dScaled = d[k] * n2 / nsum;
                    mean[k] += dScaled;
                    m2[k] += m2Res[threadIdx.x * U + k][i] + d[k] * n * dScaled;
                }
@ -322,8 +357,8 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
            }

            size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
-            ElemType run[U];
-            ElemType x[U];
+            comp_t run[U];
+            comp_t x[U];

            // Compute running mean and batch mean.
            LoadValues<U>(runMean + idxDstBase, run);
@ -344,14 +379,14 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
            for (int k = 0; k < U; k++)
            {
                // Compute batch inverse standard deviation and variance
-                ElemType runVariance = batchSize == 1 ? 0 : m2[k] / (batchSize - 1);
+                comp_t runVariance = batchSize == 1 ? 0 : m2[k] / (batchSize - 1);
                // Average
                run[k] = expAvgFactor * runVariance + (1.0 - expAvgFactor) * run[k];
                // Blend
-                im2[k] = Operations::RSqrt(static_cast<ElemType>(m2[k] / batchSize + epsilon));
+                im2[k] = Operations::RSqrt(static_cast<comp_t>(m2[k] / batchSize + epsilon));
                if (blendFactor != 0)
                {
-                    ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
+                    comp_t runInvStdDev = Operations::RSqrt(static_cast<comp_t>(run[k] + epsilon));
                    im2[k] = blendFactor * runInvStdDev + (1.0 - blendFactor) * im2[k];
                }
            }
@ -363,7 +398,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
    else if (threadIdx.y == 0)
    {
        size_t idxDstBase = (blockIdx.x * BlockDimX + threadIdx.x) * U;
-        ElemType run[U];
+        comp_t run[U];

        // Copy mean
        LoadValues<U>(runMean + idxDstBase, run);
@ -373,7 +408,7 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
        LoadValues<U>(runVariance + idxDstBase, run);
 #pragma unroll
        for (int k = 0; k < U; k++)
-            run[k] = Operations::RSqrt(static_cast<ElemType>(run[k] + epsilon));
+            run[k] = Operations::RSqrt(static_cast<comp_t>(run[k] + epsilon));
        StoreValues<U>(run, xInvStdDev + idxDstBase);
    }
 }
@ -381,12 +416,13 @@ __global__ void kComputeBatchMeanAndInvStdDev(int vectorSize, int batchSize,
 // This kernel is very similar to kComputeBatchMeanAndInvStdDev except it reduces not just over N (minibatch)
 // but also W and H dimensions.
 // REVIEW alexeyk: is it possible to combine this and previous kernel into a single kernel without hurting performance/readability much?
-template <int BlockDimX, int BlockDimY, int U, typename ElemType>
+template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
 __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatialSize, int batchSize, const ElemType* x,
                                                     double expAvgFactor, double blendFactor,
-                                                     ElemType* runMean, ElemType* runVariance,
-                                                     double epsilon, ElemType* xMean, ElemType* xInvStdDev)
+                                                     StatType* runMean, StatType* runVariance,
+                                                     double epsilon, StatType* xMean, StatType* xInvStdDev)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
    assert(blockDim.x == BlockDimX);
@ -410,8 +446,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
        int irowSrcLim = (blockIdx.x + 1) * spatialSize;

        int n = 0;
-        ElemType mean[U];
-        ElemType m2[U];
+        comp_t mean[U];
+        comp_t m2[U];
 #pragma unroll
        for (int k = 0; k < U; k++)
        {
@ -429,13 +465,13 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
            for (int irowSrc = irowSrcBase; irowSrc < irowSrcLim; irowSrc += BlockDimX * U, psrc += BlockDimX * U)
            {
                n++;
-                ElemType curVal[U];
+                comp_t curVal[U];
                LoadValues<U>(psrc, curVal);
                // No need for separate unrolling, SASS looks good.
 #pragma unroll
                for (int k = 0; k < U; k++)
                {
-                    ElemType d = curVal[k] - mean[k];
+                    comp_t d = curVal[k] - mean[k];
                    // REVIEW alexeyk: we enabled fast CUDA math in CNTK so division below will be approximate, is this a problem?
                    // Using precise math slows down the code by about 40%.
                    mean[k] += d / n;
@ -448,22 +484,26 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
        const int tid = threadIdx.y * BlockDimX + threadIdx.x;
        const int laneId = tid & 0x1f;
        // First, reduce within warp using shuffle.
+        unsigned int mask;
+#if CUDA_VERSION >= 9000
+        mask = __ballot_sync(0xffffffff, n);
+#endif
        if (n > 0)
        {
 #pragma unroll
            for (int i = 1; i < CUB_PTX_WARP_THREADS; i *= 2)
            {
                int srcLane = laneId + i;
-                int n2 = Shuffle(n, srcLane);
+                int n2 = Shuffle(n, srcLane, mask);
                int nsum = n + n2;
-                ElemType d[U];
+                comp_t d[U];
 #pragma unroll
                for (int k = 0; k < U; k++)
                {
-                    d[k] = Shuffle(mean[k], srcLane) - mean[k];
-                    ElemType dScaled = d[k] * n2 / nsum;
+                    d[k] = Shuffle(mean[k], srcLane, mask) - mean[k];
+                    comp_t dScaled = d[k] * n2 / nsum;
                    mean[k] += dScaled;
-                    m2[k] += Shuffle(m2[k], srcLane) + d[k] * n * dScaled;
+                    m2[k] += Shuffle(m2[k], srcLane, mask) + d[k] * n * dScaled;
                }
                n = nsum;
            }
@ -472,8 +512,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
        // Storage for each warp in a thread block. First warp ("accumulator") holds
        // final results so it does not need shared memory.
        const int cwarp = BlockDimX * BlockDimY / CUB_PTX_WARP_THREADS;
-        __shared__ ElemType meanRes[U][cwarp - 1];
-        __shared__ ElemType m2Res[U][cwarp - 1];
+        __shared__ comp_t meanRes[U][cwarp - 1];
+        __shared__ comp_t m2Res[U][cwarp - 1];
        __shared__ int nRes[cwarp - 1];

        // Each warp (except warp0) will write accumulated results to shared memory.
@ -499,12 +539,12 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
            {
                int n2 = nRes[i];
                int nsum = n + n2;
-                ElemType d[U];
+                comp_t d[U];
 #pragma unroll
                for (int k = 0; k < U; k++)
                {
                    d[k] = meanRes[k][i] - mean[k];
-                    ElemType dScaled = d[k] * n2 / nsum;
+                    comp_t dScaled = d[k] * n2 / nsum;
                    mean[k] += dScaled;
                    m2[k] += m2Res[k][i] + d[k] * n * dScaled;
                }
@ -515,8 +555,8 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
 #pragma unroll
            for (int k = 1; k < U; k++)
            {
-                ElemType d = mean[k] - mean[0];
-                ElemType dScaled = d * n / (n + k * n);
+                comp_t d = mean[k] - mean[0];
+                comp_t dScaled = d * n / (n + k * n);
                mean[0] += dScaled;
                m2[0] += m2[k] + d * k * n * dScaled;
            }
@ -525,12 +565,12 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
            runMean[blockIdx.x] = expAvgFactor * mean[0] + (1.0 - expAvgFactor) * runMean[blockIdx.x];
            xMean[blockIdx.x] = blendFactor * runMean[blockIdx.x] + (1.0 - blendFactor) * mean[0];

-            ElemType runV = batchSize * spatialSize == 1 ? 0 : m2[0] / (batchSize * spatialSize - 1);
+            comp_t runV = batchSize * spatialSize == 1 ? 0 : m2[0] / (batchSize * spatialSize - 1);
            runVariance[blockIdx.x] = expAvgFactor * runV + (1.0 - expAvgFactor) * runVariance[blockIdx.x];
-            xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(m2[0] / (batchSize * spatialSize) + epsilon));
+            xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<comp_t>(m2[0] / (batchSize * spatialSize) + (comp_t)epsilon));
            if (blendFactor != 0)
            {
-                ElemType runInvStdDev = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
+                comp_t runInvStdDev = Operations::RSqrt(static_cast<comp_t>((comp_t)runVariance[blockIdx.x] + (comp_t)epsilon));
                xInvStdDev[blockIdx.x] = blendFactor * runInvStdDev + (1.0 - blendFactor) * xInvStdDev[blockIdx.x];
            }
        }
@ -538,7 +578,7 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
    else if (threadIdx.y == 0 && threadIdx.x == 0)
    {
        xMean[blockIdx.x] = runMean[blockIdx.x];
-        xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<ElemType>(runVariance[blockIdx.x] + epsilon));
+        xInvStdDev[blockIdx.x] = Operations::RSqrt(static_cast<comp_t>((comp_t)runVariance[blockIdx.x] + (comp_t)epsilon));
    }
 }

@ -547,14 +587,14 @@ __global__ void kComputeSpatialBatchMeanAndInvStdDev(int vectorSize, int spatial
 template <int U>
 struct ComputeBatchMeanAndInvStdDev
 {
-    template <typename ElemType>
+    template <typename ElemType, typename StatType>
    static void Call(size_t vectorSize, size_t batchSize,
                     const ElemType* x,                         // (in) input data
                     double expAvgFactor,
                     double blendFactor,
-                     ElemType* runMean, ElemType* runVariance,  // (in/out) running mean/variance, gets updated with current minibatch
+                     StatType* runMean, StatType* runVariance,  // (in/out) running mean/variance, gets updated with current minibatch
                     double epsilon,
-                     ElemType* xMean, ElemType* xInvStdDev,     // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
+                     StatType* xMean, StatType* xInvStdDev,     // (out) actual interpolated mean/stddev that are used to normalize. Returned since needed in backprop.
                     cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);
@ -565,7 +605,7 @@ struct ComputeBatchMeanAndInvStdDev
        auto bdim = dim3(BlockDimX, BlockDimY);
        // Create grid with only one block in y(batch)-dimension as kernel uses striding.
        auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
-        kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
+        kComputeBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
            static_cast<int>(vectorSize), static_cast<int>(batchSize),
            x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
    }
@ -574,10 +614,10 @@ struct ComputeBatchMeanAndInvStdDev
 template <int U>
 struct ComputeSpatialBatchMeanAndInvStdDev
 {
-    template <typename ElemType>
+    template <typename ElemType, typename StatType>
    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x,
-                        double expAvgFactor, double blendFactor, ElemType* runMean, ElemType* runVariance,
-                        double epsilon, ElemType* xMean, ElemType* xInvStdDev, cudaStream_t stream)
+                        double expAvgFactor, double blendFactor, StatType* runMean, StatType* runVariance,
+                        double epsilon, StatType* xMean, StatType* xInvStdDev, cudaStream_t stream)
    {
        assert((vectorSize % spatialSize) == 0);
        assert((spatialSize % U) == 0);
@ -589,7 +629,7 @@ struct ComputeSpatialBatchMeanAndInvStdDev
        // Create grid with only one block in y(batch)-dimension as kernel uses striding.
        // Each thread block processes a single whole feature map independently (i.e. reduces over W, H and N dimensions).
        auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
-        kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
+        kComputeSpatialBatchMeanAndInvStdDev<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize),
            x, expAvgFactor, blendFactor, runMean, runVariance, epsilon, xMean, xInvStdDev);
    }
@ -603,14 +643,15 @@ struct ComputeSpatialBatchMeanAndInvStdDev
 // or Cx1x1 in convolutional case.
 //--------------------------------------------------------------------

-template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType>
+template <int BlockDimX, int BlockDimY, bool Spatial, bool NormalizeRunningStats, int U, typename ElemType, typename StatType>
 __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int batchSize,
    double epsilon,
    const ElemType* x, ElemType* y,
-    const ElemType* bnScale, const ElemType* bnBias,
-    const ElemType* runningMean, const ElemType* runningVariance,
-    const ElemType* batchMean, ElemType* batchInvStdDev)
+    const StatType* bnScale, const StatType* bnBias,
+    const StatType* runningMean, const StatType* runningVariance,
+    const StatType* batchMean, StatType* batchInvStdDev)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
    assert(blockDim.x == BlockDimX);
@ -627,10 +668,10 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
        return;
    assert(irowBase + U <= vectorSize);

-    __shared__ ElemType meanS[BlockDimX * U];
-    __shared__ ElemType invStdDevS[BlockDimX * U];
-    __shared__ ElemType scaleS[BlockDimX * U];
-    __shared__ ElemType biasS[BlockDimX * U];
+    __shared__ comp_t meanS[BlockDimX * U];
+    __shared__ comp_t invStdDevS[BlockDimX * U];
+    __shared__ comp_t scaleS[BlockDimX * U];
+    __shared__ comp_t biasS[BlockDimX * U];
    int offs = threadIdx.x * U;

    // REVIEW alexeyk: optimize smem usage, reduce transaction count (is it worth it?).
@ -644,8 +685,8 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
                int imap = (irowBase + k) / spatialSize;
                meanS[offs + k] = NormalizeRunningStats ? runningMean[imap] : batchMean[imap];
                invStdDevS[offs + k] = NormalizeRunningStats
-                    ? Operations::RSqrt(static_cast<ElemType>(runningVariance[imap] + epsilon))
-                    : batchInvStdDev[imap];
+                    ? Operations::RSqrt(static_cast<comp_t>((comp_t)runningVariance[imap] + (comp_t)epsilon))
+                    : (comp_t)batchInvStdDev[imap];
                scaleS[offs + k] = bnScale[imap];
                biasS[offs + k] = bnBias[imap];
            }
@ -657,18 +698,18 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
            for (int k = 0; k < U; k++)
            {
                invStdDevS[offs + k] = NormalizeRunningStats
-                    ? Operations::RSqrt(static_cast<ElemType>(runningVariance[irowBase + k] + epsilon))
-                    : batchInvStdDev[irowBase + k];
+                    ? Operations::RSqrt(static_cast<comp_t>((comp_t)runningVariance[irowBase + k] + (comp_t)epsilon))
+                    : (comp_t)batchInvStdDev[irowBase + k];
            }
            LoadValues<U>(bnScale + irowBase, scaleS + offs);
            LoadValues<U>(bnBias + irowBase, biasS + offs);
        }
    }
    __syncthreads();
-    ElemType mean[U];
-    ElemType invStdDev[U];
-    ElemType scale[U];
-    ElemType bias[U];
+    comp_t mean[U];
+    comp_t invStdDev[U];
+    comp_t scale[U];
+    comp_t bias[U];
    LoadValues<U>(meanS + offs, mean);
    LoadValues<U>(invStdDevS + offs, invStdDev);
    LoadValues<U>(scaleS + offs, scale);
@ -681,7 +722,7 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
    size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
    for (; icol < batchSize; icol += gridDim.y * BlockDimY, psrc += stride, pdst += stride)
    {
-        ElemType val[U];
+        comp_t val[U];
        LoadValues<U>(psrc, val);
 #pragma unroll
        for (int k = 0; k < U; k++)
@ -695,13 +736,13 @@ __global__ void kNormalizeBatchTraining(int vectorSize, int spatialSize, int bat
 template <int U>
 struct NormalizeBatchTraining
 {
-    template <typename ElemType>
+    template <typename ElemType, typename StatType>
    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial,
                     bool normalizeRunningStats, double epsilon,
                     const ElemType* x, ElemType* y,                               // (in, out) data to normalize -> normalized data
-                     const ElemType* bnScale, const ElemType* bnBias,              // (in) scale/bias to denormalize with
-                     const ElemType* runningMean, const ElemType* runningVariance, // (in) running mean/variance
-                     const ElemType* batchMean, ElemType* batchInvStdDev,          // (in) batch mean/stddev to normalize with
+                     const StatType* bnScale, const StatType* bnBias,              // (in) scale/bias to denormalize with
+                     const StatType* runningMean, const StatType* runningVariance, // (in) running mean/variance
+                     const StatType* batchMean, StatType* batchInvStdDev,          // (in) batch mean/stddev to normalize with
                     cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);
@ -715,14 +756,14 @@ struct NormalizeBatchTraining
        if (spatial)
        {
            if (normalizeRunningStats)
-                kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U><<<gdim, bdim, 0, stream>>>(
+                kNormalizeBatchTraining<BlockDimX, BlockDimY, true, true, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
                    (int)vectorSize, (int)spatialSize, (int)batchSize,
                    epsilon,
                    x, y, bnScale, bnBias,
                    runningMean, runningVariance,
                    batchMean, batchInvStdDev);
            else
-                kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U><<<gdim, bdim, 0, stream>>>(
+                kNormalizeBatchTraining<BlockDimX, BlockDimY, true, false, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
                    (int)vectorSize, (int)spatialSize, (int)batchSize,
                    epsilon,
                    x, y, bnScale, bnBias,
@ -732,14 +773,14 @@ struct NormalizeBatchTraining
        else
        {
            if (normalizeRunningStats)
-                kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U><<<gdim, bdim, 0, stream>>>(
+                kNormalizeBatchTraining<BlockDimX, BlockDimY, false, true, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
                    (int)vectorSize, (int)spatialSize, (int)batchSize,
                    epsilon,
                    x, y, bnScale, bnBias,
                    runningMean, runningVariance,
                    batchMean, batchInvStdDev);
            else
-                kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U><<<gdim, bdim, 0, stream>>>(
+                kNormalizeBatchTraining<BlockDimX, BlockDimY, false, false, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
                    (int)vectorSize, (int)spatialSize, (int)batchSize,
                    epsilon,
                    x, y, bnScale, bnBias,
@ -757,10 +798,11 @@ struct NormalizeBatchTraining
 // All tensor dimensions and assumptions are the same as in case of forward propagation.
 //--------------------------------------------------------------------

-template <int BlockDimX, int BlockDimY, int U, typename ElemType>
-__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dScale, ElemType* dBias,
-                                              const ElemType* savedMean, const ElemType* savedInvStdDev)
+template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
+__global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, const ElemType* x, const ElemType* dy, StatType* dScale, StatType* dBias,
+                                              const StatType* savedMean, const StatType* savedInvStdDev)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
    static_assert(((BlockDimY - 1) & BlockDimY) == 0, "BlockDimY must be a power of 2.");
@ -777,10 +819,10 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
        return;
    assert(irowSrcBase + U <= vectorSize);

-    ElemType mean[U];
-    ElemType invStdDev[U];
-    __shared__ ElemType meanS[BlockDimX * U];
-    __shared__ ElemType invStdDevS[BlockDimX * U];
+    comp_t mean[U];
+    comp_t invStdDev[U];
+    __shared__ comp_t meanS[BlockDimX * U];
+    __shared__ comp_t invStdDevS[BlockDimX * U];
    // Read mean and inv std dev.
    if (threadIdx.y == 0)
    {
@ -796,8 +838,8 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
        LoadValues<U>(&invStdDevS[threadIdx.x * U], invStdDev);
    }

-    ElemType ds[U];
-    ElemType db[U];
+    comp_t ds[U];
+    comp_t db[U];
 #pragma unroll
    for (int k = 0; k < U; k++)
    {
@ -813,21 +855,21 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
    // Stride over all vectors in the batch.
    for (; icolSrc < batchSize; icolSrc += BlockDimY, px += stride, pdy += stride)
    {
-        ElemType curX[U];
-        ElemType curdY[U];
+        comp_t curX[U];
+        comp_t curdY[U];
        LoadValues<U>(px, curX);
        LoadValues<U>(pdy, curdY);
 #pragma unroll
        for (int k = 0; k < U; k++)
        {
-            ds[k] += pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
-            db[k] += pdy[k];
+            ds[k] += (comp_t)pdy[k] * (curX[k] - mean[k]) * invStdDev[k];
+            db[k] += (comp_t)pdy[k];
        }
    }

    // Final reduction.
-    __shared__ ElemType dsS[BlockDimY][BlockDimX * U];
-    __shared__ ElemType dbS[BlockDimY][BlockDimX * U];
+    __shared__ comp_t dsS[BlockDimY][BlockDimX * U];
+    __shared__ comp_t dbS[BlockDimY][BlockDimX * U];
    StoreValues<U>(ds, &dsS[threadIdx.y][threadIdx.x * U]);
    StoreValues<U>(db, &dbS[threadIdx.y][threadIdx.x * U]);
    __syncthreads();
@ -861,10 +903,11 @@ __global__ void kComputeScaleAndBiasGradients(int vectorSize, int batchSize, con
    }
 }

-template <int BlockDimX, int BlockDimY, int U, typename ElemType>
+template <int BlockDimX, int BlockDimY, int U, typename ElemType, typename StatType>
 __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy,
-                                                        ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev)
+                                                        StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
    assert(blockDim.x == BlockDimX);
@ -881,10 +924,10 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
    assert(irowBase + U <= vectorSize);
    int irowLim = (blockIdx.x + 1) * spatialSize;

-    ElemType mean;
-    ElemType invStdDev;
-    __shared__ ElemType meanS;
-    __shared__ ElemType invStdDevS;
+    comp_t mean;
+    comp_t invStdDev;
+    __shared__ comp_t meanS;
+    __shared__ comp_t invStdDevS;
    const int tid = threadIdx.y * BlockDimX + threadIdx.x;
    // Read mean and inv std dev.
    if (tid == 0)
@ -899,8 +942,8 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
        invStdDev = invStdDevS;
    }

-    ElemType ds[U];
-    ElemType db[U];
+    comp_t ds[U];
+    comp_t db[U];
 #pragma unroll
    for (int k = 0; k < U; k++)
    {
@ -921,25 +964,25 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
        // Stride over all values in feature map (W and H dimensions).
        for (int irow = irowBase; irow < irowLim; irow += BlockDimX * U, px += BlockDimX * U, pdy += BlockDimX * U)
        {
-            ElemType curX[U];
-            ElemType curdY[U];
+            comp_t curX[U];
+            comp_t curdY[U];
            LoadValues<U>(px, curX);
            LoadValues<U>(pdy, curdY);
 #pragma unroll
            for (int k = 0; k < U; k++)
            {
-                ds[k] += pdy[k] * (curX[k] - mean) * invStdDev;
-                db[k] += pdy[k];
+                ds[k] += (comp_t)pdy[k] * (curX[k] - mean) * invStdDev;
+                db[k] += (comp_t)pdy[k];
            }
        }
    }
    __syncthreads();
-    using BlockReduce = cub::BlockReduce<ElemType, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
+    using BlockReduce = cub::BlockReduce<comp_t, BlockDimX, cub::BLOCK_REDUCE_WARP_REDUCTIONS, BlockDimY>;
    // Note: must use separate temp storages for each reduction.
    __shared__ typename BlockReduce::TempStorage tmp1;
-    ElemType dsRes = BlockReduce(tmp1).Sum(ds);
+    comp_t dsRes = BlockReduce(tmp1).Sum(ds);
    __shared__ typename BlockReduce::TempStorage tmp2;
-    ElemType dbRes = BlockReduce(tmp2).Sum(db);
+    comp_t dbRes = BlockReduce(tmp2).Sum(db);
    if (tid == 0)
    {
        dScale[blockIdx.x] = dsRes;
@ -950,9 +993,9 @@ __global__ void kComputeSpatialScaleAndBiasGradients(int vectorSize, int spatial
 template <int U>
 struct ComputeScaleAndBiasGradients
 {
-    template <typename ElemType>
+    template <typename ElemType, typename StatType>
    static void Call(size_t vectorSize, size_t batchSize, const ElemType* x, const ElemType* dy,
-        ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
+        StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);
        assert(batchSize >= 1);
@ -961,7 +1004,7 @@ struct ComputeScaleAndBiasGradients
        auto bdim = dim3(BlockDimX, BlockDimY);
        // Create a grid that has uses striding in y-dimension to cover whole minibatch.
        auto gdim = dim3(static_cast<unsigned int>(RoundUpToMultiple(vectorSize, BlockDimX * U)));
-        kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
+        kComputeScaleAndBiasGradients<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
            static_cast<int>(vectorSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
    }
 };
@ -969,9 +1012,9 @@ struct ComputeScaleAndBiasGradients
 template <int U>
 struct ComputeSpatialScaleAndBiasGradients
 {
-    template <typename ElemType>
+    template <typename ElemType, typename StatType>
    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, const ElemType* x, const ElemType* dy,
-                     ElemType* dScale, ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
+                     StatType* dScale, StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
    {
        assert((spatialSize % U) == 0);
        assert((vectorSize % spatialSize) == 0);
@ -982,17 +1025,18 @@ struct ComputeSpatialScaleAndBiasGradients
        auto bdim = dim3(BlockDimX, BlockDimY);
        // Create a grid that has uses striding in y-dimension to cover whole minibatch.
        auto gdim = dim3(static_cast<unsigned int>(vectorSize / spatialSize));
-        kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U><<<gdim, bdim, 0, stream>>>(
+        kComputeSpatialScaleAndBiasGradients<BlockDimX, BlockDimY, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
            static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dScale, dBias, savedMean, savedInvStdDev);
    }
 };

 // mbStatsWeight is the weight with which current MB's stats were used (0 means not at all, locked model).
-template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType>
+template <int BlockDimX, int BlockDimY, bool Spatial, int U, typename ElemType, typename StatType>
 __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize, int batchSize, const ElemType* x, const ElemType* dy, ElemType* dx,
-                                                    const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale, const ElemType* dBias,
-                                                    const ElemType* savedMean, const ElemType* savedInvStdDev)
+                                                    const StatType* bnScale, StatType mbStatsWeight, const StatType* dScale, const StatType* dBias,
+                                                    const StatType* savedMean, const StatType* savedInvStdDev)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    static_assert(BlockDimX * U == CUB_PTX_WARP_THREADS, "BlockDimX * U must be equal to warp size (32).");
    static_assert((BlockDimX * BlockDimY % CUB_PTX_WARP_THREADS) == 0, "Block size must be a multiple of warp size (32).");
    assert(blockDim.x == BlockDimX);
@ -1008,11 +1052,11 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
    if (irowBase >= vectorSize)
        return;
    assert(irowBase + U <= vectorSize);
-    ElemType scale[U];
-    ElemType ds[U];
-    ElemType db[U];
-    ElemType mean[U];
-    ElemType invStdDev[U];
+    comp_t scale[U];
+    comp_t ds[U];
+    comp_t db[U];
+    comp_t mean[U];
+    comp_t invStdDev[U];
    // REVIEW alexeyk: here we're wasting some bandwidth but this might be ok as it's a one-timer.
    if (Spatial)
    {
@ -1044,9 +1088,9 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
    size_t stride = static_cast<size_t>(gridDim.y * BlockDimY) * vectorSize;
    for (; icol < batchSize; icol += gridDim.y * BlockDimY, px += stride, pdy += stride, pdx += stride)
    {
-        ElemType xCur[U];
-        ElemType dyCur[U];
-        ElemType dxCur[U];
+        comp_t xCur[U];
+        comp_t dyCur[U];
+        comp_t dxCur[U];
        LoadValues<U>(px, xCur);
        LoadValues<U>(pdy, dyCur);
        LoadValues<U>(pdx, dxCur);
@ -1060,12 +1104,12 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
        //   dBias = Reduce(dy)
        //   dScale = Reduce(dy * xHat)
        // Simplifying this a bit more, we get the formula below.
-        ElemType val[U];
+        comp_t val[U];
        int m = Spatial ? batchSize * spatialSize : batchSize;
 #pragma unroll
        for (int k = 0; k < U; k++)
        {
-            ElemType xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
+            comp_t xNorm = (xCur[k] - mean[k]) * invStdDev[k]; // xHat
            // scale * invStdDev * (
            //   dL/dyi
            //   - mbStatsWeight * (xHat * dL/dScale + dL/dBias) / m
@ -1073,7 +1117,7 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
            val[k] = dxCur[k]   // (adding to gradient)
                     + (scale[k] * invStdDev[k]) * (
                        dyCur[k]
-                        - mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
+                        - (comp_t)mbStatsWeight * (xNorm * ds[k] + db[k]) / m);
        }
        StoreValues<U>(val, pdx);
    }
@ -1082,10 +1126,10 @@ __global__ void kBackpropagateBatchNormGradients(int vectorSize, int spatialSize
 template <int U>
 struct BackpropagateBatchNormGradients
 {
-    template <typename ElemType>
+    template <typename ElemType, typename StatType>
    static void Call(size_t vectorSize, size_t spatialSize, size_t batchSize, bool spatial, const ElemType* x, const ElemType* dy, ElemType* dx,
-                     const ElemType* bnScale, ElemType mbStatsWeight, const ElemType* dScale,
-                     const ElemType* dBias, const ElemType* savedMean, const ElemType* savedInvStdDev, cudaStream_t stream)
+                     const StatType* bnScale, StatType mbStatsWeight, const StatType* dScale,
+                     const StatType* dBias, const StatType* savedMean, const StatType* savedInvStdDev, cudaStream_t stream)
    {
        assert((vectorSize % U) == 0);
        assert(batchSize >= 1);
@ -1096,7 +1140,7 @@ struct BackpropagateBatchNormGradients
                         static_cast<unsigned int>(RoundUpToMultiple(batchSize,  BlockDimY)));
        if (spatial)
        {
-            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U><<<gdim, bdim, 0, stream>>>(
+            kBackpropagateBatchNormGradients<BlockDimX, BlockDimY, true/*spatial*/, U, ElemType, StatType><<<gdim, bdim, 0, stream>>>(
                static_cast<int>(vectorSize), static_cast<int>(spatialSize), static_cast<int>(batchSize), x, dy, dx, bnScale, mbStatsWeight, dScale, dBias, savedMean, savedInvStdDev);
        }
        else
--- a/Source/Math/CommonMatrix.h
+++ b/Source/Math/CommonMatrix.h
@ -51,6 +51,11 @@ namespace Microsoft { namespace MSR { namespace CNTK {
 MATH_API void SetMathLibTraceLevel(int traceLevel);
 MATH_API int GetMathLibTraceLevel();

+inline bool IsGpu(DEVICEID_TYPE deviceId)
+{
+    return deviceId > CPUDEVICE;
+}
+
 class MATH_API TracingGPUMemoryAllocator
 {
 private:
--- a/Source/Math/Convolution.cuh
+++ b/Source/Math/Convolution.cuh
@ -1,5 +1,6 @@
 //
 // Copyright (c) Microsoft. All rights reserved.
+// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
 // Licensed under the MIT license. See LICENSE.md file in the project root for full license information.
 //

@ -8,6 +9,7 @@
 #include <cuda_runtime.h>
 #include <device_launch_parameters.h>
 #include <math_constants.h>
+#include "half.hpp"

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -28,6 +30,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
                                    const ElemType* __restrict__ src, int srcVecSize,
                                    ElemType* dst, int dstVecSize)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= dstVecSize)
        return;
@ -41,7 +44,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
        int ivBase = mpRowIwht[row];
        assert(0 <= colBase && colBase < srcVecSize);

-        ElemType sum = 0;
+        comp_t sum = 0;
        int i0 = mpRowRun[row];
        int skip = runs[i0++];
        int size = runs[i0++];
@ -52,7 +55,7 @@ __global__ void kConvolutionForward(int batchSize, const ElemType* __restrict__
                continue;
            int dcol = runs[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
-            sum += kernel[ivBase + skip + i] * src[colBase + dcol];
+            sum += (comp_t)kernel[ivBase + skip + i] * (comp_t)src[colBase + dcol];
        }
        dst[row] = sum;

@ -68,6 +71,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
                                         const ElemType* __restrict__ srcGrad, int srcVecSize,
                                         ElemType* grad, int dstVecSize)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= srcVecSize)
        return;
@ -81,7 +85,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
        int ivBase = mpRowIwht[row];
        assert(0 <= colBase && colBase < dstVecSize);

-        ElemType g = srcGrad[row];
+        comp_t g = srcGrad[row];
        int i0 = mpRowRun[row];
        int skip = runs[i0++];
        int size = runs[i0++];
@ -92,7 +96,7 @@ __global__ void kConvolutionBackwardData(int batchSize, const ElemType* __restri
                continue;
            int dcol = runs[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < dstVecSize);
-            atomicAdd(&grad[colBase + dcol], g * kernel[ivBase + skip + i]);
+            atomicAdd(&grad[colBase + dcol], (ElemType)((comp_t)g * (comp_t)kernel[ivBase + skip + i]));
        }

        srcGrad += blockDim.y * srcVecSize;
@ -108,6 +112,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
                                           const ElemType* __restrict__ srcGrad,
                                           ElemType* kernelGrad)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= outVecSize)
        return;
@ -121,7 +126,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
        int ivBase = mpRowIwht[row];
        assert(0 <= colBase && colBase < inVecSize);

-        ElemType g = srcGrad[row];
+        comp_t g = srcGrad[row];
        int i0 = mpRowRun[row];
        int skip = runs[i0++];
        int size = runs[i0++];
@ -132,7 +137,7 @@ __global__ void kConvolutionBackwardKernel(int batchSize, int inVecSize, int out
                continue;
            int dcol = runs[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < inVecSize);
-            atomicAdd(&kernelGrad[ivBase + skip + i], g * in[colBase + dcol]);
+            atomicAdd(&kernelGrad[ivBase + skip + i], (ElemType)((comp_t)g * (comp_t)in[colBase + dcol]));
        }

        in += blockDim.y * inVecSize;
@ -233,7 +238,7 @@ __device__ double round_(double a)
 // corresponding to the ROI and which pixels in that subset should go into the
 // output location, then takes the max value over that window.
 // src: Images              [W x H x C x N]
-// roiData: ROIs            [4 x numROIs x N], 
+// roiData: ROIs            [4 x numROIs x N],
 // dst: Pooled ROIs         [PW x PH x C x numROIs x N]
 // argmax: max positions    [PW x PH x C x numROIs x N]
 // spatialScale             ratio of input feature map to the original image.
@ -245,6 +250,7 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,
    const int pooledWidth, const int pooledHeight, const ElemType* src,
    const ElemType* roiData, ElemType* dst, ElemType* argmax, double spatialScale)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // index loops over all totalRois*c*pooledHeight*pooledWidth output locations.
    for (int index = blockIdx.x * blockDim.x + threadIdx.x;
        index < (totalIterations); index += blockDim.x * gridDim.x)
@ -269,8 +275,8 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,
        int roiWidth = max(roiEndW - roiStartW + 1, (int)1);
        int roiHeight = max(roiEndH - roiStartH + 1, (int)1);

-        ElemType winH = (ElemType)roiHeight / (ElemType)pooledHeight;
-        ElemType winW = (ElemType)roiWidth / (ElemType)pooledWidth;
+        comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
+        comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;

        // compute window for this output location.
        int hstart = (int)(ph * winH);
@ -286,7 +292,7 @@ __global__ void kMaxROIPoolingForward(const int totalIterations,

        bool isempty = (hend <= hstart) || (wend <= wstart);
        // Define an empty pooling region to be zero
-        ElemType maxval = isempty ? (ElemType)0 : -CUDART_INF_F;
+        comp_t maxval = isempty ? (comp_t)0 : (comp_t)-CUDART_INF_F;
        int maxidx = -1;

        int imgIdx = n / numROIs;
@ -319,6 +325,7 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
    const int pooledWidth, const int pooledHeight, const ElemType* pooledGrad,
    const ElemType* roiData, ElemType* grad, const ElemType* argmax, double spatialScale)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    // index loops over all input locations (locations in the original input tensor).
    for (int index = blockIdx.x * blockDim.x + threadIdx.x;
        index < (totalIterations); index += blockDim.x * gridDim.x)
@ -334,7 +341,7 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
        int roiMin = n * numROIs;
        int roiMax = (n + 1) * numROIs;

-        ElemType gradient = 0;
+        comp_t gradient = 0;
        for (int roiN = roiMin; roiN < roiMax; roiN++)
        {
            // each ROI is 4 elements: (x, y, w, h)
@ -355,8 +362,8 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
            if (!inROI)
                continue;

-            ElemType winH = (ElemType)roiHeight / (ElemType)pooledHeight;
-            ElemType winW = (ElemType)roiWidth / (ElemType)pooledWidth;
+            comp_t winH = (comp_t)roiHeight / (comp_t)pooledHeight;
+            comp_t winW = (comp_t)roiWidth / (comp_t)pooledWidth;

            // what pooled nodes in the output for this ROI could have pooled this input location?
            // we use int here since the computation can yield a negative result
@ -381,13 +388,13 @@ __global__ void kMaxROIPoolingBackward(const int totalIterations,
                {
                    if ((int)offsetArgmax[ph * pooledWidth + pw] == (h * width + w))
                    {
-                        gradient += offsetPoolGrad[ph * pooledWidth + pw];
+                        gradient += (comp_t)offsetPoolGrad[ph * pooledWidth + pw];
                    }
                }
            }
        }

-        atomicAdd(&grad[index], gradient);
+        atomicAdd(&grad[index], (ElemType)gradient);
    }
 }

@ -443,6 +450,7 @@ __global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const
                                       const ElemType* __restrict__ src, int srcVecSize,
                                       ElemType* dst, int dstVecSize)
 {
+    typedef typename TypeSelector<ElemType>::comp_t comp_t;
    int row = blockIdx.x * blockDim.x + threadIdx.x;
    if (row >= dstVecSize)
        return;
@ -457,14 +465,14 @@ __global__ void kAveragePoolingForward(int batchSize, const int* mpRowCol, const

        int i0 = mpRowIndices[row];
        int size = indices[i0++];
-        ElemType sum = 0;
+        comp_t sum = 0;
        for (int i = 0; i < size; i++)
        {
            int dcol = indices[i0 + i];
            assert(0 <= colBase + dcol && colBase + dcol < srcVecSize);
-            sum += src[colBase + dcol];
+            sum += (comp_t)src[colBase + dcol];
        }
-        dst[row] = sum / size;
+        dst[row] = sum / (comp_t)size;

        src += blockDim.y * srcVecSize;
        dst += blockDim.y * dstVecSize;
--- a/Source/Math/ConvolutionEngine.cpp
+++ b/Source/Math/ConvolutionEngine.cpp
@ -6,7 +6,7 @@
 #include "stdafx.h"
 #include "ConvolutionEngine.h"
 #include "CuDnnFactories.h"
-#include "Mkl2017DnnCommon.h"
+#include "MklDnnCommon.h"

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -131,7 +131,7 @@ void ConvolutionEngine<ElemType>::MaxUnpooling(const Mat& out, const Mat& poolIn
 //------------------------------------------------------------------
 // Reference convolution engine implementation.
 // This engine supports arbitrary convolution geometry but does not provide efficient implementation.
-// Its main purpose is to serve as a baseline for optmized engines (e.g. cuDNN) that 
+// Its main purpose is to serve as a baseline for optmized engines (e.g. cuDNN) that
 // usually implement only a subset of a general convolution geometry.
 //------------------------------------------------------------------
 template <class ElemType>
@ -169,11 +169,11 @@ protected:
        if (m_mpRowIwht == nullptr)
        {
            auto flags = IsGpu(m_deviceId) ? matrixFlagNormal : matrixFlagDontOwnBuffer;
-            m_mpRowIwht = std::make_unique<Matrix<int>>(m_geometry->MpRowIwht().size(), 1, 
+            m_mpRowIwht = std::make_unique<Matrix<int>>(m_geometry->MpRowIwht().size(), 1,
                                                        const_cast<int*>(m_geometry->MpRowIwht().data()), m_deviceId, flags);
            m_mpRowRun = std::make_unique<Matrix<int>>(m_geometry->MpRowRun().size(), 1,
                                                       const_cast<int*>(m_geometry->MpRowRun().data()), m_deviceId, flags);
-            m_runs = std::make_unique<Matrix<int>>(m_geometry->Runs().size(), 1, 
+            m_runs = std::make_unique<Matrix<int>>(m_geometry->Runs().size(), 1,
                                                   const_cast<int*>(m_geometry->Runs().data()), m_deviceId, flags);
        }
    }
@ -239,12 +239,6 @@ protected:
        out.MaxUnpooling(m_mpRowCol, *m_mpRowIndices, *m_indices, poolIn, in);
    }

-protected:
-    static bool IsGpu(DEVICEID_TYPE deviceId)
-    {
-        return deviceId >= 0;
-    }
-
 protected:
    using IntMatPtr = std::unique_ptr<Matrix<int>>;
    // IMP NOTE: Make sure that in the declaration below m_isConvGeometryComputed is declared
@ -273,7 +267,7 @@ public:

 public:
    LegacyConvolutionEngine(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId, ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind, bool poolIncludePad)
-        : Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, poolIncludePad), 
+        : Base(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, poolIncludePad),
        m_inT(m_geometry->InputShape(), ImageLayoutKind::CHW), m_outT(m_geometry->OutputShape(), ImageLayoutKind::CHW),
        m_kernelT(m_geometry->KernelShape(), ImageLayoutKind::CHW), m_strideT(m_geometry->Stride(), ImageLayoutKind::CHW)
    {
@ -559,7 +553,7 @@ private:
 //------------------------------------------------------------------
 // GEMM convolution engine implementation.
 // This engine supports arbitrary convolution configuration with full
-// sharing and implemented using unroll + GEMM technique 
+// sharing and implemented using unroll + GEMM technique
 // (High performance convolutional neural networks for document processing; Chellapilla, Puri, Simard)
 // Uses reference engine for pooling operations.
 //------------------------------------------------------------------
@ -579,8 +573,6 @@ public:
 protected:
    using typename Base::IntMatPtr;

-    using Base::IsGpu;
-
    using Base::m_geometry;
    using Base::m_deviceId;
    using Base::m_imageLayout;
@ -600,7 +592,7 @@ protected:
            LogicError("GEMM convolution engine currently supports only CPU device.");
    }

-    // A note on notation used in the documentation for the next 3 functions: 
+    // A note on notation used in the documentation for the next 3 functions:
    // for simplicity we use cuDNN-style notation for 2D convolutions (though this engine supports arbitrary convolution configuration)
    // where N - is the number of samples in a batch, C, H, W are number of channels, height and width of the input respectively.
    // For the output we use K as the number of output feature maps and H', W' as height and width of the output.
@ -632,7 +624,7 @@ protected:
        size_t mapOutSize = m_geometry->OutputShape().GetNumElements() / mapCount;
        size_t unrollRows = mapOutSize * subBatchSize;
        size_t unrollCols = m_geometry->KernelShape().GetNumElements();
-        // Reserve space for unrolled inputs and, if needed, intermediate outputs. 
+        // Reserve space for unrolled inputs and, if needed, intermediate outputs.
        // Intermediate outputs will be transposed to final outputs after GEMM operation.
        // Transpose is not required if subBatchSize == 1.
        workspace.Resize(unrollRows, unrollCols + (subBatchSize > 1 ? mapCount : 0));
@ -682,7 +674,7 @@ protected:
            }
        }
    }
-    
+
    // The backward data method works by representing this operation as a "reverse" convolution
    // in case kernel's last dimension is equal to input dimension. Gradients matrix (grad) becomes
    // an output of such reverse convolution.
@ -734,7 +726,7 @@ protected:
        workspace.Resize(1, kernCols + unrollRows * (unrollCols + (subBatchSize > 1 ? mapInCount : 0)));

        auto kern = kernel.ColumnSlice(0, kernel.GetNumCols());
-        size_t kernTCols = kernT.GetNumElements(); 
+        size_t kernTCols = kernT.GetNumElements();
        // cudnn layout uses row-major kernel weight matrix.
        kern.Reshape(kernTCols, kernCols/kernTCols);
        // Now transpose and reshape to [KXY x C].
@ -862,7 +854,7 @@ protected:

            // cudnn layout uses row-major kernel weight matrix.
            auto kernGrad = kernelGrad.ColumnSlice(0, kernelGrad.GetNumCols());
-            kernGrad.Reshape(unrollRows, kernGrad.GetNumElements() / unrollRows); 
+            kernGrad.Reshape(unrollRows, kernGrad.GetNumElements() / unrollRows);
            // 3. Multiply.
            Mat::MultiplyAndAdd(unrolledInputSlice, true, srcGradSlice, false, kernGrad);
        }
@ -955,6 +947,11 @@ protected:

        bool Supported(const ConvolveGeometry* geometry, bool forward)
        {
+            //TODO: test code for linking with mkldnn.dll, will extend to support dilated convolution with MKL-DNN later
+            mkldnn_primitive_attr_t attr;
+            mkldnn_primitive_attr_create(&attr);
+            mkldnn_primitive_attr_destroy(attr);
+
            //MKL2017 does not support asymmetric padding yet
            if (geometry->IsAsymmetricPadding()) return false;

@ -1125,7 +1122,7 @@ public:
 template <class ElemType>
 std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
                                                                                 ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
-                                                                                 ConvolutionEngineKind enabledEngines, std::wstring logPrefix, 
+                                                                                 ConvolutionEngineKind enabledEngines, std::wstring logPrefix,
                                                                                 bool forceDeterministicAlgorithms, bool poolIncludePad,
                                                                                 bool inputHasFreeDimension)
 {
@ -1134,7 +1131,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create

    auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
    // Note: in some cases do not throw exception even if parameters do not match as Create
-    // can be called from places like MEL with default parameters and never be used. 
+    // can be called from places like MEL with default parameters and never be used.
    // The check will be done later in engine's EnsureCompatible call if the egnine is actually used.
    auto engStr = (std::string)(*geometry);
    // Only legacy engine supports HWC layout.
@ -1156,7 +1153,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
        if (GetMathLibTraceLevel() > 0)
            fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());

-        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, 
+        return CuDnnConvolutionEngineFactory<ElemType>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
                                                               forceDeterministicAlgorithms, poolIncludePad, inputHasFreeDimension);
    }

@ -1177,7 +1174,41 @@ std::unique_ptr<ConvolutionEngine<ElemType>> ConvolutionEngine<ElemType>::Create
    return std::make_unique<ReferenceConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, poolIncludePad);
 }

+// only GPU supports fp16 convolution
+template <>
+std::unique_ptr<ConvolutionEngine<half>> ConvolutionEngine<half>::Create(ConvolveGeometryPtr geometry, DEVICEID_TYPE deviceId,
+    ImageLayoutKind imageLayout, size_t maxTempMemSizeInSamples, PoolKind poolKind,
+    ConvolutionEngineKind enabledEngines, std::wstring logPrefix,
+    bool forceDeterministicAlgorithms, bool poolIncludePad,
+    bool inputHasFreeDimension)
+{
+    if (!logPrefix.empty())
+        logPrefix += L": ";
+
+    auto isEnabled = [=](ConvolutionEngineKind eng) { return ((int)enabledEngines & (int)eng) != 0; };
+    // Note: in some cases do not throw exception even if parameters do not match as Create
+    // can be called from places like MEL with default parameters and never be used.
+    // The check will be done later in engine's EnsureCompatible call if the egnine is actually used.
+    auto engStr = (std::string)(*geometry);
+
+    // Check if we can use cuDNN engine. Do not need to validate tensors as ConvolveGeometry has already done that.
+    if (isEnabled(ConvolutionEngineKind::CuDnn) &&
+        CuDnnConvolutionEngineFactory<half>::IsSupported(deviceId, geometry, poolKind))
+    {
+        if (GetMathLibTraceLevel() > 0)
+            fprintf(stderr, "%lsusing cuDNN convolution engine for geometry: %s.\n", logPrefix.c_str(), engStr.c_str());
+
+        return CuDnnConvolutionEngineFactory<half>::Create(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
+            forceDeterministicAlgorithms, poolIncludePad, inputHasFreeDimension);
+    }
+
+    RuntimeError("FP16 convolution is only supported via cuDNN.");
+
+    return nullptr;
+}
+
 template class ConvolutionEngine<float>;
 template class ConvolutionEngine<double>;
+template class ConvolutionEngine<half>;

 }}}
--- a/Source/Math/CuDnnBatchNormalization.cu
+++ b/Source/Math/CuDnnBatchNormalization.cu
@ -11,20 +11,21 @@

 namespace Microsoft { namespace MSR { namespace CNTK {

-template <class ElemType>
-class CuDnnBatchNormEngine : public BatchNormEngine<ElemType>
+template <class InoutType, class StatType>
+class CuDnnBatchNormEngine : public BatchNormEngine<InoutType, StatType>
 {
 public:
-    using Base = BatchNormEngine<ElemType>;
-    using typename Base::Mat;
+    using Base = BatchNormEngine<InoutType, StatType>;
+    using typename Base::InoutMat;
+    using typename Base::StatMat;

 public:
    CuDnnBatchNormEngine(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
                        bool spatial, ImageLayoutKind imageLayout)
                        : Base(deviceId, inOutT, spatial, imageLayout),
                        m_cudnn(CuDnn::Instance()),
-                        m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<ElemType>()),
-                        m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<ElemType>()),
+                        m_inOutCuDnnT(GetInOutTensor(inOutT), CuDnnTensor::GetDataType<InoutType>()),
+                        m_scaleBiasCuDnnT(GetScaleBiasTensor(inOutT, spatial), CuDnnTensor::GetDataType<StatType>()),
                        m_cudnnEpsilon(CUDNN_BN_MIN_EPSILON)
    {
    }
@ -43,8 +44,8 @@ protected:
            InvalidArgument("cuDNN batch normalization supports tensors of max 4 dimensions.");
    }

-    void ForwardCore(const Mat& in, const Mat& scale, const Mat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, Mat& runMean, Mat& runVariance,
-                     Mat& out, double epsilon, Mat& savedMean, Mat& savedInvStdDev) override
+    void ForwardCore(const InoutMat& in, const StatMat& scale, const StatMat& bias, bool inferenceOnly, double expAvgFactor, double blendFactor, StatMat& runMean, StatMat& runVariance,
+                     InoutMat& out, double epsilon, StatMat& savedMean, StatMat& savedInvStdDev) override
    {
        // TODO batchSize == 1

@ -53,7 +54,8 @@ protected:
            InvalidArgument("cuDNN batch normalization engine currently supports blendTimeConstant of 0 or 1 only.");

        m_inOutCuDnnT.UpdateBatchSize(in.GetNumCols());
-        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
+        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL_PERSISTENT : CUDNN_BATCHNORM_PER_ACTIVATION;
+        if (inferenceOnly) mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
        // cuDNN will fail with BAD_PARAM if epsilon < CUDNN_BN_MIN_EPSILON.
        m_cudnnEpsilon = max(epsilon, CUDNN_BN_MIN_EPSILON);
        if (inferenceOnly)
@ -75,23 +77,26 @@ protected:
        }
    }

-    void BackwardCore(const Mat& in, const Mat& srcGrad, Mat& grad, const Mat& scale, double blendFactor, const Mat& savedMean, const Mat& savedInvStdDev,
-                      Mat& scaleGrad, Mat& biasGrad, bool accumulateDataGrad) override
+    void BackwardCore(const InoutMat& in, const InoutMat& srcGrad, InoutMat& grad, const StatMat& scale, double blendFactor, const StatMat& savedMean, const StatMat& savedInvStdDev,
+                      StatMat& scaleGrad, StatMat& biasGrad, bool accumulateDataGrad) override
    {
        UNUSED(blendFactor);  // BUGBUG: It should be used.
        m_inOutCuDnnT.UpdateBatchSize(srcGrad.GetNumCols());
-        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL : CUDNN_BATCHNORM_PER_ACTIVATION;
+        cudnnBatchNormMode_t mode = m_spatial ? CUDNN_BATCHNORM_SPATIAL_PERSISTENT : CUDNN_BATCHNORM_PER_ACTIVATION;
        // REVIEW alexeyk: change betaParamDiff to 1 and update CNTK BN engine.
        CUDNN_CALL(cudnnBatchNormalizationBackward(*m_cudnn, mode, &C::One, accumulateDataGrad ? &C::One : &C::Zero, &C::One, &C::Zero, m_inOutCuDnnT, ptr(in), m_inOutCuDnnT, ptr(srcGrad), m_inOutCuDnnT, ptr(grad),
                                                   m_scaleBiasCuDnnT, ptr(scale), ptr(scaleGrad), ptr(biasGrad), m_cudnnEpsilon, ptr(savedMean), ptr(savedInvStdDev)));
    }

 private:
-    static ElemType* ptr(Mat& src)
+    template<typename ElemType>
+    static ElemType* ptr(Matrix<ElemType>& src)
    {
        return src.Data();
    }
-    static const ElemType* ptr(const Mat& src)
+
+    template<typename ElemType>
+    static const ElemType* ptr(const Matrix<ElemType>& src)
    {
        return src.Data();
    }
@ -123,7 +128,7 @@ private:
    }

 private:
-    using C = Consts<ElemType>;
+    using C = Consts<StatType>;

    CuDnn::ptr_t m_cudnn;
    CuDnnTensor m_inOutCuDnnT;
@ -131,18 +136,20 @@ private:
    double m_cudnnEpsilon;
 };

-template class CuDnnBatchNormEngine<float>;
-template class CuDnnBatchNormEngine<double>;
+template class CuDnnBatchNormEngine<float, float>;
+template class CuDnnBatchNormEngine<double, double>;
+template class CuDnnBatchNormEngine<half, float>;

-template <typename ElemType>
-std::unique_ptr<BatchNormEngine<ElemType>> CuDnnBatchNormEngineFactory<ElemType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+template <typename InoutType, typename StatType>
+std::unique_ptr<BatchNormEngine<InoutType, StatType>> CuDnnBatchNormEngineFactory<InoutType, StatType>::Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
                                                                                         bool spatial, ImageLayoutKind imageLayout)
 {
-    return std::make_unique<CuDnnBatchNormEngine<ElemType>>(deviceId, inOutT, spatial, imageLayout);
+    return std::make_unique<CuDnnBatchNormEngine<InoutType, StatType>>(deviceId, inOutT, spatial, imageLayout);
 }

-template class CuDnnBatchNormEngineFactory<float>;
-template class CuDnnBatchNormEngineFactory<double>;
+template class CuDnnBatchNormEngineFactory<float, float>;
+template class CuDnnBatchNormEngineFactory<double, double>;
+template class CuDnnBatchNormEngineFactory<half, float>;

 CudaTimer::~CudaTimer()
 {
--- a/Source/Math/CuDnnCommon.cpp
+++ b/Source/Math/CuDnnCommon.cpp
@ -6,6 +6,7 @@
 #include "stdafx.h"
 #include "GPUMatrix.h"
 #include "CuDnnCommon.h"
+#include "half.hpp"

 namespace Microsoft { namespace MSR { namespace CNTK {
 #ifndef CPUONLY
@ -23,6 +24,10 @@ const float Consts<float>::Zero = 0;
 template <>
 const double Consts<double>::Zero = 0;

+const float Consts<half>::Zero = 0;
+const float Consts<half>::One = 1;
+
+
 CuDnnTensor::CuDnnTensor()
    : m_tensor(nullptr)
 {
@ -31,7 +36,7 @@ CuDnnTensor::CuDnnTensor()
 CuDnnTensor::CuDnnTensor(const TensorShape& src, cudnnDataType_t dataType)
    : m_tensor(nullptr)
 {
-    Set(src, dataType); 
+    Set(src, dataType);
 }

 CuDnnTensor::~CuDnnTensor()
@ -85,12 +90,15 @@ cudnnDataType_t CuDnnTensor::GetDataType()
        return CUDNN_DATA_FLOAT;
    else if (typeid(ElemType) == typeid(double))
        return CUDNN_DATA_DOUBLE;
+    else if (typeid(ElemType) == typeid(half))
+        return CUDNN_DATA_HALF;
    else
        InvalidArgument("cuDNN engine currently supports only single and double precision data types.");
 }

 template cudnnDataType_t CuDnnTensor::GetDataType<float>();
 template cudnnDataType_t CuDnnTensor::GetDataType<double>();
+template cudnnDataType_t CuDnnTensor::GetDataType<half>();

 CuDnn::ptr_t CuDnn::Instance()
 {
--- a/Source/Math/CuDnnCommon.h
+++ b/Source/Math/CuDnnCommon.h
@ -12,6 +12,7 @@
 #error CNTK requires the NVIDIA cuDNN library 5.0 or higher to build, cf. https://docs.microsoft.com/en-us/cognitive-toolkit/Setup-CNTK-on-Windows#cudnn or https://docs.microsoft.com/en-us/cognitive-toolkit/Setup-CNTK-on-Linux#cudnn for installation instructions.
 #endif
 #include <memory>
+#include "half.hpp"

 namespace Microsoft { namespace MSR { namespace CNTK {

@ -51,4 +52,11 @@ struct Consts
    static const ElemType One;
 };

+template <>
+struct Consts<half>
+{
+    static const float Zero;
+    static const float One;
+};
+
 } } }
--- a/Source/Math/CuDnnConvolutionEngine.cu
+++ b/Source/Math/CuDnnConvolutionEngine.cu
@ -1,4 +1,3 @@
-
 //
 // Copyright (c) Microsoft. All rights reserved.
 // Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
@ -11,6 +10,10 @@
 #include <typeinfo>
 #include <typeindex>
 #include "CuDnnCommon.h"
+#include "half.hpp"
+
+// We want tensor core be enabled in order to get(v7)/find tensor core results. But if algo without tensorcore is faster, the only way to force faster algo is to turn it off. Since re-tuning can happen quite often in CNTK, it gets bad if we don't do it carefully. It also require move to get_v7 and we can't test until we can run fp16.
+// For now, let's keep it simple and enable tensor core all the time for fp16.

 template <>
 const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)
@ -26,11 +29,6 @@ const char* CudaErrString<cudnnStatus_t>(cudnnStatus_t x)

 namespace Microsoft { namespace MSR { namespace CNTK {

-static bool IsGpu(DEVICEID_TYPE deviceId)
-{
-    return deviceId >= 0;
-}
-
 class CuDnnKernel
 {
 public:
@ -54,6 +52,9 @@ public:
        // Set map count(aka K) dimension.
        dims[0] = (int)mapCount;
        dims[1] = (int)filt[filt_size - 1];
+        int numElems = 1;
+        for(int i=0; i<(int)dim_size;i++) numElems *= dims[i];
+        m_isOdd = (numElems%2==1);
        CUDNN_CALL(cudnnSetFilterNdDescriptor(m_kernel, dataType, FILTER_FORMAT, (int)dim_size, dims.data()));
    }

@ -71,10 +72,16 @@ public:
        return m_kernel;
    }

+    bool isOdd()
+    {
+        return m_isOdd;
+    }
+
    DISABLE_COPY_AND_MOVE(CuDnnKernel);

 private:
    cudnnFilterDescriptor_t m_kernel;
+    bool m_isOdd;
 };

 class CuDnnConv
@ -102,7 +109,10 @@ public:
        }
        CUDNN_CALL(cudnnSetConvolutionNdDescriptor(m_conv, (int)dim_size, pad.data(),
                                                   stride.data(), dilation.data(),
-                                                   CUDNN_CROSS_CORRELATION, dataType));
+                                                   CUDNN_CROSS_CORRELATION, dataType == CUDNN_DATA_HALF ? CUDNN_DATA_FLOAT : dataType));
+        // allow tensor core for fp16 by default
+        if(dataType == CUDNN_DATA_HALF)
+            CUDNN_CALL(cudnnSetConvolutionMathType(m_conv, CUDNN_TENSOR_OP_MATH));
    }

    ~CuDnnConv()
@ -286,7 +296,7 @@ protected:
            calgo = 1;              // set count of algorithms
            return result;
        };
-        // find workspace size needed to auto-tune all algorithms, as well as the size needed for deterministic algorithm 
+        // find workspace size needed to auto-tune all algorithms, as well as the size needed for deterministic algorithm
        auto workspaceSizeFinder = [&, this]() -> cudnnStatus_t
        {
            size_t tmpSize;
@ -306,6 +316,8 @@ protected:
            return err;
        };
        FindBestAlgo(batchSize, m_fwdAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
+        if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_fwdAlgo.AlgoMathType));
+        else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
        // Perform forward convolution operation.
        CUDNN_CALL(cudnnConvolutionForward(*m_cudnn, &C::One, m_inT, ptr(in), *m_kernelT, ptr(kernel), *m_conv, m_fwdAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), &C::Zero, m_outT, ptr(out)));
    }
@ -369,6 +381,8 @@ protected:
        };
        FindBestAlgo(batchSize, m_backDataAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
        // Compute gradients with respect to the output tensor (data).
+        if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_backDataAlgo.AlgoMathType));
+        else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
        CUDNN_CALL(cudnnConvolutionBackwardData(*m_cudnn, &C::One, *m_kernelT, ptr(kernel), m_outT, ptr(srcGrad), *m_conv, m_backDataAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), accumulateGradient ? &C::One : &C::Zero, m_inT, ptr(grad)));
    }

@ -396,6 +410,15 @@ protected:
        {
            if(!noMem)
                return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_SPECIFY_WORKSPACE_LIMIT, workspace.BufferSize(), &algo);
+            // special case for half/odd filter
+            if(m_kernelT->isOdd() && m_dataType == CUDNN_DATA_HALF)
+            {
+                size_t tmpSize = 0;
+                algo = (cudnnConvolutionBwdFilterAlgo_t) 1;
+                auto err = cudnnGetConvolutionBackwardFilterWorkspaceSize(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, algo, &tmpSize);
+                workspace.Resize((tmpSize + sizeof(ElemType) - 1) / sizeof(ElemType), 1);
+                return err;
+            }
            return cudnnGetConvolutionBackwardFilterAlgorithm(*m_cudnn, m_inT, m_outT, *m_conv, *m_kernelT, CUDNN_CONVOLUTION_BWD_FILTER_NO_WORKSPACE, 0, &algo);
        };
        // find deterministic algorithm
@ -431,6 +454,8 @@ protected:
        };
        FindBestAlgo(batchSize, m_backFiltAlgo, workspaceSizeFinder, deterministicFinder, finder, staticFinder, workspace);
        // Compute gradients with respect to the output tensor (data).
+        if(m_dataType == CUDNN_DATA_HALF) CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, m_backFiltAlgo.AlgoMathType));
+        else CUDNN_CALL(cudnnSetConvolutionMathType(*m_conv, CUDNN_DEFAULT_MATH));
        CUDNN_CALL(cudnnConvolutionBackwardFilter(*m_cudnn, &C::One, m_inT, ptr(in), m_outT, ptr(srcGrad), *m_conv, m_backFiltAlgo.selectedAlgo, ptr(workspace), workspace.BufferSize(), accumulateGradient ? &C::One : &C::Zero, *m_kernelT, ptr(kernelGrad)));
    }

@ -512,7 +537,7 @@ private:
                assert(calgo == 1);                                 // only one deterministic algorithm will be returned
                algo.RecordAlgoBatchSizeWorkspaceSize(true, (*algoPerf).algo, batchSize, (*algoPerf).memory);
                algo.autotuningState = AutotuningState::Running;    // no further need for tuning since this is deterministic, directly enter running state
-            }            
+            }
            else
            {
                // This branch handles two cases: a) When first MB comes through, and b) When input has free dimensions.
@ -556,6 +581,7 @@ private:
                assert(calgo > 0);
                auto res = algoPerf;        // first returned algorithm is the fastest
                algo.RecordAlgoBatchSizeWorkspaceSize(true, (*res).algo, batchSize, (*res).memory);
+                algo.AlgoMathType = (*res).mathType;
                algo.autotuningState = AutotuningState::Running;
                if (algo.MaxAlgoWorkspaceSize < curSize)   // need to shrink the workspace
                    workspace.Resize((curSize + sizeof(ElemType) - 1) / sizeof(ElemType), 1, 0, false);
@ -573,6 +599,7 @@ private:
                    assert(calgo > 0);
                    auto res = algoPerf;    // first returned algorithm is the fastest
                    algo.RecordAlgoBatchSizeWorkspaceSize(true, (*res).algo, batchSize, (*res).memory);
+                    algo.AlgoMathType = (*res).mathType;
                    algo.autotuningState = AutotuningState::Running;
                }
                catch (...)
@ -613,7 +640,7 @@ private:
    {
        typedef T typeT;
        ConvAlgoInfo()
-            : LastBatchAlgoMBSize(0), MaxAlgoMBSize(0), maxMBSizeSeen(0), autotuningState(AutotuningState::Init), MaxAlgoWorkspaceSize(0), LastBatchAlgoWorkspaceSize(0)
+            : LastBatchAlgoMBSize(0), MaxAlgoMBSize(0), maxMBSizeSeen(0), autotuningState(AutotuningState::Init), MaxAlgoWorkspaceSize(0), LastBatchAlgoWorkspaceSize(0), AlgoMathType(CUDNN_TENSOR_OP_MATH)
        {
        }
        // Variables to stores states
@ -631,6 +658,8 @@ private:
        decltype(T::algo) selectedAlgo;     // currently selected algorithm
        decltype(T::algo) maxAlgo;          // algorithm that was selected when the current workspace is allocated

+        cudnnMathType_t AlgoMathType;
+
        bool NeedAutotuning(size_t batchSize, size_t workspaceSize)
        {
            // NVIDIA:
@ -687,7 +716,7 @@ std::unique_ptr<ConvolutionEngine<ElemType>> CuDnnConvolutionEngineFactory<ElemT
                                                                                             bool forceDeterministicAlgorithms, bool poolIncludePad,
                                                                                             bool inputHasFreeDimension)
 {
-    return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind, 
+    return std::make_unique<CuDnnConvolutionEngine<ElemType>>(geometry, deviceId, imageLayout, maxTempMemSizeInSamples, poolKind,
                                                              forceDeterministicAlgorithms, poolIncludePad, inputHasFreeDimension);
 }

@ -741,5 +770,6 @@ bool CuDnnConvolutionEngineFactory<ElemType>::IsSupported(DEVICEID_TYPE deviceId

 template class CuDnnConvolutionEngineFactory<float>;
 template class CuDnnConvolutionEngineFactory<double>;
+template class CuDnnConvolutionEngineFactory<half>;

 } } }
--- a/Source/Math/CuDnnFactories.h
+++ b/Source/Math/CuDnnFactories.h
@ -21,11 +21,11 @@ public:
    static bool IsSupported(DEVICEID_TYPE deviceId, ConvolveGeometryPtr geometry, PoolKind poolKind);
 };

-template <class ElemType>
+template <class InoutType, class StatType>
 class CuDnnBatchNormEngineFactory
 {
 public:
-    static std::unique_ptr<BatchNormEngine<ElemType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
+    static std::unique_ptr<BatchNormEngine<InoutType, StatType>> Create(DEVICEID_TYPE deviceId, const TensorShape& inOutT,
                                                             bool spatial, ImageLayoutKind imageLayout);
 };

--- a/Показать больше
+++ b/Показать больше